In [1]:
import warnings
from datetime import datetime, timedelta
import numpy as np
import pandas as pd

import gc
import os
from tqdm import tqdm

from sklearn.model_selection import StratifiedKFold

warnings.simplefilter('ignore')
tqdm.pandas()
%matplotlib inline

pd.set_option('max_columns', None)
pd.set_option('max_rows', None)
pd.set_option('max_colwidth', 200)

In [2]:
seed = 2020

In [3]:
# !pip install lightgbm

In [4]:
df_train = pd.read_csv('raw_data/train.csv')
df_test = pd.read_csv('raw_data/testA.csv')

df_feature = df_train.append(df_test)
df_feature = df_feature.reset_index(drop=True)

In [5]:
df_feature.head()

Unnamed: 0,annualIncome,applicationType,delinquency_2years,dti,earliesCreditLine,employmentLength,employmentTitle,ficoRangeHigh,ficoRangeLow,grade,homeOwnership,id,initialListStatus,installment,interestRate,isDefault,issueDate,loanAmnt,n0,n1,n10,n11,n12,n13,n14,n2,n3,n4,n5,n6,n7,n8,n9,openAcc,policyCode,postCode,pubRec,pubRecBankruptcies,purpose,regionCode,revolBal,revolUtil,subGrade,term,title,totalAcc,verificationStatus
0,110000.0,0,0.0,17.05,Aug-2001,2 years,320.0,734.0,730.0,E,2,0,0,917.97,19.52,1.0,2014-07-01,35000.0,0.0,2.0,7.0,0.0,0.0,0.0,2.0,2.0,2.0,4.0,9.0,8.0,4.0,12.0,2.0,7.0,1.0,137.0,0.0,0.0,1,32,24178.0,48.9,E2,5,1.0,27.0,2
1,46000.0,0,0.0,27.83,May-2002,5 years,219843.0,704.0,700.0,D,0,1,1,461.9,18.49,0.0,2012-08-01,18000.0,,,13.0,,,,,,,10.0,,,,,,13.0,1.0,156.0,0.0,0.0,0,18,15096.0,38.9,D2,5,1723.0,18.0,2
2,74000.0,0,0.0,22.77,May-2006,8 years,31698.0,679.0,675.0,D,0,2,0,298.17,16.99,0.0,2015-10-01,12000.0,0.0,0.0,11.0,0.0,0.0,0.0,4.0,3.0,3.0,0.0,0.0,21.0,4.0,5.0,3.0,11.0,1.0,337.0,0.0,0.0,0,14,4606.0,51.8,D3,5,0.0,27.0,2
3,118000.0,0,0.0,17.21,May-1999,10+ years,46854.0,689.0,685.0,A,1,3,1,340.96,7.26,0.0,2015-08-01,11000.0,6.0,4.0,9.0,0.0,0.0,0.0,1.0,6.0,6.0,4.0,16.0,4.0,7.0,21.0,6.0,9.0,1.0,148.0,0.0,0.0,4,11,9948.0,52.6,A4,3,4.0,28.0,1
4,29000.0,0,0.0,32.16,Aug-1977,,54.0,694.0,690.0,C,1,4,0,101.07,12.99,0.0,2016-03-01,3000.0,1.0,2.0,12.0,0.0,0.0,0.0,4.0,7.0,7.0,2.0,4.0,9.0,10.0,15.0,7.0,12.0,1.0,301.0,0.0,0.0,10,21,2942.0,32.0,C2,3,11.0,27.0,2


In [6]:
df_feature.isDefault.value_counts()

0.0    640390
1.0    159610
Name: isDefault, dtype: int64

# 特征工程

In [7]:
del df_feature['policyCode']

In [8]:
grade_map = {'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7}
df_feature['grade'] = df_feature['grade'].map(grade_map)


def subGrade_map(x):
    grade, num = list(x)
    ans = grade_map[grade]
    ans = ans * 5 + int(num) - 1
    return ans


df_feature['subGrade'] = df_feature['subGrade'].map(subGrade_map)

In [9]:
def employmentLength_to_int(s):
    if pd.isnull(s):
        return s
    else:
        return np.int8(s.split()[0])


df_feature['employmentLength'].replace(
    to_replace='10+ years', value='10 years', inplace=True)
df_feature['employmentLength'].replace('< 1 year', '0 years', inplace=True)
df_feature['employmentLength'] = df_feature['employmentLength'].apply(
    employmentLength_to_int)

In [10]:
df_feature['issueDate_dt'] = pd.to_datetime(
    df_feature['issueDate'], format='%Y-%m-%d')
df_feature['issueDate_year'] = df_feature['issueDate_dt'].dt.year

df_feature['earliesCreditLine_year'] = df_feature['earliesCreditLine'].str.split(
    '-', expand=True)[1]
df_feature['earliesCreditLine_year'] = df_feature['earliesCreditLine_year'].astype(
    'int')

df_feature['issueDate_year_earliesCreditLine_year_minus'] = df_feature['issueDate_year'] - \
    df_feature['earliesCreditLine_year']

del df_feature['issueDate_dt']

In [11]:
df_feature['debt_ratio_year'] = df_feature['loanAmnt'] / df_feature['term'] / df_feature['annualIncome']

In [12]:
cate_features = ['applicationType', 'employmentLength', 'employmentTitle', 'grade', 'homeOwnership', 'initialListStatus',
                 'postCode', 'purpose', 'regionCode', 'subGrade', 'title', 'verificationStatus']
dense_features = ['annualIncome', 'delinquency_2years', 'dti', 'employmentLength', 'ficoRangeHigh',
                  'ficoRangeLow', 'installment', 'interestRate', 'loanAmnt', 'openAcc', 'pubRec', 'pubRecBankruptcies',
                  'revolBal', 'revolUtil', 'subGrade', 'term', 'totalAcc']

In [13]:
for f in tqdm(cate_features):
    df_feature['{}_cnt'.format(f)] = df_feature.groupby([f])[
        f].transform('count')

for f1 in tqdm(cate_features):
    for f2 in cate_features:
        if f1 != f2:
            df_feature['{}_{}_cnt'.format(f1, f2)] = df_feature.groupby([f1, f2])[
                f].transform('count')

100%|█████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:01<00:00,  7.58it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████| 12/12 [00:41<00:00,  3.46s/it]


In [14]:
# 欺诈率
gps = []
for f in cate_features:
    gps.append([f])

for f1 in cate_features:
    for f2 in cate_features:
        if f1 != f2:
            gps.append([f1, f2])
                        
def statis_feat(df_know, df_unknow):
    for group_by in tqdm(gps):
        group = df_know.groupby(group_by).agg({'isDefault': ['mean']})
        columns = ['{}_default_ratio'.format('_'.join(group_by))]
        group.columns = columns
        group.reset_index(inplace=True)
        df_unknow = df_unknow.merge(group, on=group_by, how='left')

    return df_unknow


df_train = df_feature[~df_feature['isDefault'].isnull()]
df_train = df_train.reset_index(drop=True)
df_test = df_feature[df_feature['isDefault'].isnull()]

df_stas_feat = None
kf = StratifiedKFold(n_splits=5, random_state=seed, shuffle=True)
for train_index, val_index in kf.split(df_train, df_train['isDefault']):
    df_fold_train = df_train.iloc[train_index]
    df_fold_val = df_train.iloc[val_index]

    df_fold_val = statis_feat(df_fold_train, df_fold_val)
    df_stas_feat = pd.concat([df_stas_feat, df_fold_val], axis=0)

    del (df_fold_train)
    del (df_fold_val)
    gc.collect()

df_test = statis_feat(df_train, df_test)
df_feature = pd.concat([df_stas_feat, df_test], axis=0)
df_feature = df_feature.reset_index(drop=True)

del (df_stas_feat)
del (df_train)
del (df_test)
gc.collect()

100%|███████████████████████████████████████████████████████████████████████████████████| 144/144 [00:30<00:00,  4.68it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 144/144 [00:30<00:00,  4.74it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 144/144 [00:30<00:00,  4.73it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 144/144 [00:29<00:00,  4.82it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 144/144 [00:29<00:00,  4.84it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 144/144 [00:35<00:00,  4.04it/s]


23

In [15]:
df_feature.head()

Unnamed: 0,annualIncome,applicationType,delinquency_2years,dti,earliesCreditLine,employmentLength,employmentTitle,ficoRangeHigh,ficoRangeLow,grade,homeOwnership,id,initialListStatus,installment,interestRate,isDefault,issueDate,loanAmnt,n0,n1,n10,n11,n12,n13,n14,n2,n3,n4,n5,n6,n7,n8,n9,openAcc,postCode,pubRec,pubRecBankruptcies,purpose,regionCode,revolBal,revolUtil,subGrade,term,title,totalAcc,verificationStatus,issueDate_year,earliesCreditLine_year,issueDate_year_earliesCreditLine_year_minus,debt_ratio_year,applicationType_cnt,employmentLength_cnt,employmentTitle_cnt,grade_cnt,homeOwnership_cnt,initialListStatus_cnt,postCode_cnt,purpose_cnt,regionCode_cnt,subGrade_cnt,title_cnt,verificationStatus_cnt,applicationType_employmentLength_cnt,applicationType_employmentTitle_cnt,applicationType_grade_cnt,applicationType_homeOwnership_cnt,applicationType_initialListStatus_cnt,applicationType_postCode_cnt,applicationType_purpose_cnt,applicationType_regionCode_cnt,applicationType_subGrade_cnt,applicationType_title_cnt,applicationType_verificationStatus_cnt,employmentLength_applicationType_cnt,employmentLength_employmentTitle_cnt,employmentLength_grade_cnt,employmentLength_homeOwnership_cnt,employmentLength_initialListStatus_cnt,employmentLength_postCode_cnt,employmentLength_purpose_cnt,employmentLength_regionCode_cnt,employmentLength_subGrade_cnt,employmentLength_title_cnt,employmentLength_verificationStatus_cnt,employmentTitle_applicationType_cnt,employmentTitle_employmentLength_cnt,employmentTitle_grade_cnt,employmentTitle_homeOwnership_cnt,employmentTitle_initialListStatus_cnt,employmentTitle_postCode_cnt,employmentTitle_purpose_cnt,employmentTitle_regionCode_cnt,employmentTitle_subGrade_cnt,employmentTitle_title_cnt,employmentTitle_verificationStatus_cnt,grade_applicationType_cnt,grade_employmentLength_cnt,grade_employmentTitle_cnt,grade_homeOwnership_cnt,grade_initialListStatus_cnt,grade_postCode_cnt,grade_purpose_cnt,grade_regionCode_cnt,grade_subGrade_cnt,grade_title_cnt,grade_verificationStatus_cnt,homeOwnership_applicationType_cnt,homeOwnership_employmentLength_cnt,homeOwnership_employmentTitle_cnt,homeOwnership_grade_cnt,homeOwnership_initialListStatus_cnt,homeOwnership_postCode_cnt,homeOwnership_purpose_cnt,homeOwnership_regionCode_cnt,homeOwnership_subGrade_cnt,homeOwnership_title_cnt,homeOwnership_verificationStatus_cnt,initialListStatus_applicationType_cnt,initialListStatus_employmentLength_cnt,initialListStatus_employmentTitle_cnt,initialListStatus_grade_cnt,initialListStatus_homeOwnership_cnt,initialListStatus_postCode_cnt,initialListStatus_purpose_cnt,initialListStatus_regionCode_cnt,initialListStatus_subGrade_cnt,initialListStatus_title_cnt,initialListStatus_verificationStatus_cnt,postCode_applicationType_cnt,postCode_employmentLength_cnt,postCode_employmentTitle_cnt,postCode_grade_cnt,postCode_homeOwnership_cnt,postCode_initialListStatus_cnt,postCode_purpose_cnt,postCode_regionCode_cnt,postCode_subGrade_cnt,postCode_title_cnt,postCode_verificationStatus_cnt,purpose_applicationType_cnt,purpose_employmentLength_cnt,purpose_employmentTitle_cnt,purpose_grade_cnt,purpose_homeOwnership_cnt,purpose_initialListStatus_cnt,purpose_postCode_cnt,purpose_regionCode_cnt,purpose_subGrade_cnt,purpose_title_cnt,purpose_verificationStatus_cnt,regionCode_applicationType_cnt,regionCode_employmentLength_cnt,regionCode_employmentTitle_cnt,regionCode_grade_cnt,regionCode_homeOwnership_cnt,regionCode_initialListStatus_cnt,regionCode_postCode_cnt,regionCode_purpose_cnt,regionCode_subGrade_cnt,regionCode_title_cnt,regionCode_verificationStatus_cnt,subGrade_applicationType_cnt,subGrade_employmentLength_cnt,subGrade_employmentTitle_cnt,subGrade_grade_cnt,subGrade_homeOwnership_cnt,subGrade_initialListStatus_cnt,subGrade_postCode_cnt,subGrade_purpose_cnt,subGrade_regionCode_cnt,subGrade_title_cnt,subGrade_verificationStatus_cnt,title_applicationType_cnt,title_employmentLength_cnt,title_employmentTitle_cnt,title_grade_cnt,title_homeOwnership_cnt,title_initialListStatus_cnt,title_postCode_cnt,title_purpose_cnt,title_regionCode_cnt,title_subGrade_cnt,title_verificationStatus_cnt,verificationStatus_applicationType_cnt,verificationStatus_employmentLength_cnt,verificationStatus_employmentTitle_cnt,verificationStatus_grade_cnt,verificationStatus_homeOwnership_cnt,verificationStatus_initialListStatus_cnt,verificationStatus_postCode_cnt,verificationStatus_purpose_cnt,verificationStatus_regionCode_cnt,verificationStatus_subGrade_cnt,verificationStatus_title_cnt,applicationType_default_ratio,employmentLength_default_ratio,employmentTitle_default_ratio,grade_default_ratio,homeOwnership_default_ratio,initialListStatus_default_ratio,postCode_default_ratio,purpose_default_ratio,regionCode_default_ratio,subGrade_default_ratio,title_default_ratio,verificationStatus_default_ratio,applicationType_employmentLength_default_ratio,applicationType_employmentTitle_default_ratio,applicationType_grade_default_ratio,applicationType_homeOwnership_default_ratio,applicationType_initialListStatus_default_ratio,applicationType_postCode_default_ratio,applicationType_purpose_default_ratio,applicationType_regionCode_default_ratio,applicationType_subGrade_default_ratio,applicationType_title_default_ratio,applicationType_verificationStatus_default_ratio,employmentLength_applicationType_default_ratio,employmentLength_employmentTitle_default_ratio,employmentLength_grade_default_ratio,employmentLength_homeOwnership_default_ratio,employmentLength_initialListStatus_default_ratio,employmentLength_postCode_default_ratio,employmentLength_purpose_default_ratio,employmentLength_regionCode_default_ratio,employmentLength_subGrade_default_ratio,employmentLength_title_default_ratio,employmentLength_verificationStatus_default_ratio,employmentTitle_applicationType_default_ratio,employmentTitle_employmentLength_default_ratio,employmentTitle_grade_default_ratio,employmentTitle_homeOwnership_default_ratio,employmentTitle_initialListStatus_default_ratio,employmentTitle_postCode_default_ratio,employmentTitle_purpose_default_ratio,employmentTitle_regionCode_default_ratio,employmentTitle_subGrade_default_ratio,employmentTitle_title_default_ratio,employmentTitle_verificationStatus_default_ratio,grade_applicationType_default_ratio,grade_employmentLength_default_ratio,grade_employmentTitle_default_ratio,grade_homeOwnership_default_ratio,grade_initialListStatus_default_ratio,grade_postCode_default_ratio,grade_purpose_default_ratio,grade_regionCode_default_ratio,grade_subGrade_default_ratio,grade_title_default_ratio,grade_verificationStatus_default_ratio,homeOwnership_applicationType_default_ratio,homeOwnership_employmentLength_default_ratio,homeOwnership_employmentTitle_default_ratio,homeOwnership_grade_default_ratio,homeOwnership_initialListStatus_default_ratio,homeOwnership_postCode_default_ratio,homeOwnership_purpose_default_ratio,homeOwnership_regionCode_default_ratio,homeOwnership_subGrade_default_ratio,homeOwnership_title_default_ratio,homeOwnership_verificationStatus_default_ratio,initialListStatus_applicationType_default_ratio,initialListStatus_employmentLength_default_ratio,initialListStatus_employmentTitle_default_ratio,initialListStatus_grade_default_ratio,initialListStatus_homeOwnership_default_ratio,initialListStatus_postCode_default_ratio,initialListStatus_purpose_default_ratio,initialListStatus_regionCode_default_ratio,initialListStatus_subGrade_default_ratio,initialListStatus_title_default_ratio,initialListStatus_verificationStatus_default_ratio,postCode_applicationType_default_ratio,postCode_employmentLength_default_ratio,postCode_employmentTitle_default_ratio,postCode_grade_default_ratio,postCode_homeOwnership_default_ratio,postCode_initialListStatus_default_ratio,postCode_purpose_default_ratio,postCode_regionCode_default_ratio,postCode_subGrade_default_ratio,postCode_title_default_ratio,postCode_verificationStatus_default_ratio,purpose_applicationType_default_ratio,purpose_employmentLength_default_ratio,purpose_employmentTitle_default_ratio,purpose_grade_default_ratio,purpose_homeOwnership_default_ratio,purpose_initialListStatus_default_ratio,purpose_postCode_default_ratio,purpose_regionCode_default_ratio,purpose_subGrade_default_ratio,purpose_title_default_ratio,purpose_verificationStatus_default_ratio,regionCode_applicationType_default_ratio,regionCode_employmentLength_default_ratio,regionCode_employmentTitle_default_ratio,regionCode_grade_default_ratio,regionCode_homeOwnership_default_ratio,regionCode_initialListStatus_default_ratio,regionCode_postCode_default_ratio,regionCode_purpose_default_ratio,regionCode_subGrade_default_ratio,regionCode_title_default_ratio,regionCode_verificationStatus_default_ratio,subGrade_applicationType_default_ratio,subGrade_employmentLength_default_ratio,subGrade_employmentTitle_default_ratio,subGrade_grade_default_ratio,subGrade_homeOwnership_default_ratio,subGrade_initialListStatus_default_ratio,subGrade_postCode_default_ratio,subGrade_purpose_default_ratio,subGrade_regionCode_default_ratio,subGrade_title_default_ratio,subGrade_verificationStatus_default_ratio,title_applicationType_default_ratio,title_employmentLength_default_ratio,title_employmentTitle_default_ratio,title_grade_default_ratio,title_homeOwnership_default_ratio,title_initialListStatus_default_ratio,title_postCode_default_ratio,title_purpose_default_ratio,title_regionCode_default_ratio,title_subGrade_default_ratio,title_verificationStatus_default_ratio,verificationStatus_applicationType_default_ratio,verificationStatus_employmentLength_default_ratio,verificationStatus_employmentTitle_default_ratio,verificationStatus_grade_default_ratio,verificationStatus_homeOwnership_default_ratio,verificationStatus_initialListStatus_default_ratio,verificationStatus_postCode_default_ratio,verificationStatus_purpose_default_ratio,verificationStatus_regionCode_default_ratio,verificationStatus_subGrade_default_ratio,verificationStatus_title_default_ratio
0,46000.0,0,0.0,27.83,May-2002,5.0,219843.0,704.0,700.0,4,0,1,1,461.9,18.49,0.0,2012-08-01,18000.0,,,13.0,,,,,,,10.0,,,,,,13.0,156.0,0.0,0.0,0,18,15096.0,38.9,21,5,1723.0,18.0,2,2012,2002,10,0.078261,980693,62645.0,151.0,149377,494678,416892,4751.0,580226,21572,33241,37.0,311132,61706.0,151.0,145912,481648,413022,4662.0,568237,21064,32426,37.0,303690,61706.0,7.0,9312.0,28984.0,27473.0,314.0,35846.0,1335.0,2085.0,4.0,18362.0,151.0,7.0,20.0,83.0,121.0,2.0,92.0,5.0,4.0,1.0,90.0,145912,9312.0,20.0,67798,73095,655.0,93490,3213,33241,8.0,59182,481648,28984.0,83.0,67798,199082,2128.0,287217,10719,14925,23.0,160130,413022,27473.0,121.0,73095,199082,2020.0,243152,9080,16525,31.0,146424,4662.0,314.0,2.0,655.0,2128.0,2020.0,2790.0,4747.0,145.0,1.0,1592.0,568237,35846.0,92.0,93490,287217,243152,2790.0,12902,20770,27.0,189929,21064,1335.0,5.0,3213,10719,9080,4747.0,12902,700,3.0,7009,32426,2085.0,4.0,33241,14925,16525,145.0,20770,700,2.0,12593,37.0,4.0,1.0,8.0,23.0,31.0,1.0,27.0,3.0,2.0,24.0,303690,18362.0,90.0,59182,160130,146424,1592.0,189929,7009,12593,24.0,0.198519,0.1965,0.182796,0.302938,0.171817,0.196211,0.156353,0.2111,0.159035,0.294491,0.095238,0.238263,0.196008,0.182796,0.302257,0.170799,0.195492,0.156639,0.210028,0.159301,0.294651,0.095238,0.237194,0.196008,0.0,0.295359,0.170428,0.191164,0.131707,0.205548,0.153396,0.294297,0.0,0.23785,0.182796,0.0,0.266667,0.26087,0.219178,,0.236364,0.0,0.0,,0.175439,0.302257,0.295359,0.266667,0.274273,0.277774,0.226601,0.309783,0.242454,0.294491,0.25,0.311324,0.170799,0.170428,0.26087,0.274273,0.170545,0.092038,0.183384,0.114618,0.260709,0.076923,0.207866,0.195492,0.191164,0.219178,0.277774,0.170545,0.161663,0.206971,0.170041,0.271423,0.125,0.22676,0.156639,0.131707,,0.226601,0.092038,0.161663,0.169022,0.156404,0.213483,,0.185185,0.210028,0.205548,0.236364,0.309783,0.183384,0.206971,0.169022,0.167394,0.299375,0.133333,0.248802,0.159301,0.153396,0.0,0.242454,0.114618,0.170041,0.156404,0.167394,0.249438,1.0,0.192325,0.294651,0.294297,0.0,0.294491,0.260709,0.271423,0.213483,0.299375,0.249438,1.0,0.30138,0.095238,0.0,,0.25,0.076923,0.125,,0.133333,1.0,1.0,0.071429,0.237194,0.23785,0.175439,0.311324,0.207866,0.22676,0.185185,0.248802,0.192325,0.30138,0.071429
1,47500.0,0,0.0,24.36,Mar-2000,4.0,386.0,669.0,665.0,3,1,23,0,684.33,14.08,1.0,2018-01-01,20000.0,0.0,4.0,7.0,0.0,0.0,0.0,1.0,4.0,4.0,4.0,7.0,9.0,6.0,10.0,4.0,7.0,226.0,0.0,0.0,9,21,19497.0,53.1,17,3,10.0,19.0,1,2018,2000,18,0.140351,980693,59818.0,275.0,283819,397051,583108,2429.0,11560,71163,55769,10103.0,387568,58878.0,273.0,277596,392585,567671,2363.0,11232,69827,54572,9780.0,382010,58878.0,29.0,16913.0,27208.0,34001.0,140.0,694.0,4413.0,3303.0,600.0,23854.0,273.0,29.0,81.0,70.0,189.0,1.0,2.0,13.0,16.0,2.0,98.0,277596,16913.0,81.0,118735,171020,687.0,3591,20563,55769,3276.0,114416,392585,27208.0,70.0,118735,221907,951.0,4960,28919,23222,4288.0,161834,567671,34001.0,189.0,171020,221907,1457.0,6604,41589,33842,6377.0,239409,2363.0,140.0,1.0,687.0,951.0,1457.0,44.0,2426.0,114.0,41.0,954.0,11232,694.0,2.0,3591,4960,6604,44.0,842,723,10082.0,4383,69827,4413.0,13.0,20563,28919,41589,2426.0,842,3992,742.0,28042,54572,3303.0,16.0,55769,23222,33842,114.0,723,3992,667.0,22517,9780.0,600.0,2.0,3276.0,4288.0,6377.0,41.0,10082.0,742.0,667.0,3966.0,382010,23854.0,98.0,114416,161834,239409,954.0,4383,28042,22517,3966.0,0.198519,0.198453,0.151685,0.225775,0.231709,0.201877,0.194,0.218425,0.214701,0.224439,0.226599,0.209223,0.197877,0.152542,0.22481,0.2304,0.200725,0.19236,0.215227,0.213953,0.223881,0.223194,0.208617,0.197877,0.222222,0.220896,0.224315,0.20033,0.170213,0.221973,0.223918,0.218584,0.228205,0.211291,0.152542,0.222222,0.232143,0.183673,0.147541,,1.0,0.4,0.166667,1.0,0.079365,0.22481,0.220896,0.232143,0.25131,0.239951,0.25,0.231771,0.241694,0.224439,0.236717,0.233434,0.2304,0.224315,0.183673,0.25131,0.23897,0.245378,0.25466,0.257456,0.253778,0.26362,0.242804,0.200725,0.20033,0.147541,0.239951,0.23897,0.199546,0.223273,0.214066,0.240328,0.225104,0.20971,0.19236,0.170213,,0.25,0.245378,0.199546,0.115385,0.192924,0.243243,0.08,0.230104,0.215227,0.221973,1.0,0.231771,0.25466,0.223273,0.115385,0.217636,0.20852,0.226865,0.223001,0.213953,0.223918,0.4,0.241694,0.257456,0.214066,0.192924,0.217636,0.231346,0.22766,0.223165,0.223881,0.218584,0.166667,0.224439,0.253778,0.240328,0.243243,0.20852,0.231346,0.216019,0.234065,0.223194,0.228205,1.0,0.236717,0.26362,0.225104,0.08,0.226865,0.22766,0.216019,0.229604,0.208617,0.211291,0.079365,0.233434,0.242804,0.20971,0.230104,0.223001,0.223165,0.234065,0.229604
2,40000.0,0,2.0,12.57,Apr-2000,10.0,2780.0,669.0,665.0,1,0,28,0,266.31,8.59,0.0,2016-08-01,8425.0,0.0,5.0,17.0,0.0,0.0,0.0,1.0,9.0,9.0,8.0,8.0,5.0,17.0,18.0,9.0,17.0,245.0,1.0,1.0,0,26,7993.0,33.9,9,3,0.0,23.0,0,2016,2000,16,0.070208,980693,328525.0,149.0,174588,494678,583108,2546.0,580226,23086,47674,491400.0,301300,323591.0,143.0,171997,481648,567671,2504.0,568237,22724,47016,479722.0,294993,323591.0,40.0,59278.0,199068.0,197111.0,896.0,195388.0,7451.0,15968.0,167561.0,98538.0,143.0,40.0,14.0,49.0,95.0,1.0,85.0,7.0,8.0,85.0,40.0,171997,59278.0,14.0,100381,117213,528.0,85590,4991,47674,72704.0,80620,481648,199068.0,49.0,100381,295596,1151.0,287217,10362,25652,243453.0,152373,567671,197111.0,95.0,117213,295596,1483.0,337074,13626,31821,316922.0,178991,2504.0,896.0,1.0,528.0,1151.0,1483.0,1511.0,2542.0,133.0,1253.0,793.0,568237,195388.0,85.0,85590,287217,337074,1511.0,13584,25078,490766.0,164687,22724,7451.0,7.0,4991,10362,13626,2542.0,13584,1289,11262.0,7322,47016,15968.0,8.0,47674,25652,31821,133.0,25078,1289,21210.0,18138,479722.0,167561.0,85.0,72704.0,243453.0,316922.0,1253.0,490766.0,11262.0,21210.0,137836.0,294993,98538.0,40.0,80620,152373,178991,793.0,164687,7322,18138,137836.0,0.198519,0.186543,0.260417,0.060166,0.171817,0.201877,0.186118,0.2111,0.186601,0.08505,0.217787,0.147101,0.186093,0.268817,0.059663,0.170799,0.200725,0.185741,0.210028,0.18553,0.084435,0.216663,0.145717,0.186093,0.409091,0.057594,0.163327,0.187522,0.151304,0.197594,0.176768,0.08348,0.201481,0.137693,0.268817,0.409091,0.0,0.166667,0.3,,0.355932,0.5,0.0,0.355932,0.1,0.059663,0.057594,0.0,0.05173,0.060572,0.051051,0.060805,0.056116,0.08505,0.061423,0.053647,0.170799,0.163327,0.166667,0.05173,0.172674,0.132,0.183384,0.153858,0.075746,0.187677,0.126693,0.200725,0.187522,0.3,0.060572,0.172674,0.170082,0.214088,0.191439,0.085033,0.215492,0.148568,0.185741,0.151304,,0.051051,0.132,0.170082,0.198953,0.186232,0.107143,0.205736,0.099609,0.210028,0.197594,0.355932,0.060805,0.183384,0.214088,0.198953,0.196315,0.084761,0.21778,0.155872,0.18553,0.176768,0.5,0.056116,0.153858,0.191439,0.186232,0.196315,0.087282,0.203198,0.138766,0.084435,0.08348,0.0,0.08505,0.075746,0.085033,0.107143,0.084761,0.087282,0.086222,0.080819,0.216663,0.201481,0.355932,0.061423,0.187677,0.215492,0.205736,0.21778,0.203198,0.086222,0.160245,0.145717,0.137693,0.1,0.053647,0.126693,0.148568,0.099609,0.155872,0.138766,0.080819,0.160245
3,30000.0,0,0.0,20.2,Jul-2007,5.0,132.0,689.0,685.0,4,2,32,1,349.55,15.59,0.0,2015-01-01,10000.0,0.0,2.0,7.0,0.0,0.0,0.0,4.0,3.0,3.0,3.0,4.0,9.0,6.0,8.0,3.0,7.0,413.0,0.0,0.0,0,37,6772.0,41.3,20,3,0.0,17.0,1,2015,2007,8,0.111111,980693,62645.0,3315.0,149377,107910,416892,521.0,580226,7470,38205,491400.0,387568,61706.0,3279.0,145912,106101,413022,511.0,568237,7272,37450,479722.0,382010,61706.0,229.0,9312.0,6455.0,27473.0,26.0,35846.0,446.0,2360.0,29106.0,24332.0,3279.0,229.0,592.0,407.0,1270.0,6.0,1903.0,28.0,162.0,1825.0,1440.0,145912,9312.0,592.0,16499,73095,90.0,93490,1198,38205,80256.0,58554,106101,6455.0,407.0,16499,42474,82.0,56828,1104,4142,49910.0,43454,413022,27473.0,1270.0,73095,42474,224.0,243152,3103,19184,174478.0,148159,511.0,26.0,6.0,90.0,82.0,224.0,320.0,521.0,29.0,279.0,192.0,568237,35846.0,1903.0,93490,56828,243152,320.0,4257,23842,490766.0,225610,7272,446.0,28.0,1198,1104,3103,521.0,4257,305,3588.0,2931,37450,2360.0,162.0,38205,4142,19184,29.0,23842,305,20365.0,15212,479722.0,29106.0,1825.0,80256.0,49910.0,174478.0,279.0,490766.0,3588.0,20365.0,204118.0,382010,24332.0,1440.0,58554,43454,148159,192.0,225610,2931,15212,204118.0,0.198519,0.1965,0.268362,0.302938,0.207853,0.196211,0.278788,0.2111,0.245263,0.277955,0.217787,0.209223,0.196008,0.267016,0.302257,0.206263,0.195492,0.277778,0.210028,0.243918,0.27769,0.216663,0.208617,0.196008,0.319444,0.295359,0.198097,0.191164,0.142857,0.205548,0.240283,0.273629,0.21519,0.210716,0.267016,0.319444,0.352785,0.188976,0.280788,0.0,0.285366,0.052632,0.324074,0.284139,0.285246,0.302257,0.295359,0.352785,0.305109,0.277774,0.396552,0.309783,0.34214,0.277955,0.318708,0.313129,0.206263,0.198097,0.188976,0.305109,0.207478,0.387755,0.219285,0.253879,0.283979,0.224909,0.215086,0.195492,0.191164,0.280788,0.277774,0.207478,0.244444,0.206971,0.224849,0.253801,0.221935,0.208437,0.277778,0.142857,0.0,0.396552,0.387755,0.244444,0.274038,0.278788,0.466667,0.288043,0.299145,0.210028,0.205548,0.285366,0.309783,0.219285,0.206971,0.274038,0.252119,0.281452,0.21778,0.219645,0.243918,0.240283,0.052632,0.34214,0.253879,0.224849,0.278788,0.252119,0.302198,0.265359,0.272537,0.27769,0.273629,0.324074,0.277955,0.283979,0.253801,0.466667,0.281452,0.302198,0.289248,0.290678,0.216663,0.21519,0.284139,0.318708,0.224909,0.221935,0.288043,0.21778,0.265359,0.289248,0.223279,0.208617,0.210716,0.285246,0.313129,0.215086,0.208437,0.299145,0.219645,0.272537,0.290678,0.223279
4,52000.0,0,0.0,18.44,Jul-2004,10.0,81832.0,709.0,705.0,3,1,42,1,298.13,13.35,1.0,2014-10-01,13000.0,0.0,3.0,7.0,0.0,0.0,0.0,1.0,5.0,5.0,4.0,5.0,5.0,6.0,8.0,5.0,7.0,524.0,0.0,0.0,4,20,17703.0,38.8,16,5,4.0,13.0,1,2014,2004,10,0.05,980693,328525.0,3.0,283819,397051,416892,1190.0,219331,11454,58859,185386.0,387568,323591.0,3.0,277596,392585,413022,1180.0,216045,11237,57751,182191.0,382010,323591.0,1.0,92220.0,92526.0,131414.0,299.0,68507.0,3717.0,19146.0,58340.0,126358.0,3.0,1.0,2.0,3.0,3.0,1.0,1.0,1.0,1.0,1.0,2.0,277596,92220.0,2.0,118735,112799,296.0,54829,3224,58859,46251.0,114416,392585,92526.0,3.0,118735,175144,579.0,90618,2940,24964,76076.0,161834,413022,131414.0,3.0,112799,175144,529.0,85758,4769,24400,60380.0,148159,1180.0,299.0,1.0,296.0,579.0,529.0,265.0,1189.0,70.0,220.0,487.0,216045,68507.0,1.0,54829,90618,85758,265.0,2262,11828,185061.0,84936,11237,3717.0,1.0,3224,2940,4769,1189.0,2262,675,1887.0,4729,57751,19146.0,1.0,58859,24964,24400,70.0,11828,675,9930.0,23481,182191.0,58340.0,1.0,46251.0,76076.0,60380.0,220.0,185061.0,1887.0,9930.0,76647.0,382010,126358.0,2.0,114416,161834,148159,487.0,84936,4729,23481,76647.0,0.198519,0.186543,0.5,0.225775,0.231709,0.196211,0.196335,0.169704,0.234083,0.206614,0.176027,0.209223,0.186093,0.5,0.22481,0.2304,0.195492,0.194188,0.169266,0.231709,0.205435,0.175586,0.208617,0.186093,,0.209869,0.236903,0.185077,0.19598,0.161127,0.218491,0.196421,0.166206,0.195361,0.5,,0.0,0.5,0.5,,,,,,0.0,0.22481,0.209869,0.0,0.25131,0.204216,0.198895,0.231318,0.257762,0.206614,0.242309,0.233434,0.2304,0.236903,0.5,0.25131,0.222544,0.222222,0.196344,0.253123,0.229465,0.205132,0.242804,0.195492,0.185077,0.5,0.204216,0.222544,0.190202,0.163872,0.215349,0.187063,0.178506,0.208437,0.194188,0.19598,,0.198895,0.222222,0.190202,0.147239,0.196592,0.190476,0.151079,0.207547,0.169266,0.161127,,0.231318,0.196344,0.163872,0.147239,0.204752,0.212917,0.175973,0.18028,0.231709,0.218491,,0.257762,0.253123,0.215349,0.196592,0.204752,0.262651,0.215232,0.248682,0.205435,0.196421,,0.206614,0.229465,0.187063,0.190476,0.212917,0.262651,0.222153,0.212162,0.175586,0.166206,,0.242309,0.205132,0.178506,0.151079,0.175973,0.215232,0.222153,0.184116,0.208617,0.195361,0.0,0.233434,0.242804,0.208437,0.207547,0.18028,0.248682,0.212162,0.184116


In [16]:
# # Function to reduce the memory usage
# def reduce_mem_usage(df, verbose=True):
#     numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
#     start_mem = df.memory_usage().sum() / 1024**2
#     for col in tqdm([f for f in df.columns if f not in ['query_time']]):
#         col_type = df[col].dtypes
#         if col_type in numerics:
#             c_min = df[col].min()
#             c_max = df[col].max()
#             if str(col_type)[:3] == 'int':
#                 if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(
#                         np.int8).max:
#                     df[col] = df[col].astype(np.int8)
#                 elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(
#                         np.int16).max:
#                     df[col] = df[col].astype(np.int16)
#                 elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(
#                         np.int32).max:
#                     df[col] = df[col].astype(np.int32)
#                 elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(
#                         np.int64).max:
#                     df[col] = df[col].astype(np.int64)
#             else:
#                 if c_min > np.finfo(np.float16).min and c_max < np.finfo(
#                         np.float16).max:
#                     df[col] = df[col].astype(np.float16)
#                 elif c_min > np.finfo(np.float32).min and c_max < np.finfo(
#                         np.float32).max:
#                     df[col] = df[col].astype(np.float32)
#                 else:
#                     df[col] = df[col].astype(np.float64)
#     end_mem = df.memory_usage().sum() / 1024**2
#     if verbose:
#         print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(
#             end_mem, 100 * (start_mem - end_mem) / start_mem))
#     return df

# df_feature = reduce_mem_usage(df_feature)

In [17]:
os.makedirs('data', exist_ok=True)
df_feature.to_pickle('data/feature.pkl')