In [1]:
import warnings
from datetime import datetime, timedelta
import numpy as np
import pandas as pd

import gc
import os
from tqdm import tqdm

from sklearn.model_selection import StratifiedKFold

# 忽略警告
warnings.simplefilter('ignore')
tqdm.pandas()
%matplotlib inline

pd.set_option('max_columns', None)
pd.set_option('max_rows', None)
pd.set_option('max_colwidth', 200)

In [2]:
seed = 2021

# 数据准备

In [3]:
# 读取训练集、测试集
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('testA.csv')

# de_feature存储所有1000000条数据
df_feature = df_train.append(df_test)
df_feature = df_feature.reset_index(drop=True)

In [4]:
df_feature.head()

Unnamed: 0,id,loanAmnt,term,interestRate,installment,grade,subGrade,employmentTitle,employmentLength,homeOwnership,annualIncome,verificationStatus,issueDate,isDefault,purpose,postCode,regionCode,dti,delinquency_2years,ficoRangeLow,ficoRangeHigh,openAcc,pubRec,pubRecBankruptcies,revolBal,revolUtil,totalAcc,initialListStatus,applicationType,earliesCreditLine,title,policyCode,n0,n1,n2,n3,n4,n5,n6,n7,n8,n9,n10,n11,n12,n13,n14
0,0,35000.0,5,19.52,917.97,E,E2,320.0,2 years,2,110000.0,2,2014-07-01,1.0,1,137.0,32,17.05,0.0,730.0,734.0,7.0,0.0,0.0,24178.0,48.9,27.0,0,0,Aug-2001,1.0,1.0,0.0,2.0,2.0,2.0,4.0,9.0,8.0,4.0,12.0,2.0,7.0,0.0,0.0,0.0,2.0
1,1,18000.0,5,18.49,461.9,D,D2,219843.0,5 years,0,46000.0,2,2012-08-01,0.0,0,156.0,18,27.83,0.0,700.0,704.0,13.0,0.0,0.0,15096.0,38.9,18.0,1,0,May-2002,1723.0,1.0,,,,,10.0,,,,,,13.0,,,,
2,2,12000.0,5,16.99,298.17,D,D3,31698.0,8 years,0,74000.0,2,2015-10-01,0.0,0,337.0,14,22.77,0.0,675.0,679.0,11.0,0.0,0.0,4606.0,51.8,27.0,0,0,May-2006,0.0,1.0,0.0,0.0,3.0,3.0,0.0,0.0,21.0,4.0,5.0,3.0,11.0,0.0,0.0,0.0,4.0
3,3,11000.0,3,7.26,340.96,A,A4,46854.0,10+ years,1,118000.0,1,2015-08-01,0.0,4,148.0,11,17.21,0.0,685.0,689.0,9.0,0.0,0.0,9948.0,52.6,28.0,1,0,May-1999,4.0,1.0,6.0,4.0,6.0,6.0,4.0,16.0,4.0,7.0,21.0,6.0,9.0,0.0,0.0,0.0,1.0
4,4,3000.0,3,12.99,101.07,C,C2,54.0,,1,29000.0,2,2016-03-01,0.0,10,301.0,21,32.16,0.0,690.0,694.0,12.0,0.0,0.0,2942.0,32.0,27.0,0,0,Aug-1977,11.0,1.0,1.0,2.0,7.0,7.0,2.0,4.0,9.0,10.0,15.0,7.0,12.0,0.0,0.0,0.0,4.0


In [5]:
# isDefault属性为0的有640390条，为1的有159610条
df_feature.isDefault.value_counts()

0.0    640390
1.0    159610
Name: isDefault, dtype: int64

# 特征工程

In [6]:
# 删除policyCode属性
del df_feature['policyCode']

In [7]:
# grade属性字母等级映射为数字等级
grade_map = {'A': 1, 'B': 2, 'C': 3, 'D': 4, 'E': 5, 'F': 6, 'G': 7}
df_feature['grade'] = df_feature['grade'].map(grade_map)

# subGrade属性映射，自定义函数
def subGrade_map(x):
    grade, num = list(x)
    ans = grade_map[grade]
    ans = ans * 5 + int(num) - 1
    return ans

df_feature['subGrade'] = df_feature['subGrade'].map(subGrade_map)

In [8]:
# employmentLength属性变换
def employmentLength_to_int(s):
    if pd.isnull(s):
        return s
    else:
        return np.int8(s.split()[0])

df_feature['employmentLength'].replace(to_replace='10+ years', value='10 years', inplace=True)
df_feature['employmentLength'].replace('< 1 year', '0 years', inplace=True)
df_feature['employmentLength'] = df_feature['employmentLength'].apply(employmentLength_to_int)

In [9]:
df_feature['issueDate_dt'] = pd.to_datetime(df_feature['issueDate'], format='%Y-%m-%d')
df_feature['issueDate_year'] = df_feature['issueDate_dt'].dt.year  # 提取issueDate属性中的年份作为issueDate_year属性

df_feature['earliesCreditLine_year'] = df_feature['earliesCreditLine'].str.split('-', expand=True)[1]
df_feature['earliesCreditLine_year'] = df_feature['earliesCreditLine_year'].astype('int')

df_feature['issueDate_year_earliesCreditLine_year_minus'] = df_feature['issueDate_year'] - df_feature['earliesCreditLine_year']

# 删除issueDate_dt属性
del df_feature['issueDate_dt']

In [11]:
df_feature['debt_ratio_year'] = df_feature['loanAmnt'] / df_feature['term'] / df_feature['annualIncome']

In [12]:
cate_features = ['applicationType', 'employmentLength', 'employmentTitle', 'grade', 'homeOwnership', 'initialListStatus',
                 'postCode', 'purpose', 'regionCode', 'subGrade', 'title', 'verificationStatus']
dense_features = ['annualIncome', 'delinquency_2years', 'dti', 'employmentLength', 'ficoRangeHigh',
                  'ficoRangeLow', 'installment', 'interestRate', 'loanAmnt', 'openAcc', 'pubRec', 'pubRecBankruptcies',
                  'revolBal', 'revolUtil', 'subGrade', 'term', 'totalAcc']

In [13]:
for f in tqdm(cate_features):
    df_feature['{}_cnt'.format(f)] = df_feature.groupby([f])[f].transform('count')

for f1 in tqdm(cate_features):
    for f2 in cate_features:
        if f1 != f2:
            df_feature['{}_{}_cnt'.format(f1, f2)] = df_feature.groupby([f1, f2])[f].transform('count')

100%|██████████████████████████████████████████████████████████████████████████████████| 12/12 [00:01<00:00,  7.24it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 12/12 [00:29<00:00,  2.44s/it]


In [22]:
# 欺诈率
gps = []
for f in cate_features:
    gps.append([f])

for f1 in cate_features:
    for f2 in cate_features:
        if f1 != f2:
            gps.append([f1, f2])
                        
def statis_feat(df_know, df_unknow):
    for group_by in tqdm(gps):
        group = df_know.groupby(group_by).agg({'isDefault': ['mean']})
        columns = ['{}_default_ratio'.format('_'.join(group_by))]
        group.columns = columns
        group.reset_index(inplace=True)
        df_unknow = df_unknow.merge(group, on=group_by, how='left')

    return df_unknow

# 划分训练集和测试集
df_train = df_feature[~df_feature['isDefault'].isnull()]
df_train = df_train.reset_index(drop=True)
df_test = df_feature[df_feature['isDefault'].isnull()]


df_stas_feat = None
kf = StratifiedKFold(n_splits=5, random_state=seed, shuffle=True)
for train_index, val_index in kf.split(df_train, df_train['isDefault']):
    df_fold_train = df_train.iloc[train_index]
    df_fold_val = df_train.iloc[val_index]

    df_fold_val = statis_feat(df_fold_train, df_fold_val)
    df_stas_feat = pd.concat([df_stas_feat, df_fold_val], axis=0)

    del (df_fold_train)
    del (df_fold_val)
    gc.collect()

df_test = statis_feat(df_train, df_test)
df_feature = pd.concat([df_stas_feat, df_test], axis=0)
df_feature = df_feature.reset_index(drop=True)

del (df_stas_feat)
del (df_train)
del (df_test)
gc.collect()

100%|████████████████████████████████████████████████████████████████████████████████| 144/144 [02:29<00:00,  1.04s/it]
100%|████████████████████████████████████████████████████████████████████████████████| 144/144 [02:24<00:00,  1.01s/it]
100%|████████████████████████████████████████████████████████████████████████████████| 144/144 [02:18<00:00,  1.04it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 144/144 [02:18<00:00,  1.04it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 144/144 [02:32<00:00,  1.06s/it]
100%|████████████████████████████████████████████████████████████████████████████████| 144/144 [03:53<00:00,  1.62s/it]


26

In [23]:
df_feature.head()

Unnamed: 0,id,loanAmnt,term,interestRate,installment,grade,subGrade,employmentTitle,employmentLength,homeOwnership,annualIncome,verificationStatus,issueDate,isDefault,purpose,postCode,regionCode,dti,delinquency_2years,ficoRangeLow,ficoRangeHigh,openAcc,pubRec,pubRecBankruptcies,revolBal,revolUtil,totalAcc,initialListStatus,applicationType,earliesCreditLine,title,n0,n1,n2,n3,n4,n5,n6,n7,n8,n9,n10,n11,n12,n13,n14,issueDate_year,earliesCreditLine_year,issueDate_year_earliesCreditLine_year_minus,debt_ratio_year,applicationType_cnt,employmentLength_cnt,employmentTitle_cnt,grade_cnt,homeOwnership_cnt,initialListStatus_cnt,postCode_cnt,purpose_cnt,regionCode_cnt,subGrade_cnt,title_cnt,verificationStatus_cnt,applicationType_employmentLength_cnt,applicationType_employmentTitle_cnt,applicationType_grade_cnt,applicationType_homeOwnership_cnt,applicationType_initialListStatus_cnt,applicationType_postCode_cnt,applicationType_purpose_cnt,applicationType_regionCode_cnt,applicationType_subGrade_cnt,applicationType_title_cnt,applicationType_verificationStatus_cnt,employmentLength_applicationType_cnt,employmentLength_employmentTitle_cnt,employmentLength_grade_cnt,employmentLength_homeOwnership_cnt,employmentLength_initialListStatus_cnt,employmentLength_postCode_cnt,employmentLength_purpose_cnt,employmentLength_regionCode_cnt,employmentLength_subGrade_cnt,employmentLength_title_cnt,employmentLength_verificationStatus_cnt,employmentTitle_applicationType_cnt,employmentTitle_employmentLength_cnt,employmentTitle_grade_cnt,employmentTitle_homeOwnership_cnt,employmentTitle_initialListStatus_cnt,employmentTitle_postCode_cnt,employmentTitle_purpose_cnt,employmentTitle_regionCode_cnt,employmentTitle_subGrade_cnt,employmentTitle_title_cnt,employmentTitle_verificationStatus_cnt,grade_applicationType_cnt,grade_employmentLength_cnt,grade_employmentTitle_cnt,grade_homeOwnership_cnt,grade_initialListStatus_cnt,grade_postCode_cnt,grade_purpose_cnt,grade_regionCode_cnt,grade_subGrade_cnt,grade_title_cnt,grade_verificationStatus_cnt,homeOwnership_applicationType_cnt,homeOwnership_employmentLength_cnt,homeOwnership_employmentTitle_cnt,homeOwnership_grade_cnt,homeOwnership_initialListStatus_cnt,homeOwnership_postCode_cnt,homeOwnership_purpose_cnt,homeOwnership_regionCode_cnt,homeOwnership_subGrade_cnt,homeOwnership_title_cnt,homeOwnership_verificationStatus_cnt,initialListStatus_applicationType_cnt,initialListStatus_employmentLength_cnt,initialListStatus_employmentTitle_cnt,initialListStatus_grade_cnt,initialListStatus_homeOwnership_cnt,initialListStatus_postCode_cnt,initialListStatus_purpose_cnt,initialListStatus_regionCode_cnt,initialListStatus_subGrade_cnt,initialListStatus_title_cnt,initialListStatus_verificationStatus_cnt,postCode_applicationType_cnt,postCode_employmentLength_cnt,postCode_employmentTitle_cnt,postCode_grade_cnt,postCode_homeOwnership_cnt,postCode_initialListStatus_cnt,postCode_purpose_cnt,postCode_regionCode_cnt,postCode_subGrade_cnt,postCode_title_cnt,postCode_verificationStatus_cnt,purpose_applicationType_cnt,purpose_employmentLength_cnt,purpose_employmentTitle_cnt,purpose_grade_cnt,purpose_homeOwnership_cnt,purpose_initialListStatus_cnt,purpose_postCode_cnt,purpose_regionCode_cnt,purpose_subGrade_cnt,purpose_title_cnt,purpose_verificationStatus_cnt,regionCode_applicationType_cnt,regionCode_employmentLength_cnt,regionCode_employmentTitle_cnt,regionCode_grade_cnt,regionCode_homeOwnership_cnt,regionCode_initialListStatus_cnt,regionCode_postCode_cnt,regionCode_purpose_cnt,regionCode_subGrade_cnt,regionCode_title_cnt,regionCode_verificationStatus_cnt,subGrade_applicationType_cnt,subGrade_employmentLength_cnt,subGrade_employmentTitle_cnt,subGrade_grade_cnt,subGrade_homeOwnership_cnt,subGrade_initialListStatus_cnt,subGrade_postCode_cnt,subGrade_purpose_cnt,subGrade_regionCode_cnt,subGrade_title_cnt,subGrade_verificationStatus_cnt,title_applicationType_cnt,title_employmentLength_cnt,title_employmentTitle_cnt,title_grade_cnt,title_homeOwnership_cnt,title_initialListStatus_cnt,title_postCode_cnt,title_purpose_cnt,title_regionCode_cnt,title_subGrade_cnt,title_verificationStatus_cnt,verificationStatus_applicationType_cnt,verificationStatus_employmentLength_cnt,verificationStatus_employmentTitle_cnt,verificationStatus_grade_cnt,verificationStatus_homeOwnership_cnt,verificationStatus_initialListStatus_cnt,verificationStatus_postCode_cnt,verificationStatus_purpose_cnt,verificationStatus_regionCode_cnt,verificationStatus_subGrade_cnt,verificationStatus_title_cnt,applicationType_default_ratio,employmentLength_default_ratio,employmentTitle_default_ratio,grade_default_ratio,homeOwnership_default_ratio,initialListStatus_default_ratio,postCode_default_ratio,purpose_default_ratio,regionCode_default_ratio,subGrade_default_ratio,title_default_ratio,verificationStatus_default_ratio,applicationType_employmentLength_default_ratio,applicationType_employmentTitle_default_ratio,applicationType_grade_default_ratio,applicationType_homeOwnership_default_ratio,applicationType_initialListStatus_default_ratio,applicationType_postCode_default_ratio,applicationType_purpose_default_ratio,applicationType_regionCode_default_ratio,applicationType_subGrade_default_ratio,applicationType_title_default_ratio,applicationType_verificationStatus_default_ratio,employmentLength_applicationType_default_ratio,employmentLength_employmentTitle_default_ratio,employmentLength_grade_default_ratio,employmentLength_homeOwnership_default_ratio,employmentLength_initialListStatus_default_ratio,employmentLength_postCode_default_ratio,employmentLength_purpose_default_ratio,employmentLength_regionCode_default_ratio,employmentLength_subGrade_default_ratio,employmentLength_title_default_ratio,employmentLength_verificationStatus_default_ratio,employmentTitle_applicationType_default_ratio,employmentTitle_employmentLength_default_ratio,employmentTitle_grade_default_ratio,employmentTitle_homeOwnership_default_ratio,employmentTitle_initialListStatus_default_ratio,employmentTitle_postCode_default_ratio,employmentTitle_purpose_default_ratio,employmentTitle_regionCode_default_ratio,employmentTitle_subGrade_default_ratio,employmentTitle_title_default_ratio,employmentTitle_verificationStatus_default_ratio,grade_applicationType_default_ratio,grade_employmentLength_default_ratio,grade_employmentTitle_default_ratio,grade_homeOwnership_default_ratio,grade_initialListStatus_default_ratio,grade_postCode_default_ratio,grade_purpose_default_ratio,grade_regionCode_default_ratio,grade_subGrade_default_ratio,grade_title_default_ratio,grade_verificationStatus_default_ratio,homeOwnership_applicationType_default_ratio,homeOwnership_employmentLength_default_ratio,homeOwnership_employmentTitle_default_ratio,homeOwnership_grade_default_ratio,homeOwnership_initialListStatus_default_ratio,homeOwnership_postCode_default_ratio,homeOwnership_purpose_default_ratio,homeOwnership_regionCode_default_ratio,homeOwnership_subGrade_default_ratio,homeOwnership_title_default_ratio,homeOwnership_verificationStatus_default_ratio,initialListStatus_applicationType_default_ratio,initialListStatus_employmentLength_default_ratio,initialListStatus_employmentTitle_default_ratio,initialListStatus_grade_default_ratio,initialListStatus_homeOwnership_default_ratio,initialListStatus_postCode_default_ratio,initialListStatus_purpose_default_ratio,initialListStatus_regionCode_default_ratio,initialListStatus_subGrade_default_ratio,initialListStatus_title_default_ratio,initialListStatus_verificationStatus_default_ratio,postCode_applicationType_default_ratio,postCode_employmentLength_default_ratio,postCode_employmentTitle_default_ratio,postCode_grade_default_ratio,postCode_homeOwnership_default_ratio,postCode_initialListStatus_default_ratio,postCode_purpose_default_ratio,postCode_regionCode_default_ratio,postCode_subGrade_default_ratio,postCode_title_default_ratio,postCode_verificationStatus_default_ratio,purpose_applicationType_default_ratio,purpose_employmentLength_default_ratio,purpose_employmentTitle_default_ratio,purpose_grade_default_ratio,purpose_homeOwnership_default_ratio,purpose_initialListStatus_default_ratio,purpose_postCode_default_ratio,purpose_regionCode_default_ratio,purpose_subGrade_default_ratio,purpose_title_default_ratio,purpose_verificationStatus_default_ratio,regionCode_applicationType_default_ratio,regionCode_employmentLength_default_ratio,regionCode_employmentTitle_default_ratio,regionCode_grade_default_ratio,regionCode_homeOwnership_default_ratio,regionCode_initialListStatus_default_ratio,regionCode_postCode_default_ratio,regionCode_purpose_default_ratio,regionCode_subGrade_default_ratio,regionCode_title_default_ratio,regionCode_verificationStatus_default_ratio,subGrade_applicationType_default_ratio,subGrade_employmentLength_default_ratio,subGrade_employmentTitle_default_ratio,subGrade_grade_default_ratio,subGrade_homeOwnership_default_ratio,subGrade_initialListStatus_default_ratio,subGrade_postCode_default_ratio,subGrade_purpose_default_ratio,subGrade_regionCode_default_ratio,subGrade_title_default_ratio,subGrade_verificationStatus_default_ratio,title_applicationType_default_ratio,title_employmentLength_default_ratio,title_employmentTitle_default_ratio,title_grade_default_ratio,title_homeOwnership_default_ratio,title_initialListStatus_default_ratio,title_postCode_default_ratio,title_purpose_default_ratio,title_regionCode_default_ratio,title_subGrade_default_ratio,title_verificationStatus_default_ratio,verificationStatus_applicationType_default_ratio,verificationStatus_employmentLength_default_ratio,verificationStatus_employmentTitle_default_ratio,verificationStatus_grade_default_ratio,verificationStatus_homeOwnership_default_ratio,verificationStatus_initialListStatus_default_ratio,verificationStatus_postCode_default_ratio,verificationStatus_purpose_default_ratio,verificationStatus_regionCode_default_ratio,verificationStatus_subGrade_default_ratio,verificationStatus_title_default_ratio
0,7,11500.0,3,14.98,398.54,3,17,214017.0,1.0,1,30000.0,2,2014-01-01,0.0,0,100.0,4,32.6,0.0,665.0,669.0,8.0,1.0,1.0,14021.0,59.7,33.0,1,0,Dec-1994,0.0,0.0,4.0,4.0,4.0,4.0,16.0,10.0,5.0,21.0,4.0,8.0,0.0,0.0,0.0,2.0,2014,1994,20,0.127778,980693,65671.0,1.0,283819,397051,416892,2675.0,580226,17777,55769,491400.0,311132,64749.0,1.0,277596,392585,413022,2643.0,568237,17402,54572,479722.0,303690,64749.0,1.0,18879.0,35375.0,27549.0,177.0,37576.0,1158.0,3681.0,31640.0,16447.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,277596,18879.0,1.0,118735,112799,746.0,172440,5049,55769,148236.0,95437,392585,35375.0,1.0,118735,175144,1151.0,236009,5884,23222,197906.0,118627,413022,27549.0,1.0,112799,175144,1113.0,243152,7272,21927,174478.0,146424,2643.0,177.0,1.0,746.0,1151.0,1113.0,1548.0,2669.0,146.0,1340.0,786.0,568237,37576.0,1.0,172440,236009,243152,1548.0,10295,33956,490766.0,189929,17402,1158.0,1.0,5049,5884,7272,2669.0,10295,1016,8837.0,5210,54572,3681.0,1.0,55769,23222,21927,146.0,33956,1016,29223.0,18964,479722.0,31640.0,1.0,148236.0,197906.0,174478.0,1340.0,490766.0,8837.0,29223.0,149446.0,303690,16447.0,1.0,95437,118627,146424,786.0,189929,5210,18964,149446.0,0.198536,0.204626,,0.225034,0.232049,0.195876,0.207456,0.21172,0.196593,0.224752,0.218223,0.237844,0.204445,,0.224211,0.230807,0.195213,0.206939,0.210594,0.196591,0.223914,0.217043,0.23662,0.204445,,0.22661,0.221516,0.192845,0.114286,0.22084,0.213592,0.232264,0.229994,0.242846,,,,,,,,,,,,0.224211,0.22661,,0.250539,0.203598,0.213992,0.228398,0.227315,0.224752,0.234171,0.232887,0.230807,0.221516,,0.250539,0.222757,0.228966,0.243642,0.233147,0.25368,0.253763,0.277685,0.195213,0.192845,,0.203598,0.222757,0.18732,0.207224,0.18906,0.202539,0.2217,0.226562,0.206939,0.114286,,0.213992,0.228966,0.18732,0.220145,0.207706,0.225806,0.227328,0.210953,0.210594,0.22084,,0.228398,0.243642,0.207224,0.220145,0.216059,0.227911,0.218239,0.249396,0.196591,0.213592,,0.227315,0.233147,0.18906,0.207706,0.216059,0.230882,0.224845,0.235669,0.223914,0.232264,,0.224752,0.25368,0.202539,0.225806,0.227911,0.230882,0.23094,0.229047,0.217043,0.229994,,0.234171,0.253763,0.2217,0.227328,0.218239,0.224845,0.23094,0.263712,0.23662,0.242846,,0.232887,0.277685,0.226562,0.210953,0.249396,0.235669,0.229047,0.263712
1,18,6000.0,3,21.0,226.06,5,26,323019.0,10.0,1,40000.0,0,2012-12-01,1.0,3,140.0,8,28.95,3.0,660.0,664.0,6.0,0.0,0.0,6804.0,84.0,29.0,0,0,Aug-2001,49758.0,1.0,2.0,4.0,4.0,2.0,13.0,7.0,4.0,22.0,4.0,6.0,0.0,0.0,1.0,0.0,2012,2001,11,0.05,980693,328525.0,1.0,69671,397051,583108,2850.0,22021,145952,15921,1.0,301300,323591.0,1.0,68176,392585,567671,2793.0,21648,143521,15660,1.0,294993,323591.0,1.0,22877.0,92526.0,197111.0,1104.0,6054.0,47810.0,5181.0,1.0,98538.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,68176,22877.0,1.0,29645,33764,197.0,1323,9775,15921,1.0,9222,392585,92526.0,1.0,29645,221907,1578.0,10080,84134,6812,1.0,116590,567671,197111.0,1.0,33764,221907,1610.0,12636,83012,7209,1.0,178991,2793.0,1104.0,1.0,197.0,1578.0,1610.0,41.0,2847.0,48.0,1.0,880.0,21648,6054.0,1.0,1323,10080,12636,41.0,3092,295,1.0,8057,143521,47810.0,1.0,9775,84134,83012,2847.0,3092,2259,1.0,42763,15660,5181.0,1.0,15921,6812,7209,48.0,295,2259,1.0,2148,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,294993,98538.0,1.0,9222,116590,178991,880.0,8057,42763,2148,1.0,0.198536,0.187346,,0.384146,0.232049,0.202106,0.209616,0.187407,0.19619,0.376326,,0.147269,0.186913,,0.38323,0.230807,0.200948,0.208402,0.18682,0.195476,0.375929,,0.146011,0.186913,,0.370267,0.238519,0.188758,0.211618,0.164202,0.188666,0.363718,,0.13834,,,,,,,,,,,,0.38323,0.370267,,0.421298,0.413154,0.379845,0.427059,0.381937,0.376326,,0.33789,0.230807,0.238519,,0.421298,0.239382,0.255403,0.21932,0.230806,0.405313,,0.170902,0.200948,0.188758,,0.413154,0.239382,0.21165,0.19825,0.199891,0.406068,,0.148922,0.208402,0.211618,,0.379845,0.255403,0.21165,0.392857,0.208874,0.4375,,0.152727,0.18682,0.164202,,0.427059,0.21932,0.19825,0.392857,0.187936,0.401042,,0.128086,0.195476,0.188666,,0.381937,0.230806,0.199891,0.208874,0.187936,0.376518,,0.14642,0.375929,0.363718,,0.376326,0.405313,0.406068,0.4375,0.401042,0.376518,,0.342407,,,,,,,,,,,,0.146011,0.13834,,0.33789,0.170902,0.148922,0.152727,0.128086,0.14642,0.342407,
2,20,15500.0,5,15.99,376.85,3,19,369690.0,4.0,0,77000.0,0,2016-12-01,0.0,4,570.0,43,27.42,0.0,660.0,664.0,23.0,1.0,1.0,19752.0,72.4,57.0,0,0,May-2004,4.0,0.0,3.0,11.0,11.0,5.0,8.0,33.0,16.0,22.0,11.0,23.0,0.0,0.0,0.0,7.0,2016,2004,12,0.04026,980693,59818.0,1.0,283819,494678,583108,881.0,219331,9581,50189,185386.0,301300,58878.0,1.0,277596,481648,567671,870.0,216045,9360,48826,182191.0,294993,58878.0,1.0,16913.0,26662.0,34001.0,57.0,13143.0,539.0,3003.0,11064.0,18998.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,277596,16913.0,1.0,134304,171020,232.0,54829,2827,50189,46251.0,73966,481648,26662.0,1.0,134304,295596,470.0,104503,5947,23779,87773.0,152373,567671,34001.0,1.0,171020,295596,511.0,133573,5669,30997,125006.0,178991,870.0,57.0,1.0,232.0,470.0,511.0,227.0,880.0,39.0,185.0,301.0,216045,13143.0,1.0,54829,104503,133573,227.0,2217,8408,185061.0,74022,9360,539.0,1.0,2827,5947,5669,880.0,2217,499,1919.0,3008,48826,3003.0,1.0,50189,23779,30997,39.0,8408,499,7082.0,10946,182191.0,11064.0,1.0,46251.0,87773.0,125006.0,185.0,185061.0,1919.0,7082.0,62125.0,294993,18998.0,1.0,73966,152373,178991,301.0,74022,3008,10946,62125.0,0.198536,0.198392,,0.225034,0.171791,0.202106,0.174229,0.169163,0.21459,0.261863,0.174947,0.147269,0.197655,,0.224211,0.170815,0.200948,0.170055,0.168891,0.212355,0.260857,0.174682,0.146011,0.197655,,0.22255,0.171246,0.199342,0.142857,0.155978,0.187861,0.258131,0.161978,0.148008,,,,,,,,,,,,0.224211,0.22255,,0.200109,0.239058,0.153846,0.230153,0.234744,0.261863,0.239933,0.202993,0.170815,0.171246,,0.200109,0.172781,0.14966,0.144264,0.208861,0.236077,0.148995,0.126821,0.200948,0.199342,,0.239058,0.172781,0.188272,0.172847,0.217585,0.277009,0.174077,0.148922,0.170055,0.142857,,0.153846,0.14966,0.188272,0.157143,0.174229,0.047619,0.181034,0.117949,0.168891,0.155978,,0.230153,0.144264,0.172847,0.157143,0.179379,0.271701,0.174876,0.130746,0.212355,0.187861,,0.234744,0.208861,0.217585,0.174229,0.179379,0.254777,0.188216,0.152117,0.260857,0.258131,,0.261863,0.236077,0.277009,0.047619,0.271701,0.254777,0.280211,0.237725,0.174682,0.161978,,0.239933,0.148995,0.174077,0.181034,0.174876,0.188216,0.280211,0.134411,0.146011,0.148008,,0.202993,0.126821,0.148922,0.117949,0.130746,0.152117,0.237725,0.134411
3,36,35000.0,5,22.7,980.65,5,29,240649.0,3.0,1,130000.0,2,2013-07-01,1.0,0,74.0,30,9.31,0.0,680.0,684.0,10.0,0.0,0.0,21322.0,54.7,27.0,1,0,Jun-1987,4208.0,0.0,6.0,7.0,7.0,6.0,16.0,6.0,9.0,21.0,7.0,10.0,0.0,0.0,0.0,3.0,2013,1987,26,0.053846,980693,80163.0,5.0,69671,397051,416892,8032.0,580226,35933,10767,11.0,311132,78956.0,5.0,68176,392585,413022,7954.0,568237,35480,10408,11.0,303690,78956.0,1.0,5549.0,38519.0,33570.0,696.0,45780.0,2838.0,866.0,1.0,22032.0,5.0,1.0,2.0,2.0,5.0,1.0,3.0,1.0,1.0,1.0,2.0,68176,5549.0,2.0,29645,35907,547.0,45076,2506,10767,2.0,31968,392585,38519.0,2.0,29645,175144,4746.0,236009,16857,4487,3.0,118627,413022,33570.0,5.0,35907,175144,3490.0,243152,15501,4881,9.0,146424,7954.0,696.0,1.0,547.0,4746.0,3490.0,4618.0,8021.0,87.0,1.0,2458.0,568237,45780.0,3.0,45076,236009,243152,4618.0,20678,6956,9.0,189929,35480,2838.0,1.0,2506,16857,15501,8021.0,20678,397,2.0,11367,10408,866.0,1.0,10767,4487,4881,87.0,6956,397,2.0,5211,11.0,1.0,1.0,2.0,3.0,9.0,1.0,9.0,2.0,2.0,3.0,303690,22032.0,2.0,31968,118627,146424,2458.0,189929,11367,5211,3.0,0.198536,0.201122,0.0,0.384146,0.232049,0.195876,0.210879,0.21172,0.212718,0.42188,0.0,0.237844,0.200748,0.0,0.38323,0.230807,0.195213,0.211091,0.210594,0.212361,0.420266,0.0,0.23662,0.200748,,0.400618,0.22341,0.19725,0.210162,0.210283,0.205592,0.451439,,0.240994,0.0,,0.0,,0.0,,0.0,,,,0.0,0.38323,0.400618,0.0,0.421298,0.356629,0.411243,0.394894,0.385714,0.42188,0.0,0.386101,0.230807,0.22341,,0.421298,0.222757,0.22981,0.243642,0.233646,0.457407,0.0,0.277685,0.195213,0.19725,0.0,0.356629,0.222757,0.203146,0.207224,0.205113,0.397186,0.0,0.226562,0.211091,0.210162,,0.411243,0.22981,0.203146,0.223783,0.211212,0.333333,,0.232764,0.210594,0.210283,0.0,0.394894,0.243642,0.207224,0.223783,0.229296,0.430985,0.0,0.249396,0.212361,0.205592,,0.385714,0.233646,0.205113,0.211212,0.229296,0.401515,,0.242997,0.420266,0.451439,,0.42188,0.457407,0.397186,0.333333,0.430985,0.401515,0.0,0.42733,0.0,,,0.0,0.0,0.0,,0.0,,0.0,0.0,0.23662,0.240994,0.0,0.386101,0.277685,0.226562,0.232764,0.249396,0.242997,0.42733,0.0
4,38,15000.0,5,19.24,391.1,5,26,104837.0,8.0,0,52000.0,1,2015-01-01,1.0,0,73.0,23,5.47,0.0,660.0,664.0,4.0,0.0,0.0,7017.0,92.3,15.0,0,0,Oct-2000,0.0,1.0,2.0,3.0,3.0,2.0,4.0,3.0,3.0,7.0,3.0,4.0,0.0,0.0,0.0,1.0,2015,2000,15,0.057692,980693,45168.0,1.0,69671,494678,583108,1751.0,580226,26246,15921,491400.0,387568,44648.0,1.0,68176,481648,567671,1730.0,568237,25767,15660,479722.0,382010,44648.0,1.0,3232.0,23891.0,25922.0,55.0,26734.0,1048.0,763.0,22551.0,17468.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,68176,3232.0,1.0,32390,33764,145.0,45076,1852,15921,38738.0,28481,481648,23891.0,1.0,32390,295596,735.0,287217,15378,7346,243453.0,182175,567671,25922.0,1.0,33764,295596,1030.0,337074,15398,7209,316922.0,239409,1730.0,55.0,1.0,145.0,735.0,1030.0,981.0,1751.0,34.0,851.0,650.0,568237,26734.0,1.0,45076,287217,337074,981.0,15146,10277,490766.0,225610,25767,1048.0,1.0,1852,15378,15398,1751.0,15146,431,13149.0,9837,15660,763.0,1.0,15921,7346,7209,34.0,10277,431,8741.0,6598,479722.0,22551.0,1.0,38738.0,243453.0,316922.0,851.0,490766.0,13149.0,8741.0,204118.0,382010,17468.0,1.0,28481,182175,239409,650.0,225610,9837,6598,204118.0,0.198536,0.19613,,0.384146,0.171791,0.202106,0.277076,0.21172,0.20776,0.376326,0.218223,0.209303,0.195624,,0.38323,0.170815,0.200948,0.276965,0.210594,0.20647,0.375929,0.217043,0.208764,0.195624,,0.396272,0.166645,0.1957,0.388889,0.209828,0.213752,0.383877,0.215904,0.206144,,,,,,,,,,,,0.38323,0.396272,,0.35052,0.413154,0.477778,0.394894,0.386441,0.376326,0.408332,0.396956,0.170815,0.166645,,0.35052,0.172781,0.256729,0.183975,0.187966,0.350042,0.188051,0.17822,0.200948,0.1957,,0.413154,0.172781,0.262751,0.214958,0.205838,0.406068,0.216309,0.210331,0.276965,0.388889,,0.477778,0.256729,0.262751,0.285024,0.277076,0.565217,0.277879,0.3,0.210594,0.209828,,0.394894,0.183975,0.214958,0.285024,0.217302,0.385426,0.218239,0.220577,0.20647,0.213752,,0.386441,0.187966,0.205838,0.277076,0.217302,0.371025,0.223472,0.210484,0.375929,0.383877,,0.376326,0.350042,0.406068,0.565217,0.385426,0.371025,0.399399,0.38615,0.217043,0.215904,,0.408332,0.188051,0.216309,0.277879,0.218239,0.223472,0.399399,0.223984,0.208764,0.206144,,0.396956,0.17822,0.210331,0.3,0.220577,0.210484,0.38615,0.223984


In [None]:
# # Function to reduce the memory usage
# def reduce_mem_usage(df, verbose=True):
#     numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
#     start_mem = df.memory_usage().sum() / 1024**2
#     for col in tqdm([f for f in df.columns if f not in ['query_time']]):
#         col_type = df[col].dtypes
#         if col_type in numerics:
#             c_min = df[col].min()
#             c_max = df[col].max()
#             if str(col_type)[:3] == 'int':
#                 if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(
#                         np.int8).max:
#                     df[col] = df[col].astype(np.int8)
#                 elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(
#                         np.int16).max:
#                     df[col] = df[col].astype(np.int16)
#                 elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(
#                         np.int32).max:
#                     df[col] = df[col].astype(np.int32)
#                 elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(
#                         np.int64).max:
#                     df[col] = df[col].astype(np.int64)
#             else:
#                 if c_min > np.finfo(np.float16).min and c_max < np.finfo(
#                         np.float16).max:
#                     df[col] = df[col].astype(np.float16)
#                 elif c_min > np.finfo(np.float32).min and c_max < np.finfo(
#                         np.float32).max:
#                     df[col] = df[col].astype(np.float32)
#                 else:
#                     df[col] = df[col].astype(np.float64)
#     end_mem = df.memory_usage().sum() / 1024**2
#     if verbose:
#         print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(
#             end_mem, 100 * (start_mem - end_mem) / start_mem))
#     return df

# df_feature = reduce_mem_usage(df_feature)

In [25]:
os.makedirs('data', exist_ok=True)
df_feature.to_pickle('data/feature.pkl')