In [1]:
import pandas as pd
import os
import gc
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor
from sklearn.linear_model import SGDRegressor, LinearRegression, Ridge
from sklearn.preprocessing import MinMaxScaler
import math
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, log_loss
import matplotlib.pyplot as plt
import time
import warnings
warnings.filterwarnings('ignore')

train = pd.read_csv('E:\\PythonWorkSpace\\HelloWorld\\train.csv')
testA = pd.read_csv('E:\\PythonWorkSpace\\HelloWorld\\testA.csv')

pd.set_option('display.max_columns', None)

# train.head()

data = pd.concat([train, testA], axis=0, ignore_index=True)

data.head()

  import pandas.util.testing as tm


Unnamed: 0,id,loanAmnt,term,interestRate,installment,grade,subGrade,employmentTitle,employmentLength,homeOwnership,annualIncome,verificationStatus,issueDate,isDefault,purpose,postCode,regionCode,dti,delinquency_2years,ficoRangeLow,ficoRangeHigh,openAcc,pubRec,pubRecBankruptcies,revolBal,revolUtil,totalAcc,initialListStatus,applicationType,earliesCreditLine,title,policyCode,n0,n1,n2,n3,n4,n5,n6,n7,n8,n9,n10,n11,n12,n13,n14
0,0,35000.0,5,19.52,917.97,E,E2,320.0,2 years,2,110000.0,2,2014-07-01,1.0,1,137.0,32,17.05,0.0,730.0,734.0,7.0,0.0,0.0,24178.0,48.9,27.0,0,0,Aug-2001,1.0,1.0,0.0,2.0,2.0,2.0,4.0,9.0,8.0,4.0,12.0,2.0,7.0,0.0,0.0,0.0,2.0
1,1,18000.0,5,18.49,461.9,D,D2,219843.0,5 years,0,46000.0,2,2012-08-01,0.0,0,156.0,18,27.83,0.0,700.0,704.0,13.0,0.0,0.0,15096.0,38.9,18.0,1,0,May-2002,1723.0,1.0,,,,,10.0,,,,,,13.0,,,,
2,2,12000.0,5,16.99,298.17,D,D3,31698.0,8 years,0,74000.0,2,2015-10-01,0.0,0,337.0,14,22.77,0.0,675.0,679.0,11.0,0.0,0.0,4606.0,51.8,27.0,0,0,May-2006,0.0,1.0,0.0,0.0,3.0,3.0,0.0,0.0,21.0,4.0,5.0,3.0,11.0,0.0,0.0,0.0,4.0
3,3,11000.0,3,7.26,340.96,A,A4,46854.0,10+ years,1,118000.0,1,2015-08-01,0.0,4,148.0,11,17.21,0.0,685.0,689.0,9.0,0.0,0.0,9948.0,52.6,28.0,1,0,May-1999,4.0,1.0,6.0,4.0,6.0,6.0,4.0,16.0,4.0,7.0,21.0,6.0,9.0,0.0,0.0,0.0,1.0
4,4,3000.0,3,12.99,101.07,C,C2,54.0,,1,29000.0,2,2016-03-01,0.0,10,301.0,21,32.16,0.0,690.0,694.0,12.0,0.0,0.0,2942.0,32.0,27.0,0,0,Aug-1977,11.0,1.0,1.0,2.0,7.0,7.0,2.0,4.0,9.0,10.0,15.0,7.0,12.0,0.0,0.0,0.0,4.0


In [3]:
# 数据预处理
print(sorted(data['grade'].unique()))  #unique函数去除其中重复的元素，并按元素由大到小返回一个新的无元素重复的元组或者列表
print(sorted(data['subGrade'].unique()))

#可以看到很多变量不能直接训练，比如grade、subGrade、employmentLength、issueDate、earliesCreditLine，需要进行预处理

data['employmentLength'].value_counts(dropna=False).sort_index()
# value_counts()是一种查看表格某列中有多少个不同值的快捷方法,并计算每个不同值有在该列中有多少重复值。
# dropna = False 表示保留
# sort_index 索引排序

['A', 'B', 'C', 'D', 'E', 'F', 'G']
['A1', 'A2', 'A3', 'A4', 'A5', 'B1', 'B2', 'B3', 'B4', 'B5', 'C1', 'C2', 'C3', 'C4', 'C5', 'D1', 'D2', 'D3', 'D4', 'D5', 'E1', 'E2', 'E3', 'E4', 'E5', 'F1', 'F2', 'F3', 'F4', 'F5', 'G1', 'G2', 'G3', 'G4', 'G5']


1 year        65671
10+ years    328525
2 years       90565
3 years       80163
4 years       59818
5 years       62645
6 years       46582
7 years       44230
8 years       45168
9 years       37866
< 1 year      80226
NaN           58541
Name: employmentLength, dtype: int64

In [4]:
# 首先将employmentLength（就业年限（年））转换成数值
# inplace = True：不创建新的对象，直接对原始对象进行修改
# 对符号进行单独处理
data['employmentLength'].replace(to_replace='10+ years', value='10 years', inplace=True) 
data['employmentLength'].replace('< 1 year', '0 years', inplace=True)

# 处理字母
def employmentLength_to_int(s):
    # 是否为缺失值NaN
    if pd.isnull(s):
        return s
    else:
        return np.int8(s.split()[0]) # [0]表示将空格前的数字进行切割

# 应用该函数
data['employmentLength'] = data['employmentLength'].apply(employmentLength_to_int)
# 再次查看
data['employmentLength'].value_counts(dropna=False).sort_index()

0.0      80226
1.0      65671
2.0      90565
3.0      80163
4.0      59818
5.0      62645
6.0      46582
7.0      44230
8.0      45168
9.0      37866
10.0    328525
NaN      58541
Name: employmentLength, dtype: int64

In [5]:
# 对earliesCreditLine（借款人最早报告的信用额度开立的月份）进行预处理
data['earliesCreditLine'].sample(5)

611865    Aug-2006
270321    Nov-1993
684230    May-2006
612096    Aug-1999
938903    Nov-1980
Name: earliesCreditLine, dtype: object

In [6]:
data['earliesCreditLine'] = data['earliesCreditLine'].apply(lambda s: int(s[-4:])) 
# 数组索引值的负数表示倒数的数，截取从倒4个到结尾
# lambda 匿名函数，传入s，冒号后是返回的结果
data['earliesCreditLine'].describe()

count    1000000.000000
mean        1998.688632
std            7.606231
min         1944.000000
25%         1995.000000
50%         2000.000000
75%         2004.000000
max         2015.000000
Name: earliesCreditLine, dtype: float64

In [7]:
data['earliesCreditLine'].value_counts(dropna=False).sort_index()

1944        2
1945        1
1946        2
1949        1
1950        7
        ...  
2011    12282
2012     8304
2013     4375
2014     1863
2015      251
Name: earliesCreditLine, Length: 70, dtype: int64

In [12]:
data.head()

Unnamed: 0,id,loanAmnt,term,interestRate,installment,grade,subGrade,employmentTitle,employmentLength,homeOwnership,annualIncome,verificationStatus,issueDate,isDefault,purpose,postCode,regionCode,dti,delinquency_2years,ficoRangeLow,ficoRangeHigh,openAcc,pubRec,pubRecBankruptcies,revolBal,revolUtil,totalAcc,initialListStatus,applicationType,earliesCreditLine,title,policyCode,n0,n1,n2,n3,n4,n5,n6,n7,n8,n9,n10,n11,n12,n13,n14
0,0,35000.0,5,19.52,917.97,E,E2,320.0,2.0,2,110000.0,2,2014-07-01,1.0,1,137.0,32,17.05,0.0,730.0,734.0,7.0,0.0,0.0,24178.0,48.9,27.0,0,0,2001,1.0,1.0,0.0,2.0,2.0,2.0,4.0,9.0,8.0,4.0,12.0,2.0,7.0,0.0,0.0,0.0,2.0
1,1,18000.0,5,18.49,461.9,D,D2,219843.0,5.0,0,46000.0,2,2012-08-01,0.0,0,156.0,18,27.83,0.0,700.0,704.0,13.0,0.0,0.0,15096.0,38.9,18.0,1,0,2002,1723.0,1.0,,,,,10.0,,,,,,13.0,,,,
2,2,12000.0,5,16.99,298.17,D,D3,31698.0,8.0,0,74000.0,2,2015-10-01,0.0,0,337.0,14,22.77,0.0,675.0,679.0,11.0,0.0,0.0,4606.0,51.8,27.0,0,0,2006,0.0,1.0,0.0,0.0,3.0,3.0,0.0,0.0,21.0,4.0,5.0,3.0,11.0,0.0,0.0,0.0,4.0
3,3,11000.0,3,7.26,340.96,A,A4,46854.0,10.0,1,118000.0,1,2015-08-01,0.0,4,148.0,11,17.21,0.0,685.0,689.0,9.0,0.0,0.0,9948.0,52.6,28.0,1,0,1999,4.0,1.0,6.0,4.0,6.0,6.0,4.0,16.0,4.0,7.0,21.0,6.0,9.0,0.0,0.0,0.0,1.0
4,4,3000.0,3,12.99,101.07,C,C2,54.0,,1,29000.0,2,2016-03-01,0.0,10,301.0,21,32.16,0.0,690.0,694.0,12.0,0.0,0.0,2942.0,32.0,27.0,0,0,1977,11.0,1.0,1.0,2.0,7.0,7.0,2.0,4.0,9.0,10.0,15.0,7.0,12.0,0.0,0.0,0.0,4.0


In [13]:
# 类别特征处理

# 部分类别特征
# grade  贷款等级
# subGrade  贷款等级之子级
# employmentTitle  就业职称
# homeOwnership  借款人在登记时提供的房屋所有权状况
# verificationStatus  验证状态
# purpose  借款人在贷款申请时的贷款用途类别
# postCode  借款人在贷款申请中提供的邮政编码的前3位数字
# regionCode  地区编码
# applicationType  表明贷款是个人申请还是与两个共同借款人的联合申请
# initialListStatus  贷款的初始列表状态
# title  借款人提供的贷款名称
# policyCode  公开可用的策略_代码=1，新产品不公开可用的策略_代码=2
cate_features = ['grade', 'subGrade', 'employmentTitle', 'homeOwnership', 'verificationStatus', 'purpose', 'postCode', 'regionCode', \
                 'applicationType', 'initialListStatus', 'title', 'policyCode']

# 查看类别数量
for f in cate_features:
    print(f, '类型数：', data[f].nunique())

grade 类型数： 7
subGrade 类型数： 35
employmentTitle 类型数： 298101
homeOwnership 类型数： 6
verificationStatus 类型数： 3
purpose 类型数： 14
postCode 类型数： 935
regionCode 类型数： 51
applicationType 类型数： 2
initialListStatus 类型数： 2
title 类型数： 47903
policyCode 类型数： 1


In [14]:
# 转换类型数在2之上，又不是高维稀疏的，拆解成新的列
data = pd.get_dummies(data, columns=['grade', 'subGrade', 'homeOwnership', 'verificationStatus', 'purpose', 'regionCode'], drop_first=True)

data.head()

Unnamed: 0,id,loanAmnt,term,interestRate,installment,employmentTitle,employmentLength,annualIncome,issueDate,isDefault,postCode,dti,delinquency_2years,ficoRangeLow,ficoRangeHigh,openAcc,pubRec,pubRecBankruptcies,revolBal,revolUtil,totalAcc,initialListStatus,applicationType,earliesCreditLine,title,policyCode,n0,n1,n2,n3,n4,n5,n6,n7,n8,n9,n10,n11,n12,n13,n14,grade_B,grade_C,grade_D,grade_E,grade_F,grade_G,subGrade_A2,subGrade_A3,subGrade_A4,subGrade_A5,subGrade_B1,subGrade_B2,subGrade_B3,subGrade_B4,subGrade_B5,subGrade_C1,subGrade_C2,subGrade_C3,subGrade_C4,subGrade_C5,subGrade_D1,subGrade_D2,subGrade_D3,subGrade_D4,subGrade_D5,subGrade_E1,subGrade_E2,subGrade_E3,subGrade_E4,subGrade_E5,subGrade_F1,subGrade_F2,subGrade_F3,subGrade_F4,subGrade_F5,subGrade_G1,subGrade_G2,subGrade_G3,subGrade_G4,subGrade_G5,homeOwnership_1,homeOwnership_2,homeOwnership_3,homeOwnership_4,homeOwnership_5,verificationStatus_1,verificationStatus_2,purpose_1,purpose_2,purpose_3,purpose_4,purpose_5,purpose_6,purpose_7,purpose_8,purpose_9,purpose_10,purpose_11,purpose_12,purpose_13,regionCode_1,regionCode_2,regionCode_3,regionCode_4,regionCode_5,regionCode_6,regionCode_7,regionCode_8,regionCode_9,regionCode_10,regionCode_11,regionCode_12,regionCode_13,regionCode_14,regionCode_15,regionCode_16,regionCode_17,regionCode_18,regionCode_19,regionCode_20,regionCode_21,regionCode_22,regionCode_23,regionCode_24,regionCode_25,regionCode_26,regionCode_27,regionCode_28,regionCode_29,regionCode_30,regionCode_31,regionCode_32,regionCode_33,regionCode_34,regionCode_35,regionCode_36,regionCode_37,regionCode_38,regionCode_39,regionCode_40,regionCode_41,regionCode_42,regionCode_43,regionCode_44,regionCode_45,regionCode_46,regionCode_47,regionCode_48,regionCode_49,regionCode_50
0,0,35000.0,5,19.52,917.97,320.0,2.0,110000.0,2014-07-01,1.0,137.0,17.05,0.0,730.0,734.0,7.0,0.0,0.0,24178.0,48.9,27.0,0,0,2001,1.0,1.0,0.0,2.0,2.0,2.0,4.0,9.0,8.0,4.0,12.0,2.0,7.0,0.0,0.0,0.0,2.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,18000.0,5,18.49,461.9,219843.0,5.0,46000.0,2012-08-01,0.0,156.0,27.83,0.0,700.0,704.0,13.0,0.0,0.0,15096.0,38.9,18.0,1,0,2002,1723.0,1.0,,,,,10.0,,,,,,13.0,,,,,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,2,12000.0,5,16.99,298.17,31698.0,8.0,74000.0,2015-10-01,0.0,337.0,22.77,0.0,675.0,679.0,11.0,0.0,0.0,4606.0,51.8,27.0,0,0,2006,0.0,1.0,0.0,0.0,3.0,3.0,0.0,0.0,21.0,4.0,5.0,3.0,11.0,0.0,0.0,0.0,4.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,3,11000.0,3,7.26,340.96,46854.0,10.0,118000.0,2015-08-01,0.0,148.0,17.21,0.0,685.0,689.0,9.0,0.0,0.0,9948.0,52.6,28.0,1,0,1999,4.0,1.0,6.0,4.0,6.0,6.0,4.0,16.0,4.0,7.0,21.0,6.0,9.0,0.0,0.0,0.0,1.0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,4,3000.0,3,12.99,101.07,54.0,,29000.0,2016-03-01,0.0,301.0,32.16,0.0,690.0,694.0,12.0,0.0,0.0,2942.0,32.0,27.0,0,0,1977,11.0,1.0,1.0,2.0,7.0,7.0,2.0,4.0,9.0,10.0,15.0,7.0,12.0,0.0,0.0,0.0,4.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [15]:
# 高维类别特征需要进行转换
for f in ['employmentTitle', 'postCode', 'title']:
    data[f+'_cnts'] = data.groupby([f])['id'].transform('count') # 保持了与原始数据集相同数量的项目，以总数量为值
    data[f+'_rank'] = data.groupby([f])['id'].rank(ascending=False).astype(int) # 根据出现的数量分级
    # ascending=False代表降序
    # rank默认升序，数量多的越后面，例如
    # In [120]:obj = pd.Series([7,-5,7,4,2,0,4])
    # In [121]:obj.rank()
    # Out [121]:
    # 0    6.5        # index=0代表的是obj第一个数7,7在obj中排名第6和第7，取平均值6.5
    # 1    1.0        # index=1代表obj第二个数-5.-5在obj中排名第1，取值1
    # 2    6.5        # index=3依次计算，类推
    # 3    4.5
    # 4    3.0
    # 5    2.0
    # 6    4.5
    # astype，变量类型转换
    # dtype: float64
    del data[f] #删除原来的列
    
data.head()

Unnamed: 0,id,loanAmnt,term,interestRate,installment,employmentLength,annualIncome,issueDate,isDefault,dti,delinquency_2years,ficoRangeLow,ficoRangeHigh,openAcc,pubRec,pubRecBankruptcies,revolBal,revolUtil,totalAcc,initialListStatus,applicationType,earliesCreditLine,policyCode,n0,n1,n2,n3,n4,n5,n6,n7,n8,n9,n10,n11,n12,n13,n14,grade_B,grade_C,grade_D,grade_E,grade_F,grade_G,subGrade_A2,subGrade_A3,subGrade_A4,subGrade_A5,subGrade_B1,subGrade_B2,subGrade_B3,subGrade_B4,subGrade_B5,subGrade_C1,subGrade_C2,subGrade_C3,subGrade_C4,subGrade_C5,subGrade_D1,subGrade_D2,subGrade_D3,subGrade_D4,subGrade_D5,subGrade_E1,subGrade_E2,subGrade_E3,subGrade_E4,subGrade_E5,subGrade_F1,subGrade_F2,subGrade_F3,subGrade_F4,subGrade_F5,subGrade_G1,subGrade_G2,subGrade_G3,subGrade_G4,subGrade_G5,homeOwnership_1,homeOwnership_2,homeOwnership_3,homeOwnership_4,homeOwnership_5,verificationStatus_1,verificationStatus_2,purpose_1,purpose_2,purpose_3,purpose_4,purpose_5,purpose_6,purpose_7,purpose_8,purpose_9,purpose_10,purpose_11,purpose_12,purpose_13,regionCode_1,regionCode_2,regionCode_3,regionCode_4,regionCode_5,regionCode_6,regionCode_7,regionCode_8,regionCode_9,regionCode_10,regionCode_11,regionCode_12,regionCode_13,regionCode_14,regionCode_15,regionCode_16,regionCode_17,regionCode_18,regionCode_19,regionCode_20,regionCode_21,regionCode_22,regionCode_23,regionCode_24,regionCode_25,regionCode_26,regionCode_27,regionCode_28,regionCode_29,regionCode_30,regionCode_31,regionCode_32,regionCode_33,regionCode_34,regionCode_35,regionCode_36,regionCode_37,regionCode_38,regionCode_39,regionCode_40,regionCode_41,regionCode_42,regionCode_43,regionCode_44,regionCode_45,regionCode_46,regionCode_47,regionCode_48,regionCode_49,regionCode_50,employmentTitle_cnts,employmentTitle_rank,postCode_cnts,postCode_rank,title_cnts,title_rank
0,0,35000.0,5,19.52,917.97,2.0,110000.0,2014-07-01,1.0,17.05,0.0,730.0,734.0,7.0,0.0,0.0,24178.0,48.9,27.0,0,0,2001,1.0,0.0,2.0,2.0,2.0,4.0,9.0,8.0,4.0,12.0,2.0,7.0,0.0,0.0,0.0,2.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1392.0,1392,2646.0,2646,8687.0,8687
1,1,18000.0,5,18.49,461.9,5.0,46000.0,2012-08-01,0.0,27.83,0.0,700.0,704.0,13.0,0.0,0.0,15096.0,38.9,18.0,1,0,2002,1.0,,,,,10.0,,,,,,13.0,,,,,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,151.0,151,4751.0,4751,37.0,37
2,2,12000.0,5,16.99,298.17,8.0,74000.0,2015-10-01,0.0,22.77,0.0,675.0,679.0,11.0,0.0,0.0,4606.0,51.8,27.0,0,0,2006,1.0,0.0,0.0,3.0,3.0,0.0,0.0,21.0,4.0,5.0,3.0,11.0,0.0,0.0,0.0,4.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2.0,2,2167.0,2167,491400.0,491400
3,3,11000.0,3,7.26,340.96,10.0,118000.0,2015-08-01,0.0,17.21,0.0,685.0,689.0,9.0,0.0,0.0,9948.0,52.6,28.0,1,0,1999,1.0,6.0,4.0,6.0,6.0,4.0,16.0,4.0,7.0,21.0,6.0,9.0,0.0,0.0,0.0,1.0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2.0,2,689.0,689,185386.0,185386
4,4,3000.0,3,12.99,101.07,,29000.0,2016-03-01,0.0,32.16,0.0,690.0,694.0,12.0,0.0,0.0,2942.0,32.0,27.0,0,0,1977,1.0,1.0,2.0,7.0,7.0,2.0,4.0,9.0,10.0,15.0,7.0,12.0,0.0,0.0,0.0,4.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,63978.0,63978,2161.0,2161,5896.0,5896


In [16]:
# 训练数据/测试数据准备

In [17]:
features = [f for f in data.columns if f not in ['id','issueDate','isDefault']] # 特征值，除了贷款清单分配的唯一信用证标识、贷款发放的月份、是否有违约的可能

# 数据清洗时，会将带空值的行删除，此时DataFrame或Series类型的数据不再是连续的索引，可以使用reset_index()重置索引。
train = data[data.isDefault.notnull()].reset_index(drop=True)
test = data[data.isDefault.isnull()].reset_index(drop=True)

x_train = train[features]
x_test = test[features]

y_train = train['isDefault'] # 将训练集索引对应的 是否有违约的可能 isDefault提取出来

In [18]:
x_train.head()

Unnamed: 0,loanAmnt,term,interestRate,installment,employmentLength,annualIncome,dti,delinquency_2years,ficoRangeLow,ficoRangeHigh,openAcc,pubRec,pubRecBankruptcies,revolBal,revolUtil,totalAcc,initialListStatus,applicationType,earliesCreditLine,policyCode,n0,n1,n2,n3,n4,n5,n6,n7,n8,n9,n10,n11,n12,n13,n14,grade_B,grade_C,grade_D,grade_E,grade_F,grade_G,subGrade_A2,subGrade_A3,subGrade_A4,subGrade_A5,subGrade_B1,subGrade_B2,subGrade_B3,subGrade_B4,subGrade_B5,subGrade_C1,subGrade_C2,subGrade_C3,subGrade_C4,subGrade_C5,subGrade_D1,subGrade_D2,subGrade_D3,subGrade_D4,subGrade_D5,subGrade_E1,subGrade_E2,subGrade_E3,subGrade_E4,subGrade_E5,subGrade_F1,subGrade_F2,subGrade_F3,subGrade_F4,subGrade_F5,subGrade_G1,subGrade_G2,subGrade_G3,subGrade_G4,subGrade_G5,homeOwnership_1,homeOwnership_2,homeOwnership_3,homeOwnership_4,homeOwnership_5,verificationStatus_1,verificationStatus_2,purpose_1,purpose_2,purpose_3,purpose_4,purpose_5,purpose_6,purpose_7,purpose_8,purpose_9,purpose_10,purpose_11,purpose_12,purpose_13,regionCode_1,regionCode_2,regionCode_3,regionCode_4,regionCode_5,regionCode_6,regionCode_7,regionCode_8,regionCode_9,regionCode_10,regionCode_11,regionCode_12,regionCode_13,regionCode_14,regionCode_15,regionCode_16,regionCode_17,regionCode_18,regionCode_19,regionCode_20,regionCode_21,regionCode_22,regionCode_23,regionCode_24,regionCode_25,regionCode_26,regionCode_27,regionCode_28,regionCode_29,regionCode_30,regionCode_31,regionCode_32,regionCode_33,regionCode_34,regionCode_35,regionCode_36,regionCode_37,regionCode_38,regionCode_39,regionCode_40,regionCode_41,regionCode_42,regionCode_43,regionCode_44,regionCode_45,regionCode_46,regionCode_47,regionCode_48,regionCode_49,regionCode_50,employmentTitle_cnts,employmentTitle_rank,postCode_cnts,postCode_rank,title_cnts,title_rank
0,35000.0,5,19.52,917.97,2.0,110000.0,17.05,0.0,730.0,734.0,7.0,0.0,0.0,24178.0,48.9,27.0,0,0,2001,1.0,0.0,2.0,2.0,2.0,4.0,9.0,8.0,4.0,12.0,2.0,7.0,0.0,0.0,0.0,2.0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1392.0,1392,2646.0,2646,8687.0,8687
1,18000.0,5,18.49,461.9,5.0,46000.0,27.83,0.0,700.0,704.0,13.0,0.0,0.0,15096.0,38.9,18.0,1,0,2002,1.0,,,,,10.0,,,,,,13.0,,,,,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,151.0,151,4751.0,4751,37.0,37
2,12000.0,5,16.99,298.17,8.0,74000.0,22.77,0.0,675.0,679.0,11.0,0.0,0.0,4606.0,51.8,27.0,0,0,2006,1.0,0.0,0.0,3.0,3.0,0.0,0.0,21.0,4.0,5.0,3.0,11.0,0.0,0.0,0.0,4.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2.0,2,2167.0,2167,491400.0,491400
3,11000.0,3,7.26,340.96,10.0,118000.0,17.21,0.0,685.0,689.0,9.0,0.0,0.0,9948.0,52.6,28.0,1,0,1999,1.0,6.0,4.0,6.0,6.0,4.0,16.0,4.0,7.0,21.0,6.0,9.0,0.0,0.0,0.0,1.0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2.0,2,689.0,689,185386.0,185386
4,3000.0,3,12.99,101.07,,29000.0,32.16,0.0,690.0,694.0,12.0,0.0,0.0,2942.0,32.0,27.0,0,0,1977,1.0,1.0,2.0,7.0,7.0,2.0,4.0,9.0,10.0,15.0,7.0,12.0,0.0,0.0,0.0,4.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,63978.0,63978,2161.0,2161,5896.0,5896


In [19]:
x_test.head()

Unnamed: 0,loanAmnt,term,interestRate,installment,employmentLength,annualIncome,dti,delinquency_2years,ficoRangeLow,ficoRangeHigh,openAcc,pubRec,pubRecBankruptcies,revolBal,revolUtil,totalAcc,initialListStatus,applicationType,earliesCreditLine,policyCode,n0,n1,n2,n3,n4,n5,n6,n7,n8,n9,n10,n11,n12,n13,n14,grade_B,grade_C,grade_D,grade_E,grade_F,grade_G,subGrade_A2,subGrade_A3,subGrade_A4,subGrade_A5,subGrade_B1,subGrade_B2,subGrade_B3,subGrade_B4,subGrade_B5,subGrade_C1,subGrade_C2,subGrade_C3,subGrade_C4,subGrade_C5,subGrade_D1,subGrade_D2,subGrade_D3,subGrade_D4,subGrade_D5,subGrade_E1,subGrade_E2,subGrade_E3,subGrade_E4,subGrade_E5,subGrade_F1,subGrade_F2,subGrade_F3,subGrade_F4,subGrade_F5,subGrade_G1,subGrade_G2,subGrade_G3,subGrade_G4,subGrade_G5,homeOwnership_1,homeOwnership_2,homeOwnership_3,homeOwnership_4,homeOwnership_5,verificationStatus_1,verificationStatus_2,purpose_1,purpose_2,purpose_3,purpose_4,purpose_5,purpose_6,purpose_7,purpose_8,purpose_9,purpose_10,purpose_11,purpose_12,purpose_13,regionCode_1,regionCode_2,regionCode_3,regionCode_4,regionCode_5,regionCode_6,regionCode_7,regionCode_8,regionCode_9,regionCode_10,regionCode_11,regionCode_12,regionCode_13,regionCode_14,regionCode_15,regionCode_16,regionCode_17,regionCode_18,regionCode_19,regionCode_20,regionCode_21,regionCode_22,regionCode_23,regionCode_24,regionCode_25,regionCode_26,regionCode_27,regionCode_28,regionCode_29,regionCode_30,regionCode_31,regionCode_32,regionCode_33,regionCode_34,regionCode_35,regionCode_36,regionCode_37,regionCode_38,regionCode_39,regionCode_40,regionCode_41,regionCode_42,regionCode_43,regionCode_44,regionCode_45,regionCode_46,regionCode_47,regionCode_48,regionCode_49,regionCode_50,employmentTitle_cnts,employmentTitle_rank,postCode_cnts,postCode_rank,title_cnts,title_rank
0,14000.0,3,10.99,458.28,10.0,80000.0,10.56,1.0,715.0,719.0,17.0,0.0,0.0,9846.0,30.7,29.0,0,0,1974,1.0,1.0,4.0,6.0,6.0,6.0,8.0,4.0,15.0,19.0,6.0,17.0,0.0,0.0,1.0,3.0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,50.0,9,998.0,202,491400.0,98066
1,20000.0,5,14.65,472.14,10.0,50000.0,21.4,2.0,670.0,674.0,5.0,0.0,0.0,8946.0,56.6,14.0,0,0,2001,1.0,2.0,1.0,3.0,3.0,1.0,1.0,3.0,3.0,9.0,3.0,5.0,0.0,0.0,2.0,2.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.0,1,5954.0,1172,55782.0,11034
2,12000.0,3,19.99,445.91,2.0,60000.0,33.5,0.0,710.0,714.0,12.0,0.0,0.0,970.0,17.6,43.0,1,0,2006,1.0,0.0,1.0,4.0,4.0,1.0,1.0,36.0,5.0,6.0,4.0,12.0,0.0,0.0,0.0,7.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2.0,1,1568.0,300,491400.0,98065
3,17500.0,5,14.31,410.02,4.0,37000.0,13.95,0.0,685.0,689.0,10.0,1.0,1.0,10249.0,52.3,18.0,0,0,2002,1.0,0.0,2.0,2.0,2.0,4.0,7.0,2.0,8.0,14.0,2.0,10.0,0.0,0.0,0.0,3.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,39.0,8,1689.0,340,185386.0,37175
4,35000.0,3,17.09,1249.42,0.0,80000.0,24.97,0.0,685.0,689.0,19.0,0.0,0.0,33199.0,35.6,22.0,0,0,2000,1.0,0.0,8.0,11.0,11.0,9.0,11.0,3.0,16.0,18.0,11.0,19.0,0.0,0.0,0.0,1.0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1.0,1,2616.0,536,491400.0,98064


In [20]:
y_train.head()

0    1.0
1    0.0
2    0.0
3    0.0
4    0.0
Name: isDefault, dtype: float64

In [21]:
# clf、clf_name表示选择的模型，train_x 训练集，y_train是训练集对应的是否有违约可能的数据集，test_x是测试集
def cv_model(clf, train_x, train_y, test_x, clf_name): 
    folds = 5 
    seed = 2020
    # k-交叉验证KFold，数据集被分成5份，4份训练集和1份验证集
    # n_splits分成的份数，shuffle表示是否打乱划分，random_state 表示是否固定随机起点
    # 减少出现过拟合的问题，就是模型可以很好的匹配训练数据，却不能很好在预测训练集外的数据
    kf = KFold(n_splits=folds, shuffle=True, random_state=seed)

    # shape[0]就是读取矩阵第一维度的长度，numpy.zeros返回一个给定大小和类型（默认numpy.float64）的用0填充的数组
    train = np.zeros(train_x.shape[0])
    test = np.zeros(test_x.shape[0])

    cv_scores = []
    
    # KFold.split 返回的是索引数组
    # from sklearn.model_selection import KFold
    # kf = KFold(n_splits=5, random_state=43, shuffle=True)
    # a=[[1,2],[3,4],[5,6],[7,8],[9,10]]
    # b=[1,2,3,4,5]
    # for i,j in kf.split(a,b):
    #     print(i,j)
    # [0 1 2 4] [3]
    # [0 1 3 4] [2]
    # [0 2 3 4] [1]
    # [1 2 3 4] [0]
    # [0 1 2 3] [4]
    
    # 对交叉验证KFold分成的五份数据进行遍历，train_index  4个训练集的索引集合，1个验证集的索引集合
    for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
        print('************************************ {} ************************************'.format(str(i+1)))
        # iloc是基于索引位来选取数据集，例如 iloc[0:4] 就是选取 0，1，2，3这四行，需要注意的是这里是前闭后开集合
        # trn_x 训练集选择的行
        # trn_y 训练集选择的行对应的违约可能性
        # val_x 验证集选择的行
        # val_y 验证集选择的行对应的违约可能性
        trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index]
        
        # lightgbm模型
        if clf_name == "lgb":
            train_matrix = clf.Dataset(trn_x, label=trn_y) # 训练矩阵
            valid_matrix = clf.Dataset(val_x, label=val_y) # 验证矩阵

            params = {
                'boosting_type': 'gbdt', # 训练方式
                'objective': 'binary', #目标 二分类
                'metric': 'auc', # 损失函数
                'min_child_weight': 5, # 叶子上的最小样本数
                'num_leaves': 2 ** 5, # 一棵树的叶数，2的5次方
                'lambda_l2': 10, # 一个浮点数，表示L2正则化系数。默认为0
                'feature_fraction': 0.8, # 随机选择 80% 的特征在每次迭代中进行训练
                'bagging_fraction': 0.8, # 装袋法，每次迭代时用的数据比例
                'bagging_freq': 4, # 每 4 次迭代执行一次装袋
                'learning_rate': 0.1, # 学习率过小时，收敛过程将变得十分缓慢。而当学习率过大时，梯度可能会在最小值附近来回震荡，甚至可能无法收敛
                'seed': 2020, # 随机数种子
                'nthread': 28, # 线程
                'n_jobs':24, # 工作的core数量
                'silent': True, # 没有运行信息输出
                'verbose': -1, # <0 显示致命的, =0 显示错误 (警告), >0 显示信息
            }

            # 迭代的数目 50000（训练集总共80w条，测试集总共20w条）
            # valid_sets 用于对训练过程中进行评估列表中的元素
            # verbose_eval如果为True ,则对valid_sets中元素的评估结果会输出在结果中；如果输入数字，假设为5，则每隔5个迭代输出一次。
            # early_stopping_rounds,早期停止次数，假设为200，验证集的误差迭代到一定程度在200次内不能再继续降低，就停止迭代。
            
            # 使用交叉验证训练
            model = clf.train(params, train_matrix, 50000, valid_sets=[train_matrix, valid_matrix], verbose_eval=200,early_stopping_rounds=200)
            # 验证集预测
            val_pred = model.predict(val_x, num_iteration=model.best_iteration)
            # 测试集预测
            test_pred = model.predict(test_x, num_iteration=model.best_iteration)
            
            # print(list(sorted(zip(features, model.feature_importance("gain")), key=lambda x: x[1], reverse=True))[:20])
                
        if clf_name == "xgb":
            train_matrix = clf.DMatrix(trn_x , label=trn_y)
            valid_matrix = clf.DMatrix(val_x , label=val_y)
            test_matrix = clf.DMatrix(test_x)
            
            params = {'booster': 'gbtree',
                      'objective': 'binary:logistic',
                      'eval_metric': 'auc',
                      'gamma': 1,
                      'min_child_weight': 1.5,
                      'max_depth': 5,
                      'lambda': 10,
                      'subsample': 0.7,
                      'colsample_bytree': 0.7,
                      'colsample_bylevel': 0.7,
                      'eta': 0.04,
                      'tree_method': 'exact',
                      'seed': 2020,
                      'nthread': 36,
                      "silent": True,
                      }
            
            # 使用交叉验证训练
            watchlist = [(train_matrix, 'train'),(valid_matrix, 'eval')]
            
            model = clf.train(params, train_matrix, num_boost_round=50000, evals=watchlist, verbose_eval=200, early_stopping_rounds=200)
            val_pred  = model.predict(valid_matrix, ntree_limit=model.best_ntree_limit)
            test_pred = model.predict(test_matrix , ntree_limit=model.best_ntree_limit)
                 
        if clf_name == "cat":
            params = {'learning_rate': 0.05, 'depth': 5, 'l2_leaf_reg': 10, 'bootstrap_type': 'Bernoulli',
                      'od_type': 'Iter', 'od_wait': 50, 'random_seed': 11, 'allow_writing_files': False}
            
            model = clf(iterations=20000, **params)
            model.fit(trn_x, trn_y, eval_set=(val_x, val_y),
                      cat_features=[], use_best_model=True, verbose=500)
            
            val_pred  = model.predict(val_x)
            test_pred = model.predict(test_x)
            
        # 记录训练集（实则为训练集的交叉验证的验证集）的预测结果
        train[valid_index] = val_pred
        print(val_pred)
        # 记录测试集的预测结果
        test = test_pred / kf.n_splits
        print(test_pred)
        print(test)
        # 记录验证集的预测结果和验证集的实际结果
        # ROC(Receiver Operating Characteristic)受试者工作特征曲线
        # AUC(Area Under the Curve)ROC曲线下与坐标轴围成的面积
        cv_scores.append(roc_auc_score(val_y, val_pred))
        
        # 准确率
        print(cv_scores)
    
    # 五次交叉验证的准确率
    print("%s_scotrainre_list:" % clf_name, cv_scores)
    # 平均值
    print("%s_score_mean:" % clf_name, np.mean(cv_scores))
    # 标准差
    print("%s_score_std:" % clf_name, np.std(cv_scores))
    return train, test

In [22]:
def lgb_model(x_train, y_train, x_test):
    lgb_train, lgb_test = cv_model(lgb, x_train, y_train, x_test, "lgb")
    return lgb_train, lgb_test

def xgb_model(x_train, y_train, x_test):
    xgb_train, xgb_test = cv_model(xgb, x_train, y_train, x_test, "xgb")
    return xgb_train, xgb_test

def cat_model(x_train, y_train, x_test):
    cat_train, cat_test = cv_model(CatBoostRegressor, x_train, y_train, x_test, "cat") 
    return cat_train, cat_test

In [23]:
lgb_train, lgb_test = lgb_model(x_train, y_train, x_test)

************************************ 1 ************************************
Training until validation scores don't improve for 200 rounds
[200]	training's auc: 0.742898	valid_1's auc: 0.730406
[400]	training's auc: 0.755553	valid_1's auc: 0.731185
[600]	training's auc: 0.766567	valid_1's auc: 0.731421
[800]	training's auc: 0.77656	valid_1's auc: 0.731297
Early stopping, best iteration is:
[658]	training's auc: 0.769561	valid_1's auc: 0.731571
[0.48105957 0.12368071 0.09843215 ... 0.36348375 0.0528753  0.06277614]
[0.06344998 0.33361567 0.54264037 ... 0.15151295 0.21221886 0.01837744]
[0.01269    0.06672313 0.10852807 ... 0.03030259 0.04244377 0.00367549]
[0.7315707699391985]
************************************ 2 ************************************
Training until validation scores don't improve for 200 rounds
[200]	training's auc: 0.743889	valid_1's auc: 0.726598
[400]	training's auc: 0.756346	valid_1's auc: 0.727829
[600]	training's auc: 0.767237	valid_1's auc: 0.728122
[800]	trainin

In [39]:
lgb_test

array([0.01638982, 0.06136415, 0.12579608, ..., 0.04464922, 0.04525388,
       0.00501263])

In [21]:
xgb_train, xgb_test = xgb_model(x_train, y_train, x_test)

************************************ 1 ************************************
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	train-auc:0.69631	eval-auc:0.69783
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.

Will train until eval-auc hasn't improved in 200 rounds.
[200]	train-auc:0.72779	eval-auc:0.72438
[400]	train-auc:0.73550	eval-auc:0.72842
[600]	train-auc:0.74080	eval-auc:0.73043
[800]	train-auc:0.74513	eval-auc:0.73153
[1000]	train-auc:0.74885	eval-auc:0.73232
[1200]	train-auc:0.75235	eval-auc:0.73297
[1400]	train-auc:0.75551	eval-auc:0.73337
[1600]	train-auc:0.75864	eval-auc:0.73367
[1800]	train-auc:0.76152	eval-auc:0.73390
[2000]	train-auc:0.76443	eval-auc:0.73414
[2200]	train-auc:0.76701	eval-auc:0.73424
[24

In [40]:
xgb_test

array([0.01505861, 0.0648168 , 0.12824766, ..., 0.03895742, 0.04455039,
       0.00432092], dtype=float32)

In [41]:
rh_test = lgb_test*0.5 + xgb_test*0.5

In [42]:
rh_test

array([0.01572422, 0.06309048, 0.12702187, ..., 0.04180332, 0.04490213,
       0.00466677])

In [43]:
testA['isDefault'] = rh_test

In [44]:
testA[['id','isDefault']].to_csv('test_sub.csv', index=False)