In [1]:
#罗官 20184352120 18软智01班
#天池大数据竞赛——贷款违约预测 网址:https://tianchi.aliyun.com/competition/entrance/531830/introduction
#训练集：train.csv 测试集：testA.csv
###################################################################################################
###################################################################################################
#导入包
import pandas as pd
import os
import gc
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor
from sklearn.preprocessing import MinMaxScaler
import math
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, log_loss
import time
import warnings
warnings.filterwarnings('ignore')

In [2]:
#导入数据集
train = pd.read_csv('train.csv')
print('训练集大小：',train.shape)
testA = pd.read_csv('testA.csv')
print('测试集大小：',testA.shape)

训练集大小： (800000, 47)
测试集大小： (200000, 46)


In [3]:
numerical_fea = list(train.select_dtypes(exclude=['object']).columns)
category_fea = list(filter(lambda x: x not in numerical_fea,list(train.columns)))
label = 'isDefault'
numerical_fea.remove(label)

In [4]:
#查看缺省值情况
train.isnull().sum()

id                        0
loanAmnt                  0
term                      0
interestRate              0
installment               0
grade                     0
subGrade                  0
employmentTitle           1
employmentLength      46799
homeOwnership             0
annualIncome              0
verificationStatus        0
issueDate                 0
isDefault                 0
purpose                   0
postCode                  1
regionCode                0
dti                     239
delinquency_2years        0
ficoRangeLow              0
ficoRangeHigh             0
openAcc                   0
pubRec                    0
pubRecBankruptcies      405
revolBal                  0
revolUtil               531
totalAcc                  0
initialListStatus         0
applicationType           0
earliesCreditLine         0
title                     1
policyCode                0
n0                    40270
n1                    40270
n2                    40270
n3                  

In [5]:
#按照中位数填充数值型特征
train[numerical_fea] = train[numerical_fea].fillna(train[numerical_fea].median())
testA[numerical_fea] = testA[numerical_fea].fillna(train[numerical_fea].median())
#按照众数填充类别型特征
train[category_fea] = train[category_fea].fillna(train[category_fea].mode())
testA[category_fea] = testA[category_fea].fillna(train[category_fea].mode())

In [6]:
#查看更新后的缺省值情况
train.isnull().sum()

id                        0
loanAmnt                  0
term                      0
interestRate              0
installment               0
grade                     0
subGrade                  0
employmentTitle           0
employmentLength      46799
homeOwnership             0
annualIncome              0
verificationStatus        0
issueDate                 0
isDefault                 0
purpose                   0
postCode                  0
regionCode                0
dti                       0
delinquency_2years        0
ficoRangeLow              0
ficoRangeHigh             0
openAcc                   0
pubRec                    0
pubRecBankruptcies        0
revolBal                  0
revolUtil                 0
totalAcc                  0
initialListStatus         0
applicationType           0
earliesCreditLine         0
title                     0
policyCode                0
n0                        0
n1                        0
n2                        0
n3                  

In [7]:
data = pd.concat([train, testA], axis=0, ignore_index=True)
data.head()
print('合并后的数据集大小:',data.shape)

合并后的数据集大小: (1000000, 47)


In [8]:
#数据类型
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 47 columns):
 #   Column              Non-Null Count    Dtype  
---  ------              --------------    -----  
 0   id                  1000000 non-null  int64  
 1   loanAmnt            1000000 non-null  float64
 2   term                1000000 non-null  int64  
 3   interestRate        1000000 non-null  float64
 4   installment         1000000 non-null  float64
 5   grade               1000000 non-null  object 
 6   subGrade            1000000 non-null  object 
 7   employmentTitle     1000000 non-null  float64
 8   employmentLength    941459 non-null   object 
 9   homeOwnership       1000000 non-null  int64  
 10  annualIncome        1000000 non-null  float64
 11  verificationStatus  1000000 non-null  int64  
 12  issueDate           1000000 non-null  object 
 13  isDefault           800000 non-null   float64
 14  purpose             1000000 non-null  int64  
 15  postCode        

In [9]:
#unique()函数：对于一维数组或者列表，unique函数去除其中重复的元素，并按元素由大到小返回一个新的无元素重复的元组或者列表
#返回数据集 grade列 和 subGrade列按从大到小顺序排序
print(sorted(data['grade'].unique()))
print(sorted(data['subGrade'].unique()))

['A', 'B', 'C', 'D', 'E', 'F', 'G']
['A1', 'A2', 'A3', 'A4', 'A5', 'B1', 'B2', 'B3', 'B4', 'B5', 'C1', 'C2', 'C3', 'C4', 'C5', 'D1', 'D2', 'D3', 'D4', 'D5', 'E1', 'E2', 'E3', 'E4', 'E5', 'F1', 'F2', 'F3', 'F4', 'F5', 'G1', 'G2', 'G3', 'G4', 'G5']


In [10]:
#value_counts()是一种查看表格某列中有多少个不同值的快捷方法，并计算每个不同值有在该列中有多少重复值。
#sort_index()排序
#返回特征employmentLength的不同值
data['employmentLength'].value_counts(dropna=False).sort_index()

1 year        65671
10+ years    328525
2 years       90565
3 years       80163
4 years       59818
5 years       62645
6 years       46582
7 years       44230
8 years       45168
9 years       37866
< 1 year      80226
NaN           58541
Name: employmentLength, dtype: int64

In [11]:
# - 首先对employmentLength进行转换到数值
#将>10 years替换成10 years,将 <1 years替换成 0 years
data['employmentLength'].replace(to_replace='10+ years', value='10 years', inplace=True)
data['employmentLength'].replace('< 1 year', '0 years', inplace=True)

def employmentLength_to_int(s):
    if pd.isnull(s):
        return s
    else:
        return np.int8(s.split()[0])
    
data['employmentLength'] = data['employmentLength'].apply(employmentLength_to_int)

In [12]:
#对转换后的特征值进行统计排序
data['employmentLength'].value_counts(dropna=False).sort_index()

0.0      80226
1.0      65671
2.0      90565
3.0      80163
4.0      59818
5.0      62645
6.0      46582
7.0      44230
8.0      45168
9.0      37866
10.0    328525
NaN      58541
Name: employmentLength, dtype: int64

In [13]:
data['employmentLength'] = data['employmentLength'].fillna(data['employmentLength'].mean())

In [14]:
# - 对earliesCreditLine进行预处理
#sample():从序列a中随机抽取n个元素，并将n个元素生以list形式返回。
data['earliesCreditLine'].sample(5)

524964    Sep-2006
931197    Apr-2003
703820    Apr-1986
161184    Sep-2007
468782    Mar-2004
Name: earliesCreditLine, dtype: object

In [15]:
#缺省值处理
data['earliesCreditLine'] = data['earliesCreditLine'].apply(lambda s: int(s[-4:]))

In [16]:
#describe():返回pandas的Series和DataFrame这两个核心数据结构的统计变量。
data['earliesCreditLine'].describe()

count    1000000.000000
mean        1998.688632
std            7.606231
min         1944.000000
25%         1995.000000
50%         2000.000000
75%         2004.000000
max         2015.000000
Name: earliesCreditLine, dtype: float64

In [17]:
#返回data前5行数据，employmentLength已经进行了预处理 
data.head()

Unnamed: 0,id,loanAmnt,term,interestRate,installment,grade,subGrade,employmentTitle,employmentLength,homeOwnership,...,n5,n6,n7,n8,n9,n10,n11,n12,n13,n14
0,0,35000.0,5,19.52,917.97,E,E2,320.0,2.0,2,...,9.0,8.0,4.0,12.0,2.0,7.0,0.0,0.0,0.0,2.0
1,1,18000.0,5,18.49,461.9,D,D2,219843.0,5.0,0,...,7.0,7.0,7.0,13.0,5.0,13.0,0.0,0.0,0.0,2.0
2,2,12000.0,5,16.99,298.17,D,D3,31698.0,8.0,0,...,0.0,21.0,4.0,5.0,3.0,11.0,0.0,0.0,0.0,4.0
3,3,11000.0,3,7.26,340.96,A,A4,46854.0,10.0,1,...,16.0,4.0,7.0,21.0,6.0,9.0,0.0,0.0,0.0,1.0
4,4,3000.0,3,12.99,101.07,C,C2,54.0,5.965504,1,...,4.0,9.0,10.0,15.0,7.0,12.0,0.0,0.0,0.0,4.0


In [18]:
# - 类别特征处理
# 部分类别特征
cate_features = ['grade', 'subGrade', 'employmentTitle', 'homeOwnership', 'verificationStatus', 'purpose', 'postCode', 'regionCode', \
                 'applicationType', 'initialListStatus', 'title', 'policyCode']
for f in cate_features:
    print(f, '类型数：', data[f].nunique())

grade 类型数： 7
subGrade 类型数： 35
employmentTitle 类型数： 298101
homeOwnership 类型数： 6
verificationStatus 类型数： 3
purpose 类型数： 14
postCode 类型数： 935
regionCode 类型数： 51
applicationType 类型数： 2
initialListStatus 类型数： 2
title 类型数： 47903
policyCode 类型数： 1


In [19]:
#类型数在2之上，又不是高维稀疏的,且纯分类特征
data = pd.get_dummies(data, columns=['grade', 'subGrade', 'homeOwnership', 'verificationStatus', 'purpose', 'regionCode'], drop_first=True)

In [20]:
# 高维类别特征需要进行转换
for f in ['employmentTitle', 'postCode', 'title']:
    data[f+'_cnts'] = data.groupby([f])['id'].transform('count')
    data[f+'_rank'] = data.groupby([f])['id'].rank(ascending=False).astype(int)
    del data[f]

In [21]:
#训练数据/测试数据准备
features = [f for f in data.columns if f not in ['id','issueDate','isDefault']]

train = data[data.isDefault.notnull()].reset_index(drop=True)
test = data[data.isDefault.isnull()].reset_index(drop=True)

x_train = train[features]
x_test = test[features]

y_train = train['isDefault']

In [22]:
#模型训练
#直接构建了一个函数，可以调用两种树模型，方便快捷
def cv_model(clf, train_x, train_y, test_x, clf_name):
    folds = 5
    seed = 2020
    kf = KFold(n_splits=folds, shuffle=True, random_state=seed)

    train = np.zeros(train_x.shape[0])
    test = np.zeros(test_x.shape[0])

    cv_scores = []
#lightbgm模型
    for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
        print('************************************ {} ************************************'.format(str(i+1)))
        trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index]

        if clf_name == "lgb":
            train_matrix = clf.Dataset(trn_x, label=trn_y)
            valid_matrix = clf.Dataset(val_x, label=val_y)
            #参数设置
            params = {
                'boosting_type': 'gbdt',
                'objective': 'binary',
                'metric': 'auc',
                'min_child_weight': 5,
                'num_leaves': 2 ** 5,
                'lambda_l2': 10,
                'feature_fraction': 0.8,
                'bagging_fraction': 0.8,
                'bagging_freq': 4,
                'learning_rate': 0.1,
                'seed': 2020,
                'nthread': 28,
                'n_jobs':24,
                'silent': True,
                'verbose': -1,
            }

            model = clf.train(params, train_matrix, 50000, valid_sets=[train_matrix, valid_matrix], verbose_eval=200,early_stopping_rounds=200)
            val_pred = model.predict(val_x, num_iteration=model.best_iteration)
            test_pred = model.predict(test_x, num_iteration=model.best_iteration)
            
            print(list(sorted(zip(features, model.feature_importance("gain")), key=lambda x: x[1], reverse=True))[:20])
     #catboost模型           
        if clf_name == "cat":
            #参数设置
            params = {'learning_rate': 0.05, 'depth': 5, 'l2_leaf_reg': 10, 'bootstrap_type': 'Bernoulli',
                      'od_type': 'Iter', 'od_wait': 50, 'random_seed': 11, 'allow_writing_files': False}
            #迭代20000次
            model = clf(iterations=20000, **params)
            model.fit(trn_x, trn_y, eval_set=(val_x, val_y),
                      cat_features=[], use_best_model=True, verbose=500)
            
            val_pred  = model.predict(val_x)
            test_pred = model.predict(test_x)
            
        train[valid_index] = val_pred
        test = test_pred / kf.n_splits
        cv_scores.append(roc_auc_score(val_y, val_pred))
        
        print(cv_scores)
       
    print("%s_scotrainre_list:" % clf_name, cv_scores)
    print("%s_score_mean:" % clf_name, np.mean(cv_scores))
    print("%s_score_std:" % clf_name, np.std(cv_scores))
    return train, test

In [23]:
def lgb_model(x_train, y_train, x_test):
    lgb_train, lgb_test = cv_model(lgb, x_train, y_train, x_test, "lgb")
    return lgb_train, lgb_test
def cat_model(x_train, y_train, x_test):
    cat_train, cat_test = cv_model(CatBoostRegressor, x_train, y_train, x_test, "cat") 
    return cat_train, cat_test

In [24]:
lgb_train, lgb_test = lgb_model(x_train, y_train, x_test)

************************************ 1 ************************************
Training until validation scores don't improve for 200 rounds
[200]	training's auc: 0.742265	valid_1's auc: 0.729966
[400]	training's auc: 0.754881	valid_1's auc: 0.731007
[600]	training's auc: 0.765341	valid_1's auc: 0.731189
[800]	training's auc: 0.775294	valid_1's auc: 0.731062
Early stopping, best iteration is:
[642]	training's auc: 0.767542	valid_1's auc: 0.731263
[('interestRate', 151147.80319571495), ('ficoRangeLow', 19137.154885053635), ('dti', 19032.258338928223), ('term', 18235.050982952118), ('annualIncome', 15239.957035541534), ('revolBal', 13044.770475387573), ('loanAmnt', 11443.995622634888), ('installment', 11363.327191114426), ('homeOwnership_1', 10882.65342092514), ('employmentTitle_cnts', 9811.251173257828), ('title_cnts', 9260.674059867859), ('earliesCreditLine', 8759.672102689743), ('revolUtil', 8075.976095199585), ('n2', 7459.235381364822), ('n14', 6951.058540821075), ('grade_B', 6672.29139

In [25]:
cat_train, cat_test = cat_model(x_train, y_train, x_test)

************************************ 1 ************************************
0:	learn: 0.3985252	test: 0.3966187	best: 0.3966187 (0)	total: 244ms	remaining: 1h 21m 10s
500:	learn: 0.3772831	test: 0.3760231	best: 0.3760231 (500)	total: 35.7s	remaining: 23m 8s
1000:	learn: 0.3757322	test: 0.3752631	best: 0.3752631 (1000)	total: 1m 10s	remaining: 22m 21s
1500:	learn: 0.3746605	test: 0.3749365	best: 0.3749365 (1500)	total: 1m 45s	remaining: 21m 39s
2000:	learn: 0.3737442	test: 0.3747361	best: 0.3747351 (1998)	total: 2m 20s	remaining: 21m 4s
2500:	learn: 0.3729246	test: 0.3746068	best: 0.3746065 (2499)	total: 2m 55s	remaining: 20m 24s
3000:	learn: 0.3721496	test: 0.3745194	best: 0.3745185 (2999)	total: 3m 31s	remaining: 19m 58s
3500:	learn: 0.3714302	test: 0.3744597	best: 0.3744587 (3489)	total: 4m 5s	remaining: 19m 18s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.3744586891
bestIteration = 3489

Shrink model to first 3490 iterations.
[0.7322693541754786]
*************

In [26]:
rh_test = lgb_test*0.5 + cat_test*0.5

In [27]:
testA['isDefault'] = rh_test

In [28]:
testA[['id','isDefault']].to_csv('test_sub.csv', index=False)

In [29]:
testA_result = pd.read_csv('test_sub.csv')

In [30]:
from sklearn.metrics import r2_score
score= r2_score(testA_result['isDefault'].values, lgb_test)
print('拟合效果:',score)

拟合效果: 0.9858982020890855


In [32]:
from sklearn.linear_model import LogisticRegression 
lr = LogisticRegression()
lr.fit(x_train, y_train)
print("训练准确度(acc): %f" %(lr.score(x_train, y_train)))

训练准确度(acc): 0.800456
