In [47]:
# -*- coding: utf-8 -*-

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor
import warnings
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, log_loss
warnings.filterwarnings('ignore')

train_data = pd.read_csv('E:/Loan_default/train.csv')
test_data = pd.read_csv('E:/Loan_default/testA.csv')

In [48]:
label = train_data["isDefault"]

In [49]:
# 将训练集和测试集进行连接
data = pd.concat([train_data, test_data], axis=0, ignore_index=True)

In [50]:
# 首先对employmentLength进行转换到数值（就业年限）
data['employmentLength'].replace(to_replace='10+ years', value='10 years', inplace=True)
data['employmentLength'].replace('< 1 year', '0 years', inplace=True)

def employmentLength_to_int(s):
    if pd.isnull(s):
        return s
    else:
        return np.int8(s.split()[0])
    
data['employmentLength'] = data['employmentLength'].apply(employmentLength_to_int)

In [51]:
# 对earliesCreditLine进行预处理（取年份，舍去月份）月份之后进行提取
data['earliesCreditLine'] = data['earliesCreditLine'].apply(lambda s: int(s[-4:]))

In [52]:
# 获取数值型特征(int or float)
numerical_fea = list(train_data.select_dtypes(exclude=['object']).columns)
numerical_fea.remove("isDefault")
print("数值型特征：" + str(numerical_fea))
print()
# 类别特征
category_fea = list(filter(lambda x: x not in numerical_fea,list(train_data.columns)))
print("object类型特征：" + str(category_fea))

数值型特征：['id', 'loanAmnt', 'term', 'interestRate', 'installment', 'employmentTitle', 'homeOwnership', 'annualIncome', 'verificationStatus', 'purpose', 'postCode', 'regionCode', 'dti', 'delinquency_2years', 'ficoRangeLow', 'ficoRangeHigh', 'openAcc', 'pubRec', 'pubRecBankruptcies', 'revolBal', 'revolUtil', 'totalAcc', 'initialListStatus', 'applicationType', 'title', 'policyCode', 'n0', 'n1', 'n2', 'n3', 'n4', 'n5', 'n6', 'n7', 'n8', 'n9', 'n10', 'n11', 'n12', 'n13', 'n14']

object类型特征：['grade', 'subGrade', 'employmentLength', 'issueDate', 'isDefault', 'earliesCreditLine']


In [53]:
#按照平均数填充数值型特征
data[numerical_fea] = data[numerical_fea].fillna(data[numerical_fea].median())
data[numerical_fea] = data[numerical_fea].fillna(data[numerical_fea].median())
#按照众数填充类别型特征
data[category_fea] = data[category_fea].fillna(data[category_fea].mode())
data[category_fea] = data[category_fea].fillna(data[category_fea].mode())

In [54]:
# 部分类别特征
cate_features = ['grade', 'subGrade', 'employmentTitle', 'homeOwnership', 'verificationStatus', 'purpose', 'postCode', 'regionCode', 'applicationType', 'initialListStatus', 'title']

for f in cate_features:
    print(f, '类型数：', data[f].nunique())

grade 类型数： 7
subGrade 类型数： 35
employmentTitle 类型数： 298101
homeOwnership 类型数： 6
verificationStatus 类型数： 3
purpose 类型数： 14
postCode 类型数： 935
regionCode 类型数： 51
applicationType 类型数： 2
initialListStatus 类型数： 2
title 类型数： 47903


In [55]:
# 类型数在2之上，又不是高维稀疏的转换成one-hot编码
data = pd.get_dummies(data, columns=['homeOwnership', 'verificationStatus', 'purpose', 'regionCode'], drop_first=True)

In [56]:
# 高维类别特征需要进行转换（labelEmcode）
for col in tqdm(['grade', 'subGrade', 'employmentTitle', 'postCode', 'title']):
    le = LabelEncoder()
    le.fit(list(data[col].astype(str).values))
    data[col] = le.transform(list(data[col].astype(str).values))
print('Label Encoding 完成')

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:06<00:00,  1.37s/it]

Label Encoding 完成





In [57]:
#转化成时间格式
for data in [data]:
    data['issueDate'] = pd.to_datetime(data['issueDate'],format='%Y-%m-%d')
    startdate = datetime.datetime.strptime('2007-06-01', '%Y-%m-%d')
    #构造时间特征
    data['issueDateDT'] = data['issueDate'].apply(lambda x: x-startdate).dt.days

In [65]:
features = [f for f in data.columns if f not in ['id','issueDate', "policyCode", "isDefault"]]

In [66]:
train = data[data.isDefault.notnull()].reset_index(drop=True)
test = data[data.isDefault.isnull()].reset_index(drop=True)

In [67]:
train = train[features]
test = test[features]

In [69]:
def lgb_model(clf, train_x, train_y, test_x):
    folds = 5
    seed = 2020
    kf = KFold(n_splits=folds, shuffle=True, random_state=seed)

    train = np.zeros(train_x.shape[0])
    test = np.zeros(test_x.shape[0])

    cv_scores = []
    
    for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
        print('************************************ {} ************************************'.format(str(i+1)))
        trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index]

        
        train_matrix = clf.Dataset(trn_x, label=trn_y)
        valid_matrix = clf.Dataset(val_x, label=val_y)

        params = {
                'boosting_type': 'gbdt',
                'objective': 'binary',
                'metric': 'auc',
                'min_child_weight': 5,
                'num_leaves': 2 ** 5,
                'lambda_l2': 10,
                'feature_fraction': 0.8,
                'bagging_fraction': 0.8,
                'bagging_freq': 4,
                'learning_rate': 0.1,
                'seed': 2020,
                'nthread': 28,
                'n_jobs':24,
                'silent': True,
                'verbose': -1,
        }

        model = clf.train(params, train_matrix, 50000, valid_sets=[train_matrix, valid_matrix], verbose_eval=200,early_stopping_rounds=200)
        val_pred = model.predict(val_x, num_iteration=model.best_iteration)
        test_pred = model.predict(test_x, num_iteration=model.best_iteration)

        train[valid_index] = val_pred
        test = test_pred / kf.n_splits
        cv_scores.append(roc_auc_score(val_y, val_pred))
        
        print(cv_scores)
       
    print("scotrainre_list:" + str(cv_scores))
    print("score_mean:" + str(np.mean(cv_scores)))
    print("score_std:" + str(np.std(cv_scores)))
    return train, test
    

In [70]:
train, test = lgb_model(lgb,train,label,test)

************************************ 1 ************************************
Training until validation scores don't improve for 200 rounds
[200]	training's auc: 0.747577	valid_1's auc: 0.73613
[400]	training's auc: 0.759683	valid_1's auc: 0.737034
[600]	training's auc: 0.769404	valid_1's auc: 0.737373
[800]	training's auc: 0.778812	valid_1's auc: 0.737147
Early stopping, best iteration is:
[629]	training's auc: 0.770887	valid_1's auc: 0.737409
[0.7374089726344591]
************************************ 2 ************************************
Training until validation scores don't improve for 200 rounds
[200]	training's auc: 0.748467	valid_1's auc: 0.732793
[400]	training's auc: 0.760113	valid_1's auc: 0.733569
[600]	training's auc: 0.770514	valid_1's auc: 0.733794
Early stopping, best iteration is:
[592]	training's auc: 0.770157	valid_1's auc: 0.73383
[0.7374089726344591, 0.733829998420346]
************************************ 3 ************************************
Training until validatio

In [71]:
test_data['isDefault'] = test

In [72]:
test_data[['id','isDefault']].to_csv('test_sub_tianchong.csv', index=False)