In [25]:
# -*- coding: utf-8 -*-

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor
import warnings
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, log_loss
warnings.filterwarnings('ignore')
# 数据读取
train_data = pd.read_csv('I:/Loan_default/train.csv')
test_data = pd.read_csv('I:/Loan_default/testA.csv')

In [26]:
# 标签
label = train_data["isDefault"]

In [27]:
# 将训练集和测试集进行连接
data = pd.concat([train_data, test_data], axis=0, ignore_index=True)

In [28]:
"""
id	为贷款清单分配的唯一信用证标识
loanAmnt	贷款金额    ----连续变量，直接使用
term	贷款期限（year） ----离散变量，直接使用
interestRate	贷款利率 ----连续型变量，直接使用
installment	分期付款金额   ----连续型变量，直接使用
grade	贷款等级    ----有优先级的，用labelEncode编码
subGrade	贷款等级之子级  ----有优先级的，用labelEncode编码
employmentTitle	就业职称   ----类别特征，但是有29W种，使用labelEncode编码(暂时不用)
employmentLength	就业年限（年）   ----转换成0-10的int类型
homeOwnership	借款人在登记时提供的房屋所有权状况  ----离散变量，直接使用
annualIncome	年收入 ----连续型变量，直接使用
verificationStatus	验证状态  ----离散型变量，直接使用
issueDate	贷款发放的月份   ----提取时间特征
purpose	借款人在贷款申请时的贷款用途类别  ----类别特征，无优先级，onehot
postCode	借款人在贷款申请中提供的邮政编码的前3位数字  ----类别特征，无优先级，onehot(暂时不用)
regionCode	地区编码  ----类别特征，无优先级，onehot
dti	债务收入比   ----连续型特征，直接使用
delinquency_2years	借款人过去2年信用档案中逾期30天以上的违约事件数   ----连续型特征，直接使用
ficoRangeLow	借款人在贷款发放时的fico所属的下限范围  ----连续型变量
ficoRangeHigh	借款人在贷款发放时的fico所属的上限范围  ----连续型变量--可以提取平均值特征
openAcc	借款人信用档案中未结信用额度的数量   ----连续特征，直接使用
pubRec	贬损公共记录的数量 -----连续特征，直接使用
pubRecBankruptcies	公开记录清除的数量  ----连续特征，直接使用
revolBal	信贷周转余额合计    ----连续特征，直接使用
revolUtil	循环额度利用率，或借款人使用的相对于所有可用循环信贷的信贷金额  ----连续特征，直接使用
totalAcc	借款人信用档案中当前的信用额度总数 ----连续特征，直接使用
initialListStatus	贷款的初始列表状态   ----离散型变量，直接使用
applicationType	表明贷款是个人申请还是与两个共同借款人的联合申请 ----离散型变量，直接使用
earliesCreditLine	借款人最早报告的信用额度开立的月份 ----提取年份和月份
title	借款人提供的贷款名称    ----暂时不用
policyCode	公开可用的策略_代码=1新产品不公开可用的策略_代码=2   ----全部为1无用特征
n系列匿名特征	匿名特征n0-n14，为一些贷款人行为计数特征的处理
n11和n12相差悬殊暂时舍弃该特征
其余特征为连续特征直接使用
"""
print()




In [29]:
# 首先对employmentLength进行转换到数值（就业年限）
data['employmentLength'].replace(to_replace='10+ years', value='10 years', inplace=True)
data['employmentLength'].replace('< 1 year', '0 years', inplace=True)

def employmentLength_to_int(s):
    if pd.isnull(s):
        return s
    else:
        return np.int8(s.split()[0])
    
data['employmentLength'] = data['employmentLength'].apply(employmentLength_to_int)

In [30]:
# 对earliesCreditLine进行预处理取年份和月份，年份直接使用int类型，月份转换成one-hot编码
data['year'] = data['earliesCreditLine'].apply(lambda s: int(s[-4:]))
data['month'] = data['earliesCreditLine'].apply(lambda x: str(x[0:3]))

In [31]:
# （labelEncode）
for col in tqdm(['grade', 'subGrade', 'employmentTitle', 'homeOwnership', 'verificationStatus', 'purpose', 'postCode', 'regionCode', 'applicationType', 'initialListStatus', 'title','month']):
    le = LabelEncoder()
    le.fit(list(data[col].astype(str).values))
    data[col] = le.transform(list(data[col].astype(str).values))
print('Label Encoding 完成')

100%|██████████████████████████████████████████████████████████████████████████████████| 12/12 [00:08<00:00,  1.37it/s]

Label Encoding 完成





In [32]:
#转化成时间格式
for data in [data]:
    data['issueDate'] = pd.to_datetime(data['issueDate'],format='%Y-%m-%d')
    startdate = datetime.datetime.strptime('2007-06-01', '%Y-%m-%d')
    #构造时间特征
    data['issueDateDT'] = data['issueDate'].apply(lambda x: x-startdate).dt.days

In [35]:
# fico特征
data["ficoRangeAve"] = data["ficoRangeLow"] + (data["ficoRangeHigh"] - data["ficoRangeLow"])/2

In [36]:
data = data.drop(["issueDate", "earliesCreditLine"],axis=1)

In [37]:
train = data[data.isDefault.notnull()].reset_index(drop=True)
test = data[data.isDefault.isnull()].reset_index(drop=True)

train = train.drop(["isDefault"], axis=1)
test = test.drop(["isDefault"], axis=1)

In [40]:
def cv_model(clf, train_x, train_y, test_x, clf_name):
    folds = 5
    seed = 1108
    kf = KFold(n_splits=folds, shuffle=True, random_state=seed)

    train = np.zeros(train_x.shape[0])
    test = np.zeros(test_x.shape[0])

    cv_scores = []

    for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
        print('************************************ {} ************************************'.format(str(i+1)))
        trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index]

        if clf_name == "lgb":
            train_matrix = clf.Dataset(trn_x, label=trn_y)
            valid_matrix = clf.Dataset(val_x, label=val_y)

            params = {
                'boosting_type': 'gbdt',
                'objective': 'binary',
                'metric': 'auc',
                'min_child_weight': 5,
                'num_leaves': 2 ** 5,
                'lambda_l2': 10,
                'tree_method':'gpu_hist',
                'feature_fraction': 0.8,
                'bagging_fraction': 0.8,
                'bagging_freq': 4,
                'learning_rate': 0.1,
                'seed': 2020,
                'nthread': 28,
                'n_jobs':24,
                'silent': True,
                'verbose': -1,
            }

            model = clf.train(params, train_matrix, 50000, valid_sets=[train_matrix, valid_matrix], verbose_eval=200,early_stopping_rounds=200)
            val_pred = model.predict(val_x, num_iteration=model.best_iteration)
            test_pred = model.predict(test_x, num_iteration=model.best_iteration)
            
            #print(list(sorted(zip(features, model.feature_importance("gain")), key=lambda x: x[1], reverse=True))[:100])
                
        if clf_name == "xgb":
            train_matrix = clf.DMatrix(trn_x , label=trn_y)
            valid_matrix = clf.DMatrix(val_x , label=val_y)
            test_matrix = clf.DMatrix(test_x)
            
            params = {'booster': 'gbtree',
                      'objective': 'binary:logistic',
                      'eval_metric': 'auc',
                      'gamma': 1,
                      'min_child_weight': 1.5,
                      'max_depth': 5,
                      'lambda': 10,
                      'subsample': 0.7,
                      'tree_method':'gpu_hist',
                      'colsample_bytree': 0.7,
                      'colsample_bylevel': 0.7,
                      'eta': 0.04,
                      'tree_method': 'exact',
                      'seed': 2020,
                      'nthread': 36,
                      "silent": True,
                      }
            
            watchlist = [(train_matrix, 'train'),(valid_matrix, 'eval')]
            
            model = clf.train(params, train_matrix, num_boost_round=50000, evals=watchlist, verbose_eval=200, early_stopping_rounds=200)
            val_pred  = model.predict(valid_matrix, ntree_limit=model.best_ntree_limit)
            test_pred = model.predict(test_matrix , ntree_limit=model.best_ntree_limit)
                 
        if clf_name == "cat":
            params = {'learning_rate': 0.05, 'depth': 5, 'l2_leaf_reg': 10, 'bootstrap_type': 'Bernoulli',
                      'od_type': 'Iter', 'od_wait': 50, 'random_seed': 11, 'allow_writing_files': False}
            
            model = clf(iterations=20000, **params)
            model.fit(trn_x, trn_y, eval_set=(val_x, val_y),
                      cat_features=[], use_best_model=True, verbose=500)
            
            val_pred  = model.predict(val_x)
            test_pred = model.predict(test_x)
            
        train[valid_index] = val_pred
        test = test_pred / kf.n_splits
        cv_scores.append(roc_auc_score(val_y, val_pred))
        
        print(cv_scores)
       
    print("%s_scotrainre_list:" % clf_name, cv_scores)
    print("%s_score_mean:" % clf_name, np.mean(cv_scores))
    print("%s_score_std:" % clf_name, np.std(cv_scores))
    return test

def lgb_model(x_train, y_train, x_test):
    lgb_test = cv_model(lgb, x_train, y_train, x_test, "lgb")
    return lgb_test

def xgb_model(x_train, y_train, x_test):
    xgb_test = cv_model(xgb, x_train, y_train, x_test, "xgb")
    return xgb_test

def cat_model(x_train, y_train, x_test):
    cat_test = cv_model(CatBoostRegressor, x_train, y_train, x_test, "cat")
    return cat_test

In [46]:
xgb_test = xgb_model(train, label, test)

************************************ 1 ************************************
Parameters: { silent } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


[0]	train-auc:0.69869	eval-auc:0.69753
Multiple eval metrics have been passed: 'eval-auc' will be used for early stopping.

Will train until eval-auc hasn't improved in 200 rounds.
[200]	train-auc:0.73282	eval-auc:0.72755
[400]	train-auc:0.74059	eval-auc:0.73151
[600]	train-auc:0.74596	eval-auc:0.73341
[800]	train-auc:0.75036	eval-auc:0.73447
[1000]	train-auc:0.75411	eval-auc:0.73525
[1200]	train-auc:0.75761	eval-auc:0.73567
[1400]	train-auc:0.76081	eval-auc:0.73592
[1600]	train-auc:0.76385	eval-auc:0.73617
[1800]	train-auc:0.76681	eval-auc:0.73642
[2000]	train-auc:0.76965	eval-auc:0.73655
[2200]	train-auc:0.77241	eval-auc:0.73668
[24

In [41]:
lgb_test = lgb_model(train, label, test)

************************************ 1 ************************************
Training until validation scores don't improve for 200 rounds
[200]	training's auc: 0.748206	valid_1's auc: 0.733914
[400]	training's auc: 0.76097	valid_1's auc: 0.735093
[600]	training's auc: 0.772241	valid_1's auc: 0.735178
[800]	training's auc: 0.782401	valid_1's auc: 0.735159
Early stopping, best iteration is:
[681]	training's auc: 0.776339	valid_1's auc: 0.735263
[0.735262615496092]
************************************ 2 ************************************
Training until validation scores don't improve for 200 rounds
[200]	training's auc: 0.747995	valid_1's auc: 0.734999
[400]	training's auc: 0.760687	valid_1's auc: 0.736198
[600]	training's auc: 0.771806	valid_1's auc: 0.736601
[800]	training's auc: 0.781858	valid_1's auc: 0.736545
Early stopping, best iteration is:
[696]	training's auc: 0.776842	valid_1's auc: 0.736673
[0.735262615496092, 0.7366730311424805]
************************************ 3 ******

In [44]:
cat_test = cat_model(train, label, test)

************************************ 1 ************************************
0:	learn: 0.3982296	test: 0.3978290	best: 0.3978290 (0)	total: 49.9ms	remaining: 16m 38s
500:	learn: 0.3754553	test: 0.3762918	best: 0.3762918 (500)	total: 20.9s	remaining: 13m 35s
1000:	learn: 0.3739562	test: 0.3755597	best: 0.3755597 (1000)	total: 41.7s	remaining: 13m 11s
1500:	learn: 0.3729419	test: 0.3752439	best: 0.3752439 (1500)	total: 1m 3s	remaining: 13m
2000:	learn: 0.3720799	test: 0.3750602	best: 0.3750602 (2000)	total: 1m 25s	remaining: 12m 45s
2500:	learn: 0.3712877	test: 0.3749657	best: 0.3749648 (2499)	total: 1m 46s	remaining: 12m 25s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.3749200066
bestIteration = 2839

Shrink model to first 2840 iterations.
[0.7350720014504196]
************************************ 2 ************************************
0:	learn: 0.3982072	test: 0.3979428	best: 0.3979428 (0)	total: 48.4ms	remaining: 16m 8s
500:	learn: 0.3755788	test: 0.3756716	best: 

In [48]:
print(xgb_test.sum())

7318.0215


In [42]:
print(lgb_test.sum())

8027.510205244015


In [45]:
print(cat_test.sum())

8011.070273002388


In [49]:
# 0.7355
rh_test = lgb_test*0.3 + xgb_test*0.4 + cat_test*0.3
test_data['isDefault'] = rh_test
test_data[['id','isDefault']].to_csv('test_sub.csv', index=False)

In [47]:
test_data['isDefault'] = xgb_test
test_data[['id','isDefault']].to_csv('test_sub.csv', index=False)