In [1]:
# -*- coding: utf-8 -*-

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor
import warnings
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, log_loss
warnings.filterwarnings('ignore')
# 数据读取
train_data = pd.read_csv('I:/Loan_default/train.csv')
test_data = pd.read_csv('I:/Loan_default/testA.csv')

In [2]:
# 标签
label = train_data["isDefault"]

In [3]:
# 将训练集和测试集进行连接
data = pd.concat([train_data, test_data], axis=0, ignore_index=True)

In [4]:
# 首先对employmentLength进行转换到数值（就业年限）
data['employmentLength'].replace(to_replace='10+ years', value='10 years', inplace=True)
data['employmentLength'].replace('< 1 year', '0 years', inplace=True)

def employmentLength_to_int(s):
    if pd.isnull(s):
        return s
    else:
        return np.int8(s.split()[0])
    
data['employmentLength'] = data['employmentLength'].apply(employmentLength_to_int)

In [5]:
# 对earliesCreditLine进行预处理取年份和月份，年份直接使用int类型，月份转换成one-hot编码
data['year'] = data['earliesCreditLine'].apply(lambda s: int(s[-4:]))
data['month'] = data['earliesCreditLine'].apply(lambda x: str(x[0:3]))

In [6]:
# 获取数值型特征(int or float)
numerical_fea = list(train_data.select_dtypes(exclude=['object']).columns)
numerical_fea.remove("isDefault")
print("数值型特征：" + str(numerical_fea))
print()
# 类别特征
category_fea = list(filter(lambda x: x not in numerical_fea,list(train_data.columns)))
print("object类型特征：" + str(category_fea))

数值型特征：['id', 'loanAmnt', 'term', 'interestRate', 'installment', 'employmentTitle', 'homeOwnership', 'annualIncome', 'verificationStatus', 'purpose', 'postCode', 'regionCode', 'dti', 'delinquency_2years', 'ficoRangeLow', 'ficoRangeHigh', 'openAcc', 'pubRec', 'pubRecBankruptcies', 'revolBal', 'revolUtil', 'totalAcc', 'initialListStatus', 'applicationType', 'title', 'policyCode', 'n0', 'n1', 'n2', 'n3', 'n4', 'n5', 'n6', 'n7', 'n8', 'n9', 'n10', 'n11', 'n12', 'n13', 'n14']

object类型特征：['grade', 'subGrade', 'employmentLength', 'issueDate', 'isDefault', 'earliesCreditLine']


In [7]:
"""填充完分数降了"""
# #按照平均数填充数值型特征
# data[numerical_fea] = data[numerical_fea].fillna(data[numerical_fea].median())
# data[numerical_fea] = data[numerical_fea].fillna(data[numerical_fea].median())
# #按照众数填充类别型特征
# data[category_fea] = data[category_fea].fillna(data[category_fea].mode())
# data[category_fea] = data[category_fea].fillna(data[category_fea].mode())

'填充完分数降了'

In [8]:
# 部分类别特征
cate_features = ['grade', 'subGrade', 'employmentTitle', 'homeOwnership', 'verificationStatus', 'purpose', 'postCode', 'regionCode', 'applicationType', 'initialListStatus', 'title']

for f in cate_features:
    print(f, '类型数：', data[f].nunique())

grade 类型数： 7
subGrade 类型数： 35
employmentTitle 类型数： 298101
homeOwnership 类型数： 6
verificationStatus 类型数： 3
purpose 类型数： 14
postCode 类型数： 935
regionCode 类型数： 51
applicationType 类型数： 2
initialListStatus 类型数： 2
title 类型数： 47903


In [9]:
# 类型数在2之上，又不是高维稀疏的转换成one-hot编码
data = pd.get_dummies(data, columns=['homeOwnership', 'verificationStatus', 'purpose', 'regionCode', 'month'], drop_first=True)

In [10]:
# 高维类别特征需要进行转换（labelEmcode）
for col in tqdm(['grade', 'subGrade', 'employmentTitle', 'postCode', 'title']):
    le = LabelEncoder()
    le.fit(list(data[col].astype(str).values))
    data[col] = le.transform(list(data[col].astype(str).values))
print('Label Encoding 完成')

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:04<00:00,  1.11it/s]

Label Encoding 完成





In [11]:
#转化成时间格式
for data in [data]:
    data['issueDate'] = pd.to_datetime(data['issueDate'],format='%Y-%m-%d')
    startdate = datetime.datetime.strptime('2007-06-01', '%Y-%m-%d')
    #构造时间特征
    data['issueDateDT'] = data['issueDate'].apply(lambda x: x-startdate).dt.days

In [12]:
"""
id:无用特征
issueDate:构造成为时间特征
policyCode：唯一特征无意义
isDefault：标签
"""
features = [f for f in data.columns if f not in ['id','issueDate', "policyCode", "isDefault","earliesCreditLine"]]

In [13]:
train = data[data.isDefault.notnull()].reset_index(drop=True)
test = data[data.isDefault.isnull()].reset_index(drop=True)

In [14]:
train = train[features]
test = test[features]

In [15]:
# def lgb_model(clf, train_x, train_y, test_x):
#     folds = 5
#     seed = 1108
#     kf = KFold(n_splits=folds, shuffle=True, random_state=seed)

#     train = np.zeros(train_x.shape[0])
#     test = np.zeros(test_x.shape[0])

#     cv_scores = []
    
#     for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
#         print('************************************ {} ************************************'.format(str(i+1)))
#         trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index]

        
#         train_matrix = clf.Dataset(trn_x, label=trn_y)
#         valid_matrix = clf.Dataset(val_x, label=val_y)

#         params = {
#                 'boosting_type': 'gbdt',
#                 'objective': 'binary',
#                 'metric': 'auc',
#                 'min_child_weight': 5,
#                 'num_leaves': 2 ** 5,
#                 'lambda_l2': 10,
#                 'feature_fraction': 0.8,
#                 'bagging_fraction': 0.8,
#                 'bagging_freq': 4,
#                 'learning_rate': 0.1,
#                 'seed': 2020,
#                 'nthread': 28,
#                 'n_jobs':24,
#                 'silent': True,
#                 'verbose': -1,
#         }

#         model = clf.train(params, train_matrix, 50000, valid_sets=[train_matrix, valid_matrix], verbose_eval=200,early_stopping_rounds=200)
#         val_pred = model.predict(val_x, num_iteration=model.best_iteration)
#         test_pred = model.predict(test_x, num_iteration=model.best_iteration)

#         train[valid_index] = val_pred
#         test = test_pred / kf.n_splits
#         cv_scores.append(roc_auc_score(val_y, val_pred))
        
#         print(cv_scores)
       
#     print("scotrainre_list:" + str(cv_scores))
#     print("score_mean:" + str(np.mean(cv_scores)))
#     print("score_std:" + str(np.std(cv_scores)))
#     return train, test
    

In [16]:
# train_lgb, test_lgb = lgb_model(lgb,train,label,test)

In [35]:
def cv_model(clf, train_x, train_y, test_x, clf_name):
    folds = 5
    seed = 2020
    kf = KFold(n_splits=folds, shuffle=True, random_state=seed)

    train = np.zeros(train_x.shape[0])
    test = np.zeros(test_x.shape[0])

    cv_scores = []

    for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
        print('************************************ {} ************************************'.format(str(i+1)))
        trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index]

        if clf_name == "lgb":
            train_matrix = clf.Dataset(trn_x, label=trn_y)
            valid_matrix = clf.Dataset(val_x, label=val_y)

            params = {
                'boosting_type': 'gbdt',
                'objective': 'binary',
                'metric': 'auc',
                'min_child_weight': 5,
                'num_leaves': 2 ** 5,
                'lambda_l2': 10,
                'feature_fraction': 0.8,
                'bagging_fraction': 0.8,
                'bagging_freq': 4,
                'learning_rate': 0.1,
                'seed': 2020,
                'nthread': 28,
                'n_jobs':24,
                'silent': True,
                'verbose': -1,
            }

            model = clf.train(params, train_matrix, 50000, valid_sets=[train_matrix, valid_matrix], verbose_eval=200,early_stopping_rounds=200)
            val_pred = model.predict(val_x, num_iteration=model.best_iteration)
            test_pred = model.predict(test_x, num_iteration=model.best_iteration)
            
            # print(list(sorted(zip(features, model.feature_importance("gain")), key=lambda x: x[1], reverse=True))[:20])
                
        if clf_name == "xgb":
            train_matrix = clf.DMatrix(trn_x , label=trn_y)
            valid_matrix = clf.DMatrix(val_x , label=val_y)
            test_matrix = clf.DMatrix(test_x)
            
            params = {'booster': 'gbtree',
                      'objective': 'binary:logistic',
                      'eval_metric': 'auc',
                      'gamma': 1,
                      'min_child_weight': 1.5,
                      'max_depth': 5,
                      'lambda': 10,
                      'subsample': 0.7,
                      'colsample_bytree': 0.7,
                      'colsample_bylevel': 0.7,
                      'eta': 0.04,
                      'tree_method': 'exact',
                      'seed': 2020,
                      'nthread': 36,
                      "silent": True,
                      }
            
            watchlist = [(train_matrix, 'train'),(valid_matrix, 'eval')]
            
            model = clf.train(params, train_matrix, num_boost_round=50000, evals=watchlist, verbose_eval=200, early_stopping_rounds=200)
            val_pred  = model.predict(valid_matrix, ntree_limit=model.best_ntree_limit)
            test_pred = model.predict(test_matrix , ntree_limit=model.best_ntree_limit)
                 
        if clf_name == "cat":
            params = {'learning_rate': 0.05, 'depth': 5, 'l2_leaf_reg': 10, 'bootstrap_type': 'Bernoulli',
                      'od_type': 'Iter', 'od_wait': 50, 'random_seed': 11, 'allow_writing_files': False}
            
            model = clf(iterations=20000, **params)
            model.fit(trn_x, trn_y, eval_set=(val_x, val_y),
                      cat_features=[], use_best_model=True, verbose=500)
            
            val_pred  = model.predict(val_x)
            test_pred = model.predict(test_x)
            
        train[valid_index] = val_pred
        test = test_pred / kf.n_splits
        cv_scores.append(roc_auc_score(val_y, val_pred))
        
        print(cv_scores)
       
    print("%s_scotrainre_list:" % clf_name, cv_scores)
    print("%s_score_mean:" % clf_name, np.mean(cv_scores))
    print("%s_score_std:" % clf_name, np.std(cv_scores))
    return test

def lgb_model(x_train, y_train, x_test):
    lgb_test = cv_model(lgb, x_train, y_train, x_test, "lgb")
    return lgb_test

def xgb_model(x_train, y_train, x_test):
    xgb_test = cv_model(xgb, x_train, y_train, x_test, "xgb")
    return xgb_test

def cat_model(x_train, y_train, x_test):
    cat_test = cv_model(CatBoostRegressor, x_train, y_train, x_test, "cat")
    return cat_test

In [None]:
xgb_test = xgb_model(train, label, test)

In [32]:
lgb_test = lgb_model(train, label, test)

************************************ 1 ************************************
Training until validation scores don't improve for 200 rounds
[200]	training's auc: 0.748264	valid_1's auc: 0.736581
[400]	training's auc: 0.760463	valid_1's auc: 0.737697
[600]	training's auc: 0.770741	valid_1's auc: 0.737789
Early stopping, best iteration is:
[546]	training's auc: 0.768013	valid_1's auc: 0.737879
[0.7378792764530935]
************************************ 2 ************************************
Training until validation scores don't improve for 200 rounds
[200]	training's auc: 0.749135	valid_1's auc: 0.733188
[400]	training's auc: 0.761151	valid_1's auc: 0.733894
[600]	training's auc: 0.771305	valid_1's auc: 0.734359
[800]	training's auc: 0.781055	valid_1's auc: 0.73433
Early stopping, best iteration is:
[600]	training's auc: 0.771305	valid_1's auc: 0.734359
[0.7378792764530935, 0.734358796091941]
************************************ 3 ************************************
Training until validati

In [36]:
cat_test = cat_model(train, label, test)

************************************ 1 ************************************
0:	learn: 0.3985448	test: 0.3966307	best: 0.3966307 (0)	total: 44.2ms	remaining: 14m 43s
500:	learn: 0.3757458	test: 0.3744098	best: 0.3744098 (500)	total: 20.4s	remaining: 13m 15s
1000:	learn: 0.3742569	test: 0.3737271	best: 0.3737265 (999)	total: 40.4s	remaining: 12m 46s
1500:	learn: 0.3732264	test: 0.3734521	best: 0.3734521 (1500)	total: 1m	remaining: 12m 23s
2000:	learn: 0.3723491	test: 0.3732803	best: 0.3732799 (1999)	total: 1m 20s	remaining: 12m 3s
2500:	learn: 0.3715715	test: 0.3731855	best: 0.3731855 (2500)	total: 1m 40s	remaining: 11m 41s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.3731568729
bestIteration = 2632

Shrink model to first 2633 iterations.
[0.7376935598491381]
************************************ 2 ************************************
0:	learn: 0.3979770	test: 0.3989152	best: 0.3989152 (0)	total: 46.9ms	remaining: 15m 38s
500:	learn: 0.3750411	test: 0.3771476	best: 

In [19]:
xgb_test.sum()

7976.796

In [34]:
lgb_test.sum()

7969.466260951486

In [37]:
cat_test.sum()

7984.467826934987

In [40]:
# 0.7355
rh_test = lgb_test*0.3 + xgb_test*0.4 + cat_test*0.3
test_data['isDefault'] = rh_test
test_data[['id','isDefault']].to_csv('test_sub.csv', index=False)

In [39]:
# 0.7333
rh_test = cat_test
test_data['isDefault'] = rh_test
test_data[['id','isDefault']].to_csv('test_sub.csv', index=False)

In [None]:
# 0.7353
rh_test = lgb_test*0.5 + xgb_test*0.5
test_data['isDefault'] = rh_test
test_data[['id','isDefault']].to_csv('test_sub.csv', index=False)