In [1]:
# -*- coding: utf-8 -*-

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor
import warnings
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, log_loss
warnings.filterwarnings('ignore')
# 数据读取
train_data = pd.read_csv('E:/Loan/train.csv')
test_data = pd.read_csv('E:/Loan/testA.csv')

In [2]:
# 标签
label = train_data["isDefault"]

In [3]:
# 将训练集和测试集进行连接
data = pd.concat([train_data, test_data], axis=0, ignore_index=True)

In [4]:
"""
id	为贷款清单分配的唯一信用证标识
loanAmnt	贷款金额    ----连续变量，直接使用
term	贷款期限（year） ----离散变量，直接使用
interestRate	贷款利率 ----连续型变量，直接使用
installment	分期付款金额   ----连续型变量，直接使用
grade	贷款等级    ----有优先级的，用labelEncode编码
subGrade	贷款等级之子级  ----有优先级的，用labelEncode编码
employmentTitle	就业职称   ----类别特征，但是有29W种，使用labelEncode编码(暂时不用)
employmentLength	就业年限（年）   ----转换成0-10的int类型
homeOwnership	借款人在登记时提供的房屋所有权状况  ----离散变量，直接使用
annualIncome	年收入 ----连续型变量，直接使用
verificationStatus	验证状态  ----离散型变量，直接使用
issueDate	贷款发放的月份   ----提取时间特征
purpose	借款人在贷款申请时的贷款用途类别  ----类别特征，无优先级，onehot
postCode	借款人在贷款申请中提供的邮政编码的前3位数字  ----类别特征，无优先级，onehot(暂时不用)
regionCode	地区编码  ----类别特征，无优先级，onehot
dti	债务收入比   ----连续型特征，直接使用
delinquency_2years	借款人过去2年信用档案中逾期30天以上的违约事件数   ----连续型特征，直接使用
ficoRangeLow	借款人在贷款发放时的fico所属的下限范围  ----连续型变量
ficoRangeHigh	借款人在贷款发放时的fico所属的上限范围  ----连续型变量--可以提取平均值特征
openAcc	借款人信用档案中未结信用额度的数量   ----连续特征，直接使用
pubRec	贬损公共记录的数量 -----连续特征，直接使用
pubRecBankruptcies	公开记录清除的数量  ----连续特征，直接使用
revolBal	信贷周转余额合计    ----连续特征，直接使用
revolUtil	循环额度利用率，或借款人使用的相对于所有可用循环信贷的信贷金额  ----连续特征，直接使用
totalAcc	借款人信用档案中当前的信用额度总数 ----连续特征，直接使用
initialListStatus	贷款的初始列表状态   ----离散型变量，直接使用
applicationType	表明贷款是个人申请还是与两个共同借款人的联合申请 ----离散型变量，直接使用
earliesCreditLine	借款人最早报告的信用额度开立的月份 ----提取年份和月份
title	借款人提供的贷款名称    ----暂时不用
policyCode	公开可用的策略_代码=1新产品不公开可用的策略_代码=2   ----全部为1无用特征
n系列匿名特征	匿名特征n0-n14，为一些贷款人行为计数特征的处理
n11和n12相差悬殊暂时舍弃该特征
其余特征为连续特征直接使用
"""
print()




In [5]:
# 删除dti为负值的数据(经验证，该数据有两条属于训练集)
# data = data.drop(data[data["dti"] < 0].index)

In [6]:
# 年应还
data["year_due"] = data["dti"] * data["annualIncome"] / 100

In [7]:
def DtiFun(x):
    if x < 36:
        return 0
    elif x < 100:
        return 1
    elif x < 1000:
        return 2
    else:
        return 3
# 根据dti判断是否属于健康范围
data["dti_grade"] = data["dti"].apply(DtiFun)

In [8]:
# 贷款时间总收入占贷款金额的比重
data["loanAmnt_annuallncome_rate"] = (data["term"] * data["annualIncome"]) / data["loanAmnt"]

In [9]:
# 根据职称分类算平均年收入
temp = data[["employmentTitle", "annualIncome"]].groupby("employmentTitle").mean()
temp = temp.rename(columns={"annualIncome":"employmentTitle_annualIncome_aver"})
data = pd.merge(data,temp,how="left",on="employmentTitle")

In [5]:
# # 标签区间特征
# 贷款期限——违约率
temp = pd.DataFrame(data.groupby(["term"])["isDefault"].sum() / data.groupby(["term"])["isDefault"].count())
temp = temp.rename(columns={"isDefault":"term_isDefault_ave"})
data = pd.merge(data,temp,how="left",on="term")

# 贷款金额——违约率
# temp = pd.DataFrame(data.groupby(["loanAmnt"])["isDefault"].sum() / data.groupby(["loanAmnt"])["isDefault"].count())
# temp = temp.rename(columns={"isDefault":"loanAmnt_isDefault_ave"})
# data = pd.merge(data,temp,how="left",on="loanAmnt")

# # 职称——违约率
# temp = pd.DataFrame(data.groupby(["employmentTitle"])["isDefault"].sum() / data.groupby(["employmentTitle"])["isDefault"].count())
# temp = temp.rename(columns={"isDefault":"employmentTitle_isDefault_ave"})
# data = pd.merge(data,temp,how="left",on="employmentTitle")

# # 就业年限——违约率
# temp = pd.DataFrame(data.groupby(["employmentLength"])["isDefault"].sum() / data.groupby(["employmentLength"])["isDefault"].count())
# temp = temp.rename(columns={"isDefault":"employmentLength_isDefault_ave"})
# data = pd.merge(data,temp,how="left",on="employmentLength")

# # 房屋所有权——违约率
# temp = pd.DataFrame(data.groupby(["homeOwnership"])["isDefault"].sum() / data.groupby(["homeOwnership"])["isDefault"].count())
# temp = temp.rename(columns={"isDefault":"homeOwnership_isDefault_ave"})
# data = pd.merge(data,temp,how="left",on="homeOwnership")

# 年收入——违约率
# temp = pd.DataFrame(data.groupby(["annualIncome"])["isDefault"].sum() / data.groupby(["annualIncome"])["isDefault"].count())
# temp = temp.rename(columns={"isDefault":"annualIncome_isDefault_ave"})
# data = pd.merge(data,temp,how="left",on="annualIncome")

# # 邮编——违约率
# temp = pd.DataFrame(data.groupby(["postCode"])["isDefault"].sum() / data.groupby(["postCode"])["isDefault"].count())
# temp = temp.rename(columns={"isDefault":"postCode_isDefault_ave"})
# data = pd.merge(data,temp,how="left",on="postCode")

# # 邮编——违约率
# temp = pd.DataFrame(data.groupby(["dti"])["isDefault"].sum() / data.groupby(["dti"])["isDefault"].count())
# temp = temp.rename(columns={"isDefault":"dti_isDefault_ave"})
# data = pd.merge(data,temp,how="left",on="dti")

In [16]:
# 月收入
data['monthincome'] = data['annualIncome'] / 12

# 月分期付款金额 / 月收入
data["monthly_installment_amount"] = data["installment"] / data["monthincome"]

14.01      1
15.69      1
16.08      1
16.31      1
16.47      1
          ..
1618.03    1
1647.03    1
1691.28    1
1714.54    4
1715.42    2
Name: installment, Length: 77132, dtype: int64

In [None]:
# fico特征
data["ficoRangeAve"] = data["ficoRangeLow"] + (data["ficoRangeHigh"] - data["ficoRangeLow"])/2

In [None]:
# 年利息 interestRate * loanAmnt
data['allmoney'] = list(map(lambda x,y:x*y,data['loanAmnt'],
                                data['interestRate']/100))

# 年利息/年收入 
data['howlong_return'] = data['allmoney'].values / data['annualIncome']

In [None]:
# 贷款金额 / 信用额度总数
data["LoanAmnt_total_rate"] = data["loanAmnt"] / data["totalAcc"]

In [21]:
# 若存在消极信息(有违约、破坏公物、信用未结)则标记为1
data["Negative"] = 0
data[data["delinquency_2years"] > 0]["Negative"] = 1
data[data["openAcc"] > 0]["Negative"] = 1
data[data["pubRec"] > 0]["Negative"] = 1

In [11]:
# 首先对employmentLength进行转换到数值（就业年限）
data['employmentLength'].replace(to_replace='10+ years', value='10 years', inplace=True)
data['employmentLength'].replace('< 1 year', '0 years', inplace=True)

def employmentLength_to_int(s):
    if pd.isnull(s):
        return s
    else:
        return np.int8(s.split()[0])
    
data['employmentLength'] = data['employmentLength'].apply(employmentLength_to_int)

In [12]:
# 对earliesCreditLine进行预处理取年份和月份，年份直接使用int类型，月份转换成one-hot编码
data['year'] = data['earliesCreditLine'].apply(lambda s: int(s[-4:]))
data['month'] = data['earliesCreditLine'].apply(lambda x: str(x[0:3]))

In [13]:
# # 部分类别特征
# cate_features = ['grade', 'subGrade', 'employmentTitle', 'homeOwnership', 'verificationStatus', 'purpose', 'postCode', 'regionCode', 'applicationType', 'initialListStatus', 'title']

# for f in cate_features:
#     print(f, '类型数：', data[f].nunique())

In [14]:
# grade = dict(zip(sorted(list(set(data['grade']))),range(0,len(set(data['grade'])))))
# data['grade_id'] = data['grade'].map(grade)

# sub_grade = dict(zip(sorted(list(set(data['subGrade']))),range(0,len(set(data['subGrade'])))))
# data['subgrade_id'] = data['subGrade'].map(sub_grade)

In [15]:
# 类型数在2之上，又不是高维稀疏的转换成one-hot编码
data = pd.get_dummies(data, columns=['homeOwnership', 'verificationStatus', 'purpose', 'regionCode', 'month'], drop_first=True)

In [16]:
# （labelEncode）
for col in tqdm(['grade', 'subGrade', 'employmentTitle', 'postCode', 'title']):
    le = LabelEncoder()
    le.fit(list(data[col].astype(str).values))
    data[col] = le.transform(list(data[col].astype(str).values))
print('Label Encoding 完成')

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:07<00:00,  1.40s/it]

Label Encoding 完成





In [17]:
#转化成时间格式
for data in [data]:
    data['issueDate'] = pd.to_datetime(data['issueDate'],format='%Y-%m-%d')
    startdate = datetime.datetime.strptime('2007-06-01', '%Y-%m-%d')
    #构造时间特征
    data['issueDateDT'] = data['issueDate'].apply(lambda x: x-startdate).dt.days

In [23]:
for col in ['grade', 'subGrade']: 
    temp_dict = data.groupby([col])['isDefault'].agg(['mean']).reset_index().rename(columns={'mean': col + '_target_mean'})
    temp_dict.index = temp_dict[col].values
    temp_dict = temp_dict[col + '_target_mean'].to_dict()

    data[col + '_target_mean'] = data[col].map(temp_dict)

In [24]:
for col in ['grade', 'subGrade']: 
    temp_dict = data.groupby([col])['isDefault'].agg(['std']).reset_index().rename(columns={'std': col + '_target_std'})
    temp_dict.index = temp_dict[col].values
    temp_dict = temp_dict[col + '_target_std'].to_dict()

    data[col + '_target_std'] = data[col].map(temp_dict)

In [25]:
data = data.drop(["id","issueDate", "earliesCreditLine"],axis=1)

In [26]:
# 提取一些排序特征
data['issueDate_Rank'] = data["issueDateDT"].rank()
data['revolBal_Rank'] = data["revolBal"].rank()
data['dti_Rank'] = data["dti"].rank()
data['revolUtil_Rank'] = data["revolUtil"].rank()
data['employmentTitle_Rank'] = data["employmentTitle"].rank()
data['annualIncome_Rank'] = data["annualIncome"].rank()
data['postCode_Rank'] = data["postCode"].rank()
data['year_Rank'] = data["year"].rank()
data['loanAmnt_annuallncome_rate_Rank'] = data["loanAmnt_annuallncome_rate"].rank()
data['interestRate_Rank'] = data["interestRate"].rank()
data['howlong_return_Rank'] = data["howlong_return"].rank()
data['installment_Rank'] = data["installment"].rank()
data['employmentLength_Rank'] = data["employmentLength"].rank()
data['loanAmnt_Rank'] = data["loanAmnt"].rank()
data['n6_Rank'] = data["n6"].rank()

In [27]:
train = data[data.isDefault.notnull()].reset_index(drop=True)
test = data[data.isDefault.isnull()].reset_index(drop=True)

train = train.drop(["isDefault"], axis=1)
test = test.drop(["isDefault"], axis=1)

In [28]:
def cv_model(clf, train_x, train_y, test_x, clf_name):
    folds = 5
    seed = 1108
    kf = KFold(n_splits=folds, shuffle=True, random_state=seed)

    train = np.zeros(train_x.shape[0])
    test_pred = np.zeros(test_x.shape[0])
    test = np.zeros(test_x.shape[0])
    importance = np.zeros(train_x.columns.shape[0])
    
    cv_scores = []
    feature_names = train_x.columns.tolist()
    for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
        print('************************************ {} ************************************'.format(str(i+1)))
        trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index]

        if clf_name == "lgb":
            train_matrix = clf.Dataset(trn_x, label=trn_y)
            valid_matrix = clf.Dataset(val_x, label=val_y)

            params = {
                'boosting_type': 'gbdt',
                'objective': 'binary',
                'metric': 'auc',
                'min_child_weight': 5,
                'num_leaves': 2 ** 5,
                'lambda_l2': 10,
                'tree_method':'gpu_hist',
                'feature_fraction': 0.8,
                'bagging_fraction': 0.8,
                'bagging_freq': 4,
                'learning_rate': 0.05,
                'seed': 2020,
                'nthread': 28,
                'n_jobs':24,
                'silent': True,
                'verbose': -1,
            }

            model = clf.train(params, train_matrix, 50000, valid_sets=[train_matrix, valid_matrix], verbose_eval=200,early_stopping_rounds=200)
            
            val_pred = model.predict(val_x, num_iteration=model.best_iteration)
            test_pred = model.predict(test_x, num_iteration=model.best_iteration)
            
                
        if clf_name == "xgb":
            train_matrix = clf.DMatrix(trn_x , label=trn_y)
            valid_matrix = clf.DMatrix(val_x , label=val_y)
            test_matrix = clf.DMatrix(test_x)
            
            params = {'booster': 'gbtree',
                      'objective': 'binary:logistic',
                      'eval_metric': 'auc',
                      'gamma': 1,
                      'min_child_weight': 1.5,
                      'max_depth': 5,
                      'lambda': 10,
                      'subsample': 0.7,
                      'tree_method':'gpu_hist',
                      'colsample_bytree': 0.7,
                      'colsample_bylevel': 0.7,
                      'eta': 0.04,
                      'tree_method': 'exact',
                      'seed': 2020,
                      'nthread': 36,
                      "silent": True,
                      }
            
            watchlist = [(train_matrix, 'train'),(valid_matrix, 'eval')]
            
            model = clf.train(params, train_matrix, num_boost_round=50000, evals=watchlist, verbose_eval=200, early_stopping_rounds=200)
            val_pred  = model.predict(valid_matrix, ntree_limit=model.best_ntree_limit)
            test_pred = model.predict(test_matrix , ntree_limit=model.best_ntree_limit)
                 
        if clf_name == "cat":
            params = {'learning_rate': 0.05, 'depth': 5, 'l2_leaf_reg': 10, 'bootstrap_type': 'Bernoulli',
                      'od_type': 'Iter', 'od_wait': 50, 'random_seed': 11, 'allow_writing_files': False}
            
            model = clf(iterations=20000, **params)
            model.fit(trn_x, trn_y, eval_set=(val_x, val_y),
                      cat_features=[], use_best_model=True, verbose=500)
            
            val_pred  = model.predict(val_x)
            test_pred = model.predict(test_x)
            
            
        # importance += model.feature_importance() / 5
        
        train[valid_index] = val_pred
        test += test_pred / kf.n_splits
        cv_scores.append(roc_auc_score(val_y, val_pred))
        
        print(cv_scores)
    
    # df = pd.DataFrame({ 'column': feature_names, 'importance': importance}).sort_values(by='importance')           
    # df.to_csv("./importance.csv")
    print("%s_scotrainre_list:" % clf_name, cv_scores)
    print("%s_score_mean:" % clf_name, np.mean(cv_scores))
    print("%s_score_std:" % clf_name, np.std(cv_scores))
    return test

def lgb_model(x_train, y_train, x_test):
    lgb_test = cv_model(lgb, x_train, y_train, x_test, "lgb")
    return lgb_test

def xgb_model(x_train, y_train, x_test):
    xgb_test = cv_model(xgb, x_train, y_train, x_test, "xgb")
    return xgb_test

def cat_model(x_train, y_train, x_test):
    cat_test = cv_model(CatBoostRegressor, x_train, y_train, x_test, "cat")
    return cat_test

In [29]:
xgb_test = xgb_model(train, label, test)

In [30]:
lgb_test = lgb_model(train, label, test)

************************************ 1 ************************************
Training until validation scores don't improve for 200 rounds
[200]	training's auc: 0.774443	valid_1's auc: 0.766825
[400]	training's auc: 0.782765	valid_1's auc: 0.769319
[600]	training's auc: 0.788808	valid_1's auc: 0.769938
[800]	training's auc: 0.794624	valid_1's auc: 0.770502
[1000]	training's auc: 0.799741	valid_1's auc: 0.770682
[1200]	training's auc: 0.80449	valid_1's auc: 0.770846
[1400]	training's auc: 0.809421	valid_1's auc: 0.770923
Early stopping, best iteration is:
[1335]	training's auc: 0.807932	valid_1's auc: 0.770946
[0.7709456944036956]
************************************ 2 ************************************
Training until validation scores don't improve for 200 rounds
[200]	training's auc: 0.773886	valid_1's auc: 0.768426
[400]	training's auc: 0.782285	valid_1's auc: 0.77102
[600]	training's auc: 0.78838	valid_1's auc: 0.771636
[800]	training's auc: 0.794025	valid_1's auc: 0.772014
[1000]	

In [31]:
cat_test = cat_model(train, label, test)

In [32]:
print(xgb_test.sum())

NameError: name 'xgb_test' is not defined

In [33]:
print(lgb_test.sum())

38125.35246299481


In [None]:
print(cat_test.sum())

In [34]:
# 0.7355
rh_test = lgb_test * 0.33 + xgb_test * 0.34 + cat_test * 0.33
test_data['isDefault'] = rh_test
test_data[['id','isDefault']].to_csv('test_sub.csv', index=False)

In [None]:
# test_data['isDefault'] = lgb_test
# test_data[['id','isDefault']].to_csv('test_sub.csv', index=False)