In [1]:
# -*- coding: utf-8 -*-

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor
import warnings
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, log_loss
warnings.filterwarnings('ignore')

train = pd.read_csv('train.csv')
testA = pd.read_csv('testA.csv')

In [2]:
# 将训练集和测试集进行连接
data = pd.concat([train, testA], axis=0, ignore_index=True)

In [3]:
# 对employmentLength列进行数值转换
data['employmentLength'].replace(to_replace='10+ years', value='10 years', inplace=True)
data['employmentLength'].replace('< 1 year', '0 years', inplace=True)

def employmentLength_to_int(s):
    if pd.isnull(s):
        return s
    else:
        return np.int8(s.split()[0])
    
data['employmentLength'] = data['employmentLength'].apply(employmentLength_to_int)

In [4]:
data['earliesCreditLine'].head()

0    Aug-2001
1    May-2002
2    May-2006
3    May-1999
4    Aug-1977
Name: earliesCreditLine, dtype: object

In [5]:
# 对earliesCreditLine进行预处理（取年份）
data['earliesCreditLine'] = data['earliesCreditLine'].apply(lambda s: int(s[-4:]))

In [6]:
# 将类型数在2之上，又不是高维稀疏的转换成one-hot编码
data = pd.get_dummies(data, columns=['grade', 'subGrade', 'homeOwnership', 'verificationStatus', 'purpose', 'regionCode'], drop_first=True)

In [7]:
# 高维类别特征需要进行转换
for f in ['employmentTitle', 'postCode', 'title']:
    data[f+'_cnts'] = data.groupby([f])['id'].transform('count')
    data[f+'_rank'] = data.groupby([f])['id'].rank(ascending=False).astype(int)
    del data[f]

In [8]:
features = [f for f in data.columns if f not in ['id','issueDate','isDefault']]

In [9]:
features

['loanAmnt',
 'term',
 'interestRate',
 'installment',
 'employmentLength',
 'annualIncome',
 'dti',
 'delinquency_2years',
 'ficoRangeLow',
 'ficoRangeHigh',
 'openAcc',
 'pubRec',
 'pubRecBankruptcies',
 'revolBal',
 'revolUtil',
 'totalAcc',
 'initialListStatus',
 'applicationType',
 'earliesCreditLine',
 'policyCode',
 'n0',
 'n1',
 'n2',
 'n3',
 'n4',
 'n5',
 'n6',
 'n7',
 'n8',
 'n9',
 'n10',
 'n11',
 'n12',
 'n13',
 'n14',
 'grade_B',
 'grade_C',
 'grade_D',
 'grade_E',
 'grade_F',
 'grade_G',
 'subGrade_A2',
 'subGrade_A3',
 'subGrade_A4',
 'subGrade_A5',
 'subGrade_B1',
 'subGrade_B2',
 'subGrade_B3',
 'subGrade_B4',
 'subGrade_B5',
 'subGrade_C1',
 'subGrade_C2',
 'subGrade_C3',
 'subGrade_C4',
 'subGrade_C5',
 'subGrade_D1',
 'subGrade_D2',
 'subGrade_D3',
 'subGrade_D4',
 'subGrade_D5',
 'subGrade_E1',
 'subGrade_E2',
 'subGrade_E3',
 'subGrade_E4',
 'subGrade_E5',
 'subGrade_F1',
 'subGrade_F2',
 'subGrade_F3',
 'subGrade_F4',
 'subGrade_F5',
 'subGrade_G1',
 'subGrade_G2'

In [10]:
train = data[data.isDefault.notnull()].reset_index(drop=True)
test = data[data.isDefault.isnull()].reset_index(drop=True)

x_train = train[features]
x_test = test[features]

y_train = train['isDefault']

In [25]:
train_pred = np.zeros((x_train.shape[0], ))
test_pred = np.zeros((x_test.shape[0], ))

In [26]:
kf = KFold(n_splits=5, shuffle=True, random_state=1108)

In [34]:
for n_fold, (train_idx, valid_idx) in enumerate(kf.split(x_train,y_train)):
        trn_x, trn_y = x_train.iloc[train_idx], y_train[train_idx]
        val_x, val_y = x_train.iloc[valid_idx], y_train[valid_idx]
        # 数据加载
        train_matrix = lgb.Dataset(trn_x, label=trn_y)
        valid_matrix = lgb.Dataset(val_x, label=val_y)

In [36]:
params = {
                'boosting_type': 'gbdt',
                'objective': 'binary',
                'metric': 'auc',
                'min_child_weight': 5,
                'num_leaves': 2 ** 5,
                'lambda_l2': 10,
                'feature_fraction': 0.8,
                'bagging_fraction': 0.8,
                'bagging_freq': 4,
                'learning_rate': 0.1,
                'seed': 2020,
                'nthread': 28,
                'n_jobs':24,
                'silent': True,
                'verbose': -1,
            }
model = lgb.train(params, train_matrix, 50000, valid_sets=[train_matrix, valid_matrix], verbose_eval=200,early_stopping_rounds=200)

Training until validation scores don't improve for 200 rounds
[200]	training's auc: 0.743603	valid_1's auc: 0.727496
[400]	training's auc: 0.756112	valid_1's auc: 0.728742
[600]	training's auc: 0.766738	valid_1's auc: 0.729354
[800]	training's auc: 0.77643	valid_1's auc: 0.729085
Early stopping, best iteration is:
[617]	training's auc: 0.767671	valid_1's auc: 0.729421


In [38]:
test_pred = model.predict(x_test, num_iteration=model.best_iteration)

In [39]:
result = test[['id']]
result['isDefault'] = test_pred
result.head()

Unnamed: 0,id,isDefault
0,800000,0.059774
1,800001,0.375426
2,800002,0.622455
3,800003,0.277488
4,800004,0.316236


In [40]:
result.to_csv('result.csv',index=False)

In [35]:
def lgb_model(train_x,train_y,test_x):
    # 装结果
    train_pred = np.zeros((train_x.shape[0], ))
    test_pred = np.zeros((test_x.shape[0], ))

    # Kfold
    kf = KFold(n_splits=5, shuffle=True, random_state=1108)
    
    # 参数
    params = {
                'boosting_type': 'gbdt',
                'objective': 'binary',
                'metric': 'auc',
                'min_child_weight': 5,
                'num_leaves': 2 ** 5,
                'lambda_l2': 10,
                'feature_fraction': 0.8,
                'bagging_fraction': 0.8,
                'bagging_freq': 4,
                'learning_rate': 0.1,
                'seed': 2020,
                'nthread': 28,
                'n_jobs':24,
                'silent': True,
                'verbose': -1,
            }
    
    for n_fold, (train_idx, valid_idx) in enumerate(kf.split(train_x,train_y)):
        trn_x, trn_y = train_x.iloc[train_idx], train_y[train_idx]
        val_x, val_y = train_x.iloc[valid_idx], train_y[valid_idx]
        # 数据加载
        train_matrix = lgb.Dataset(trn_x, label=trn_y)
        valid_matrix = lgb.Dataset(val_x, label=val_y)
        
        lgb = lgb.train(
            params=params,
            train_set=train_matrix,
            valid_sets=[train_matrix,valid_matrix],
            early_stopping_rounds=200,
            verbose_eval=200
        )
        
        train_pred[valid_idx] = lgb.predict(val_x, num_iteration=clf.best_iteration)
        test_pred += lgb.predict(test[feature], num_iteration=clf.best_iteration)/5
    
    result = test[['id']]
    result['isDefault'] = test_pred
    
    return result

    
result_lgb = lgb_model(train,test,y_train)

ValueError: Found input variables with inconsistent numbers of samples: [800000, 200000]