In [20]:
import warnings
warnings.simplefilter('ignore')

import os
import re
import gc

import numpy as np
import pandas as pd
pd.set_option('max_columns', None)
pd.set_option('max_rows', 200)
pd.set_option('float_format', lambda x: '%.3f' % x)

from tqdm.notebook import tqdm

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

import lightgbm as lgb

# 读取数据

In [2]:
train_data = pd.read_csv('raw_data/train_public.csv')

print(train_data.shape)
train_data.head()

(10000, 39)


Unnamed: 0,loan_id,user_id,total_loan,year_of_loan,interest,monthly_payment,class,employer_type,industry,work_year,house_exist,censor_status,issue_date,use,post_code,region,debt_loan_ratio,del_in_18month,scoring_low,scoring_high,known_outstanding_loan,known_dero,pub_dero_bankrup,recircle_b,recircle_u,initial_list_status,app_type,earlies_credit_mon,title,policy_code,f0,f1,f2,f3,f4,early_return,early_return_amount,early_return_amount_3mon,isDefault
0,1040418,240418,31818.182,3,11.466,1174.91,C,政府机构,金融业,3 years,0,1,2016/10/1,2,193,13,2.43,0,556.364,649.091,3,0,0.0,7734.231,91.8,0,0,1-Dec,5,1,1.0,0.0,4.0,5.0,4.0,3,9927,0.0,0
1,1025197,225197,28000.0,5,16.841,670.69,C,政府机构,金融业,10+ years,0,2,2013/6/1,0,491,30,11.005,1,715.0,893.75,3,0,0.0,31329.0,54.8,1,0,Apr-90,40642,1,7.0,0.0,4.0,45.0,22.0,0,0,0.0,0
2,1009360,209360,17272.727,3,8.9,603.32,A,政府机构,公共服务、社会组织,10+ years,1,0,2014/1/1,4,459,8,6.409,0,774.545,903.636,5,0,0.0,18514.0,57.692,1,0,Oct-91,154,1,6.0,0.0,6.0,28.0,19.0,0,0,0.0,0
3,1039708,239708,20000.0,3,4.788,602.3,A,世界五百强,文化和体育业,6 years,0,1,2015/7/1,0,157,8,9.205,0,750.0,875.0,3,0,0.0,20707.0,42.6,0,0,1-Jun,0,1,5.0,0.0,10.0,15.0,9.0,0,0,0.0,0
4,1027483,227483,15272.727,3,12.79,470.31,C,政府机构,信息传输、软件和信息技术服务业,< 1 year,2,1,2016/7/1,0,38,21,15.578,0,609.091,710.606,15,0,0.0,14016.154,30.462,0,0,2-May,0,1,10.0,0.0,6.0,15.0,4.0,0,0,0.0,0


In [3]:
train_internet = pd.read_csv('raw_data/train_internet.csv')

print(train_internet.shape)
train_internet.head()

(750000, 42)


Unnamed: 0,loan_id,user_id,total_loan,year_of_loan,interest,monthly_payment,class,sub_class,work_type,employer_type,industry,work_year,house_exist,house_loan_status,censor_status,marriage,offsprings,issue_date,use,post_code,region,debt_loan_ratio,del_in_18month,scoring_low,scoring_high,pub_dero_bankrup,early_return,early_return_amount,early_return_amount_3mon,recircle_b,recircle_u,initial_list_status,earlies_credit_mon,title,policy_code,f0,f1,f2,f3,f4,f5,is_default
0,119262,0,12000.0,5,11.53,264.1,B,B5,职员,普通企业,采矿业,,0,0,2,0,0,2015-06-01,0,814.0,4,5.07,1.0,670.0,674.0,1.0,0,0,0.0,3855.0,23.1,0,Mar-1984,0.0,1.0,1.0,0.0,8.0,17.0,8.0,1.0,1
1,369815,1,8000.0,3,13.98,273.35,C,C3,其他,普通企业,国际组织,10+ years,0,1,2,1,3,2010-10-01,2,240.0,21,15.04,0.0,725.0,729.0,0.0,0,0,0.0,118632.0,99.9,1,Jan-1992,94.0,1.0,,,,,,,0
2,787833,2,20000.0,5,17.99,507.76,D,D2,工人,上市企业,信息传输、软件和信息技术服务业,10+ years,0,0,1,0,0,2016-08-01,0,164.0,20,17.38,1.0,675.0,679.0,0.0,0,0,0.0,15670.0,72.5,0,Oct-1996,0.0,1.0,6.0,0.0,10.0,8.0,3.0,0.0,0
3,671675,3,10700.0,3,10.16,346.07,B,B1,职员,普通企业,电力、热力生产供应业,2 years,2,0,2,0,0,2013-05-01,4,48.0,10,27.87,0.0,710.0,714.0,0.0,0,0,0.0,18859.0,78.6,0,Jul-2000,41646.0,1.0,3.0,0.0,4.0,11.0,6.0,0.0,0
4,245160,4,8000.0,3,8.24,251.58,B,B1,其他,政府机构,金融业,5 years,1,2,0,0,0,2017-04-01,4,122.0,9,3.47,0.0,660.0,664.0,0.0,0,0,0.0,8337.0,67.8,1,Mar-2000,4.0,1.0,3.0,0.0,8.0,6.0,4.0,1.0,0


In [4]:
test_data = pd.read_csv('raw_data/test_public.csv')

print(test_data.shape)
test_data.head()

(5000, 38)


Unnamed: 0,loan_id,user_id,total_loan,year_of_loan,interest,monthly_payment,class,employer_type,industry,work_year,house_exist,censor_status,issue_date,use,post_code,region,debt_loan_ratio,del_in_18month,scoring_low,scoring_high,known_outstanding_loan,known_dero,pub_dero_bankrup,recircle_b,recircle_u,initial_list_status,app_type,earlies_credit_mon,title,policy_code,f0,f1,f2,f3,f4,early_return,early_return_amount,early_return_amount_3mon
0,1000575,200575,2890.909,3,10.791,88.01,B,幼教与中小学校,住宿和餐饮业,5 years,0,1,2017/12/1,0,314,0,23.04,0,745.0,869.167,7,0,0.0,8647.692,31.846,1,0,3-Mar,0,1,2.0,0.0,15.0,5.0,4.0,3,773,89.192
1,1028125,228125,7272.727,3,9.99,258.1,B,普通企业,批发和零售业,10+ years,1,1,2015/7/1,5,29,19,27.755,0,681.818,738.636,24,0,0.0,9406.154,18.277,0,0,Dec-99,6,1,8.0,0.0,8.0,29.0,14.0,1,1894,218.538
2,1010694,210694,26295.455,3,15.763,764.03,C,普通企业,住宿和餐饮业,10+ years,0,2,2013/4/1,0,488,24,25.495,1,758.182,947.727,11,0,0.0,26414.769,62.3,1,0,Apr-99,268,1,6.0,0.0,4.0,10.0,6.0,1,5670,1221.231
3,1026712,226712,22690.909,5,19.305,524.3,D,普通企业,采矿业,10+ years,0,2,2017/12/1,0,489,30,10.62,0,572.727,620.455,8,0,0.0,1198.0,7.7,0,0,Jul-00,0,1,4.0,0.0,12.0,10.0,8.0,2,4800,443.077
4,1002895,202895,14545.455,3,7.139,490.32,A,世界五百强,金融业,1 year,0,0,2016/6/1,2,418,45,6.611,0,638.182,691.364,15,0,0.0,3920.0,8.831,1,0,7-May,5,1,4.0,0.0,7.0,14.0,9.0,0,3516,649.108


In [5]:
train_data['isDefault'].value_counts(dropna=True)

0    8317
1    1683
Name: isDefault, dtype: int64

In [6]:
train_internet = train_internet.rename(columns={'is_default': 'isDefault'})
train_internet['isDefault'].value_counts(dropna=True)

0    600327
1    149673
Name: isDefault, dtype: int64

# 数据整理

In [7]:
drop1 = ['sub_class', 'work_type', 'house_loan_status', 'marriage', 'offsprings', 'f5']
drop2 = ['known_outstanding_loan', 'known_dero', 'app_type']

train_internet.drop(drop1 + ['user_id'], axis=1, inplace=True)
train_data.drop(drop2 + ['user_id'], axis=1, inplace=True)

train_data = pd.concat([train_data, train_internet]).reset_index(drop=True)
print(train_data.shape)
train_data.head()

(760000, 35)


Unnamed: 0,loan_id,total_loan,year_of_loan,interest,monthly_payment,class,employer_type,industry,work_year,house_exist,censor_status,issue_date,use,post_code,region,debt_loan_ratio,del_in_18month,scoring_low,scoring_high,pub_dero_bankrup,recircle_b,recircle_u,initial_list_status,earlies_credit_mon,title,policy_code,f0,f1,f2,f3,f4,early_return,early_return_amount,early_return_amount_3mon,isDefault
0,1040418,31818.182,3,11.466,1174.91,C,政府机构,金融业,3 years,0,1,2016/10/1,2,193.0,13,2.43,0.0,556.364,649.091,0.0,7734.231,91.8,0,1-Dec,5.0,1.0,1.0,0.0,4.0,5.0,4.0,3,9927,0.0,0
1,1025197,28000.0,5,16.841,670.69,C,政府机构,金融业,10+ years,0,2,2013/6/1,0,491.0,30,11.005,1.0,715.0,893.75,0.0,31329.0,54.8,1,Apr-90,40642.0,1.0,7.0,0.0,4.0,45.0,22.0,0,0,0.0,0
2,1009360,17272.727,3,8.9,603.32,A,政府机构,公共服务、社会组织,10+ years,1,0,2014/1/1,4,459.0,8,6.409,0.0,774.545,903.636,0.0,18514.0,57.692,1,Oct-91,154.0,1.0,6.0,0.0,6.0,28.0,19.0,0,0,0.0,0
3,1039708,20000.0,3,4.788,602.3,A,世界五百强,文化和体育业,6 years,0,1,2015/7/1,0,157.0,8,9.205,0.0,750.0,875.0,0.0,20707.0,42.6,0,1-Jun,0.0,1.0,5.0,0.0,10.0,15.0,9.0,0,0,0.0,0
4,1027483,15272.727,3,12.79,470.31,C,政府机构,信息传输、软件和信息技术服务业,< 1 year,2,1,2016/7/1,0,38.0,21,15.578,0.0,609.091,710.606,0.0,14016.154,30.462,0,2-May,0.0,1.0,10.0,0.0,6.0,15.0,4.0,0,0,0.0,0


In [8]:
test_data.drop(drop2 + ['user_id'], axis=1, inplace=True)

print(test_data.shape)
test_data.head()

(5000, 34)


Unnamed: 0,loan_id,total_loan,year_of_loan,interest,monthly_payment,class,employer_type,industry,work_year,house_exist,censor_status,issue_date,use,post_code,region,debt_loan_ratio,del_in_18month,scoring_low,scoring_high,pub_dero_bankrup,recircle_b,recircle_u,initial_list_status,earlies_credit_mon,title,policy_code,f0,f1,f2,f3,f4,early_return,early_return_amount,early_return_amount_3mon
0,1000575,2890.909,3,10.791,88.01,B,幼教与中小学校,住宿和餐饮业,5 years,0,1,2017/12/1,0,314,0,23.04,0,745.0,869.167,0.0,8647.692,31.846,1,3-Mar,0,1,2.0,0.0,15.0,5.0,4.0,3,773,89.192
1,1028125,7272.727,3,9.99,258.1,B,普通企业,批发和零售业,10+ years,1,1,2015/7/1,5,29,19,27.755,0,681.818,738.636,0.0,9406.154,18.277,0,Dec-99,6,1,8.0,0.0,8.0,29.0,14.0,1,1894,218.538
2,1010694,26295.455,3,15.763,764.03,C,普通企业,住宿和餐饮业,10+ years,0,2,2013/4/1,0,488,24,25.495,1,758.182,947.727,0.0,26414.769,62.3,1,Apr-99,268,1,6.0,0.0,4.0,10.0,6.0,1,5670,1221.231
3,1026712,22690.909,5,19.305,524.3,D,普通企业,采矿业,10+ years,0,2,2017/12/1,0,489,30,10.62,0,572.727,620.455,0.0,1198.0,7.7,0,Jul-00,0,1,4.0,0.0,12.0,10.0,8.0,2,4800,443.077
4,1002895,14545.455,3,7.139,490.32,A,世界五百强,金融业,1 year,0,0,2016/6/1,2,418,45,6.611,0,638.182,691.364,0.0,3920.0,8.831,1,7-May,5,1,4.0,0.0,7.0,14.0,9.0,0,3516,649.108


In [9]:
data = pd.concat([train_data, test_data])

print(data.shape)
data.head()

(765000, 35)


Unnamed: 0,loan_id,total_loan,year_of_loan,interest,monthly_payment,class,employer_type,industry,work_year,house_exist,censor_status,issue_date,use,post_code,region,debt_loan_ratio,del_in_18month,scoring_low,scoring_high,pub_dero_bankrup,recircle_b,recircle_u,initial_list_status,earlies_credit_mon,title,policy_code,f0,f1,f2,f3,f4,early_return,early_return_amount,early_return_amount_3mon,isDefault
0,1040418,31818.182,3,11.466,1174.91,C,政府机构,金融业,3 years,0,1,2016/10/1,2,193.0,13,2.43,0.0,556.364,649.091,0.0,7734.231,91.8,0,1-Dec,5.0,1.0,1.0,0.0,4.0,5.0,4.0,3,9927,0.0,0.0
1,1025197,28000.0,5,16.841,670.69,C,政府机构,金融业,10+ years,0,2,2013/6/1,0,491.0,30,11.005,1.0,715.0,893.75,0.0,31329.0,54.8,1,Apr-90,40642.0,1.0,7.0,0.0,4.0,45.0,22.0,0,0,0.0,0.0
2,1009360,17272.727,3,8.9,603.32,A,政府机构,公共服务、社会组织,10+ years,1,0,2014/1/1,4,459.0,8,6.409,0.0,774.545,903.636,0.0,18514.0,57.692,1,Oct-91,154.0,1.0,6.0,0.0,6.0,28.0,19.0,0,0,0.0,0.0
3,1039708,20000.0,3,4.788,602.3,A,世界五百强,文化和体育业,6 years,0,1,2015/7/1,0,157.0,8,9.205,0.0,750.0,875.0,0.0,20707.0,42.6,0,1-Jun,0.0,1.0,5.0,0.0,10.0,15.0,9.0,0,0,0.0,0.0
4,1027483,15272.727,3,12.79,470.31,C,政府机构,信息传输、软件和信息技术服务业,< 1 year,2,1,2016/7/1,0,38.0,21,15.578,0.0,609.091,710.606,0.0,14016.154,30.462,0,2-May,0.0,1.0,10.0,0.0,6.0,15.0,4.0,0,0,0.0,0.0


In [10]:
data['issue_date'] = pd.to_datetime(data['issue_date'])
data['issue_mon'] = data['issue_date'].dt.year * 100 + data['issue_date'].dt.month
data.drop(['issue_date'], axis=1, inplace=True)

In [11]:
data['class'] = data['class'].map({
    'A': 0, 'B': 1, 'C': 2, 'D': 3,
    'E': 4, 'F': 5, 'G': 6
})

In [12]:
lbe = LabelEncoder()
data['employer_type'] = lbe.fit_transform(data['employer_type'])

In [13]:
lbe = LabelEncoder()
data['industry'] = lbe.fit_transform(data['industry'])

In [14]:
data['work_year'] = data['work_year'].map({
    '< 1 year': 0, '1 year': 1, '2 years': 2, '3 years': 3, '4 years': 4,
    '5 years': 5, '6 years': 6, '7 years': 7, '8 years': 8, '9 years': 9,
    '10+ years': 10
})

data['work_year'].fillna(-1, inplace=True)

In [15]:
def clean_mon(x):
    mons = {'jan':1, 'feb':2, 'mar':3, 'apr':4,  'may':5,  'jun':6,
            'jul':7, 'aug':8, 'sep':9, 'oct':10, 'nov':11, 'dec':12}
    year_group = re.search('(\d+)', x)
    if year_group:
        year = int(year_group.group(1))
        if year < 22:
            year += 2000
        elif 100 > year > 22:
            year += 1900
        else:
            year = 2022
    else:
        year = 2022
        
    month_group = re.search('([a-zA-Z]+)', x)
    if month_group:
        mon = month_group.group(1).lower()
        month = mons[mon]
    else:
        month = 0
        
    return year*100 + month

data['earlies_credit_mon'] = data['earlies_credit_mon'].apply(lambda x: clean_mon(x))

In [16]:
data.head()

Unnamed: 0,loan_id,total_loan,year_of_loan,interest,monthly_payment,class,employer_type,industry,work_year,house_exist,censor_status,use,post_code,region,debt_loan_ratio,del_in_18month,scoring_low,scoring_high,pub_dero_bankrup,recircle_b,recircle_u,initial_list_status,earlies_credit_mon,title,policy_code,f0,f1,f2,f3,f4,early_return,early_return_amount,early_return_amount_3mon,isDefault,issue_mon
0,1040418,31818.182,3,11.466,1174.91,2,3,13,3.0,0,1,2,193.0,13,2.43,0.0,556.364,649.091,0.0,7734.231,91.8,0,200112,5.0,1.0,1.0,0.0,4.0,5.0,4.0,3,9927,0.0,0.0,201610
1,1025197,28000.0,5,16.841,670.69,2,3,13,10.0,0,2,0,491.0,30,11.005,1.0,715.0,893.75,0.0,31329.0,54.8,1,199004,40642.0,1.0,7.0,0.0,4.0,45.0,22.0,0,0,0.0,0.0,201306
2,1009360,17272.727,3,8.9,603.32,0,3,3,10.0,1,0,4,459.0,8,6.409,0.0,774.545,903.636,0.0,18514.0,57.692,1,199110,154.0,1.0,6.0,0.0,6.0,28.0,19.0,0,0,0.0,0.0,201401
3,1039708,20000.0,3,4.788,602.3,0,1,10,6.0,0,1,0,157.0,8,9.205,0.0,750.0,875.0,0.0,20707.0,42.6,0,200106,0.0,1.0,5.0,0.0,10.0,15.0,9.0,0,0,0.0,0.0,201507
4,1027483,15272.727,3,12.79,470.31,2,3,2,0.0,2,1,0,38.0,21,15.578,0.0,609.091,710.606,0.0,14016.154,30.462,0,200205,0.0,1.0,10.0,0.0,6.0,15.0,4.0,0,0,0.0,0.0,201607


# 特征工程

In [17]:
# TODO

# 模型

In [18]:
train = data[data['isDefault'].notna()]
test  = data[data['isDefault'].isna()]

ycol = 'isDefault'
feature_names = list(
    filter(lambda x: x not in [ycol, 'loan_id'], train.columns))

model = lgb.LGBMClassifier(objective='binary',
                           boosting_type='gbdt',
                           tree_learner='serial',
                           num_leaves=32,
                           max_depth=6,
                           learning_rate=0.1,
                           n_estimators=10000,
                           subsample=0.8,
                           feature_fraction=0.6,
                           reg_alpha=0.5,
                           reg_lambda=0.5,
                           random_state=2021,
                           is_unbalance=True,
                           metric='auc')


oof = []
prediction = test[['loan_id']]
prediction[ycol] = 0
df_importance_list = []

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=2021)
for fold_id, (trn_idx, val_idx) in enumerate(kfold.split(train[feature_names], train[ycol])):
    X_train = train.iloc[trn_idx][feature_names]
    Y_train = train.iloc[trn_idx][ycol]

    X_val = train.iloc[val_idx][feature_names]
    Y_val = train.iloc[val_idx][ycol]

    print('\nFold_{} Training ================================\n'.format(fold_id+1))

    lgb_model = model.fit(X_train,
                          Y_train,
                          eval_names=['train', 'valid'],
                          eval_set=[(X_train, Y_train), (X_val, Y_val)],
                          verbose=500,
                          eval_metric='auc',
                          early_stopping_rounds=50)

    pred_val = lgb_model.predict_proba(
        X_val, num_iteration=lgb_model.best_iteration_)
    df_oof = train.iloc[val_idx][['loan_id', ycol]].copy()
    df_oof['pred'] = pred_val[:, 1]
    oof.append(df_oof)

    pred_test = lgb_model.predict_proba(
        test[feature_names], num_iteration=lgb_model.best_iteration_)
    prediction[ycol] += pred_test[:, 1] / kfold.n_splits

    df_importance = pd.DataFrame({
        'column': feature_names,
        'importance': lgb_model.feature_importances_,
    })
    df_importance_list.append(df_importance)

    del lgb_model, pred_val, pred_test, X_train, Y_train, X_val, Y_val
    gc.collect()
    



Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[313]	train's auc: 0.818205	valid's auc: 0.806095


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[377]	train's auc: 0.821227	valid's auc: 0.805393


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[395]	train's auc: 0.82171	valid's auc: 0.805808


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[438]	train's auc: 0.8229	valid's auc: 0.807024


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[392]	train's auc: 0.821207	valid's auc: 0.806122


In [19]:
df_importance = pd.concat(df_importance_list)
df_importance = df_importance.groupby(['column'])['importance'].agg(
    'mean').sort_values(ascending=False).reset_index()
df_importance

Unnamed: 0,column,importance
0,issue_mon,1015.6
1,recircle_b,723.8
2,interest,685.2
3,debt_loan_ratio,649.4
4,region,641.0
5,recircle_u,634.0
6,post_code,622.2
7,monthly_payment,608.8
8,total_loan,527.2
9,f2,501.6


In [26]:
oof = pd.concat(oof)
print('roc_auc_score:', roc_auc_score(oof['isDefault'], oof['pred']))

roc_auc_score: 0.8060829091394448


# 提交

In [29]:
prediction.columns = ['id', 'isDefault']
prediction.head()

Unnamed: 0,id,isDefault
0,1000575,0.077
1,1028125,0.147
2,1010694,0.007
3,1026712,0.02
4,1002895,0.019


In [28]:
prediction['isDefault'].describe()

count   5000.000
mean       0.273
std        0.307
min        0.000
25%        0.030
50%        0.087
75%        0.572
max        0.954
Name: isDefault, dtype: float64

In [30]:
prediction.to_csv('baseline.csv', index=False)