In [1]:
import warnings
warnings.simplefilter('ignore')

import os
import re
import gc

import numpy as np
import pandas as pd
pd.set_option('max_columns', None)
pd.set_option('max_rows', 200)
pd.set_option('float_format', lambda x: '%.3f' % x)

from tqdm.notebook import tqdm

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import roc_auc_score

import lightgbm as lgb

In [2]:
!pip install tqdm --upgrade

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
Requirement already up-to-date: tqdm in /opt/conda/envs/python35-paddle120-env/lib/python3.7/site-packages (4.62.3)


# 读取数据

In [3]:
train_data = pd.read_csv('../raw_data/train_public.csv')

train_data['target'] = 1

print(train_data.shape)
train_data.head()

(10000, 40)


Unnamed: 0,loan_id,user_id,total_loan,year_of_loan,interest,monthly_payment,class,employer_type,industry,work_year,house_exist,censor_status,issue_date,use,post_code,region,debt_loan_ratio,del_in_18month,scoring_low,scoring_high,known_outstanding_loan,known_dero,pub_dero_bankrup,recircle_b,recircle_u,initial_list_status,app_type,earlies_credit_mon,title,policy_code,f0,f1,f2,f3,f4,early_return,early_return_amount,early_return_amount_3mon,isDefault,target
0,1040418,240418,31818.182,3,11.466,1174.91,C,政府机构,金融业,3 years,0,1,2016/10/1,2,193,13,2.43,0,556.364,649.091,3,0,0.0,7734.231,91.8,0,0,1-Dec,5,1,1.0,0.0,4.0,5.0,4.0,3,9927,0.0,0,1
1,1025197,225197,28000.0,5,16.841,670.69,C,政府机构,金融业,10+ years,0,2,2013/6/1,0,491,30,11.005,1,715.0,893.75,3,0,0.0,31329.0,54.8,1,0,Apr-90,40642,1,7.0,0.0,4.0,45.0,22.0,0,0,0.0,0,1
2,1009360,209360,17272.727,3,8.9,603.32,A,政府机构,公共服务、社会组织,10+ years,1,0,2014/1/1,4,459,8,6.409,0,774.545,903.636,5,0,0.0,18514.0,57.692,1,0,Oct-91,154,1,6.0,0.0,6.0,28.0,19.0,0,0,0.0,0,1
3,1039708,239708,20000.0,3,4.788,602.3,A,世界五百强,文化和体育业,6 years,0,1,2015/7/1,0,157,8,9.205,0,750.0,875.0,3,0,0.0,20707.0,42.6,0,0,1-Jun,0,1,5.0,0.0,10.0,15.0,9.0,0,0,0.0,0,1
4,1027483,227483,15272.727,3,12.79,470.31,C,政府机构,信息传输、软件和信息技术服务业,< 1 year,2,1,2016/7/1,0,38,21,15.578,0,609.091,710.606,15,0,0.0,14016.154,30.462,0,0,2-May,0,1,10.0,0.0,6.0,15.0,4.0,0,0,0.0,0,1


In [4]:
train_internet = pd.read_csv('../raw_data/train_internet.csv')

train_internet['target'] = 0

print(train_internet.shape)
train_internet.head()

(750000, 43)


Unnamed: 0,loan_id,user_id,total_loan,year_of_loan,interest,monthly_payment,class,sub_class,work_type,employer_type,industry,work_year,house_exist,house_loan_status,censor_status,marriage,offsprings,issue_date,use,post_code,region,debt_loan_ratio,del_in_18month,scoring_low,scoring_high,pub_dero_bankrup,early_return,early_return_amount,early_return_amount_3mon,recircle_b,recircle_u,initial_list_status,earlies_credit_mon,title,policy_code,f0,f1,f2,f3,f4,f5,is_default,target
0,119262,0,12000.0,5,11.53,264.1,B,B5,职员,普通企业,采矿业,,0,0,2,0,0,2015-06-01,0,814.0,4,5.07,1.0,670.0,674.0,1.0,0,0,0.0,3855.0,23.1,0,Mar-1984,0.0,1.0,1.0,0.0,8.0,17.0,8.0,1.0,1,0
1,369815,1,8000.0,3,13.98,273.35,C,C3,其他,普通企业,国际组织,10+ years,0,1,2,1,3,2010-10-01,2,240.0,21,15.04,0.0,725.0,729.0,0.0,0,0,0.0,118632.0,99.9,1,Jan-1992,94.0,1.0,,,,,,,0,0
2,787833,2,20000.0,5,17.99,507.76,D,D2,工人,上市企业,信息传输、软件和信息技术服务业,10+ years,0,0,1,0,0,2016-08-01,0,164.0,20,17.38,1.0,675.0,679.0,0.0,0,0,0.0,15670.0,72.5,0,Oct-1996,0.0,1.0,6.0,0.0,10.0,8.0,3.0,0.0,0,0
3,671675,3,10700.0,3,10.16,346.07,B,B1,职员,普通企业,电力、热力生产供应业,2 years,2,0,2,0,0,2013-05-01,4,48.0,10,27.87,0.0,710.0,714.0,0.0,0,0,0.0,18859.0,78.6,0,Jul-2000,41646.0,1.0,3.0,0.0,4.0,11.0,6.0,0.0,0,0
4,245160,4,8000.0,3,8.24,251.58,B,B1,其他,政府机构,金融业,5 years,1,2,0,0,0,2017-04-01,4,122.0,9,3.47,0.0,660.0,664.0,0.0,0,0,0.0,8337.0,67.8,1,Mar-2000,4.0,1.0,3.0,0.0,8.0,6.0,4.0,1.0,0,0


In [5]:
train_data['isDefault'].value_counts(dropna=True)

0    8317
1    1683
Name: isDefault, dtype: int64

In [6]:
train_internet = train_internet.rename(columns={'is_default': 'isDefault'})
train_internet['isDefault'].value_counts(dropna=True)

0    600327
1    149673
Name: isDefault, dtype: int64

# 数据整理

In [7]:
drop1 = ['sub_class', 'work_type', 'house_loan_status', 'marriage', 'offsprings', 'f5']
drop2 = ['known_outstanding_loan', 'known_dero', 'app_type']

train_internet.drop(drop1 + ['user_id'], axis=1, inplace=True)
train_data.drop(drop2 + ['user_id'], axis=1, inplace=True)

train_data = pd.concat([train_data, train_internet]).reset_index(drop=True)
print(train_data.shape)
train_data.head()

(760000, 36)


Unnamed: 0,loan_id,total_loan,year_of_loan,interest,monthly_payment,class,employer_type,industry,work_year,house_exist,censor_status,issue_date,use,post_code,region,debt_loan_ratio,del_in_18month,scoring_low,scoring_high,pub_dero_bankrup,recircle_b,recircle_u,initial_list_status,earlies_credit_mon,title,policy_code,f0,f1,f2,f3,f4,early_return,early_return_amount,early_return_amount_3mon,isDefault,target
0,1040418,31818.182,3,11.466,1174.91,C,政府机构,金融业,3 years,0,1,2016/10/1,2,193.0,13,2.43,0.0,556.364,649.091,0.0,7734.231,91.8,0,1-Dec,5.0,1.0,1.0,0.0,4.0,5.0,4.0,3,9927,0.0,0,1
1,1025197,28000.0,5,16.841,670.69,C,政府机构,金融业,10+ years,0,2,2013/6/1,0,491.0,30,11.005,1.0,715.0,893.75,0.0,31329.0,54.8,1,Apr-90,40642.0,1.0,7.0,0.0,4.0,45.0,22.0,0,0,0.0,0,1
2,1009360,17272.727,3,8.9,603.32,A,政府机构,公共服务、社会组织,10+ years,1,0,2014/1/1,4,459.0,8,6.409,0.0,774.545,903.636,0.0,18514.0,57.692,1,Oct-91,154.0,1.0,6.0,0.0,6.0,28.0,19.0,0,0,0.0,0,1
3,1039708,20000.0,3,4.788,602.3,A,世界五百强,文化和体育业,6 years,0,1,2015/7/1,0,157.0,8,9.205,0.0,750.0,875.0,0.0,20707.0,42.6,0,1-Jun,0.0,1.0,5.0,0.0,10.0,15.0,9.0,0,0,0.0,0,1
4,1027483,15272.727,3,12.79,470.31,C,政府机构,信息传输、软件和信息技术服务业,< 1 year,2,1,2016/7/1,0,38.0,21,15.578,0.0,609.091,710.606,0.0,14016.154,30.462,0,2-May,0.0,1.0,10.0,0.0,6.0,15.0,4.0,0,0,0.0,0,1


In [8]:
# data = pd.concat([train_data, test_data])
data = train_data.copy()

print(data.shape)
data.head()

(760000, 36)


Unnamed: 0,loan_id,total_loan,year_of_loan,interest,monthly_payment,class,employer_type,industry,work_year,house_exist,censor_status,issue_date,use,post_code,region,debt_loan_ratio,del_in_18month,scoring_low,scoring_high,pub_dero_bankrup,recircle_b,recircle_u,initial_list_status,earlies_credit_mon,title,policy_code,f0,f1,f2,f3,f4,early_return,early_return_amount,early_return_amount_3mon,isDefault,target
0,1040418,31818.182,3,11.466,1174.91,C,政府机构,金融业,3 years,0,1,2016/10/1,2,193.0,13,2.43,0.0,556.364,649.091,0.0,7734.231,91.8,0,1-Dec,5.0,1.0,1.0,0.0,4.0,5.0,4.0,3,9927,0.0,0,1
1,1025197,28000.0,5,16.841,670.69,C,政府机构,金融业,10+ years,0,2,2013/6/1,0,491.0,30,11.005,1.0,715.0,893.75,0.0,31329.0,54.8,1,Apr-90,40642.0,1.0,7.0,0.0,4.0,45.0,22.0,0,0,0.0,0,1
2,1009360,17272.727,3,8.9,603.32,A,政府机构,公共服务、社会组织,10+ years,1,0,2014/1/1,4,459.0,8,6.409,0.0,774.545,903.636,0.0,18514.0,57.692,1,Oct-91,154.0,1.0,6.0,0.0,6.0,28.0,19.0,0,0,0.0,0,1
3,1039708,20000.0,3,4.788,602.3,A,世界五百强,文化和体育业,6 years,0,1,2015/7/1,0,157.0,8,9.205,0.0,750.0,875.0,0.0,20707.0,42.6,0,1-Jun,0.0,1.0,5.0,0.0,10.0,15.0,9.0,0,0,0.0,0,1
4,1027483,15272.727,3,12.79,470.31,C,政府机构,信息传输、软件和信息技术服务业,< 1 year,2,1,2016/7/1,0,38.0,21,15.578,0.0,609.091,710.606,0.0,14016.154,30.462,0,2-May,0.0,1.0,10.0,0.0,6.0,15.0,4.0,0,0,0.0,0,1


In [9]:
data['issue_date'] = pd.to_datetime(data['issue_date'])
data['issue_mon'] = data['issue_date'].dt.year * 100 + data['issue_date'].dt.month
data.drop(['issue_date'], axis=1, inplace=True)

In [10]:
data['class'] = data['class'].map({
    'A': 0, 'B': 1, 'C': 2, 'D': 3,
    'E': 4, 'F': 5, 'G': 6
})

In [11]:
lbe = LabelEncoder()
data['employer_type'] = lbe.fit_transform(data['employer_type'])

In [12]:
lbe = LabelEncoder()
data['industry'] = lbe.fit_transform(data['industry'])

In [13]:
data['work_year'] = data['work_year'].map({
    '< 1 year': 0, '1 year': 1, '2 years': 2, '3 years': 3, '4 years': 4,
    '5 years': 5, '6 years': 6, '7 years': 7, '8 years': 8, '9 years': 9,
    '10+ years': 10
})

data['work_year'].fillna(-1, inplace=True)

In [14]:
def clean_mon(x):
    mons = {'jan':1, 'feb':2, 'mar':3, 'apr':4,  'may':5,  'jun':6,
            'jul':7, 'aug':8, 'sep':9, 'oct':10, 'nov':11, 'dec':12}
    year_group = re.search('(\d+)', x)
    if year_group:
        year = int(year_group.group(1))
        if year < 22:
            year += 2000
        elif 100 > year > 22:
            year += 1900
        else:
            pass
    else:
        year = 2022
        
    month_group = re.search('([a-zA-Z]+)', x)
    if month_group:
        mon = month_group.group(1).lower()
        month = mons[mon]
    else:
        month = 0
        
    return year*100 + month

data['earlies_credit_mon'] = data['earlies_credit_mon'].apply(lambda x: clean_mon(x))

In [15]:
data.head()

Unnamed: 0,loan_id,total_loan,year_of_loan,interest,monthly_payment,class,employer_type,industry,work_year,house_exist,censor_status,use,post_code,region,debt_loan_ratio,del_in_18month,scoring_low,scoring_high,pub_dero_bankrup,recircle_b,recircle_u,initial_list_status,earlies_credit_mon,title,policy_code,f0,f1,f2,f3,f4,early_return,early_return_amount,early_return_amount_3mon,isDefault,target,issue_mon
0,1040418,31818.182,3,11.466,1174.91,2,3,13,3.0,0,1,2,193.0,13,2.43,0.0,556.364,649.091,0.0,7734.231,91.8,0,200112,5.0,1.0,1.0,0.0,4.0,5.0,4.0,3,9927,0.0,0,1,201610
1,1025197,28000.0,5,16.841,670.69,2,3,13,10.0,0,2,0,491.0,30,11.005,1.0,715.0,893.75,0.0,31329.0,54.8,1,199004,40642.0,1.0,7.0,0.0,4.0,45.0,22.0,0,0,0.0,0,1,201306
2,1009360,17272.727,3,8.9,603.32,0,3,3,10.0,1,0,4,459.0,8,6.409,0.0,774.545,903.636,0.0,18514.0,57.692,1,199110,154.0,1.0,6.0,0.0,6.0,28.0,19.0,0,0,0.0,0,1,201401
3,1039708,20000.0,3,4.788,602.3,0,1,10,6.0,0,1,0,157.0,8,9.205,0.0,750.0,875.0,0.0,20707.0,42.6,0,200106,0.0,1.0,5.0,0.0,10.0,15.0,9.0,0,0,0.0,0,1,201507
4,1027483,15272.727,3,12.79,470.31,2,3,2,0.0,2,1,0,38.0,21,15.578,0.0,609.091,710.606,0.0,14016.154,30.462,0,200205,0.0,1.0,10.0,0.0,6.0,15.0,4.0,0,0,0.0,0,1,201607


# 模型

In [16]:
train = data.copy()
test  = data[data['target'] == 0].copy()

ycol = 'target'
feature_names = list(
    filter(lambda x: x not in [ycol, 'loan_id', 'policy_code'], train.columns))

model = lgb.LGBMClassifier(objective='binary',
                           boosting_type='gbdt',
                           tree_learner='serial',
                           num_leaves=32,
                           max_depth=6,
                           learning_rate=0.1,
                           n_estimators=10000,
                           subsample=0.8,
                           feature_fraction=0.6,
                           reg_alpha=0.5,
                           reg_lambda=0.5,
                           random_state=2021,
                           is_unbalance=True,
                           metric='auc')


oof = []
prediction = test[['loan_id']]
prediction[ycol] = 0
df_importance_list = []

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=2021)
for fold_id, (trn_idx, val_idx) in enumerate(kfold.split(train[feature_names], train[ycol])):
    X_train = train.iloc[trn_idx][feature_names]
    Y_train = train.iloc[trn_idx][ycol]

    X_val = train.iloc[val_idx][feature_names]
    Y_val = train.iloc[val_idx][ycol]

    print('\nFold_{} Training ================================\n'.format(fold_id+1))

    lgb_model = model.fit(X_train,
                          Y_train,
                          eval_names=['train', 'valid'],
                          eval_set=[(X_train, Y_train), (X_val, Y_val)],
                          verbose=500,
                          eval_metric='auc',
                          early_stopping_rounds=50)

    pred_val = lgb_model.predict_proba(
        X_val, num_iteration=lgb_model.best_iteration_)
    df_oof = train.iloc[val_idx][['loan_id', ycol]].copy()
    df_oof['pred'] = pred_val[:, 1]
    oof.append(df_oof)

    pred_test = lgb_model.predict_proba(
        test[feature_names], num_iteration=lgb_model.best_iteration_)
    prediction[ycol] += pred_test[:, 1] / kfold.n_splits

    df_importance = pd.DataFrame({
        'column': feature_names,
        'importance': lgb_model.feature_importances_,
    })
    df_importance_list.append(df_importance)

    del lgb_model, pred_val, pred_test, X_train, Y_train, X_val, Y_val
    gc.collect()
    



Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[38]	train's auc: 1	valid's auc: 0.999995


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[36]	train's auc: 0.999999	valid's auc: 1


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[39]	train's auc: 0.999999	valid's auc: 1


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[75]	train's auc: 0.999999	valid's auc: 1


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[51]	train's auc: 0.999999	valid's auc: 1


In [17]:
df_importance = pd.concat(df_importance_list)
df_importance = df_importance.groupby(['column'])['importance'].agg(
    'mean').sort_values(ascending=False).reset_index()
df_importance

Unnamed: 0,column,importance
0,scoring_low,324.6
1,scoring_high,165.2
2,interest,154.8
3,early_return_amount,95.6
4,recircle_u,83.6
5,total_loan,70.0
6,early_return_amount_3mon,64.8
7,class,48.6
8,early_return,47.4
9,f0,37.4


In [18]:
oof = pd.concat(oof)
print('roc_auc_score:', roc_auc_score(oof['target'], oof['pred']))

roc_auc_score: 0.9999992305333333


In [19]:
prediction.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 750000 entries, 10000 to 759999
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype  
---  ------   --------------   -----  
 0   loan_id  750000 non-null  int64  
 1   target   750000 non-null  float64
dtypes: float64(1), int64(1)
memory usage: 17.2 MB


In [20]:
prediction['target'].describe()

count   750000.000
mean         0.011
std          0.016
min          0.001
25%          0.006
50%          0.007
75%          0.008
max          0.999
Name: target, dtype: float64

In [21]:
save_list = prediction[prediction['target'] >= 0.1]['loan_id'].tolist()
len(save_list)

4248

In [22]:
train_internet = pd.read_csv('../raw_data/train_internet.csv')
train_internet = train_internet[train_internet['loan_id'].isin(save_list)].copy()
print(train_internet.shape)
train_internet.head()

(4248, 42)


Unnamed: 0,loan_id,user_id,total_loan,year_of_loan,interest,monthly_payment,class,sub_class,work_type,employer_type,industry,work_year,house_exist,house_loan_status,censor_status,marriage,offsprings,issue_date,use,post_code,region,debt_loan_ratio,del_in_18month,scoring_low,scoring_high,pub_dero_bankrup,early_return,early_return_amount,early_return_amount_3mon,recircle_b,recircle_u,initial_list_status,earlies_credit_mon,title,policy_code,f0,f1,f2,f3,f4,f5,is_default
240,414778,240,16000.0,5,19.42,418.76,D,D3,其他,普通企业,电力、热力生产供应业,10+ years,2,2,2,0,0,2018-03-01,4,114.0,13,31.87,0.0,720.0,724.0,0.0,0,0,0.0,7582.0,70.9,0,Apr-1997,4.0,1.0,4.0,0.0,8.0,7.0,3.0,0.0,1
359,697588,359,20000.0,5,7.34,399.25,A,A4,其他,政府机构,信息传输、软件和信息技术服务业,,0,2,0,0,0,2018-02-01,2,69.0,26,10.21,0.0,825.0,829.0,0.0,0,0,0.0,10274.0,6.9,0,Sep-1986,5.0,1.0,4.0,0.0,4.0,25.0,19.0,0.0,0
403,88991,403,2400.0,3,15.61,83.92,D,D1,职员,上市企业,电力、热力生产供应业,2 years,1,1,1,1,5,2015-05-01,4,402.0,0,20.9,0.0,780.0,784.0,0.0,0,0,0.0,752.0,4.4,1,Mar-2009,4.0,1.0,4.0,0.0,6.0,5.0,4.0,0.0,0
491,210862,491,25200.0,5,12.99,573.25,C,C2,职员,普通企业,批发和零售业,8 years,0,1,0,0,0,2016-03-01,0,129.0,30,14.35,0.0,810.0,814.0,0.0,0,0,0.0,17915.0,27.8,0,Mar-1989,0.0,1.0,5.0,0.0,5.0,16.0,12.0,0.0,1
665,365687,665,14250.0,3,13.99,486.97,C,C4,工人,普通企业,信息传输、软件和信息技术服务业,9 years,0,0,1,1,2,2015-03-01,0,109.0,21,22.2,0.0,720.0,724.0,0.0,0,0,0.0,42906.0,69.1,1,Feb-1994,0.0,1.0,7.0,0.0,0.0,14.0,6.0,0.0,0


In [23]:
train_internet = train_internet.rename(columns={'is_default': 'isDefault'})
train_internet['isDefault'].value_counts()

0    3239
1    1009
Name: isDefault, dtype: int64

In [24]:
train_internet[['total_loan']]=train_internet[['total_loan']].astype(np.int64)

In [25]:
train_internet.to_csv(f'../user_data/train_internet_{len(train_internet)}.csv', index=False)