In [3]:
import numpy as np 
import pandas as pd
import os
import pickle
import gc

import ydata_profiling as ypf
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix

import lightgbm as lgb

import warnings
warnings.filterwarnings('ignore')

import japanize_matplotlib
%matplotlib inline

In [5]:
# コンペ： Home Credit Default Risk
# 評価指標AUC
# 目的変数：　貸し倒れ有無
# モデル： 貸し倒れの有無を分類する２値分類モデル（予測値は0から1の連続値）

app_train = pd.read_csv('./application_train.csv')
app_train.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
app_train.memory_usage().sum() / 1024**2

286.22710037231445

In [112]:
def reduce_mem_usage(df):
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df

In [11]:
app_train = reduce_mem_usage(app_train)

Memory usage of dataframe is 286.23 MB
Memory usage after optimization is: 59.54 MB
Decreased by 79.2%


In [12]:
app_train.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,0,0,0,0,,,,,,
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0


In [44]:
x_train = app_train.drop(columns=['TARGET', 'SK_ID_CURR'])
y_train = app_train['TARGET']
id_train = app_train[['SK_ID_CURR']]

for col in x_train.columns:
    if x_train[col].dtype == 'O':
        x_train[col] = x_train[col].astype('category')

In [17]:
for col in x_train.columns:
    if x_train[col].dtype == 'O':
        x_train[col] = x_train[col].astype('category')

In [18]:
x_train.dtypes

NAME_CONTRACT_TYPE            category
CODE_GENDER                   category
FLAG_OWN_CAR                  category
FLAG_OWN_REALTY               category
CNT_CHILDREN                      int8
                                ...   
AMT_REQ_CREDIT_BUREAU_DAY      float16
AMT_REQ_CREDIT_BUREAU_WEEK     float16
AMT_REQ_CREDIT_BUREAU_MON      float16
AMT_REQ_CREDIT_BUREAU_QRT      float16
AMT_REQ_CREDIT_BUREAU_YEAR     float16
Length: 120, dtype: object

In [19]:
y_train.mean()

0.08072881945686496

In [21]:
y_train.value_counts()

282686

In [23]:
y_train.value_counts()[1] / y_train.value_counts().sum()

0.08072881945686496

In [45]:
# foldごとのindexのリスト作成
cv = list(StratifiedKFold(n_splits=5, shuffle=True, random_state=123).split(x_train, y_train))

# 0fold目のindexのリスト取得
nfold = 0
idx_tr, idx_va = cv[nfold][0], cv[nfold][1]

# 学習データと検証データに分離

x_tr, y_tr, id_tr = x_train.loc[idx_tr, :], y_train[idx_tr], id_train.loc[idx_tr, :]
x_va, y_va, id_va = x_train.loc[idx_va, :], y_train[idx_va], id_train.loc[idx_va, :]
print(x_tr.shape, y_tr.shape, id_tr.shape)
print(x_va.shape, y_va.shape, id_va.shape)

(246008, 120) (246008,) (246008, 1)
(61503, 120) (61503,) (61503, 1)


In [47]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary', 
    'metric': 'auc',
    'learning_rate': 0.05,
    'num_leaves': 32,
    'n_estimators': 100000,
    "random_state": 123,
    "importance_type": "gain",
}

# モデルの学習
model = lgb.LGBMClassifier(**params)
model.fit(x_tr,
          y_tr,
          eval_set=[(x_tr,y_tr), (x_va,y_va)],
          callbacks=[
              lgb.early_stopping(stopping_rounds=100, verbose=True),
              lgb.log_evaluation(100),
          ],
         )

# モデルの保存
with open("model_lgb_fold0.pickle", "wb") as f:
    pickle.dump(model, f, protocol=4)

[LightGBM] [Info] Number of positive: 19860, number of negative: 226148
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.024745 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11367
[LightGBM] [Info] Number of data points in the train set: 246008, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432482
[LightGBM] [Info] Start training from score -2.432482
Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.782506	valid_1's auc: 0.755903
[200]	training's auc: 0.808961	valid_1's auc: 0.758356
[300]	training's auc: 0.829245	valid_1's auc: 0.757774
Early stopping, best iteration is:
[217]	training's auc: 0.812578	valid_1's auc: 0.758595


In [51]:
# 学習データの推論値取得とROC計算
y_tr_pred = model.predict_proba(x_tr)[:,1]
metric_tr = roc_auc_score(y_tr, y_tr_pred)

y_va_pred = model.predict_proba(x_va)[:,1]
metric_va = roc_auc_score(y_va, y_va_pred)

metrics = []
metrics.append([nfold, metric_tr, metric_va])
print(metric_tr, metric_va)

0.8125779578420791 0.7585952814061514


In [52]:
train_oof = np.zeros(len(x_train))
train_oof[idx_va] = y_va_pred

In [56]:
imp_fold = pd.DataFrame({'col': x_train.columns, 'imp': model.feature_importances_, 'nfold': nfold})
imp_fold.sort_values('imp', ascending=False)[:10] # 重要度トップ１０を表示

Unnamed: 0,col,imp,nfold
41,EXT_SOURCE_3,66225.020483,0
40,EXT_SOURCE_2,52568.833805,0
38,ORGANIZATION_TYPE,20218.523523,0
39,EXT_SOURCE_1,19776.252288,0
6,AMT_CREDIT,8111.321247,0
8,AMT_GOODS_PRICE,7120.960365,0
15,DAYS_BIRTH,7042.223005,0
7,AMT_ANNUITY,6992.551795,0
16,DAYS_EMPLOYED,5236.51412,0
26,OCCUPATION_TYPE,4376.651746,0


In [70]:
# 重要度を格納する5-fold用データフレームの作成
imp = pd.DataFrame()

imp = pd.concat([imp, imp_fold])

In [71]:
imp

Unnamed: 0,col,imp,nfold
0,NAME_CONTRACT_TYPE,1456.625567,0
1,CODE_GENDER,3585.535678,0
2,FLAG_OWN_CAR,154.326403,0
3,FLAG_OWN_REALTY,234.007279,0
4,CNT_CHILDREN,86.682109,0
...,...,...,...
115,AMT_REQ_CREDIT_BUREAU_DAY,128.842901,0
116,AMT_REQ_CREDIT_BUREAU_WEEK,78.961071,0
117,AMT_REQ_CREDIT_BUREAU_MON,305.250370,0
118,AMT_REQ_CREDIT_BUREAU_QRT,971.668136,0


In [62]:
metrics = np.array(metrics)
print(metrics)

print('[cv] tr:{:.4f}+-{:.4f}, va:{:.4f}+-{:.4f}'.format(
    metrics[:,1].mean(),metrics[:,1].std(),
    metrics[:,2].mean(),metrics[:,2].std(),
))
print('[oof] {:.4f}'.format(
    roc_auc_score(y_train, train_oof)
))

[[0.         0.81257796 0.75859528]]
[cv] tr:0.8126+-0.0000, va:0.7586+-0.0000
[oof] 0.5103


In [63]:
train_oof = pd.concat([
    id_train,
    pd.DataFrame({'true': y_train, 'pred': train_oof})
], axis=1)
train_oof.head()

Unnamed: 0,SK_ID_CURR,true,pred
0,100002,1,0.0
1,100003,0,0.0
2,100004,0,0.031866
3,100006,0,0.0
4,100007,0,0.0


In [74]:
imp = imp.groupby('col')['imp'].agg(['mean', 'std']).reset_index(drop=False)
imp.columns = ['col', 'imp', 'imp_std']
imp

Unnamed: 0,col,imp,imp_std
0,AMT_ANNUITY,6992.551795,
1,AMT_CREDIT,8111.321247,
2,AMT_GOODS_PRICE,7120.960365,
3,AMT_INCOME_TOTAL,1595.740609,
4,AMT_REQ_CREDIT_BUREAU_DAY,128.842901,
...,...,...,...
115,YEARS_BEGINEXPLUATATION_MEDI,497.554882,
116,YEARS_BEGINEXPLUATATION_MODE,521.439980,
117,YEARS_BUILD_AVG,571.474816,
118,YEARS_BUILD_MEDI,97.963029,


In [86]:
# モデルの精度を繰り返し測るための関数
def train_lgb(input_x,
              input_y,
              input_id,
              params,
              list_nfold=[0,1,2,3,4],
              n_splits=5,
             ):
    train_oof = np.zeros(len(x_train))
    metrics = []
    imp = pd.DataFrame()

    cv = list(StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=123).split(input_x, input_y))

    for nfold in list_nfold:
        print('-'*20, nfold, '-'*20)

        # データセットの作成
        idx_tr, idx_va = cv[nfold][0], cv[nfold][1]
        x_tr, y_tr, id_tr = input_x.loc[idx_tr, :], input_y[idx_tr], input_id.loc[idx_tr, :]
        x_va, y_va, id_va = input_x.loc[idx_va, :], input_y[idx_va], input_id.loc[idx_va, :]
        print(x_tr.shape, x_va.shape)

        # 学習
        model = lgb.LGBMClassifier(**params)
        model.fit(x_tr,
                  y_tr,
                  eval_set=[(x_tr,y_tr), (x_va,y_va)],
                  callbacks=[
                      lgb.early_stopping(stopping_rounds=100, verbose=True),
                      lgb.log_evaluation(100),
                  ],
                 )
        fname_lgb = 'model_lgb_fold{}.pickle'.format(nfold)

        # モデルの保存
        with open(fname_lgb, "wb") as f:
            pickle.dump(model, f, protocol=4)

        # 学習データの推論値取得とROC計算
        y_tr_pred = model.predict_proba(x_tr)[:,1]
        y_va_pred = model.predict_proba(x_va)[:,1]
        metric_tr = roc_auc_score(y_tr, y_tr_pred)
        metric_va = roc_auc_score(y_va, y_va_pred)
        metrics.append([nfold, metric_tr, metric_va])
        print(metric_tr, metric_va)

        # oof
        train_oof[idx_va] = y_va_pred

        # imp
        _imp = pd.DataFrame({'col': input_x.columns, 'imp': model.feature_importances_, 'nfold': nfold})
        imp = pd.concat([imp, _imp])

    print('-'*20, 'result', '-'*20)
    metrics = np.array(metrics)
    print(metrics)
    print('[cv] tr:{:.4f}+-{:.4f}, va:{:.4f}+-{:.4f}'.format(
        metrics[:,1].mean(),metrics[:,1].std(),
        metrics[:,2].mean(),metrics[:,2].std(),
    ))
    print('[oof] {:.4f}'.format(
        roc_auc_score(input_y, train_oof)
    ))

    train_oof = pd.concat([
        input_id,
        pd.DataFrame({'true': input_y, 'pred': train_oof})
    ], axis=1)

    # 重要度
    imp = imp.groupby('col')['imp'].agg(['mean', 'std']).reset_index(drop=False)
    imp.columns = ['col', 'imp', 'imp_std']

    return train_oof, imp, metrics

In [85]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary', 
    'metric': 'auc',
    'learning_rate': 0.05,
    'num_leaves': 32,
    'n_estimators': 100000,
    "random_state": 123,
    "importance_type": "gain",
}

train_oof, imp, metrics = train_lgb(
    x_train,
    y_train,
    id_train,
    params,
    list_nfold=[0,1,2,3,4],
    n_splits=5
)

-------------------- 0 --------------------
(246008, 120) (61503, 120)
[LightGBM] [Info] Number of positive: 19860, number of negative: 226148
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.024778 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 11367
[LightGBM] [Info] Number of data points in the train set: 246008, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432482
[LightGBM] [Info] Start training from score -2.432482
Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.782506	valid_1's auc: 0.755903
[200]	training's auc: 0.808961	valid_1's auc: 0.758356
[300]	training's auc: 0.829245	valid_1's auc: 0.757774
Early stopping, best iteration is:
[217]	training's auc: 0.812578	valid_1's auc: 0.758595
0.8125779578420791 0.7585952814061514
------------

In [87]:
imp.sort_values('imp', ascending=False)[:10]

Unnamed: 0,col,imp,imp_std
38,EXT_SOURCE_3,65353.907478,1558.201212
37,EXT_SOURCE_2,54545.388309,1251.798934
102,ORGANIZATION_TYPE,21441.917474,1450.24619
36,EXT_SOURCE_1,20051.934248,685.852224
1,AMT_CREDIT,8263.228728,410.384434
22,DAYS_BIRTH,7645.58911,689.458833
2,AMT_GOODS_PRICE,7263.054566,405.837031
0,AMT_ANNUITY,6762.95364,479.302045
23,DAYS_EMPLOYED,5810.288375,552.93773
101,OCCUPATION_TYPE,5502.675859,831.872392


In [91]:
app_test = pd.read_csv('./application_test.csv')
app_test = reduce_mem_usage(app_test)

x_test = app_test.drop(columns=['SK_ID_CURR'])
id_test = app_test[['SK_ID_CURR']]

Memory usage of dataframe is 45.00 MB
Memory usage after optimization is: 9.40 MB
Decreased by 79.1%


In [92]:
for col in x_test.columns:
    if x_test[col].dtype == 'O':
        x_test[col] = x_test[col].astype('category')

In [95]:
with open('model_lgb_fold0.pickle', 'rb') as f:
    model = pickle.load(f)

In [101]:
test_pred_fold = model.predict_proba(x_test)[:, 1]
test_pred = np.zeros((len(x_test), 5))
test_pred[:, 0] = test_pred_fold

In [103]:
test_pred_mean = test_pred.mean(axis=1)
test_pred_mean

array([0.00657238, 0.02387448, 0.00423262, ..., 0.00671567, 0.01320681,
       0.0334756 ])

In [104]:
df_test_pred = pd.concat([
    id_test,
    pd.DataFrame({'pred': test_pred_mean}),
], axis=1)
df_test_pred.head()

Unnamed: 0,SK_ID_CURR,pred
0,100001,0.006572
1,100005,0.023874
2,100013,0.004233
3,100028,0.008966
4,100038,0.030794


In [171]:
def predict_lgb(input_x,
                input_id,
                list_nfold=[0,1,2,3,4],
               ):
    pred = np.zeros((len(input_x), len(list_nfold)))

    for nfold in list_nfold:
        print('-'*20, nfold, '-'*20)
        fname_lgb = 'model_lgb_fold{}.pickle'.format(nfold)
        with open(fname_lgb, 'rb') as f:
            model = pickle.load(f)
        pred[:, nfold] = model.predict_proba(input_x)[:,1]

    pred = pd.concat([
        input_id,
        pd.DataFrame({'pred': pred.mean(axis=1)}),
    ], axis=1)

    print('Done')
    return pred


In [159]:
test_pred = predict_lgb(x_test, id_test, list_nfold=[0,1,2,3,4])

-------------------- 0 --------------------
-------------------- 1 --------------------
-------------------- 2 --------------------
-------------------- 3 --------------------
-------------------- 4 --------------------
Done


In [111]:
df_submit = test_pred.rename(columns={'pred': 'TARGET'})
df_submit.to_csv('submission_baseline.csv', index=None)

In [129]:
# 特徴量エンジニアリング
display(app_train['DAYS_EMPLOYED'].value_counts())
print('{:.4f}'.format((app_train['DAYS_EMPLOYED'] > 0).mean()))
print('{:.4f}'.format((app_train['DAYS_EMPLOYED'] > 0).sum()))

DAYS_EMPLOYED
-200.0      156
-224.0      152
-199.0      151
-230.0      151
-212.0      150
           ... 
-13961.0      1
-11827.0      1
-10176.0      1
-9459.0       1
-8694.0       1
Name: count, Length: 12573, dtype: int64

0.0000
0.0000


In [132]:
app_train[np.isnan(app_train['DAYS_EMPLOYED'])]
# app_train['DAYS_EMPLOYED'] = app_train['DAYS_EMPLOYED'].replace(365243, np.nan)

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,FLAG_DOCUMENT_18,FLAG_DOCUMENT_19,FLAG_DOCUMENT_20,FLAG_DOCUMENT_21,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_WEEK,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_YEAR
8,100011,0,Cash loans,F,N,Y,0,112500.00000,1019610.0,33826.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,1.0
11,100015,0,Cash loans,F,N,Y,0,38419.15625,148365.0,10678.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,2.0
23,100027,0,Cash loans,F,N,Y,0,83250.00000,239850.0,23850.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
38,100045,0,Cash loans,F,N,Y,0,99000.00000,247275.0,17338.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,2.0
43,100050,0,Cash loans,F,N,Y,0,108000.00000,746280.0,42970.5,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
307469,456209,0,Cash loans,F,N,Y,0,202500.00000,703728.0,29943.0,...,0,0,0,0,0.0,0.0,0.0,1.0,0.0,0.0
307483,456227,0,Cash loans,F,N,Y,0,99000.00000,247275.0,16479.0,...,0,0,0,0,0.0,0.0,0.0,0.0,0.0,5.0
307487,456231,0,Cash loans,M,N,Y,0,117000.00000,1071909.0,31473.0,...,0,0,0,0,0.0,0.0,0.0,0.0,1.0,8.0
307505,456249,0,Cash loans,F,N,Y,0,112500.00000,225000.0,22050.0,...,0,0,0,0,0.0,0.0,0.0,2.0,0.0,0.0


In [134]:
# 仮説１：所得金額が同じでも、家族人数が多い方が経済的な負担が大きいので貸し倒れしやすい→総所得金額を世帯人数で割った値
app_train['INCOME_div_PERSON'] = app_train['AMT_INCOME_TOTAL'] / app_train['CNT_FAM_MEMBERS']

# 仮説２：所得金額が同じでも、終了期間が短い方が優秀で貸し倒れしにくい→総所得金額を就労期間が割った値
app_train['INCOME_div_EMPLOYED'] = app_train['AMT_INCOME_TOTAL'] / app_train['DAYS_EMPLOYED']

# 仮説３：外部機関によるスコア（EXT_SOURCE_1〜３）が平均的に高い方が貸し倒れしにくい→外部スコアの平均値など
app_train['EXT_SOURCE_MEAN'] = app_train[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].mean(axis=1)
app_train['EXT_SOURCE_max'] = app_train[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].max(axis=1)
app_train['EXT_SOURCE_min'] = app_train[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].min(axis=1)
app_train['EXT_SOURCE_std'] = app_train[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].std(axis=1)
app_train['EXT_SOURCE_count'] = app_train[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].notnull().sum(axis=1)

# 仮説４：年齢に占める終了期間が長い方が貸し倒れしにくい（離職可能性が低く支払い不能状態になりづらい）。→就労期間を年齢で割った値
app_train['DAYS_EMPLOYED_div_BIRTH'] = app_train['DAYS_EMPLOYED'] / app_train['DAYS_BIRTH']

# 仮説５：所得金額に占める年金支払額が少ない方が貸し倒れしにくい→年金支払額を所得金額で割った値
app_train['ANNUITY_div_INCOME'] = app_train['AMT_ANNUITY'] / app_train['AMT_INCOME_TOTAL']

# 仮説６：借入金に占める年金支払額が少ない方が貸し倒れしにくい→年金支払額を借入金で割った値
app_train['ANNUITY_div_CREDIT'] = app_train['AMT_ANNUITY'] / app_train['AMT_CREDIT']


In [135]:
app_train.head()

Unnamed: 0,SK_ID_CURR,TARGET,NAME_CONTRACT_TYPE,CODE_GENDER,FLAG_OWN_CAR,FLAG_OWN_REALTY,CNT_CHILDREN,AMT_INCOME_TOTAL,AMT_CREDIT,AMT_ANNUITY,...,INCOME_div_PERSON,INCOME_div_EMPLOYED,EXT_SOURCE_MEAN,EXT_SOURCE_max,EXT_SOURCE_min,EXT_SOURCE_std,EXT_SOURCE_count,DAYS_EMPLOYED_div_BIRTH,ANNUITY_div_INCOME,ANNUITY_div_CREDIT
0,100002,1,Cash loans,M,N,Y,0,202500.0,406597.5,24700.5,...,202500.0,-317.896389,0.161743,0.262939,0.083008,0.092041,3,0.067329,0.121978,0.060749
1,100003,0,Cash loans,F,N,N,0,270000.0,1293502.5,35698.5,...,135000.0,-227.272727,0.466797,0.62207,0.311279,0.219727,2,0.070862,0.132217,0.027598
2,100004,0,Revolving loans,M,Y,Y,0,67500.0,135000.0,6750.0,...,67500.0,-300.0,0.642578,0.729492,0.556152,0.122559,2,0.011814,0.1,0.05
3,100006,0,Cash loans,F,N,Y,0,135000.0,312682.5,29686.5,...,67500.0,-44.422507,0.650391,0.650391,0.650391,,1,0.159905,0.2199,0.094941
4,100007,0,Cash loans,M,N,Y,0,121500.0,513000.0,21865.5,...,121500.0,-39.993417,0.322754,0.322754,0.322754,,1,0.152418,0.179963,0.042623


In [136]:
x_train = app_train.drop(columns=['TARGET', 'SK_ID_CURR'])
y_train = app_train['TARGET']
id_train = app_train[['SK_ID_CURR']]

for col in x_train.columns:
    if x_train[col].dtype == 'O':
        x_train[col] = x_train[col].astype('category')

In [137]:
train_oof, imp, metrics = train_lgb(x_train,
                                    y_train,
                                    id_train,
                                    params,
                                    list_nfold=[0,1,2,3,4],
                                    n_splits=5
                                   )


-------------------- 0 --------------------
(246008, 130) (61503, 130)
[LightGBM] [Info] Number of positive: 19860, number of negative: 226148
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.027459 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 13680
[LightGBM] [Info] Number of data points in the train set: 246008, number of used features: 126
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.080729 -> initscore=-2.432482
[LightGBM] [Info] Start training from score -2.432482
Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.787895	valid_1's auc: 0.761408
[200]	training's auc: 0.81693	valid_1's auc: 0.765243
[300]	training's auc: 0.838558	valid_1's auc: 0.765348
Early stopping, best iteration is:
[217]	training's auc: 0.820432	valid_1's auc: 0.765606
0.8204320356491586 0.7656060604927121
-------------

In [140]:
imp.sort_values('imp', ascending=False)[:10]

Unnamed: 0,col,imp,imp_std
42,EXT_SOURCE_MEAN,113116.122069,1786.948047
10,ANNUITY_div_CREDIT,22151.650249,898.087993
112,ORGANIZATION_TYPE,20187.355369,1438.208866
41,EXT_SOURCE_3,10549.22125,1015.861249
24,DAYS_BIRTH,6863.921363,709.067033
45,EXT_SOURCE_min,6826.756135,440.769525
39,EXT_SOURCE_1,6085.67802,796.181708
0,AMT_ANNUITY,5342.644769,621.544562
2,AMT_GOODS_PRICE,5332.378775,472.867742
1,AMT_CREDIT,4722.058802,766.762029


In [141]:
# 仮説１：所得金額が同じでも、家族人数が多い方が経済的な負担が大きいので貸し倒れしやすい→総所得金額を世帯人数で割った値
app_test['INCOME_div_PERSON'] = app_test['AMT_INCOME_TOTAL'] / app_test['CNT_FAM_MEMBERS']

# 仮説２：所得金額が同じでも、終了期間が短い方が優秀で貸し倒れしにくい→総所得金額を就労期間が割った値
app_test['INCOME_div_EMPLOYED'] = app_test['AMT_INCOME_TOTAL'] / app_test['DAYS_EMPLOYED']

# 仮説３：外部機関によるスコア（EXT_SOURCE_1〜３）が平均的に高い方が貸し倒れしにくい→外部スコアの平均値など
app_test['EXT_SOURCE_MEAN'] = app_test[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].mean(axis=1)
app_test['EXT_SOURCE_max'] = app_test[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].max(axis=1)
app_test['EXT_SOURCE_min'] = app_test[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].min(axis=1)
app_test['EXT_SOURCE_std'] = app_test[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].std(axis=1)
app_test['EXT_SOURCE_count'] = app_test[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].notnull().sum(axis=1)

# 仮説４：年齢に占める終了期間が長い方が貸し倒れしにくい（離職可能性が低く支払い不能状態になりづらい）。→就労期間を年齢で割った値
app_test['DAYS_EMPLOYED_div_BIRTH'] = app_test['DAYS_EMPLOYED'] / app_test['DAYS_BIRTH']

# 仮説５：所得金額に占める年金支払額が少ない方が貸し倒れしにくい→年金支払額を所得金額で割った値
app_test['ANNUITY_div_INCOME'] = app_test['AMT_ANNUITY'] / app_test['AMT_INCOME_TOTAL']

# 仮説６：借入金に占める年金支払額が少ない方が貸し倒れしにくい→年金支払額を借入金で割った値
app_test['ANNUITY_div_CREDIT'] = app_test['AMT_ANNUITY'] / app_test['AMT_CREDIT']


In [168]:
x_test = app_test.drop(columns=['SK_ID_CURR'])
id_test = app_train[['SK_ID_CURR']]

for col in x_test.columns:
    if x_test[col].dtype == 'O':
        x_test[col] = x_test[col].astype('category')

In [170]:
test_pred = predict_lgb(x_test,
                        id_test,
                        list_nfold=[0,1,2,3,4],
                       )
test_pred

-------------------- 0 --------------------
-------------------- 1 --------------------
-------------------- 2 --------------------
-------------------- 3 --------------------
-------------------- 4 --------------------
Done


Unnamed: 0,SK_ID_CURR,pred
0,100002,0.006572
1,100003,0.023874
2,100004,0.004233
3,100006,0.008966
4,100007,0.030794
...,...,...
307506,456251,
307507,456252,
307508,456253,
307509,456254,


In [163]:
df_submit = test_pred.rename(columns={'pred': 'TARGET'})
df_submit.to_csv('submission_home_credit_2.csv', index=None)

In [169]:
len(id_test)

307511

In [165]:
test_pred.isnull().sum()

SK_ID_CURR         0
pred          258767
dtype: int64