In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import numpy as np
import pandas as pd
from bayes_opt import BayesianOptimization
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
from lightgbm import LGBMClassifier

In [3]:
def get_apps_dataset():
    app_train = pd.read_csv('../data/home-credit-default-risk/application_train.csv')
    app_test = pd.read_csv('../data/home-credit-default-risk/application_test.csv')
    apps = pd.concat([app_train, app_test])
    prev = pd.read_csv('../data/home-credit-default-risk/previous_application.csv')
    
    return apps, prev


def get_apps_processed(apps):
    apps['APPS_EXT_SOURCE_MEAN'] = apps[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].mean(axis=1)
    apps['APPS_EXT_SOURCE_STD'] = apps[['EXT_SOURCE_1', 'EXT_SOURCE_2', 'EXT_SOURCE_3']].std(axis=1)
    apps['APPS_EXT_SOURCE_STD'] = apps['APPS_EXT_SOURCE_STD'].fillna(apps['APPS_EXT_SOURCE_STD'].mean())
    
    apps['APPS_ANNUITY_CREDIT_RATIO'] = apps['AMT_ANNUITY'] / apps['AMT_CREDIT']
    apps['APPS_GOODS_CREDIT_RATIO'] = apps['AMT_GOODS_PRICE'] / apps['AMT_CREDIT']
    apps['APPS_CREDIT_GOODS_DIFF'] = apps['AMT_CREDIT'] - apps['AMT_GOODS_PRICE']
    
    apps['APPS_ANNUITY_INCOME_RATIO'] = apps['AMT_ANNUITY'] / apps['AMT_INCOME_TOTAL']
    apps['APPS_CREDIT_INCOME_RATIO'] = apps['AMT_CREDIT'] / apps['AMT_INCOME_TOTAL']
    apps['APPS_GOODS_INCOME_RATIO'] = apps['AMT_GOODS_PRICE'] / apps['AMT_INCOME_TOTAL']
    apps['APPS_CNT_FAM_INCOME_RATIO'] = apps['AMT_INCOME_TOTAL'] / apps['CNT_FAM_MEMBERS']
    
    apps['APPS_EMPLOYED_BIRTH_RATIO'] = apps['DAYS_EMPLOYED'] / apps['DAYS_BIRTH']
    apps['APPS_INCOME_EMPLOYED_RATIO'] = apps['AMT_INCOME_TOTAL'] / apps['DAYS_EMPLOYED']
    apps['APPS_INCOME_BIRTH_RATIO'] = apps['AMT_INCOME_TOTAL'] / apps['DAYS_BIRTH']
    apps['APPS_CAR_BIRTH_RATIO'] = apps['OWN_CAR_AGE'] / apps['DAYS_BIRTH']
    apps['APPS_CAR_EMPLOYED_RATIO'] = apps['OWN_CAR_AGE'] / apps['DAYS_EMPLOYED']
    
    return apps

In [4]:
def get_prev_processed(prev):
    
    prev['PREV_CREDIT_DIFF'] = prev['AMT_APPLICATION'] - prev['AMT_CREDIT']
    prev['PREV_GOODS_DIFF'] = prev['AMT_APPLICATION'] - prev['AMT_GOODS_PRICE']
    prev['PREV_CREDIT_APPL_RATIO'] = prev['AMT_CREDIT'] / prev['AMT_APPLICATION']
    prev['PREV_ANNUITY_APPL_RATIO'] = prev['AMT_ANNUITY'] / prev['AMT_APPLICATION']
    prev['PREV_GOODS_APPL_RATIO'] = prev['AMT_GOODS_PRICE'] / prev['AMT_APPLICATION']
    
    prev['DAYS_FIRST_DRAWING'].replace(365243, np.nan, inplace=True)
    prev['DAYS_FIRST_DUE'].replace(365243, np.nan, inplace=True)
    prev['DAYS_LAST_DUE_1ST_VERSION'].replace(365243, np.nan, inplace=True)
    prev['DAYS_LAST_DUE'].replace(365243, np.nan, inplace=True)
    prev['DAYS_TERMINATION'].replace(365243, np.nan, inplace=True)

    prev['PREV_DAYS_LAST_DUE_DIFF'] = prev['DAYS_LAST_DUE_1ST_VERSION'] - prev['DAYS_LAST_DUE']
    
    all_pay = prev['AMT_ANNUITY'] * prev['CNT_PAYMENT']
    prev['PREV_INTERESTS_RATE'] = (all_pay / prev['AMT_CREDIT'] - 1) / prev['CNT_PAYMENT']
    
    return prev


def get_prev_amt_agg(prev):    

    agg_dict = {
         # 기존 컬럼. 
        'SK_ID_CURR':['count'],
        'AMT_CREDIT':['mean', 'max', 'sum'],
        'AMT_ANNUITY':['mean', 'max', 'sum'], 
        'AMT_APPLICATION':['mean', 'max', 'sum'],
        'AMT_DOWN_PAYMENT':['mean', 'max', 'sum'],
        'AMT_GOODS_PRICE':['mean', 'max', 'sum'],
        'RATE_DOWN_PAYMENT': ['min', 'max', 'mean'],
        'DAYS_DECISION': ['min', 'max', 'mean'],
        'CNT_PAYMENT': ['mean', 'sum'],
        # 가공 컬럼
        'PREV_CREDIT_DIFF':['mean', 'max', 'sum'], 
        'PREV_CREDIT_APPL_RATIO':['mean', 'max'],
        'PREV_GOODS_DIFF':['mean', 'max', 'sum'],
        'PREV_GOODS_APPL_RATIO':['mean', 'max'],
        'PREV_DAYS_LAST_DUE_DIFF':['mean', 'max', 'sum'],
        'PREV_INTERESTS_RATE':['mean', 'max']
    }
    
    prev_group = prev.groupby('SK_ID_CURR')
    prev_amt_agg = prev_group.agg(agg_dict)
    prev_amt_agg.columns = ['PREV_' + ('_').join(column).upper() for column in prev_amt_agg.columns.ravel()]
    prev_amt_agg = prev_amt_agg.reset_index()
    
    return prev_amt_agg


def get_prev_refused_appr_agg(prev):

    prev_refused_appr_group = prev[prev['NAME_CONTRACT_STATUS'].isin(['Approved', 'Refused'])].groupby(['SK_ID_CURR', 'NAME_CONTRACT_STATUS'])
    prev_refused_appr_agg = prev_refused_appr_group['SK_ID_CURR'].count().unstack()
    prev_refused_appr_agg = prev_refused_appr_agg.fillna(0)
    prev_refused_appr_agg.columns = ['PREV_APPROVED_COUNT', 'PREV_REFUSED_COUNT']
    prev_refused_appr_agg = prev_refused_appr_agg.reset_index()
    
    return prev_refused_appr_agg


def get_prev_agg(prev):
    
    prev = get_prev_processed(prev)
    prev_amt_agg = get_prev_amt_agg(prev)
    prev_refused_appr_agg = get_prev_refused_appr_agg(prev)
    
    prev_agg = prev_amt_agg.merge(prev_refused_appr_agg, on='SK_ID_CURR', how='left')
    prev_agg['PREV_REFUSED_RATIO'] = prev_agg['PREV_REFUSED_COUNT'] / prev_agg['PREV_SK_ID_CURR_COUNT']
    prev_agg['PREV_APPROVED_RATIO'] = prev_agg['PREV_APPROVED_COUNT'] / prev_agg['PREV_SK_ID_CURR_COUNT']
    prev_agg = prev_agg.drop(['PREV_REFUSED_COUNT', 'PREV_APPROVED_COUNT'], axis=1)
    
    return prev_agg


def get_apps_all_with_prev_agg(apps, prev):

    apps_all = get_apps_processed(apps)
    prev_agg = get_prev_agg(prev)
    print('prev_agg shape:', prev_agg.shape)
    print('apps_all before merge shape:', apps_all.shape)
    apps_all = apps_all.merge(prev_agg, on='SK_ID_CURR', how='left')
    print('apps_all after merge with prev_agg shape:', apps_all.shape)
    
    return apps_all


def get_apps_all_encoded(apps_all):
    
    object_columns = apps_all.dtypes[apps_all.dtypes == 'object'].index.tolist()
    for column in object_columns:
        apps_all[column] = pd.factorize(apps_all[column])[0]
    
    return apps_all


def get_apps_all_train_test(apps_all):

    apps_all_train = apps_all[~apps_all['TARGET'].isnull()]
    apps_all_test = apps_all[apps_all['TARGET'].isnull()]
    apps_all_test = apps_all_test.drop('TARGET', axis=1)
    
    return apps_all_train, apps_all_test


def train_apps_all(apps_all_train):
    
    ftr_app = apps_all_train.drop(['SK_ID_CURR', 'TARGET'], axis=1)
    target_app = apps_all_train['TARGET']

    train_x, valid_x, train_y, valid_y = train_test_split(ftr_app, target_app, test_size=0.3, random_state=0)
    print(f'train shape: {train_x.shape}, valid shape: {valid_x.shape}')
    
    clf = LGBMClassifier(n_jobs=-1, n_estimators=1000, learning_rate=0.02, num_leaves=32, 
                         subsample=0.8, max_depth=12, silent=-1, verbose=-1)
    clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], 
            eval_metric='auc', verbose=100, early_stopping_rounds=100)
    
    return clf

In [5]:
apps, prev = get_apps_dataset()
apps_all = get_apps_all_with_prev_agg(apps, prev)
apps_all = get_apps_all_encoded(apps_all)
apps_all_train, apps_all_test = get_apps_all_train_test(apps_all)
print(f'apps_all_train.shape: {apps_all_train.shape}, apps_all_test.shape: {apps_all_test.shape}')

prev_agg shape: (338857, 42)
apps_all before merge shape: (356255, 136)
apps_all after merge with prev_agg shape: (356255, 177)
apps_all_train.shape: (307511, 177), apps_all_test.shape: (48744, 176)


In [6]:
ftr_app = apps_all_train.drop(['SK_ID_CURR', 'TARGET'], axis=1)
target_app = apps_all_train['TARGET']
train_x, valid_x, train_y, valid_y = train_test_split(ftr_app, target_app, test_size=0.3, random_state=0)

In [7]:
bayesian_params = {
    'max_depth': (6, 16),
    'num_leaves': (24, 64),
    'min_child_samples': (10, 200),
    'min_child_weight': (1, 50),
    'subsample': (0.5, 1),
    'colsample_bytree': (0.5, 1),
    'max_bin': (10, 500),
    'reg_lambda': (0.001, 10),
    'reg_alpha': (0.01, 50),
}

In [8]:
def lgb_roc_eval(max_depth, num_leaves, min_child_samples, min_child_weight, subsample,
                colsample_bytree, max_bin, reg_lambda, reg_alpha):
    
    params = {
        "num_iterations": 500, 
        "learning_rate": 0.02,
        'max_depth': int(round(max_depth)),
        'num_leaves': int(round(num_leaves)),
        'min_child_samples': int(round(min_child_samples)),
        'min_child_weight': int(round(min_child_weight)),
        'subsample': max(min(subsample, 1), 0),
        'colsample_bytree': max(min(colsample_bytree, 1), 0),
        'max_bin': max(int(round(max_bin)), 10),
        'reg_lambda': max(reg_lambda, 0),
        'reg_alpha': max(reg_alpha, 0),
    }
    print(params)
    
    lgb_model = LGBMClassifier(**params)
    lgb_model.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], 
                  eval_metric='auc', verbose=100, early_stopping_rounds= 100)
    valid_proba = lgb_model.predict_proba(valid_x)[:, 1]
    roc_auc = roc_auc_score(valid_y, valid_proba)
    
    return roc_auc

In [9]:
lgbBO = BayesianOptimization(f=lgb_roc_eval, pbounds=bayesian_params, random_state=0)
lgbBO.maximize(init_points=5, n_iter=25)

|   iter    |  target   | colsam... |  max_bin  | max_depth | min_ch... | min_ch... | num_le... | reg_alpha | reg_la... | subsample |
-------------------------------------------------------------------------------------------------------------------------------------
{'num_iterations': 500, 'learning_rate': 0.02, 'max_depth': 12, 'num_leaves': 50, 'min_child_samples': 114, 'min_child_weight': 22, 'subsample': 0.9818313802505146, 'colsample_bytree': 0.7744067519636624, 'max_bin': 360, 'reg_lambda': 8.917838234820016, 'reg_alpha': 21.884984691022}
Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.769542	training's binary_logloss: 0.246863	valid_1's auc: 0.755399	valid_1's binary_logloss: 0.247029
[200]	training's auc: 0.786767	training's binary_logloss: 0.239327	valid_1's auc: 0.766757	valid_1's binary_logloss: 0.24212
[300]	training's auc: 0.798311	training's binary_logloss: 0.234892	valid_1's auc: 0.772621	valid_1's binary_logloss: 0.240097
[400]	tra

| [95m 6       [0m | [95m 0.7787  [0m | [95m 0.6405  [0m | [95m 435.0   [0m | [95m 13.5    [0m | [95m 169.3   [0m | [95m 26.92   [0m | [95m 57.69   [0m | [95m 5.768   [0m | [95m 9.196   [0m | [95m 0.613   [0m |
{'num_iterations': 500, 'learning_rate': 0.02, 'max_depth': 12, 'num_leaves': 53, 'min_child_samples': 199, 'min_child_weight': 14, 'subsample': 0.821334265834883, 'colsample_bytree': 0.7458994896937481, 'max_bin': 435, 'reg_lambda': 8.539189077900696, 'reg_alpha': 24.607737391152277}
Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.770824	training's binary_logloss: 0.246817	valid_1's auc: 0.755844	valid_1's binary_logloss: 0.247075
[200]	training's auc: 0.787721	training's binary_logloss: 0.239152	valid_1's auc: 0.766709	valid_1's binary_logloss: 0.242149
[300]	training's auc: 0.799065	training's binary_logloss: 0.234728	valid_1's auc: 0.772531	valid_1's binary_logloss: 0.240108
[400]	training's auc: 0.808209	training's

| [0m 12      [0m | [0m 0.7776  [0m | [0m 0.9716  [0m | [0m 467.8   [0m | [0m 8.134   [0m | [0m 165.7   [0m | [0m 10.05   [0m | [0m 62.97   [0m | [0m 10.45   [0m | [0m 7.26    [0m | [0m 0.8869  [0m |
{'num_iterations': 500, 'learning_rate': 0.02, 'max_depth': 13, 'num_leaves': 44, 'min_child_samples': 116, 'min_child_weight': 27, 'subsample': 0.5242793368963636, 'colsample_bytree': 0.7515227016933802, 'max_bin': 358, 'reg_lambda': 9.179902256499673, 'reg_alpha': 28.27854101642585}
Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.766494	training's binary_logloss: 0.247811	valid_1's auc: 0.753836	valid_1's binary_logloss: 0.247501
[200]	training's auc: 0.782986	training's binary_logloss: 0.240664	valid_1's auc: 0.765628	valid_1's binary_logloss: 0.242552
[300]	training's auc: 0.793386	training's binary_logloss: 0.236654	valid_1's auc: 0.771477	valid_1's binary_logloss: 0.240488
[400]	training's auc: 0.801442	training's binary_log

| [0m 18      [0m | [0m 0.7786  [0m | [0m 0.7667  [0m | [0m 365.6   [0m | [0m 8.933   [0m | [0m 114.8   [0m | [0m 5.31    [0m | [0m 60.16   [0m | [0m 14.37   [0m | [0m 4.147   [0m | [0m 0.6532  [0m |
{'num_iterations': 500, 'learning_rate': 0.02, 'max_depth': 16, 'num_leaves': 46, 'min_child_samples': 159, 'min_child_weight': 22, 'subsample': 0.9750060472743405, 'colsample_bytree': 0.9064892234635891, 'max_bin': 435, 'reg_lambda': 8.49184767806519, 'reg_alpha': 4.001032818745956}
Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.7723	training's binary_logloss: 0.245583	valid_1's auc: 0.756313	valid_1's binary_logloss: 0.246345
[200]	training's auc: 0.792502	training's binary_logloss: 0.237203	valid_1's auc: 0.768761	valid_1's binary_logloss: 0.241284
[300]	training's auc: 0.80668	training's binary_logloss: 0.231904	valid_1's auc: 0.774688	valid_1's binary_logloss: 0.23928
[400]	training's auc: 0.818002	training's binary_logloss:

| [0m 24      [0m | [0m 0.7785  [0m | [0m 0.6652  [0m | [0m 413.1   [0m | [0m 13.74   [0m | [0m 158.4   [0m | [0m 31.35   [0m | [0m 62.35   [0m | [0m 10.76   [0m | [0m 9.811   [0m | [0m 0.5561  [0m |
{'num_iterations': 500, 'learning_rate': 0.02, 'max_depth': 10, 'num_leaves': 63, 'min_child_samples': 105, 'min_child_weight': 16, 'subsample': 0.5109666846187021, 'colsample_bytree': 0.6817288558916161, 'max_bin': 364, 'reg_lambda': 9.226656044273998, 'reg_alpha': 0.019211789902964148}
Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.781327	training's binary_logloss: 0.243263	valid_1's auc: 0.759397	valid_1's binary_logloss: 0.245662
[200]	training's auc: 0.803272	training's binary_logloss: 0.233688	valid_1's auc: 0.770101	valid_1's binary_logloss: 0.240876
[300]	training's auc: 0.819375	training's binary_logloss: 0.22738	valid_1's auc: 0.77516	valid_1's binary_logloss: 0.239119
[400]	training's auc: 0.832313	training's binary_lo

In [10]:
lgbBO.res

[{'target': 0.7779410185743948,
  'params': {'colsample_bytree': 0.7744067519636624,
   'max_bin': 360.44278952248555,
   'max_depth': 12.027633760716439,
   'min_child_samples': 113.52780476941041,
   'min_child_weight': 21.75908516760633,
   'num_leaves': 49.835764522666246,
   'reg_alpha': 21.884984691022,
   'reg_lambda': 8.917838234820016,
   'subsample': 0.9818313802505146}},
 {'target': 0.7770706588920003,
  'params': {'colsample_bytree': 0.6917207594128889,
   'max_bin': 397.94526866050563,
   'max_depth': 11.288949197529044,
   'min_child_samples': 117.92846660784714,
   'min_child_weight': 46.35423527634039,
   'num_leaves': 26.841442327915477,
   'reg_alpha': 4.36559369208002,
   'reg_lambda': 0.20316375600581688,
   'subsample': 0.916309922773969}},
 {'target': 0.7784572757986459,
  'params': {'colsample_bytree': 0.8890783754749252,
   'max_bin': 436.30595264094137,
   'max_depth': 15.78618342232764,
   'min_child_samples': 161.8401272011775,
   'min_child_weight': 23.61248

In [11]:
target_list = []
for result in lgbBO.res:
    target = result['target']
    target_list.append(target)
print(target_list)
print(f'maximum target index: {np.argmax(np.array(target_list))}')

[0.7779410185743948, 0.7770706588920003, 0.7784572757986459, 0.7765905850952646, 0.776161186066826, 0.7786794720787867, 0.7777650992807662, 0.777492578001182, 0.7782009176907888, 0.7771538493281773, 0.7781304537066802, 0.7775902517559199, 0.7768626009748988, 0.7783587998217655, 0.7778827836646265, 0.7775666102693805, 0.7784106586699453, 0.7786255065952609, 0.7785845435282205, 0.7772490650742707, 0.7786681286576917, 0.7782786706652841, 0.7790445617713937, 0.7784509751459386, 0.7784889459243873, 0.778369572060695, 0.7786230598177262, 0.7771435327514249, 0.7782767934654706, 0.7748394994159502]
maximum target index: 22


In [12]:
max_dict = lgbBO.res[np.argmax(np.array(target_list))]
print(max_dict)

{'target': 0.7790445617713937, 'params': {'colsample_bytree': 0.6013787284477659, 'max_bin': 348.7945541058695, 'max_depth': 12.095550661878493, 'min_child_samples': 107.76369211353001, 'min_child_weight': 8.843095617583431, 'num_leaves': 61.56115763939606, 'reg_alpha': 11.964113352102862, 'reg_lambda': 5.513581899056866, 'subsample': 0.773731718803031}}


In [13]:
def train_apps_all(apps_all_train):
    ftr_app = apps_all_train.drop(['SK_ID_CURR', 'TARGET'], axis=1)
    target_app = apps_all_train['TARGET']
    
    train_x, valid_x, train_y, valid_y = train_test_split(ftr_app, target_app, test_size=0.3, random_state=0)
    print(f'train_shape: {train_x.shape}, valid_shape: {valid_x.shape}')
    
    clf = LGBMClassifier(
        n_jobs=-1,
        n_estimators=1000, 
        learning_rate=0.02, 
        max_depth=round(max_dict['params']['max_depth']),
        num_leaves=round(max_dict['params']['num_leaves']),
        colsample_bytree=round(max_dict['params']['colsample_bytree'], 3),
        subsample=round(max_dict['params']['subsample'], 3),
        max_bin=round(max_dict['params']['max_bin']),
        reg_alpha=round(max_dict['params']['reg_alpha'], 3),
        reg_lambda=round(max_dict['params']['reg_lambda'], 3),
        min_child_weight=round(max_dict['params']['min_child_weight']),
        min_child_samples=round(max_dict['params']['min_child_samples']),
        silent=-1,
        verbose=-1,        
    )
    
    clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], 
            eval_metric='auc', verbose=100, early_stopping_rounds=100)
    
    return clf

In [14]:
apps_all = get_apps_all_with_prev_agg(apps, prev)
apps_all = get_apps_all_encoded(apps_all)
apps_all_train, apps_all_test = get_apps_all_train_test(apps_all)
clf = train_apps_all(apps_all_train)

prev_agg shape: (338857, 42)
apps_all before merge shape: (356255, 136)
apps_all after merge with prev_agg shape: (356255, 177)
train_shape: (215257, 175), valid_shape: (92254, 175)
Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.776689	training's binary_logloss: 0.244952	valid_1's auc: 0.758041	valid_1's binary_logloss: 0.246369
[200]	training's auc: 0.795598	training's binary_logloss: 0.236388	valid_1's auc: 0.768785	valid_1's binary_logloss: 0.241426
[300]	training's auc: 0.80939	training's binary_logloss: 0.230996	valid_1's auc: 0.774216	valid_1's binary_logloss: 0.239459
[400]	training's auc: 0.821202	training's binary_logloss: 0.226578	valid_1's auc: 0.777129	valid_1's binary_logloss: 0.238476
[500]	training's auc: 0.831667	training's binary_logloss: 0.222605	valid_1's auc: 0.779044	valid_1's binary_logloss: 0.237868
[600]	training's auc: 0.84113	training's binary_logloss: 0.219	valid_1's auc: 0.780108	valid_1's binary_logloss: 0.237505
[700]

In [15]:
preds = clf.predict_proba(apps_all_test.drop('SK_ID_CURR', axis=1))[:, 1]
apps_all_test['TARGET'] = preds
apps_all_test[['SK_ID_CURR', 'TARGET']].to_csv('../result/hyper_parameter_tuning_result_01.csv', index=False)

In [16]:
bayesian_params = {
    'max_depth': (6, 16), 
    'num_leaves': (24, 64), 
    'min_data_in_leaf': (10, 200), # min_child_samples
    'min_child_weight': (1, 50),
    'bagging_fraction': (0.5, 1.0), # subsample
    'feature_fraction': (0.5, 1.0), # colsample_bytree
    'max_bin': (10, 500),
    'lambda_l2': (0.001, 10), # reg_lambda
    'lambda_l1': (0.01, 50), # reg_alpha
}

In [17]:
import lightgbm as lgb

train_data = lgb.Dataset(data=ftr_app, label=target_app, free_raw_data=False)

def lgb_roc_eval_cv(max_depth, num_leaves, min_data_in_leaf, min_child_weight, 
                    bagging_fraction, feature_fraction, max_bin, lambda_l2, lambda_l1):
    
    params = {
        'num_iterations': 500, 
        'learning_rate': 0.02,
        'early_stopping_rounds': 100,
        'metric': 'auc',
        'max_depth': int(round(max_depth)),
        'num_leaves': int(round(num_leaves)), 
        'min_data_in_leaf': int(round(min_data_in_leaf)),
        'min_child_weight': int(round(min_child_weight)),
        'bagging_fraction': max(min(bagging_fraction, 1), 0), 
        'feature_fraction': max(min(feature_fraction, 1), 0),
        'max_bin':  max(int(round(max_bin)), 10),
        'lambda_l2': max(lambda_l2, 0),
        'lambda_l1': max(lambda_l1, 0),
    }
    
    cv_result = lgb.cv(params, train_data, nfold=3, seed=0, verbose_eval=100, early_stopping_rounds=50, metrics=['auc'])
    
    return max(cv_result['auc-mean'])

In [18]:
lgbBO = BayesianOptimization(f=lgb_roc_eval_cv, pbounds=bayesian_params, random_state=0)
lgbBO.maximize(init_points=5, n_iter=25)

|   iter    |  target   | baggin... | featur... | lambda_l1 | lambda_l2 |  max_bin  | max_depth | min_ch... | min_da... | num_le... |
-------------------------------------------------------------------------------------------------------------------------------------
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 20434
[LightGBM] [Info] Number of data points in the train set: 205007, number of used features: 165
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 20434
[LightGBM] [Info] Number of data points in the train set: 205007, number of used features: 165
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 20434
[LightGBM] [Info] Number of data points in the train set: 205008, number of used features: 165
[LightGBM] [Info] Start training from score 0.080729
[LightGBM] [Info] Start training from score 0.080729
[LightGBM] [Info] Start training from score 0.080729
[100]	cv_

[200]	cv_agg's auc: 0.762218 + 0.00223035
[300]	cv_agg's auc: 0.766924 + 0.00210518


[400]	cv_agg's auc: 0.769147 + 0.00194789


[500]	cv_agg's auc: 0.770606 + 0.00173552
| [0m 2       [0m | [0m 0.7706  [0m | [0m 0.6917  [0m | [0m 0.8959  [0m | [0m 26.45   [0m | [0m 5.681   [0m | [0m 463.5   [0m | [0m 6.71    [0m | [0m 5.269   [0m | [0m 13.84   [0m | [0m 57.3    [0m |
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 21909
[LightGBM] [Info] Number of data points in the train set: 205007, number of used features: 167
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 21909
[LightGBM] [Info] Number of data points in the train set: 205007, number of used features: 167
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 21909
[LightGBM] [Info] Number of data points in the train set: 205008, number of used features: 167
[LightGBM] [Info] Start training from score 0.080729
[LightGBM] [Info] Start training from score 0.080729
[LightGBM] [Info] Start training from score 0.080729
[100]	cv_agg'

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 17664
[LightGBM] [Info] Number of data points in the train set: 205007, number of used features: 165
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 17664
[LightGBM] [Info] Number of data points in the train set: 205007, number of used features: 165
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 17664
[LightGBM] [Info] Number of data points in the train set: 205008, number of used features: 165
[LightGBM] [Info] Start training from score 0.080729
[LightGBM] [Info] Start training from score 0.080729
[LightGBM] [Info] Start training from score 0.080729
[100]	cv_agg's auc: 0.756272 + 0.00200983
[200]	cv_agg's auc: 0.76429 + 0.00175676
[300]	cv_agg's auc: 0.76847 + 0.00185334
[400]	cv_agg's auc: 0.770

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 40936
[LightGBM] [Info] Number of data points in the train set: 205007, number of used features: 167
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 40936
[LightGBM] [Info] Number of data points in the train set: 205008, number of used features: 167
[LightGBM] [Info] Start training from score 0.080729
[LightGBM] [Info] Start training from score 0.080729
[LightGBM] [Info] Start training from score 0.080729
[100]	cv_agg's auc: 0.751269 + 0.0026493
[200]	cv_agg's auc: 0.762964 + 0.00253685
[300]	cv_agg's auc: 0.76803 + 0.00219627
[400]	cv_agg's auc: 0.770619 + 0.0019399
[500]	cv_agg's auc: 0.772134 + 0.00183119
| [0m 11      [0m | [0m 0.7721  [0m | [0m 0.7641  [0m | [0m 0.9781  [0m | [0m 7.537   [0m | [0m 6.76    [0m | [0m 470.9   [0m | [0m 7.874   [0m | [0m 46.02   [0m | [0m 153.5   [0m | [0m 40.77   [0m |
You can set `force_col_wise=true` to

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 34464
[LightGBM] [Info] Number of data points in the train set: 205007, number of used features: 167
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 34464
[LightGBM] [Info] Number of data points in the train set: 205007, number of used features: 167
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 34464
[LightGBM] [Info] Number of data points in the train set: 205008, number of used features: 167
[LightGBM] [Info] Start training from score 0.080729
[LightGBM] [Info] Start training from score 0.080729
[LightGBM] [Info] Start training from score 0.080729




[100]	cv_agg's auc: 0.753583 + 0.00224717






[200]	cv_agg's auc: 0.762901 + 0.00255202






[300]	cv_agg's auc: 0.76713 + 0.00227676




[400]	cv_agg's auc: 0.769014 + 0.00227629






[500]	cv_agg's auc: 0.770164 + 0.00236604
| [0m 16      [0m | [0m 0.7702  [0m | [0m 0.5     [0m | [0m 1.0     [0m | [0m 0.01    [0m | [0m 0.001   [0m | [0m 388.1   [0m | [0m 6.0     [0m | [0m 50.0    [0m | [0m 133.8   [0m | [0m 64.0    [0m |
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 27043
[LightGBM] [Info] Number of data points in the train set: 205007, number of used features: 165
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 27043
[LightGBM] [Info] Number of data points in the train set: 205007, number of used features: 165
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 27043
[LightGBM] [Info] Number of data points in the train set: 205008, number of used features: 165
[LightGBM] [Info] Start training from score 0.080729
[LightGBM] [Info] Start training from score 0.080729
[LightGBM] [Info] Start training from score 0.080729
[100]	cv_agg'

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 28778
[LightGBM] [Info] Number of data points in the train set: 205008, number of used features: 165
[LightGBM] [Info] Start training from score 0.080729
[LightGBM] [Info] Start training from score 0.080729
[LightGBM] [Info] Start training from score 0.080729




[100]	cv_agg's auc: 0.74849 + 0.0024407






[200]	cv_agg's auc: 0.758594 + 0.0024888






[300]	cv_agg's auc: 0.763884 + 0.00238771




[400]	cv_agg's auc: 0.766497 + 0.00234919






[500]	cv_agg's auc: 0.768146 + 0.00217444
| [0m 20      [0m | [0m 0.7681  [0m | [0m 0.5     [0m | [0m 0.8916  [0m | [0m 50.0    [0m | [0m 2.811   [0m | [0m 318.4   [0m | [0m 6.0     [0m | [0m 50.0    [0m | [0m 188.9   [0m | [0m 64.0    [0m |
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 23840
[LightGBM] [Info] Number of data points in the train set: 205007, number of used features: 165
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 23840
[LightGBM] [Info] Number of data points in the train set: 205007, number of used features: 165
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 23840
[LightGBM] [Info] Number of data points in the train set: 205008, number of used features: 165
[LightGBM] [Info] Start training from score 0.080729
[LightGBM] [Info] Start training from score 0.080729
[LightGBM] [Info] Start training from score 0.080729
[100]	cv_agg'

[LightGBM] [Info] Start training from score 0.080729
[LightGBM] [Info] Start training from score 0.080729
[100]	cv_agg's auc: 0.752268 + 0.00233717
[200]	cv_agg's auc: 0.762613 + 0.00206768
[300]	cv_agg's auc: 0.767714 + 0.00180925
[400]	cv_agg's auc: 0.770441 + 0.0017251
[500]	cv_agg's auc: 0.772098 + 0.00170697
| [0m 23      [0m | [0m 0.7721  [0m | [0m 0.8387  [0m | [0m 0.6137  [0m | [0m 2.03    [0m | [0m 8.781   [0m | [0m 495.0   [0m | [0m 7.616   [0m | [0m 40.23   [0m | [0m 199.8   [0m | [0m 34.46   [0m |
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 42074
[LightGBM] [Info] Number of data points in the train set: 205007, number of used features: 166
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 42074
[LightGBM] [Info] Number of data points in the train set: 205007, number of used features: 166
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 

[400]	cv_agg's auc: 0.769635 + 0.00187473
[500]	cv_agg's auc: 0.771299 + 0.0018438
| [0m 27      [0m | [0m 0.7713  [0m | [0m 0.9425  [0m | [0m 0.6197  [0m | [0m 2.495   [0m | [0m 1.449   [0m | [0m 480.6   [0m | [0m 8.087   [0m | [0m 3.911   [0m | [0m 156.2   [0m | [0m 28.23   [0m |
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 42161
[LightGBM] [Info] Number of data points in the train set: 205007, number of used features: 165
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 42161
[LightGBM] [Info] Number of data points in the train set: 205007, number of used features: 165
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 42161
[LightGBM] [Info] Number of data points in the train set: 205008, number of used features: 165
[LightGBM] [Info] Start training from score 0.080729
[LightGBM] [Info] Start training from score 0.080729
[LightGBM] [Info] Start t

In [19]:
max_dict = lgbBO.res[np.argmax(np.array(target_list))]
print(max_dict)

{'target': 0.7720982152503876, 'params': {'bagging_fraction': 0.8386515465125856, 'feature_fraction': 0.6137100057774653, 'lambda_l1': 2.029918642228829, 'lambda_l2': 8.780977921949715, 'max_bin': 495.033560028543, 'max_depth': 7.61623668276145, 'min_child_weight': 40.232413573410355, 'min_data_in_leaf': 199.82554538351746, 'num_leaves': 34.460922099466856}}


In [20]:
def train_apps_all(apps_all_train):
    ftr_app = apps_all_train.drop(['SK_ID_CURR', 'TARGET'], axis=1)
    target_app = apps_all_train['TARGET']
    
    train_x, valid_x, train_y, valid_y = train_test_split(ftr_app, target_app, test_size=0.3, random_state=0)
    print(f'train_shape: {train_x.shape}, valid_shape: {valid_x.shape}')
    
    clf = LGBMClassifier(
        n_jobs=-1,
        n_estimators=1000, 
        learning_rate=0.02, 
        max_depth=round(max_dict['params']['max_depth']),
        num_leaves=round(max_dict['params']['num_leaves']),
        colsample_bytree=round(max_dict['params']['feature_fraction'], 3),
        subsample=round(max_dict['params']['bagging_fraction'], 3),
        max_bin=round(max_dict['params']['max_bin']),
        reg_alpha=round(max_dict['params']['lambda_l1'], 3),
        reg_lambda=round(max_dict['params']['lambda_l2'], 3),
        min_child_weight=round(max_dict['params']['min_child_weight']),
        min_child_samples=round(max_dict['params']['min_data_in_leaf']),
        silent=-1,
        verbose=-1,        
    )
    
    clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], 
            eval_metric='auc', verbose=100, early_stopping_rounds=100)
    
    return clf

In [21]:
apps_all = get_apps_all_with_prev_agg(apps, prev)
apps_all = get_apps_all_encoded(apps_all)
apps_all_train, apps_all_test = get_apps_all_train_test(apps_all)
clf = train_apps_all(apps_all_train)

prev_agg shape: (338857, 42)
apps_all before merge shape: (356255, 136)
apps_all after merge with prev_agg shape: (356255, 177)
train_shape: (215257, 175), valid_shape: (92254, 175)
Training until validation scores don't improve for 100 rounds
[100]	training's auc: 0.767574	training's binary_logloss: 0.247444	valid_1's auc: 0.755815	valid_1's binary_logloss: 0.246967
[200]	training's auc: 0.784581	training's binary_logloss: 0.239994	valid_1's auc: 0.766951	valid_1's binary_logloss: 0.241976
[300]	training's auc: 0.795286	training's binary_logloss: 0.235804	valid_1's auc: 0.77226	valid_1's binary_logloss: 0.24004
[400]	training's auc: 0.803836	training's binary_logloss: 0.232632	valid_1's auc: 0.775254	valid_1's binary_logloss: 0.239031
[500]	training's auc: 0.811465	training's binary_logloss: 0.2299	valid_1's auc: 0.776926	valid_1's binary_logloss: 0.238474
[600]	training's auc: 0.817912	training's binary_logloss: 0.227553	valid_1's auc: 0.777989	valid_1's binary_logloss: 0.238136
[700

In [22]:
preds = clf.predict_proba(apps_all_test.drop('SK_ID_CURR', axis=1))[:, 1]
apps_all_test['TARGET'] = preds
apps_all_test[['SK_ID_CURR', 'TARGET']].to_csv('../result/hyper_parameter_tuning_result_02.csv', index=False)