In [1]:
# import modules
import warnings
warnings.filterwarnings('ignore')


import pickle
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import log_loss

from math import floor
from scipy.stats import norm

sns.set()
sns.set_color_codes()
sns.set_theme(style="ticks", palette="pastel")
print('<Check the versions of modules>')
print("- Seaborn version : ", sns.__version__)


# upload data
train = pd.read_csv('/Users/gangtaro/competition_data/DACON/14thMonthlyDacon/open/train.csv',index_col=0)
test = pd.read_csv('/Users/gangtaro/competition_data/DACON/14thMonthlyDacon/open/test.csv',index_col=0)
submit = pd.read_csv('/Users/gangtaro/competition_data/DACON/14thMonthlyDacon/open/sample_submission.csv')

<Check the versions of modules>
- Seaborn version :  0.11.1


In [2]:
# outliers
train.drop((train[train['child_num']>=6]).index, inplace = True)

In [3]:
for df in [train, test]:
    # no information
    df.drop('FLAG_MOBIL',axis = 1, inplace = True)
    
    
    df.family_type[(df.family_type == 'Single / not married')&(df.family_size - df.child_num == 2)] = 'Married'
    df.family_type[df.family_type == 'Single / not married'] = 'Absolutely Single'

    whether_married_new = df.family_type.copy()
    whether_married_new.replace({'Married':2, 'Civil marriage':2, 'Separated':1, 'Absolutely Single':1, 'Widow':1}, inplace = True)
    df.family_size = whether_married_new + df.child_num

    df.occyp_type.fillna('Missing occyp', inplace=True)
    df.loc[(df.DAYS_EMPLOYED == 0)&(df.occyp_type == 'Missing occyp'), 'occyp_type'] = 'inoccyp'
    df.loc[(df.DAYS_EMPLOYED != 0)&(df.occyp_type == 'Missing occyp'), 'occyp_type'] = 'blank'
    df.occyp_type[df.income_type == 'Pensioner'] = 'inoccyp'
    df.DAYS_EMPLOYED[df.income_type == 'Pensioner'] = 0

    df['ID'] = \
    df['gender'].astype('str') + \
    df['car'].astype('str') + \
    df['reality'].astype('str') + '_' + \
    df['child_num'].astype('str') + '_' + \
    df['income_total'].astype('str') + '_' + \
    df['income_type'].astype('str') + '_' + \
    df['family_type'].astype('str') + '_' + \
    df['house_type'].astype('str') + '_' + \
    df['phone'].astype('str') + '_' + \
    df['email'].astype('str') + '_' + \
    df['family_size'].astype('str') + '_' + \
    df['DAYS_BIRTH'].astype('str') + '_' + \
    df['DAYS_EMPLOYED'].astype('str') + '_' + \
    df['occyp_type'].astype('str') 

    # 로그 변환(income_total)
    df['income_total_log'] = np.log(df['income_total']+1)

    # 음수로 표현된 수치형 변수 양수화
    df['DAYS_BIRTH']    = -df['DAYS_BIRTH']
    df['DAYS_EMPLOYED'] = -df['DAYS_EMPLOYED']
    df['begin_month']   = -df['begin_month']

    # 동산/부동산 보유 정보
    df['group_property'] = df['car'].astype(str)   + df['reality'].astype(str)

    # 연락처 종류에 관한 정보
    df['group_contacts'] = df['phone'].astype(str) + df['work_phone'].astype(str) + df['email'].astype(str)

    # 어떤 성별의, 어떤 가정을 이루고 있는지에 대한 정보
    df['group_whoare_u'] = df['gender'].astype(str)+ df['family_type'].astype(str) + (df['child_num'] > 0).replace({True : 'hav child', False : 'dont hav child'}) 

    # 소득원천, 직업종류에 대한 결합 정보                                                                                               
    df['group_your_job'] = df['income_type'] +'_'+ df['occyp_type']

    # 현재 하고 있는 일을 시작하게 된 나이(날짜) = 일을 하지 않았던 기간 
    df['DAYS_nowork'] = df['DAYS_BIRTH'] - df['DAYS_EMPLOYED']

    # 소득과 관련된 변수와 조합
    df['income_per_child'] = df['income_total']/ df['child_num']  
    df['income_per_family']= df['income_total']/ df['family_size']
    df['income_per_age']   = df['income_total']/ df['DAYS_BIRTH']

    # 날짜표기 변환(begin_month) : 4로 나누어 계절로 분할
    df['begin_season'] = df['begin_month'].apply(lambda x: floor(x/4))

    # 날짜표기 변환(DAYS_EMPLOYED) : 개월로 binning
    df['mm_EMPLOYED'] = df['DAYS_EMPLOYED'].apply(lambda x: floor(x/30.4375))

    # 날쨔 표기 변환(DAYS_EMPLOYED): 연차로 binning
    df['yy_EMPLOYED'] = df['DAYS_EMPLOYED'].apply(lambda x: floor(x/365.25))     

In [4]:
# 카드사용자의 능력에 대한 정보
personal_info = train.drop(['credit', 'begin_month'], axis = 1).drop_duplicates(subset='ID', keep='first', inplace=False, ignore_index=True)

F, F_birth_labels = pd.qcut(personal_info.DAYS_BIRTH[personal_info.gender == 'F'], q = 20, retbins=True)
M, M_birth_labels = pd.qcut(personal_info.DAYS_BIRTH[personal_info.gender == 'M'], q = 20, retbins=True)
F_birth_labels.put(0, F_birth_labels[0]-.001)
M_birth_labels.put(0, M_birth_labels[0]-.001)

personal_info['age_F_bins'] = F
personal_info['n_F_age'] = personal_info.groupby('age_F_bins').gender.count()
personal_info['age_M_bins'] = M
personal_info['n_M_age'] = personal_info.groupby('age_M_bins').gender.count()

personal_info['income_mean_F_age']  = personal_info.groupby([pd.cut(personal_info.DAYS_BIRTH, bins = F_birth_labels), 'gender']).income_total.transform(np.mean)
personal_info['income_mean_M_age']  = personal_info.groupby([pd.cut(personal_info.DAYS_BIRTH, bins = M_birth_labels), 'gender']).income_total.transform(np.mean)
personal_info['income_std_F_age']   = personal_info.groupby([pd.cut(personal_info.DAYS_BIRTH, bins = F_birth_labels), 'gender']).income_total.transform(np.std)
personal_info['income_std_M_age']   = personal_info.groupby([pd.cut(personal_info.DAYS_BIRTH, bins = M_birth_labels), 'gender']).income_total.transform(np.std)

F_age_info = personal_info.groupby('age_F_bins')[['income_mean_F_age', 'income_std_F_age', 'n_F_age']].apply(np.mean)
M_age_info = personal_info.groupby('age_M_bins')[['income_mean_M_age', 'income_std_M_age', 'n_M_age']].apply(np.mean)

In [5]:
pre_ = test.DAYS_BIRTH.iloc[6031]
test.DAYS_BIRTH.iloc[6031] = 7757

for df in [train, test]:
    df['age_F_bins'] = pd.cut(df.DAYS_BIRTH, bins = F_birth_labels)
    df['age_M_bins'] = pd.cut(df.DAYS_BIRTH, bins = M_birth_labels)

#예외 발생(test)
test.DAYS_BIRTH.iloc[6031] = pre_

train = pd.merge(train, F_age_info, how = 'left', on = 'age_F_bins')
train = pd.merge(train, M_age_info, how = 'left', on = 'age_M_bins')
test = pd.merge(test, F_age_info, how = 'left', on = 'age_F_bins')
test = pd.merge(test, M_age_info, how = 'left', on = 'age_M_bins')

for df in [train, test]:
    df['ability_income_per_age_by_gen'] = 0
    df['ability_income_per_age_by_gen'][df.gender == 'F'] = ((df['income_total'] - df['income_mean_F_age'])/(df['income_std_F_age']/df['n_F_age'].apply(np.sqrt)))[df.gender == 'F']
    df['ability_income_per_age_by_gen'][df.gender == 'M'] = ((df['income_total'] - df['income_mean_M_age'])/(df['income_std_M_age']/df['n_M_age'].apply(np.sqrt)))[df.gender == 'M']    

In [6]:
# 여유성 자금 확보에 대한 정보
## 자녀양육비
### 자녀 1인 기준 기본 양육비
one_child_fee = (personal_info.income_total/personal_info.family_size)[(personal_info.DAYS_BIRTH > 33*365) & (personal_info.DAYS_BIRTH < 37*365) & (personal_info.child_num == 1)].mean()

### 교육 수준에 따른 소득의 평균을 프레임에 저장
df_weight_edu = personal_info.groupby('edu_type').income_total.apply(np.mean)
df_weight_edu.name = 'weight_edu'
df_weight_edu = df_weight_edu.reset_index()

d_age_mu  = 35.00059114405094 #계수1
d_age_std = 7.977814929279566 #계수2
train = pd.merge(train, df_weight_edu, how = 'left', on = 'edu_type')
train['weight_chd'] = train['child_num'].transform(lambda x: np.log(1+x)/np.log(2))
train['weight_age'] = train['DAYS_BIRTH'].transform(lambda x: norm(d_age_mu, scale = d_age_std).pdf((x/365.25))/norm(d_age_mu, scale = d_age_std).pdf(d_age_mu))
test  = pd.merge(test , df_weight_edu, how = 'left', on = 'edu_type') 
test['weight_chd'] = test['child_num'].transform(lambda x: np.log(1+x)/np.log(2))
test['weight_age'] = test['DAYS_BIRTH'].transform(lambda x: norm(d_age_mu, scale = d_age_std).pdf((x/365.25))/norm(d_age_mu, scale = d_age_std).pdf(d_age_mu))


### 가정한 공식에 따른 "자녀 양육비"
train['child_fee_total'] = one_child_fee*train['weight_edu']*train['weight_chd']*train['weight_age']
test['child_fee_total'] = one_child_fee*test['weight_edu']*test['weight_chd']*test['weight_age']


## 차량 유지비용
### 소득에 따른 자동차 유지비에 대한 가중치 함수
def car_weight(x, loc_par_car):
    _max = personal_info.income_total.max()
    _med = loc_par_car
    if x < _med : 
        return 1
    else:
        return 1+(x-_med)/(_max-_med)*5

### 가정한 공식에 따른 "차량 유지비용"
loc_par_car = 183585 #계수3
train['car_fees_total'] = train.income_total.median()*0.1*(train.car == 'Y').astype('int')*train.income_total.apply(lambda x: car_weight(x, loc_par_car))
test['car_fees_total'] = train.income_total.median()*0.1*(test.car == 'Y').astype('int')*test.income_total.apply(lambda x: car_weight(x, loc_par_car))

## 여유성 자금 확보 변수 생성
c_1 = 1.4633527457954487 
c_2 = 1.2772905851450718
train['save_income'] = train['income_total'] - c_1*train['child_fee_total'] - c_2*train['car_fees_total']
test['save_income'] = test['income_total'] - c_1*test['child_fee_total'] - c_2*test['car_fees_total']

In [7]:
drop_list = ['income_total',
             'age_F_bins', 'age_M_bins', 
             'income_mean_F_age', 'income_std_F_age', 'n_F_age',
             'income_mean_M_age', 'income_std_M_age', 'n_M_age', 
             'weight_edu', 'weight_chd', 'weight_age', 'child_fee_total', 'car_fees_total'
            ]

for df in [train, test]:
    df.drop(drop_list, axis = 1, inplace = True)    

In [23]:
# save the train/test data which is prepared to do modeling by pickle
with open('train.pickle', 'wb') as f:
    pickle.dump(train, f, pickle.HIGHEST_PROTOCOL)
with open('test.pickle', 'wb') as f:
    pickle.dump(test, f, pickle.HIGHEST_PROTOCOL)    

In [24]:
numerical_feats = train.dtypes[train.dtypes != "object"].index.tolist()
numerical_feats.remove('credit')
print("Number of Numerical features: ", len(numerical_feats))

categorical_feats = train.dtypes[train.dtypes == "object"].index.tolist()
print("Number of Categorical features: ", len(categorical_feats))

Number of Numerical features:  18
Number of Categorical features:  13


In [9]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 26451 entries, 0 to 26450
Data columns (total 32 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   gender                         26451 non-null  object 
 1   car                            26451 non-null  object 
 2   reality                        26451 non-null  object 
 3   child_num                      26451 non-null  int64  
 4   income_type                    26451 non-null  object 
 5   edu_type                       26451 non-null  object 
 6   family_type                    26451 non-null  object 
 7   house_type                     26451 non-null  object 
 8   DAYS_BIRTH                     26451 non-null  int64  
 9   DAYS_EMPLOYED                  26451 non-null  int64  
 10  work_phone                     26451 non-null  int64  
 11  phone                          26451 non-null  int64  
 12  email                          26451 non-null 

In [10]:
import optuna
from optuna.samplers import TPESampler

# preset option for model
n_class  =3
n_fold = 10
seed = 55

target = 'credit'
X = train.drop(target, axis=1)
y = train[target]
X_test = test

def objective(trial):
    skfold = StratifiedKFold(n_splits=n_fold, shuffle=True, random_state=seed)
    folds=[]
    for train_idx, valid_idx in skfold.split(X, y):
            folds.append((train_idx, valid_idx))
            
    cat_pred = np.zeros((X.shape[0], n_class))
    cat_pred_test = np.zeros((X_test.shape[0], n_class))
    cat_cols = categorical_feats

    param = {
        #"objective": trial.suggest_categorical("objective", ["Logloss", "CrossEntropy"]),
        "learning_rate":     trial.suggest_float("learning_rate", 0.01, 0.05),
        "n_estimators":      trial.suggest_int("n_estimators", 2400, 4400, 200),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.01, 0.2),
        "depth": trial.suggest_int("depth", 8, 16, 1),
        #"boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        #"bootstrap_type": trial.suggest_categorical(
        #    "bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]
        #),
        "subsample" : trial.suggest_float("subsample", 0.6, 0.98), #only=> bootstrap_type = 'Bernoulli'
        "used_ram_limit": "14gb",
    }

    #if param["bootstrap_type"] == "Bayesian":
    #    param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    #elif param["bootstrap_type"] == "Bernoulli":
    #    param["subsample"] = trial.suggest_float("subsample", 0.1, 1)
        
    model_folds = []    
    for fold in range(n_fold):
        print(f'\n----------------- Fold {fold+1} -----------------\n')
        train_idx, valid_idx = folds[fold]
        X_train, X_valid, y_train, y_valid = X.iloc[train_idx], X.iloc[valid_idx], y[train_idx], y[valid_idx]
        train_data = Pool(data=X_train, label=y_train, cat_features=cat_cols)
        valid_data = Pool(data=X_valid, label=y_valid, cat_features=cat_cols)

        model_cat = CatBoostClassifier(**param, 
                                       boosting_type = 'Ordered', 
                                       bootstrap_type = 'Bernoulli')
        model_cat.fit(train_data, eval_set=valid_data, use_best_model=True, early_stopping_rounds=100, verbose=200)
        
        model_folds.append(model_cat)
        cat_pred[valid_idx] = model_cat.predict_proba(X_valid)
        cat_pred_test += model_cat.predict_proba(X_test) / n_fold
        print(f'CV Log Loss Score: {log_loss(y_valid, cat_pred[valid_idx]):.6f}')

    print(f'\tLog Loss: {log_loss(y, cat_pred):.6f}')
    logloss = log_loss(y, cat_pred)
    return logloss

In [11]:
study_make_model_params = optuna.create_study(direction="minimize")
study_make_model_params.optimize(objective)

[32m[I 2021-07-04 08:52:14,022][0m A new study created in memory with name: no-name-ea94be68-6c53-4077-b477-d173087a2ea4[0m



----------------- Fold 1 -----------------

0:	learn: 1.0873115	test: 1.0873242	best: 1.0873242 (0)	total: 67.1ms	remaining: 3m 47s
200:	learn: 0.7905962	test: 0.7799713	best: 0.7799713 (200)	total: 4.17s	remaining: 1m 6s
400:	learn: 0.7445592	test: 0.7209176	best: 0.7209176 (400)	total: 8.18s	remaining: 1m 1s
600:	learn: 0.7275407	test: 0.6980181	best: 0.6980126 (599)	total: 12.6s	remaining: 58.8s
800:	learn: 0.7163698	test: 0.6835692	best: 0.6835692 (800)	total: 17.9s	remaining: 58.1s
1000:	learn: 0.7108149	test: 0.6786037	best: 0.6786037 (1000)	total: 22.8s	remaining: 54.6s
1200:	learn: 0.7057896	test: 0.6759531	best: 0.6759372 (1188)	total: 28s	remaining: 51.3s
1400:	learn: 0.7021060	test: 0.6746338	best: 0.6746206 (1391)	total: 32.9s	remaining: 47s
1600:	learn: 0.6981612	test: 0.6736541	best: 0.6736494 (1594)	total: 38.1s	remaining: 42.8s
1800:	learn: 0.6943677	test: 0.6722667	best: 0.6722093 (1794)	total: 43.4s	remaining: 38.5s
2000:	learn: 0.6902657	test: 0.6717021	best: 0.6717

CV Log Loss Score: 0.665562

----------------- Fold 7 -----------------

0:	learn: 1.0873118	test: 1.0873053	best: 1.0873053 (0)	total: 24.4ms	remaining: 1m 22s
200:	learn: 0.7881430	test: 0.7705962	best: 0.7705895 (199)	total: 4.16s	remaining: 1m 6s
400:	learn: 0.7580360	test: 0.7333831	best: 0.7333831 (400)	total: 7.83s	remaining: 58.5s
600:	learn: 0.7300121	test: 0.6919765	best: 0.6919737 (597)	total: 12.4s	remaining: 57.8s
800:	learn: 0.7184780	test: 0.6754636	best: 0.6754636 (800)	total: 17.6s	remaining: 57s
1000:	learn: 0.7124391	test: 0.6698202	best: 0.6698202 (1000)	total: 22.8s	remaining: 54.5s
1200:	learn: 0.7083647	test: 0.6673999	best: 0.6673999 (1200)	total: 27.6s	remaining: 50.6s
1400:	learn: 0.7031974	test: 0.6651160	best: 0.6651160 (1400)	total: 33.3s	remaining: 47.5s
1600:	learn: 0.6998806	test: 0.6640305	best: 0.6640305 (1600)	total: 38.5s	remaining: 43.3s
1800:	learn: 0.6962444	test: 0.6633782	best: 0.6633740 (1796)	total: 44.2s	remaining: 39.2s
2000:	learn: 0.692723

[32m[I 2021-07-04 09:03:15,205][0m Trial 0 finished with value: 0.665683727124693 and parameters: {'learning_rate': 0.025639478386043843, 'n_estimators': 3400, 'colsample_bylevel': 0.034244272134555324, 'depth': 9, 'subsample': 0.7261702509869272}. Best is trial 0 with value: 0.665683727124693.[0m



----------------- Fold 1 -----------------

0:	learn: 1.0804241	test: 1.0804447	best: 1.0804447 (0)	total: 24.1ms	remaining: 1m 31s
200:	learn: 0.7175159	test: 0.6770375	best: 0.6770375 (200)	total: 11.5s	remaining: 3m 26s
400:	learn: 0.7001486	test: 0.6702795	best: 0.6702243 (398)	total: 33.6s	remaining: 4m 44s
600:	learn: 0.6806754	test: 0.6680461	best: 0.6680461 (600)	total: 1m 3s	remaining: 5m 36s
800:	learn: 0.6628901	test: 0.6674175	best: 0.6671176 (719)	total: 1m 31s	remaining: 5m 41s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.6671175745
bestIteration = 719

Shrink model to first 720 iterations.
CV Log Loss Score: 0.667118

----------------- Fold 2 -----------------

0:	learn: 1.0745851	test: 1.0744996	best: 1.0744996 (0)	total: 66.3ms	remaining: 4m 11s
200:	learn: 0.7181782	test: 0.6702222	best: 0.6702222 (200)	total: 12.6s	remaining: 3m 45s
400:	learn: 0.7020520	test: 0.6662426	best: 0.6662132 (397)	total: 34.5s	remaining: 4m 52s
600:	learn: 0.683646

[32m[I 2021-07-04 09:16:47,291][0m Trial 1 finished with value: 0.6633573503896221 and parameters: {'learning_rate': 0.04160502952238434, 'n_estimators': 3800, 'colsample_bylevel': 0.1832000117969863, 'depth': 9, 'subsample': 0.8101208520516356}. Best is trial 1 with value: 0.6633573503896221.[0m



----------------- Fold 1 -----------------

0:	learn: 1.0786668	test: 1.0786894	best: 1.0786894 (0)	total: 16.5ms	remaining: 1m 12s
200:	learn: 0.7178850	test: 0.6776434	best: 0.6776434 (200)	total: 15s	remaining: 5m 14s
400:	learn: 0.6956108	test: 0.6714936	best: 0.6714858 (399)	total: 1m 24s	remaining: 14m 6s
600:	learn: 0.6721000	test: 0.6700615	best: 0.6700196 (592)	total: 2m 55s	remaining: 18m 31s
800:	learn: 0.6488936	test: 0.6693230	best: 0.6689319 (719)	total: 4m 23s	remaining: 19m 44s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.6689318672
bestIteration = 719

Shrink model to first 720 iterations.
CV Log Loss Score: 0.668932

----------------- Fold 2 -----------------

0:	learn: 1.0786836	test: 1.0786247	best: 1.0786247 (0)	total: 32.8ms	remaining: 2m 24s
200:	learn: 0.7161059	test: 0.6692692	best: 0.6692685 (199)	total: 22s	remaining: 7m 40s
400:	learn: 0.6940473	test: 0.6659300	best: 0.6658030 (389)	total: 1m 33s	remaining: 15m 30s
600:	learn: 0.6682

[32m[I 2021-07-04 09:52:48,965][0m Trial 2 finished with value: 0.6634479979427051 and parameters: {'learning_rate': 0.045721884216426596, 'n_estimators': 4400, 'colsample_bylevel': 0.16548282906074485, 'depth': 11, 'subsample': 0.9679705760891386}. Best is trial 1 with value: 0.6633573503896221.[0m



----------------- Fold 1 -----------------

0:	learn: 1.0809347	test: 1.0809547	best: 1.0809547 (0)	total: 16.9ms	remaining: 53.9s
200:	learn: 0.7196106	test: 0.6820394	best: 0.6820268 (199)	total: 6.68s	remaining: 1m 39s
400:	learn: 0.7046408	test: 0.6732726	best: 0.6732591 (398)	total: 15.7s	remaining: 1m 49s
600:	learn: 0.6896693	test: 0.6707955	best: 0.6707717 (595)	total: 26.2s	remaining: 1m 53s
800:	learn: 0.6758466	test: 0.6703492	best: 0.6703038 (783)	total: 37.1s	remaining: 1m 51s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.6702078248
bestIteration = 845

Shrink model to first 846 iterations.
CV Log Loss Score: 0.670208

----------------- Fold 2 -----------------

0:	learn: 1.0809321	test: 1.0809249	best: 1.0809249 (0)	total: 44ms	remaining: 2m 20s
200:	learn: 0.7205988	test: 0.6760225	best: 0.6760225 (200)	total: 6.91s	remaining: 1m 43s
400:	learn: 0.7043187	test: 0.6673872	best: 0.6673449 (399)	total: 16.1s	remaining: 1m 52s
600:	learn: 0.6904516	te

[32m[I 2021-07-04 09:59:31,737][0m Trial 3 finished with value: 0.6642327528961203 and parameters: {'learning_rate': 0.04041215540542881, 'n_estimators': 3200, 'colsample_bylevel': 0.09331790915027285, 'depth': 8, 'subsample': 0.6992674668944645}. Best is trial 1 with value: 0.6633573503896221.[0m



----------------- Fold 1 -----------------

0:	learn: 1.0804288	test: 1.0804494	best: 1.0804494 (0)	total: 19.9ms	remaining: 47.6s
200:	learn: 0.7191538	test: 0.6792795	best: 0.6792795 (200)	total: 7.67s	remaining: 1m 23s
400:	learn: 0.7042530	test: 0.6719840	best: 0.6719840 (400)	total: 18.4s	remaining: 1m 31s
600:	learn: 0.6887454	test: 0.6696113	best: 0.6695801 (599)	total: 32s	remaining: 1m 35s
800:	learn: 0.6726936	test: 0.6689673	best: 0.6689126 (788)	total: 45.4s	remaining: 1m 30s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.6685061874
bestIteration = 895

Shrink model to first 896 iterations.
CV Log Loss Score: 0.668506

----------------- Fold 2 -----------------

0:	learn: 1.0750026	test: 1.0749830	best: 1.0749830 (0)	total: 75.8ms	remaining: 3m 1s
200:	learn: 0.7192126	test: 0.6744871	best: 0.6744871 (200)	total: 7.5s	remaining: 1m 22s
400:	learn: 0.7030139	test: 0.6678665	best: 0.6677612 (390)	total: 18.8s	remaining: 1m 33s
600:	learn: 0.6861556	test

[32m[I 2021-07-04 10:07:25,653][0m Trial 4 finished with value: 0.6641162987125543 and parameters: {'learning_rate': 0.04159397390768506, 'n_estimators': 2400, 'colsample_bylevel': 0.15221470001243506, 'depth': 8, 'subsample': 0.638305299754207}. Best is trial 1 with value: 0.6633573503896221.[0m



----------------- Fold 1 -----------------

0:	learn: 1.0919215	test: 1.0919290	best: 1.0919290 (0)	total: 13.7ms	remaining: 54.8s
200:	learn: 0.7473485	test: 0.7239129	best: 0.7239129 (200)	total: 7.04s	remaining: 2m 13s
400:	learn: 0.7229058	test: 0.6848352	best: 0.6848352 (400)	total: 15s	remaining: 2m 14s
600:	learn: 0.7171854	test: 0.6776483	best: 0.6776483 (600)	total: 23.4s	remaining: 2m 12s
800:	learn: 0.7124352	test: 0.6741563	best: 0.6741563 (800)	total: 33.1s	remaining: 2m 12s
1000:	learn: 0.7058075	test: 0.6721714	best: 0.6721714 (1000)	total: 47.3s	remaining: 2m 21s
1200:	learn: 0.6995922	test: 0.6710880	best: 0.6710787 (1198)	total: 1m 1s	remaining: 2m 23s
1400:	learn: 0.6935572	test: 0.6704323	best: 0.6704002 (1385)	total: 1m 16s	remaining: 2m 21s
1600:	learn: 0.6873589	test: 0.6699095	best: 0.6699095 (1600)	total: 1m 30s	remaining: 2m 15s
1800:	learn: 0.6814722	test: 0.6696591	best: 0.6696591 (1800)	total: 1m 44s	remaining: 2m 7s
2000:	learn: 0.6757877	test: 0.6693610	

800:	learn: 0.7137528	test: 0.6591955	best: 0.6591955 (800)	total: 34.1s	remaining: 2m 16s
1000:	learn: 0.7069679	test: 0.6577953	best: 0.6577760 (999)	total: 49.2s	remaining: 2m 27s
1200:	learn: 0.7006289	test: 0.6568200	best: 0.6568200 (1200)	total: 1m 4s	remaining: 2m 29s
1400:	learn: 0.6947875	test: 0.6562951	best: 0.6562951 (1400)	total: 1m 19s	remaining: 2m 27s
1600:	learn: 0.6885780	test: 0.6561762	best: 0.6561730 (1597)	total: 1m 34s	remaining: 2m 21s
1800:	learn: 0.6827270	test: 0.6559161	best: 0.6558957 (1737)	total: 1m 49s	remaining: 2m 14s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.6557818917
bestIteration = 1861

Shrink model to first 1862 iterations.
CV Log Loss Score: 0.655782

----------------- Fold 9 -----------------

0:	learn: 1.0897945	test: 1.0898354	best: 1.0898354 (0)	total: 78.8ms	remaining: 5m 15s
200:	learn: 0.7451169	test: 0.7123170	best: 0.7123170 (200)	total: 8.31s	remaining: 2m 37s
400:	learn: 0.7223755	test: 0.6750087	best: 0.675

[32m[I 2021-07-04 10:26:29,415][0m Trial 5 finished with value: 0.6637191031795945 and parameters: {'learning_rate': 0.015099238118660865, 'n_estimators': 4000, 'colsample_bylevel': 0.17099932252211464, 'depth': 8, 'subsample': 0.6978905144445353}. Best is trial 1 with value: 0.6633573503896221.[0m



----------------- Fold 1 -----------------

0:	learn: 1.0792813	test: 1.0793032	best: 1.0793032 (0)	total: 13.9ms	remaining: 58.4s
200:	learn: 0.7416542	test: 0.7191406	best: 0.7191406 (200)	total: 6.23s	remaining: 2m 3s
400:	learn: 0.7113061	test: 0.6788474	best: 0.6788474 (400)	total: 18.7s	remaining: 2m 57s
600:	learn: 0.6972442	test: 0.6732530	best: 0.6732530 (600)	total: 35.3s	remaining: 3m 31s
800:	learn: 0.6851974	test: 0.6720429	best: 0.6719346 (799)	total: 52.9s	remaining: 3m 44s
1000:	learn: 0.6754452	test: 0.6717276	best: 0.6717276 (1000)	total: 1m 9s	remaining: 3m 41s
1200:	learn: 0.6648590	test: 0.6707263	best: 0.6707263 (1200)	total: 1m 25s	remaining: 3m 34s
1400:	learn: 0.6554935	test: 0.6706454	best: 0.6705648 (1349)	total: 1m 40s	remaining: 3m 21s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.6703659886
bestIteration = 1422

Shrink model to first 1423 iterations.
CV Log Loss Score: 0.670366

----------------- Fold 2 -----------------

0:	learn: 

[32m[I 2021-07-04 10:40:56,237][0m Trial 6 finished with value: 0.6647922733524976 and parameters: {'learning_rate': 0.04428021681941441, 'n_estimators': 4200, 'colsample_bylevel': 0.04310146422975949, 'depth': 11, 'subsample': 0.712504383062696}. Best is trial 1 with value: 0.6633573503896221.[0m



----------------- Fold 1 -----------------

0:	learn: 1.0898365	test: 1.0898464	best: 1.0898464 (0)	total: 17.7ms	remaining: 1m 7s
200:	learn: 0.7325552	test: 0.7013393	best: 0.7013393 (200)	total: 31s	remaining: 9m 15s
400:	learn: 0.7176363	test: 0.6795161	best: 0.6795161 (400)	total: 1m 5s	remaining: 9m 18s
600:	learn: 0.7111140	test: 0.6738640	best: 0.6738610 (598)	total: 1m 40s	remaining: 8m 54s
800:	learn: 0.6966128	test: 0.6713313	best: 0.6713313 (800)	total: 4m 21s	remaining: 16m 17s
1000:	learn: 0.6820036	test: 0.6703164	best: 0.6702780 (991)	total: 7m 12s	remaining: 20m 10s
1200:	learn: 0.6666523	test: 0.6692043	best: 0.6692043 (1200)	total: 10m	remaining: 21m 39s
1400:	learn: 0.6530528	test: 0.6690336	best: 0.6689624 (1348)	total: 12m 48s	remaining: 21m 56s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.6689623665
bestIteration = 1348

Shrink model to first 1349 iterations.
CV Log Loss Score: 0.668962

----------------- Fold 2 -----------------

0:	lear

Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.6686649229
bestIteration = 1199

Shrink model to first 1200 iterations.
CV Log Loss Score: 0.668665
	Log Loss: 0.663664


[32m[I 2021-07-04 12:32:17,967][0m Trial 7 finished with value: 0.6636643307805705 and parameters: {'learning_rate': 0.019852120244864435, 'n_estimators': 3800, 'colsample_bylevel': 0.14674631540241442, 'depth': 12, 'subsample': 0.6815462648363503}. Best is trial 1 with value: 0.6633573503896221.[0m



----------------- Fold 1 -----------------

0:	learn: 1.0797650	test: 1.0797864	best: 1.0797864 (0)	total: 13.1ms	remaining: 44.6s
200:	learn: 0.7504812	test: 0.7277722	best: 0.7277720 (199)	total: 7.82s	remaining: 2m 4s
400:	learn: 0.7177524	test: 0.6859475	best: 0.6859475 (400)	total: 17.5s	remaining: 2m 11s
600:	learn: 0.7040042	test: 0.6744059	best: 0.6744059 (600)	total: 37.6s	remaining: 2m 55s
800:	learn: 0.6941491	test: 0.6714982	best: 0.6714368 (797)	total: 55.4s	remaining: 2m 59s
1000:	learn: 0.6851048	test: 0.6700684	best: 0.6700576 (995)	total: 1m 12s	remaining: 2m 54s
1200:	learn: 0.6769211	test: 0.6691591	best: 0.6691529 (1196)	total: 1m 28s	remaining: 2m 42s
1400:	learn: 0.6697970	test: 0.6682306	best: 0.6681919 (1334)	total: 1m 47s	remaining: 2m 32s
1600:	learn: 0.6620182	test: 0.6682246	best: 0.6680676 (1531)	total: 2m 3s	remaining: 2m 19s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.6677101648
bestIteration = 1688

Shrink model to first 1689 it

200:	learn: 0.7545061	test: 0.7259818	best: 0.7259683 (199)	total: 5.64s	remaining: 1m 29s
400:	learn: 0.7189284	test: 0.6842476	best: 0.6842468 (397)	total: 16.4s	remaining: 2m 2s
600:	learn: 0.7055513	test: 0.6747390	best: 0.6747221 (597)	total: 35.7s	remaining: 2m 46s
800:	learn: 0.6960033	test: 0.6718461	best: 0.6718461 (800)	total: 56.1s	remaining: 3m 1s
1000:	learn: 0.6881870	test: 0.6712334	best: 0.6712083 (988)	total: 1m 12s	remaining: 2m 52s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.6711424548
bestIteration = 1023

Shrink model to first 1024 iterations.
CV Log Loss Score: 0.671142
	Log Loss: 0.665043


[32m[I 2021-07-04 12:49:31,735][0m Trial 8 finished with value: 0.6650432143547209 and parameters: {'learning_rate': 0.04314682849543618, 'n_estimators': 3400, 'colsample_bylevel': 0.03553347148935506, 'depth': 12, 'subsample': 0.8933034790809897}. Best is trial 1 with value: 0.6633573503896221.[0m



----------------- Fold 1 -----------------

0:	learn: 1.0854478	test: 1.0854627	best: 1.0854627 (0)	total: 12.3ms	remaining: 29.5s
200:	learn: 0.7895301	test: 0.7752808	best: 0.7752808 (200)	total: 3.82s	remaining: 41.8s
400:	learn: 0.7447429	test: 0.7193874	best: 0.7193873 (399)	total: 7.91s	remaining: 39.5s
600:	learn: 0.7249707	test: 0.6927947	best: 0.6927947 (600)	total: 13.3s	remaining: 40s
800:	learn: 0.7144855	test: 0.6801318	best: 0.6801318 (800)	total: 19.7s	remaining: 39.4s
1000:	learn: 0.7093073	test: 0.6764019	best: 0.6764019 (1000)	total: 25.8s	remaining: 36s
1200:	learn: 0.7041677	test: 0.6740798	best: 0.6740728 (1194)	total: 32.6s	remaining: 32.6s
1400:	learn: 0.7005676	test: 0.6723227	best: 0.6723227 (1400)	total: 38s	remaining: 27.1s
1600:	learn: 0.6971412	test: 0.6716690	best: 0.6716690 (1600)	total: 43.7s	remaining: 21.8s
1800:	learn: 0.6938515	test: 0.6707569	best: 0.6707569 (1800)	total: 49.1s	remaining: 16.3s
2000:	learn: 0.6900522	test: 0.6704785	best: 0.6704323

1000:	learn: 0.7096943	test: 0.6680658	best: 0.6680658 (1000)	total: 26.5s	remaining: 37s
1200:	learn: 0.7052664	test: 0.6654663	best: 0.6654663 (1200)	total: 32.7s	remaining: 32.6s
1400:	learn: 0.7014631	test: 0.6639854	best: 0.6639801 (1389)	total: 38.8s	remaining: 27.6s
1600:	learn: 0.6982161	test: 0.6630878	best: 0.6630876 (1598)	total: 44s	remaining: 21.9s
1800:	learn: 0.6947852	test: 0.6624865	best: 0.6624860 (1793)	total: 49.8s	remaining: 16.6s
2000:	learn: 0.6913401	test: 0.6619232	best: 0.6619194 (1999)	total: 55.1s	remaining: 11s
2200:	learn: 0.6887217	test: 0.6616893	best: 0.6616893 (2200)	total: 1m 1s	remaining: 5.53s
2399:	learn: 0.6858473	test: 0.6613811	best: 0.6613626 (2351)	total: 1m 6s	remaining: 0us

bestTest = 0.6613626446
bestIteration = 2351

Shrink model to first 2352 iterations.
CV Log Loss Score: 0.661363

----------------- Fold 8 -----------------

0:	learn: 1.0854581	test: 1.0854399	best: 1.0854399 (0)	total: 24ms	remaining: 57.6s
200:	learn: 0.7804860	test: 

[32m[I 2021-07-04 13:00:15,634][0m Trial 9 finished with value: 0.6656269397963898 and parameters: {'learning_rate': 0.029933382828200558, 'n_estimators': 2400, 'colsample_bylevel': 0.030470406163246824, 'depth': 10, 'subsample': 0.802013782407934}. Best is trial 1 with value: 0.6633573503896221.[0m



----------------- Fold 1 -----------------

0:	learn: 1.0829408	test: 1.0829585	best: 1.0829585 (0)	total: 17.8ms	remaining: 53.5s
200:	learn: 0.7181254	test: 0.6779857	best: 0.6779857 (200)	total: 2m 49s	remaining: 39m 37s
400:	learn: 0.6999033	test: 0.6702978	best: 0.6702978 (400)	total: 15m 6s	remaining: 1h 38m 11s
600:	learn: 0.6668854	test: 0.6689595	best: 0.6689579 (589)	total: 49m 49s	remaining: 3h 19m 13s
800:	learn: 0.6356423	test: 0.6675777	best: 0.6674239 (798)	total: 1h 24m 15s	remaining: 3h 51m 36s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.6674239424
bestIteration = 798

Shrink model to first 799 iterations.
CV Log Loss Score: 0.667424

----------------- Fold 2 -----------------

0:	learn: 1.0781593	test: 1.0779584	best: 1.0779584 (0)	total: 28.7ms	remaining: 1m 26s
200:	learn: 0.7186528	test: 0.6725967	best: 0.6725967 (200)	total: 1m 45s	remaining: 24m 43s
400:	learn: 0.6987929	test: 0.6663944	best: 0.6663944 (400)	total: 14m 31s	remaining: 1h 

KeyboardInterrupt: 

In [12]:
study_make_model_params.best_params

{'learning_rate': 0.04160502952238434,
 'n_estimators': 3800,
 'colsample_bylevel': 0.1832000117969863,
 'depth': 9,
 'subsample': 0.8101208520516356}

In [13]:
params_df = study_make_model_params.trials_dataframe()
params_df.drop(10, inplace = True)
params_df.drop(['datetime_start', 'datetime_complete', 'duration', 'state'], axis = 1, inplace=True)

In [17]:
# save the outcome of tuning
with open('params_df.pickle', 'wb') as f:
    pickle.dump(params_df, f, pickle.HIGHEST_PROTOCOL)

params_df.sort_values('value').head(8)

Unnamed: 0,number,value,params_colsample_bylevel,params_depth,params_learning_rate,params_n_estimators,params_subsample
1,1,0.663357,0.1832,9,0.041605,3800,0.810121
2,2,0.663448,0.165483,11,0.045722,4400,0.967971
7,7,0.663664,0.146746,12,0.019852,3800,0.681546
5,5,0.663719,0.170999,8,0.015099,4000,0.697891
4,4,0.664116,0.152215,8,0.041594,2400,0.638305
3,3,0.664233,0.093318,8,0.040412,3200,0.699267
6,6,0.664792,0.043101,11,0.04428,4200,0.712504
8,8,0.665043,0.035533,12,0.043147,3400,0.893303


In [22]:
# best parameters as result tunning
params_df.sort_values('value').iloc[0,2:]

params_colsample_bylevel       0.183200
params_depth                   9.000000
params_learning_rate           0.041605
params_n_estimators         3800.000000
params_subsample               0.810121
Name: 1, dtype: float64