In [17]:
import os, sys, warnings
warnings.filterwarnings('ignore')
from tqdm import tqdm_notebook as tqdm

import pandas as pd
import numpy as np
from datetime import datetime as dt
import re

import lightgbm as lgb
from catboost import Pool, CatBoostRegressor
import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor as RF
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from pygam import LinearGAM, s, f

import optuna
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import StratifiedShuffleSplit as SSsplit
from sklearn.metrics import mean_squared_error as mse
def rmse(pred, true) : return np.sqrt(mse(true, pred))

import matplotlib.pyplot as plt
import seaborn as sns

## data Load

In [2]:
read_path = '../dataset/fin/'
write_path = '../dataset/result/'
train = pd.read_csv(read_path+'train_fe.csv')
test = pd.read_csv(read_path+'test_fe.csv')

In [3]:
files = os.listdir(read_path)
newFiles = []
histFiles = []
transFiles = []
isFile = re.compile(r'.*[.]csv')
isNew = re.compile(r'.*_new[.]csv')
isHist = re.compile(r'.*_hist[.]csv')
isTrans = re.compile(r'.*_trans[.]csv')
for file in tqdm(files):
    if re.match(isFile, file):
        locals()[file[:-4]] = pd.read_csv(read_path+file)
        if re.match(isNew, file): newFiles.append(file[:-4])
        elif re.match(isHist, file): histFiles.append(file[:-4])
        elif re.match(isTrans, file): transFiles.append(file[:-4])
        else : print('is it proper file name? : {}'.format(file))

HBox(children=(IntProgress(value=0, max=49), HTML(value='')))

is it proper file name? : test_fe.csv
is it proper file name? : train_fe.csv



In [4]:
tempCols = ['merchant_try_'+col if col!='card_id' else col for col in locals()['mertry_trans'].columns.tolist()]
locals()['mertry_trans'].columns = tempCols
tempCols = ['merchant_visit_'+col if col!='card_id' else col for col in locals()['mervisit_trans'].columns.tolist()]
locals()['mervisit_trans'].columns = tempCols

In [5]:
train_trans = train
for file in tqdm(transFiles):
    train_trans = train_trans.merge(locals()[file], how='left', on='card_id')
    if train_trans.shape[0] != train.shape[0] : print('it is wrong : {} : {}'.format(train_trans.shape, file))

HBox(children=(IntProgress(value=0, max=14), HTML(value='')))




In [46]:
modelCols = pd.read_csv('../dataset/modelCols/trans_401.csv')['modelCols'].values.tolist()

catCols = []
isFeature = re.compile(r'feature_[\d]')
isModeKey = re.compile(r'.*_modeKey')

for col in modelCols:
    if re.match(isFeature, col): catCols.append(col)
    elif re.match(isModeKey, col): catCols.append(col)

catCols2 = []
for col in catCols:
    if train_trans[col].isna().sum()==0:
        catCols2.append(modelCols.index(col))

## CV Data

In [48]:
split_rate = test.shape[0]/(train.shape[0]+test.shape[0])
split_y = train['outliers']
SSspliter = SSsplit(3, split_rate)
for i, (train_index, test_index) in enumerate(SSspliter.split(train, split_y)):
    locals()['x_train_'+str(i)] = train_trans[modelCols].iloc[train_index]
    locals()['x_validate_'+str(i)] = train_trans[modelCols].iloc[test_index]
    locals()['y_train_'+str(i)] = train_trans['target'].iloc[train_index]
    locals()['y_validate_'+str(i)] = train_trans['target'].iloc[test_index]

## CV

### LGB

In [38]:
def lgbobj(trial):
    global k
    params = {
    'num_leaves':trial.suggest_int('num_leaves', 10, 1000),
    'max_depth':trial.suggest_int('max_depth', -1, 400),
    'learning_rate':trial.suggest_uniform('learning_rate', 1e-4, 0.1),
    'num_estimators': trial.suggest_int('num_estimators', 500, 1500),
    'subsample_for_bin' : trial.suggest_int('subsample_for_bin', 20, 20000),
    'min_split_gain': trial.suggest_uniform('min_split_gain', 0.,1e-3),
    'min_child_samples': trial.suggest_int('min_child_samples', 1, 200),
    'reg_lambda':trial.suggest_uniform('reg_labmda', 0.1, 0.5),
    'drop_rate': trial.suggest_uniform('drop_rate', 0.01, 0.5),
    'boosting': trial.suggest_categorical('boosting', ['gbdt', 'goss', 'dart']),
    'objective': 'regression',
    'num_threads': 7,
    }
    total_score = 0
    for i in tqdm(range(3)):
        lgb_data = lgb.Dataset(globals()['x_train_'+str(i)], label = globals()['y_train_'+str(i)].values)
        bst = lgb.train(params, lgb_data)
        pred = bst.predict(globals()['x_validate_'+str(i)])
        total_score += rmse(pred, globals()['y_validate_'+str(i)].values)
    print('{} : complete'.format(k))
    k+=1
    return total_score/3
lgbstudy = optuna.create_study()

In [39]:
%%time
k = 0
lgbstudy.optimize(lgbobj, n_trials=300)

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

0 : complete


[I 2019-02-25 00:16:32,871] Finished a trial resulted in value: 148618257.20726156. Current best value is 148618257.20726156 with parameters: {'num_leaves': 822, 'max_depth': 122, 'learning_rate': 0.011369082947663022, 'num_estimators': 1447, 'subsample_for_bin': 18914, 'min_split_gain': 1.2725743807421753e-05, 'min_child_samples': 163, 'reg_labmda': 0.43877139074078175, 'drop_rate': 0.3020308438775831, 'boosting': 'goss'}.


HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

1 : complete


[I 2019-02-25 00:19:33,656] Finished a trial resulted in value: 3.731566529805566. Current best value is 3.731566529805566 with parameters: {'num_leaves': 835, 'max_depth': 162, 'learning_rate': 0.053659117486951804, 'num_estimators': 697, 'subsample_for_bin': 18684, 'min_split_gain': 0.0007133963113271788, 'min_child_samples': 8, 'reg_labmda': 0.28479801983449626, 'drop_rate': 0.09136349699403186, 'boosting': 'dart'}.


HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

2 : complete


[I 2019-02-25 00:22:04,846] Finished a trial resulted in value: 3.7109155431030243. Current best value is 3.7109155431030243 with parameters: {'num_leaves': 943, 'max_depth': 14, 'learning_rate': 0.07716682149146482, 'num_estimators': 509, 'subsample_for_bin': 18867, 'min_split_gain': 2.4605629020916455e-05, 'min_child_samples': 23, 'reg_labmda': 0.19727946720477707, 'drop_rate': 0.12832554371950913, 'boosting': 'dart'}.


HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

3 : complete


[I 2019-02-25 00:22:37,865] Finished a trial resulted in value: 4.316293663309502e+38. Current best value is 3.7109155431030243 with parameters: {'num_leaves': 943, 'max_depth': 14, 'learning_rate': 0.07716682149146482, 'num_estimators': 509, 'subsample_for_bin': 18867, 'min_split_gain': 2.4605629020916455e-05, 'min_child_samples': 23, 'reg_labmda': 0.19727946720477707, 'drop_rate': 0.12832554371950913, 'boosting': 'dart'}.


HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

4 : complete


[I 2019-02-25 00:23:51,101] Finished a trial resulted in value: 3.7910214792487675. Current best value is 3.7109155431030243 with parameters: {'num_leaves': 943, 'max_depth': 14, 'learning_rate': 0.07716682149146482, 'num_estimators': 509, 'subsample_for_bin': 18867, 'min_split_gain': 2.4605629020916455e-05, 'min_child_samples': 23, 'reg_labmda': 0.19727946720477707, 'drop_rate': 0.12832554371950913, 'boosting': 'dart'}.


HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

5 : complete


[I 2019-02-25 00:26:08,773] Finished a trial resulted in value: 3.7647108983637367. Current best value is 3.7109155431030243 with parameters: {'num_leaves': 943, 'max_depth': 14, 'learning_rate': 0.07716682149146482, 'num_estimators': 509, 'subsample_for_bin': 18867, 'min_split_gain': 2.4605629020916455e-05, 'min_child_samples': 23, 'reg_labmda': 0.19727946720477707, 'drop_rate': 0.12832554371950913, 'boosting': 'dart'}.


HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

6 : complete


[I 2019-02-25 00:29:26,549] Finished a trial resulted in value: 3.729337641369075. Current best value is 3.7109155431030243 with parameters: {'num_leaves': 943, 'max_depth': 14, 'learning_rate': 0.07716682149146482, 'num_estimators': 509, 'subsample_for_bin': 18867, 'min_split_gain': 2.4605629020916455e-05, 'min_child_samples': 23, 'reg_labmda': 0.19727946720477707, 'drop_rate': 0.12832554371950913, 'boosting': 'dart'}.


HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

7 : complete


[I 2019-02-25 00:31:36,754] Finished a trial resulted in value: 3.708951227709823. Current best value is 3.708951227709823 with parameters: {'num_leaves': 482, 'max_depth': 325, 'learning_rate': 0.04713560882537008, 'num_estimators': 624, 'subsample_for_bin': 17055, 'min_split_gain': 0.0006701232506893072, 'min_child_samples': 34, 'reg_labmda': 0.4257022541726505, 'drop_rate': 0.22213715013805807, 'boosting': 'dart'}.


HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

8 : complete


[I 2019-02-25 00:32:08,104] Finished a trial resulted in value: 3.6966819357107674. Current best value is 3.6966819357107674 with parameters: {'num_leaves': 77, 'max_depth': 38, 'learning_rate': 0.06230442971564923, 'num_estimators': 780, 'subsample_for_bin': 6334, 'min_split_gain': 0.0006248297427933434, 'min_child_samples': 107, 'reg_labmda': 0.4780428122460535, 'drop_rate': 0.11750649900794807, 'boosting': 'gbdt'}.


HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

9 : complete


[I 2019-02-25 00:32:26,819] Finished a trial resulted in value: 3.702334006490117. Current best value is 3.6966819357107674 with parameters: {'num_leaves': 77, 'max_depth': 38, 'learning_rate': 0.06230442971564923, 'num_estimators': 780, 'subsample_for_bin': 6334, 'min_split_gain': 0.0006248297427933434, 'min_child_samples': 107, 'reg_labmda': 0.4780428122460535, 'drop_rate': 0.11750649900794807, 'boosting': 'gbdt'}.


HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

10 : complete


[I 2019-02-25 00:33:43,207] Finished a trial resulted in value: 3.7477189294810316. Current best value is 3.6966819357107674 with parameters: {'num_leaves': 77, 'max_depth': 38, 'learning_rate': 0.06230442971564923, 'num_estimators': 780, 'subsample_for_bin': 6334, 'min_split_gain': 0.0006248297427933434, 'min_child_samples': 107, 'reg_labmda': 0.4780428122460535, 'drop_rate': 0.11750649900794807, 'boosting': 'gbdt'}.


HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

11 : complete


[I 2019-02-25 00:34:06,401] Finished a trial resulted in value: 3.6931483245864083. Current best value is 3.6931483245864083 with parameters: {'num_leaves': 49, 'max_depth': 266, 'learning_rate': 0.07495510629144969, 'num_estimators': 1015, 'subsample_for_bin': 5165, 'min_split_gain': 0.00028011562725156145, 'min_child_samples': 71, 'reg_labmda': 0.49390609213375464, 'drop_rate': 0.06923165924199738, 'boosting': 'gbdt'}.


HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

12 : complete


[I 2019-02-25 00:35:37,649] Finished a trial resulted in value: 3.730537718311334. Current best value is 3.6931483245864083 with parameters: {'num_leaves': 49, 'max_depth': 266, 'learning_rate': 0.07495510629144969, 'num_estimators': 1015, 'subsample_for_bin': 5165, 'min_split_gain': 0.00028011562725156145, 'min_child_samples': 71, 'reg_labmda': 0.49390609213375464, 'drop_rate': 0.06923165924199738, 'boosting': 'gbdt'}.


HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

13 : complete


[I 2019-02-25 00:35:53,484] Finished a trial resulted in value: 3.701048826942109. Current best value is 3.6931483245864083 with parameters: {'num_leaves': 49, 'max_depth': 266, 'learning_rate': 0.07495510629144969, 'num_estimators': 1015, 'subsample_for_bin': 5165, 'min_split_gain': 0.00028011562725156145, 'min_child_samples': 71, 'reg_labmda': 0.49390609213375464, 'drop_rate': 0.06923165924199738, 'boosting': 'gbdt'}.


HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

14 : complete


[I 2019-02-25 00:38:11,287] Finished a trial resulted in value: 3.728850470896377. Current best value is 3.6931483245864083 with parameters: {'num_leaves': 49, 'max_depth': 266, 'learning_rate': 0.07495510629144969, 'num_estimators': 1015, 'subsample_for_bin': 5165, 'min_split_gain': 0.00028011562725156145, 'min_child_samples': 71, 'reg_labmda': 0.49390609213375464, 'drop_rate': 0.06923165924199738, 'boosting': 'gbdt'}.


HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

15 : complete


[I 2019-02-25 00:38:44,186] Finished a trial resulted in value: 3.710477425042009. Current best value is 3.6931483245864083 with parameters: {'num_leaves': 49, 'max_depth': 266, 'learning_rate': 0.07495510629144969, 'num_estimators': 1015, 'subsample_for_bin': 5165, 'min_split_gain': 0.00028011562725156145, 'min_child_samples': 71, 'reg_labmda': 0.49390609213375464, 'drop_rate': 0.06923165924199738, 'boosting': 'gbdt'}.


HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

16 : complete


[I 2019-02-25 00:39:28,568] Finished a trial resulted in value: 2.257211890834053e+37. Current best value is 3.6931483245864083 with parameters: {'num_leaves': 49, 'max_depth': 266, 'learning_rate': 0.07495510629144969, 'num_estimators': 1015, 'subsample_for_bin': 5165, 'min_split_gain': 0.00028011562725156145, 'min_child_samples': 71, 'reg_labmda': 0.49390609213375464, 'drop_rate': 0.06923165924199738, 'boosting': 'gbdt'}.


HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

17 : complete


[I 2019-02-25 00:39:39,371] Finished a trial resulted in value: 3.734545137627048. Current best value is 3.6931483245864083 with parameters: {'num_leaves': 49, 'max_depth': 266, 'learning_rate': 0.07495510629144969, 'num_estimators': 1015, 'subsample_for_bin': 5165, 'min_split_gain': 0.00028011562725156145, 'min_child_samples': 71, 'reg_labmda': 0.49390609213375464, 'drop_rate': 0.06923165924199738, 'boosting': 'gbdt'}.


HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

18 : complete


[I 2019-02-25 00:40:18,547] Finished a trial resulted in value: 3.697845575925518. Current best value is 3.6931483245864083 with parameters: {'num_leaves': 49, 'max_depth': 266, 'learning_rate': 0.07495510629144969, 'num_estimators': 1015, 'subsample_for_bin': 5165, 'min_split_gain': 0.00028011562725156145, 'min_child_samples': 71, 'reg_labmda': 0.49390609213375464, 'drop_rate': 0.06923165924199738, 'boosting': 'gbdt'}.


HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

19 : complete


[I 2019-02-25 00:41:34,424] Finished a trial resulted in value: 3.7379849772548717. Current best value is 3.6931483245864083 with parameters: {'num_leaves': 49, 'max_depth': 266, 'learning_rate': 0.07495510629144969, 'num_estimators': 1015, 'subsample_for_bin': 5165, 'min_split_gain': 0.00028011562725156145, 'min_child_samples': 71, 'reg_labmda': 0.49390609213375464, 'drop_rate': 0.06923165924199738, 'boosting': 'gbdt'}.


HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

20 : complete


[I 2019-02-25 00:44:04,870] Finished a trial resulted in value: 3.7211622205936297. Current best value is 3.6931483245864083 with parameters: {'num_leaves': 49, 'max_depth': 266, 'learning_rate': 0.07495510629144969, 'num_estimators': 1015, 'subsample_for_bin': 5165, 'min_split_gain': 0.00028011562725156145, 'min_child_samples': 71, 'reg_labmda': 0.49390609213375464, 'drop_rate': 0.06923165924199738, 'boosting': 'gbdt'}.


HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

21 : complete


[I 2019-02-25 00:44:42,841] Finished a trial resulted in value: 3.6879455586031806. Current best value is 3.6879455586031806 with parameters: {'num_leaves': 78, 'max_depth': 82, 'learning_rate': 0.05521924185007325, 'num_estimators': 964, 'subsample_for_bin': 7753, 'min_split_gain': 0.0002875160936943945, 'min_child_samples': 143, 'reg_labmda': 0.33301257868156753, 'drop_rate': 0.13001417297809148, 'boosting': 'gbdt'}.


HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

22 : complete


[I 2019-02-25 00:46:53,488] Finished a trial resulted in value: 3.685990040691595. Current best value is 3.685990040691595 with parameters: {'num_leaves': 519, 'max_depth': 182, 'learning_rate': 0.03300716550418671, 'num_estimators': 1114, 'subsample_for_bin': 8617, 'min_split_gain': 0.00031297823770644806, 'min_child_samples': 161, 'reg_labmda': 0.23545106059857868, 'drop_rate': 0.3026639017725872, 'boosting': 'gbdt'}.


HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

23 : complete


[I 2019-02-25 00:49:11,520] Finished a trial resulted in value: 3.6867942272547847. Current best value is 3.685990040691595 with parameters: {'num_leaves': 519, 'max_depth': 182, 'learning_rate': 0.03300716550418671, 'num_estimators': 1114, 'subsample_for_bin': 8617, 'min_split_gain': 0.00031297823770644806, 'min_child_samples': 161, 'reg_labmda': 0.23545106059857868, 'drop_rate': 0.3026639017725872, 'boosting': 'gbdt'}.


HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

24 : complete


[I 2019-02-25 00:51:20,511] Finished a trial resulted in value: 3.6875485026468073. Current best value is 3.685990040691595 with parameters: {'num_leaves': 519, 'max_depth': 182, 'learning_rate': 0.03300716550418671, 'num_estimators': 1114, 'subsample_for_bin': 8617, 'min_split_gain': 0.00031297823770644806, 'min_child_samples': 161, 'reg_labmda': 0.23545106059857868, 'drop_rate': 0.3026639017725872, 'boosting': 'gbdt'}.


HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

25 : complete


[I 2019-02-25 00:53:47,321] Finished a trial resulted in value: 3.7298980512039335. Current best value is 3.685990040691595 with parameters: {'num_leaves': 519, 'max_depth': 182, 'learning_rate': 0.03300716550418671, 'num_estimators': 1114, 'subsample_for_bin': 8617, 'min_split_gain': 0.00031297823770644806, 'min_child_samples': 161, 'reg_labmda': 0.23545106059857868, 'drop_rate': 0.3026639017725872, 'boosting': 'gbdt'}.


HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

26 : complete


[I 2019-02-25 00:54:46,338] Finished a trial resulted in value: 4.3665759867794963e+17. Current best value is 3.685990040691595 with parameters: {'num_leaves': 519, 'max_depth': 182, 'learning_rate': 0.03300716550418671, 'num_estimators': 1114, 'subsample_for_bin': 8617, 'min_split_gain': 0.00031297823770644806, 'min_child_samples': 161, 'reg_labmda': 0.23545106059857868, 'drop_rate': 0.3026639017725872, 'boosting': 'gbdt'}.


HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

27 : complete


[I 2019-02-25 00:56:45,006] Finished a trial resulted in value: 3.692599048601604. Current best value is 3.685990040691595 with parameters: {'num_leaves': 519, 'max_depth': 182, 'learning_rate': 0.03300716550418671, 'num_estimators': 1114, 'subsample_for_bin': 8617, 'min_split_gain': 0.00031297823770644806, 'min_child_samples': 161, 'reg_labmda': 0.23545106059857868, 'drop_rate': 0.3026639017725872, 'boosting': 'gbdt'}.


HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

28 : complete


[I 2019-02-25 00:57:40,115] Finished a trial resulted in value: 3814444826.9837356. Current best value is 3.685990040691595 with parameters: {'num_leaves': 519, 'max_depth': 182, 'learning_rate': 0.03300716550418671, 'num_estimators': 1114, 'subsample_for_bin': 8617, 'min_split_gain': 0.00031297823770644806, 'min_child_samples': 161, 'reg_labmda': 0.23545106059857868, 'drop_rate': 0.3026639017725872, 'boosting': 'gbdt'}.


HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

29 : complete


[I 2019-02-25 01:00:18,847] Finished a trial resulted in value: 3.7070735373830037. Current best value is 3.685990040691595 with parameters: {'num_leaves': 519, 'max_depth': 182, 'learning_rate': 0.03300716550418671, 'num_estimators': 1114, 'subsample_for_bin': 8617, 'min_split_gain': 0.00031297823770644806, 'min_child_samples': 161, 'reg_labmda': 0.23545106059857868, 'drop_rate': 0.3026639017725872, 'boosting': 'gbdt'}.


HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

30 : complete


[I 2019-02-25 01:03:13,331] Finished a trial resulted in value: 3.8535437188762818. Current best value is 3.685990040691595 with parameters: {'num_leaves': 519, 'max_depth': 182, 'learning_rate': 0.03300716550418671, 'num_estimators': 1114, 'subsample_for_bin': 8617, 'min_split_gain': 0.00031297823770644806, 'min_child_samples': 161, 'reg_labmda': 0.23545106059857868, 'drop_rate': 0.3026639017725872, 'boosting': 'gbdt'}.


HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

31 : complete


[I 2019-02-25 01:05:02,823] Finished a trial resulted in value: 3.694459678106169. Current best value is 3.685990040691595 with parameters: {'num_leaves': 519, 'max_depth': 182, 'learning_rate': 0.03300716550418671, 'num_estimators': 1114, 'subsample_for_bin': 8617, 'min_split_gain': 0.00031297823770644806, 'min_child_samples': 161, 'reg_labmda': 0.23545106059857868, 'drop_rate': 0.3026639017725872, 'boosting': 'gbdt'}.


HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

32 : complete


[I 2019-02-25 01:05:47,647] Finished a trial resulted in value: 4.119191099626848e+25. Current best value is 3.685990040691595 with parameters: {'num_leaves': 519, 'max_depth': 182, 'learning_rate': 0.03300716550418671, 'num_estimators': 1114, 'subsample_for_bin': 8617, 'min_split_gain': 0.00031297823770644806, 'min_child_samples': 161, 'reg_labmda': 0.23545106059857868, 'drop_rate': 0.3026639017725872, 'boosting': 'gbdt'}.


HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

33 : complete


[I 2019-02-25 01:08:22,077] Finished a trial resulted in value: 3.726186315083336. Current best value is 3.685990040691595 with parameters: {'num_leaves': 519, 'max_depth': 182, 'learning_rate': 0.03300716550418671, 'num_estimators': 1114, 'subsample_for_bin': 8617, 'min_split_gain': 0.00031297823770644806, 'min_child_samples': 161, 'reg_labmda': 0.23545106059857868, 'drop_rate': 0.3026639017725872, 'boosting': 'gbdt'}.


HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

34 : complete


[I 2019-02-25 01:10:16,776] Finished a trial resulted in value: 3.7508780964944695. Current best value is 3.685990040691595 with parameters: {'num_leaves': 519, 'max_depth': 182, 'learning_rate': 0.03300716550418671, 'num_estimators': 1114, 'subsample_for_bin': 8617, 'min_split_gain': 0.00031297823770644806, 'min_child_samples': 161, 'reg_labmda': 0.23545106059857868, 'drop_rate': 0.3026639017725872, 'boosting': 'gbdt'}.


HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

35 : complete


[I 2019-02-25 01:12:39,922] Finished a trial resulted in value: 3.741492098631349. Current best value is 3.685990040691595 with parameters: {'num_leaves': 519, 'max_depth': 182, 'learning_rate': 0.03300716550418671, 'num_estimators': 1114, 'subsample_for_bin': 8617, 'min_split_gain': 0.00031297823770644806, 'min_child_samples': 161, 'reg_labmda': 0.23545106059857868, 'drop_rate': 0.3026639017725872, 'boosting': 'gbdt'}.


HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

36 : complete


[I 2019-02-25 01:14:42,429] Finished a trial resulted in value: 3.6868190851550158. Current best value is 3.685990040691595 with parameters: {'num_leaves': 519, 'max_depth': 182, 'learning_rate': 0.03300716550418671, 'num_estimators': 1114, 'subsample_for_bin': 8617, 'min_split_gain': 0.00031297823770644806, 'min_child_samples': 161, 'reg_labmda': 0.23545106059857868, 'drop_rate': 0.3026639017725872, 'boosting': 'gbdt'}.


HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

37 : complete


[I 2019-02-25 01:15:55,930] Finished a trial resulted in value: 3.6978387784719664. Current best value is 3.685990040691595 with parameters: {'num_leaves': 519, 'max_depth': 182, 'learning_rate': 0.03300716550418671, 'num_estimators': 1114, 'subsample_for_bin': 8617, 'min_split_gain': 0.00031297823770644806, 'min_child_samples': 161, 'reg_labmda': 0.23545106059857868, 'drop_rate': 0.3026639017725872, 'boosting': 'gbdt'}.


HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

38 : complete


[I 2019-02-25 01:19:05,195] Finished a trial resulted in value: 3.702810774101465. Current best value is 3.685990040691595 with parameters: {'num_leaves': 519, 'max_depth': 182, 'learning_rate': 0.03300716550418671, 'num_estimators': 1114, 'subsample_for_bin': 8617, 'min_split_gain': 0.00031297823770644806, 'min_child_samples': 161, 'reg_labmda': 0.23545106059857868, 'drop_rate': 0.3026639017725872, 'boosting': 'gbdt'}.


HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

39 : complete


[I 2019-02-25 01:21:06,886] Finished a trial resulted in value: 3.685617191493774. Current best value is 3.685617191493774 with parameters: {'num_leaves': 935, 'max_depth': 245, 'learning_rate': 0.040630726310134826, 'num_estimators': 1436, 'subsample_for_bin': 6342, 'min_split_gain': 0.0004120715698725839, 'min_child_samples': 200, 'reg_labmda': 0.30042871016116973, 'drop_rate': 0.3708866605608058, 'boosting': 'gbdt'}.


HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

40 : complete


[I 2019-02-25 01:21:51,505] Finished a trial resulted in value: 2.9029674480430443e+37. Current best value is 3.685617191493774 with parameters: {'num_leaves': 935, 'max_depth': 245, 'learning_rate': 0.040630726310134826, 'num_estimators': 1436, 'subsample_for_bin': 6342, 'min_split_gain': 0.0004120715698725839, 'min_child_samples': 200, 'reg_labmda': 0.30042871016116973, 'drop_rate': 0.3708866605608058, 'boosting': 'gbdt'}.


HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

41 : complete


[I 2019-02-25 01:23:52,989] Finished a trial resulted in value: 3.6976028187686905. Current best value is 3.685617191493774 with parameters: {'num_leaves': 935, 'max_depth': 245, 'learning_rate': 0.040630726310134826, 'num_estimators': 1436, 'subsample_for_bin': 6342, 'min_split_gain': 0.0004120715698725839, 'min_child_samples': 200, 'reg_labmda': 0.30042871016116973, 'drop_rate': 0.3708866605608058, 'boosting': 'gbdt'}.


HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

42 : complete


[I 2019-02-25 01:26:14,267] Finished a trial resulted in value: 3.685662022207591. Current best value is 3.685617191493774 with parameters: {'num_leaves': 935, 'max_depth': 245, 'learning_rate': 0.040630726310134826, 'num_estimators': 1436, 'subsample_for_bin': 6342, 'min_split_gain': 0.0004120715698725839, 'min_child_samples': 200, 'reg_labmda': 0.30042871016116973, 'drop_rate': 0.3708866605608058, 'boosting': 'gbdt'}.


HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

43 : complete


[I 2019-02-25 01:29:18,251] Finished a trial resulted in value: 3.70576779269795. Current best value is 3.685617191493774 with parameters: {'num_leaves': 935, 'max_depth': 245, 'learning_rate': 0.040630726310134826, 'num_estimators': 1436, 'subsample_for_bin': 6342, 'min_split_gain': 0.0004120715698725839, 'min_child_samples': 200, 'reg_labmda': 0.30042871016116973, 'drop_rate': 0.3708866605608058, 'boosting': 'gbdt'}.


HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

44 : complete


[I 2019-02-25 01:30:50,008] Finished a trial resulted in value: 3.70588999069882. Current best value is 3.685617191493774 with parameters: {'num_leaves': 935, 'max_depth': 245, 'learning_rate': 0.040630726310134826, 'num_estimators': 1436, 'subsample_for_bin': 6342, 'min_split_gain': 0.0004120715698725839, 'min_child_samples': 200, 'reg_labmda': 0.30042871016116973, 'drop_rate': 0.3708866605608058, 'boosting': 'gbdt'}.


HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

45 : complete


[I 2019-02-25 01:31:47,055] Finished a trial resulted in value: 3.7037517262291053. Current best value is 3.685617191493774 with parameters: {'num_leaves': 935, 'max_depth': 245, 'learning_rate': 0.040630726310134826, 'num_estimators': 1436, 'subsample_for_bin': 6342, 'min_split_gain': 0.0004120715698725839, 'min_child_samples': 200, 'reg_labmda': 0.30042871016116973, 'drop_rate': 0.3708866605608058, 'boosting': 'gbdt'}.


HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

46 : complete


[I 2019-02-25 01:32:24,400] Finished a trial resulted in value: 3.918770063558795e+20. Current best value is 3.685617191493774 with parameters: {'num_leaves': 935, 'max_depth': 245, 'learning_rate': 0.040630726310134826, 'num_estimators': 1436, 'subsample_for_bin': 6342, 'min_split_gain': 0.0004120715698725839, 'min_child_samples': 200, 'reg_labmda': 0.30042871016116973, 'drop_rate': 0.3708866605608058, 'boosting': 'gbdt'}.


HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

47 : complete


[I 2019-02-25 01:34:28,307] Finished a trial resulted in value: 3.6924436661480513. Current best value is 3.685617191493774 with parameters: {'num_leaves': 935, 'max_depth': 245, 'learning_rate': 0.040630726310134826, 'num_estimators': 1436, 'subsample_for_bin': 6342, 'min_split_gain': 0.0004120715698725839, 'min_child_samples': 200, 'reg_labmda': 0.30042871016116973, 'drop_rate': 0.3708866605608058, 'boosting': 'gbdt'}.


HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

48 : complete


[I 2019-02-25 01:36:33,566] Finished a trial resulted in value: 3.695754743796609. Current best value is 3.685617191493774 with parameters: {'num_leaves': 935, 'max_depth': 245, 'learning_rate': 0.040630726310134826, 'num_estimators': 1436, 'subsample_for_bin': 6342, 'min_split_gain': 0.0004120715698725839, 'min_child_samples': 200, 'reg_labmda': 0.30042871016116973, 'drop_rate': 0.3708866605608058, 'boosting': 'gbdt'}.


HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

49 : complete


[I 2019-02-25 01:40:08,618] Finished a trial resulted in value: 3.725369552018338. Current best value is 3.685617191493774 with parameters: {'num_leaves': 935, 'max_depth': 245, 'learning_rate': 0.040630726310134826, 'num_estimators': 1436, 'subsample_for_bin': 6342, 'min_split_gain': 0.0004120715698725839, 'min_child_samples': 200, 'reg_labmda': 0.30042871016116973, 'drop_rate': 0.3708866605608058, 'boosting': 'gbdt'}.


HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

50 : complete


[I 2019-02-25 01:42:31,635] Finished a trial resulted in value: 3.6900788545038434. Current best value is 3.685617191493774 with parameters: {'num_leaves': 935, 'max_depth': 245, 'learning_rate': 0.040630726310134826, 'num_estimators': 1436, 'subsample_for_bin': 6342, 'min_split_gain': 0.0004120715698725839, 'min_child_samples': 200, 'reg_labmda': 0.30042871016116973, 'drop_rate': 0.3708866605608058, 'boosting': 'gbdt'}.


HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

51 : complete


[I 2019-02-25 01:46:01,072] Finished a trial resulted in value: 3.7827130615367808. Current best value is 3.685617191493774 with parameters: {'num_leaves': 935, 'max_depth': 245, 'learning_rate': 0.040630726310134826, 'num_estimators': 1436, 'subsample_for_bin': 6342, 'min_split_gain': 0.0004120715698725839, 'min_child_samples': 200, 'reg_labmda': 0.30042871016116973, 'drop_rate': 0.3708866605608058, 'boosting': 'gbdt'}.


HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

52 : complete


[I 2019-02-25 01:48:35,521] Finished a trial resulted in value: 3.7113820770275034. Current best value is 3.685617191493774 with parameters: {'num_leaves': 935, 'max_depth': 245, 'learning_rate': 0.040630726310134826, 'num_estimators': 1436, 'subsample_for_bin': 6342, 'min_split_gain': 0.0004120715698725839, 'min_child_samples': 200, 'reg_labmda': 0.30042871016116973, 'drop_rate': 0.3708866605608058, 'boosting': 'gbdt'}.


HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

53 : complete


[I 2019-02-25 01:49:33,632] Finished a trial resulted in value: 1.3580369511652082e+27. Current best value is 3.685617191493774 with parameters: {'num_leaves': 935, 'max_depth': 245, 'learning_rate': 0.040630726310134826, 'num_estimators': 1436, 'subsample_for_bin': 6342, 'min_split_gain': 0.0004120715698725839, 'min_child_samples': 200, 'reg_labmda': 0.30042871016116973, 'drop_rate': 0.3708866605608058, 'boosting': 'gbdt'}.


HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

KeyboardInterrupt: 

## xgb

In [40]:
def xgb_optuna(trial):
    global k
    model = XGBRegressor(max_depth=trial.suggest_int('max_depth', -1, 400), 
              learning_rate=trial.suggest_uniform('learning_rate', 1e-2, 0.5), 
              n_estimators=trial.suggest_int('n_estimators', 100, 1500),
              booster=trial.suggest_categorical('booster', ['gbtree', 'dart']), 
              min_child_weight=trial.suggest_int('min_child_weight', 1, 200),  
              subsample=trial.suggest_uniform('subsample', 0, 1), 
              reg_alpha=0, 
              reg_lambda=1, 
              scale_pos_weight=1, 
              base_score=0.5,  
              importance_type='gain',
              n_jobs=8,  
              silent=True, 
              objective='reg:linear', 
              gamma=0, 
              max_delta_step=0,
              colsample_bytree=1, 
              colsample_bylevel=1, )
    total_score = 0
    for i in tqdm(range(3)):
        model.fit(globals()['x_train_'+str(i)], globals()['y_train_'+str(i)].values)
        pred = model.predict(globals()['x_validate_'+str(i)])
        total_score += rmse(pred, globals()['y_validate_'+str(i)].values)
    print('{} : complete'.format(k))
    k+=1
    return total_score/3
xgbstudy = optuna.create_study()

In [41]:
%%time
k = 0
xgbstudy.optimize(xgb_optuna, n_trials=100)

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

Exception ignored in: <function Booster.__del__ at 0x00000201F5728D90>
Traceback (most recent call last):
  File "C:\anaconda3\envs\py37\lib\site-packages\lightgbm\basic.py", line 1554, in __del__
    _safe_call(_LIB.LGBM_BoosterFree(self.handle))
KeyboardInterrupt: 


KeyboardInterrupt: 

## Cat Boost

In [53]:
def cat_optuna(trial):
    global k
    model = CatBoostRegressor(iterations=trial.suggest_int('iterations', 100, 2000),
                        learning_rate=trial.suggest_uniform('learning_rate', 1e-2, 0.5),
#                         depth=trial.suggest_int('depth', 3, 400),
                        silent=True,
                        early_stopping_rounds=10,
                        loss_function='RMSE',
                        thread_count=8,)
    total_score = 0
    for i in tqdm(range(3)):
        train_pool = Pool(globals()['x_train_'+str(i)], globals()['y_train_'+str(i)].values, cat_features=catCols2)
        test_pool = Pool(globals()['x_validate_'+str(i)], cat_features=catCols2) 
        model.fit(train_pool, silent=True)
        pred = model.predict(test_pool)
        total_score += rmse(pred, globals()['y_validate_'+str(i)].values)
    print('{} : complete'.format(k))
    k+=1
    return total_score/3
catstudy = optuna.create_study()

In [54]:
%%time
k = 0
catstudy.optimize(cat_optuna, n_trials=100)

HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

0 : complete


[I 2019-02-25 02:49:37,870] Finished a trial resulted in value: 3.7228686208535096. Current best value is 3.7228686208535096 with parameters: {'iterations': 840, 'learning_rate': 0.1893025064860477}.


HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

1 : complete


[I 2019-02-25 02:53:42,722] Finished a trial resulted in value: 3.7218307265627417. Current best value is 3.7218307265627417 with parameters: {'iterations': 523, 'learning_rate': 0.22775658852973543}.


HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

2 : complete


[I 2019-02-25 03:01:24,160] Finished a trial resulted in value: 3.7525296812530016. Current best value is 3.7218307265627417 with parameters: {'iterations': 523, 'learning_rate': 0.22775658852973543}.


HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

3 : complete


[I 2019-02-25 03:07:53,245] Finished a trial resulted in value: 3.689683078775776. Current best value is 3.689683078775776 with parameters: {'iterations': 878, 'learning_rate': 0.042356140476210384}.


HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

4 : complete


[I 2019-02-25 03:13:19,329] Finished a trial resulted in value: 3.696011853863969. Current best value is 3.689683078775776 with parameters: {'iterations': 878, 'learning_rate': 0.042356140476210384}.


HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

5 : complete


[I 2019-02-25 03:24:28,360] Finished a trial resulted in value: 3.924956344667287. Current best value is 3.689683078775776 with parameters: {'iterations': 878, 'learning_rate': 0.042356140476210384}.


HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

6 : complete


[I 2019-02-25 03:35:31,167] Finished a trial resulted in value: 3.928199865564821. Current best value is 3.689683078775776 with parameters: {'iterations': 878, 'learning_rate': 0.042356140476210384}.


HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

7 : complete


[I 2019-02-25 03:47:09,798] Finished a trial resulted in value: 3.808419520941077. Current best value is 3.689683078775776 with parameters: {'iterations': 878, 'learning_rate': 0.042356140476210384}.


HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

8 : complete


[I 2019-02-25 03:55:45,898] Finished a trial resulted in value: 3.691113866555391. Current best value is 3.689683078775776 with parameters: {'iterations': 878, 'learning_rate': 0.042356140476210384}.


HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

9 : complete


[I 2019-02-25 03:58:34,174] Finished a trial resulted in value: 3.7037416936470158. Current best value is 3.689683078775776 with parameters: {'iterations': 878, 'learning_rate': 0.042356140476210384}.


HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

10 : complete


[I 2019-02-25 04:12:47,916] Finished a trial resulted in value: 3.704061969387068. Current best value is 3.689683078775776 with parameters: {'iterations': 878, 'learning_rate': 0.042356140476210384}.


HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

11 : complete


[I 2019-02-25 04:22:51,049] Finished a trial resulted in value: 3.8316060338151465. Current best value is 3.689683078775776 with parameters: {'iterations': 878, 'learning_rate': 0.042356140476210384}.


HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

12 : complete


[I 2019-02-25 04:25:08,938] Finished a trial resulted in value: 3.6983003206002594. Current best value is 3.689683078775776 with parameters: {'iterations': 878, 'learning_rate': 0.042356140476210384}.


HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

13 : complete


[I 2019-02-25 04:31:19,401] Finished a trial resulted in value: 3.7040462053272063. Current best value is 3.689683078775776 with parameters: {'iterations': 878, 'learning_rate': 0.042356140476210384}.


HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

14 : complete


[I 2019-02-25 04:35:31,390] Finished a trial resulted in value: 3.696665803803992. Current best value is 3.689683078775776 with parameters: {'iterations': 878, 'learning_rate': 0.042356140476210384}.


HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

15 : complete


[I 2019-02-25 04:49:08,769] Finished a trial resulted in value: 3.8665922224123546. Current best value is 3.689683078775776 with parameters: {'iterations': 878, 'learning_rate': 0.042356140476210384}.


HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

16 : complete


[I 2019-02-25 04:56:43,347] Finished a trial resulted in value: 3.710625775842889. Current best value is 3.689683078775776 with parameters: {'iterations': 878, 'learning_rate': 0.042356140476210384}.


HBox(children=(IntProgress(value=0, max=3), HTML(value='')))

KeyboardInterrupt: 