### LightGBM and Bayesian Optimization

In [1]:
import os
import glob
import warnings
import numpy as np
import pandas as pd

from datetime import datetime
from tqdm import tqdm_notebook as tqdm
from sklearn.model_selection import train_test_split

import lightgbm as lgb
from bayes_opt import BayesianOptimization
from bayes_opt.observer import JSONLogger
from bayes_opt.event import Events

from constants import DATA_DIR
from utils import csv_concatenate, calculate_FPTS, calculate_MAE, calculate_RMSE, cross_val, load_full_dataset

In [2]:
np.random.seed(23)
warnings.filterwarnings("ignore")

### LightGBM and Hyperparameter Tuning with Bayesian Optimization

In [3]:
def bayes_parameter_opt_lgb(X, y, init_round, opt_round, n_folds, random_seed):
    train_data = lgb.Dataset(data=X, label=y)

    def lgb_eval(feature_fraction, bagging_fraction, lambda_l1, lambda_l2, max_depth, num_leaves, 
                 min_split_gain, min_child_weight, learning_rate, n_estimators):
        params = {
            "objective" : "regression",
            "max_bin": 255,
            "bagging_freq": 1,
            "min_child_samples": 20,
            "boosting": "gbdt",
            "verbosity": 1,
            "early_stopping_round": 200,
            "metric" : 'rmse'
        }
        
        params['feature_fraction'] = max(min(feature_fraction, 1), 0)
        params['bagging_fraction'] = max(min(bagging_fraction, 1), 0)
        params['lambda_l1'] = max(lambda_l1, 0)
        params['lambda_l2'] = max(lambda_l2, 0)
        params['max_depth'] = int(round(max_depth))
        params['num_leaves'] = int(round(num_leaves))
        params['min_split_gain'] = min_split_gain
        params['min_child_weight'] = min_child_weight
        params['learning_rate'] = learning_rate
        params['n_estimators'] = int(round(n_estimators))
        
        cv_result = lgb.cv(params, train_data, nfold=n_folds, seed=random_seed,
                           verbose_eval=None, stratified=False)
        
        # Print RMSE for each round of lgbBO for rough tracking of the optimization process
        min_index = cv_result['rmse-mean'].index(min(cv_result['rmse-mean']))
        print("RMSE: {} +- {}".format(round(cv_result['rmse-mean'][min_index],5),
                                      round(cv_result['rmse-stdv'][min_index],5)))
        
        return (-1.0 * np.array(cv_result['rmse-mean'])).max()
    
    lgbBO = BayesianOptimization(lgb_eval, {'feature_fraction': (0.6, 0.9),
                                            'bagging_fraction': (0.8, 1),
                                            'lambda_l1': (0, 3),
                                            'lambda_l2': (0, 3),
                                            'max_depth': (5, 100),
                                            'num_leaves' : (10, 300),
                                            'min_split_gain': (0.001, 0.1),
                                            'min_child_weight': (0, 1),
                                            'learning_rate': (0.001, 0.1),
                                            'n_estimators': (50, 5000)
                                           },
                                 random_state=random_seed)
    
    # Save progress for each round into a JSON file which can be monitored on a editor (i.e. VSCode)
    # This somehow suppresses the terminal output (https://github.com/fmfn/BayesianOptimization/issues/167)
    logger = JSONLogger(path=DATA_DIR+"/Models/LightGBM/Params/{}.json".format(pd.Timestamp.now().strftime('%Y%m%d-%Hh%Mm')))
    lgbBO.subscribe(Events.OPTMIZATION_STEP, logger)
    
    lgbBO.maximize(init_points=init_round, n_iter=opt_round, acq='ei')
    
    return lgbBO.max['params']

In [4]:
X, y = load_full_dataset('quad')

In [None]:
# Check DATA_DIR/Logs/{}.json for tuned parameters while optimising
# Takes ~ 2 hrs for 50 rounds (20 init + 30 opt) on CPU
opt_params = bayes_parameter_opt_lgb(X, y,
                                     init_round=20,
                                     opt_round=30,
                                     n_folds=5,
                                     random_seed=23)

RMSE: 9.08604 +- 0.04703
RMSE: 9.09618 +- 0.04531
RMSE: 9.10782 +- 0.04711
RMSE: 9.08217 +- 0.0462
RMSE: 9.09302 +- 0.04852
RMSE: 9.08347 +- 0.04771
RMSE: 9.09529 +- 0.04418
RMSE: 9.08705 +- 0.04761
RMSE: 9.09951 +- 0.04444
RMSE: 9.09278 +- 0.04607


In [None]:
# Load the newest file in the logs directory, otherwise set manually
path_params = sorted(glob.glob(DATA_DIR+'/Logs/*.json'))[-1]
df_params = pd.read_json(path_params, lines=True)
df_params = df_params.loc[:,['target', 'params']].sort_values(by='target', ascending=False).reset_index()

In [None]:
df_params.head(5)

In [None]:
if opt_params = None:
    opt_params = df_params.loc[0, 'params']

In [None]:
for key in opt_params.keys():
    if key in ['max_depth', 'num_leaves', 'n_estimators']:
        opt_params[key] = int(round(opt_params[key]))

In [None]:
err_buf = []
n_iters = 5

for i in tqdm(range(n_iters)):
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=i)
    d_train = lgb.Dataset(X_train, label=y_train)
    d_valid = lgb.Dataset(X_valid, label=y_valid)
    watchlist = [d_valid]
    
    model = lgb.train(opt_params, d_train, watchlist, verbose_eval=1)

    preds = model.predict(X_valid)
    err = calculate_RMSE(preds, y_valid)    
    err_buf.append(err)
    print('RMSE: ' + str(err))
    
print('\nMean RMSE: ' + str(np.mean(err_buf)) + ' +/- ' + str(np.std(err_buf)))

In [None]:
import xgboost as xgb

opt_params_xgb = {'max_depth':6, 'n_estimators':250, 'min_child_weight':4, 'colsample_bytree':0.6, 
                  'colsample_bylevel':0.7, 'subsample':1.0, 'gamma':0.0, 'learning_rate':0.026944654231987667}

reg = xgb.XGBRegressor(**opt_params_xgb)
cross_val(reg, X, y, show_train=True)