# LigthGBM - REGRESSOR - BAYESIAN OPTIMIZATION

In [1]:
import numpy as np
from scipy.stats import uniform as sp_rand
from sklearn import datasets
import lightgbm as lgb
from lightgbm import LGBMRegressor
from bayes_opt import BayesianOptimization
from bayes_opt.logger import JSONLogger
from bayes_opt.event import Events
import pandas as pd
from datetime import datetime
from time import time
from sklearn.metrics import mean_squared_error
import gc

In [2]:
import warnings
warnings.filterwarnings('ignore')

### GLOBAL VARIABLES

In [45]:
INPUT_PATH = '../../../data/train_test'
OUTPUT_PATH = '../../../models/light_gbm/hyperparameters'
SEED = 47
NITER = 10
N_ESTIMATORS = 1000 # It set the max number of estimators. 
                    # The real number will depend on early sopping round parameter

ESR = 50            # early_stopping_rounds 
CV = 3
SCORE = 'rmse'
handlingnull = False
NJOBS = -1
USEGPU = False

### FUNCTIONS

In [4]:
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [5]:
logger = JSONLogger(path=f'{OUTPUT_PATH}/bayesianopt_gbm_regressor_logs_d{datetime.now().date()}.json')

### LOAD DATASET

In [6]:
X_train_vector = np.load(f'{INPUT_PATH}/X_train.npy') # It loads a vector with the folds

In [7]:
Y_train_vector = np.load(f'{INPUT_PATH}/Y_train.npy')

In [8]:
X_val_vector = np.load(f'{INPUT_PATH}/X_val.npy') # It loads a vector with the folds

In [9]:
Y_val_vector = np.load(f'{INPUT_PATH}/Y_val.npy') # It loads a vector with the folds

In [10]:
FOLDS = X_train_vector.shape[0]

In [11]:
features = pd.read_csv(f'{INPUT_PATH}/features.csv', index_col=False)

In [12]:
categorical_feature = [0,1,2,3,4,5]

### TRAIN MODEL

#### Set Search hyperparameters

In [13]:
# ======== General Parameters ======= #

# Select the type of model to run at each iteration. gbtree or gblinear.
boosting = 'gbdt'


# ======== Booster Parameters ======== # 

# Analogous to learning rate in GBM. 
# Typical final values to be used: 0.01-0.2
eta = [0.01] 


# Learning Task Parameters
# This defines the loss function to be minimized. See documentation
# -  options: regression, regression_l1, huber, fair, poisson, quantile, 
# mape, gamma, tweedie, binary, multiclass, multiclassova, cross_entropy, cross_entropy_lambda,
# lambdarank, aliases: objective_type, app, application
objective  = 'regression'


# The metric to be used for validation data.
# - rmse, root square loss, aliases: root_mean_squared_error, l2_root
# - quantile, Quantile regression
# - mape, MAPE loss, aliases: mean_absolute_percentage_error
# - huber, Huber loss
# - fair, Fair loss
# - poisson, negative log-likelihood for Poisson regression
# - gamma, negative log-likelihood for Gamma regression
# - gamma_deviance, residual deviance for Gamma regression
# - tweedie, negative log-likelihood for Tweedie regression
# - ndcg, NDCG, aliases: lambdarank
# - map, MAP, aliases: mean_average_precision
# - auc, AUC
# - binary_logloss, log loss, aliases: binary
metric = 'rmse'

[lightGBM params](https://lightgbm.readthedocs.io/en/latest/Parameters.html)


help(lgb.LGBMClassifier)

#### Define the search space

In [14]:
pds = {
    # Minimal number of data in one leaf. Can be used to deal with over-fitting
    # default = 20, type = int, aliases: min_data_per_leaf, min_data, min_child_samples, 
    'num_leaves': (6, 50),

    # Denotes the fraction of columns to be randomly samples for each tree.
    'feature_fraction': (0.4, 0.6),
    
    # Denotes the fraction of observations to be randomly samples for each tree.
    'bagging_fraction': (0.8, 1),

    # The maximum depth of a tree
    'max_depth': (3, 10 ),

    'min_split_gain': (0.001, 0.1),

    # Minimal sum hessian in one leaf. Like min_data_in_leaf, 
    # it can be used to deal with over-fitting
    # Default 1e-3
    'min_child_weight': (1e-5, 1e4),   

    #L1 regularization
    'reg_alpha': (0,  100),

    #L2 regularization
    'reg_lambda': (0, 100) 


}

In [15]:
pds

{'num_leaves': (6, 50),
 'feature_fraction': (0.4, 0.6),
 'bagging_fraction': (0.8, 1),
 'max_depth': (3, 10),
 'min_split_gain': (0.001, 0.1),
 'min_child_weight': (1e-05, 10000.0),
 'reg_alpha': (0, 100),
 'reg_lambda': (0, 100)}

### Bayesian optimization hyperparameters

In [46]:
def hyp_lgbm(num_leaves, feature_fraction, bagging_fraction, max_depth, min_split_gain, min_child_weight, reg_alpha, reg_lambda):
    
    best_score = []

    fit_params = {
    'num_boost_round': 100_000,
    'early_stopping_rounds': ESR,
    'verbose_eval': False,
    
    }
      
    bst_params = {'boosting_type': boosting,
              'objective': objective,
              'num_iterations': N_ESTIMATORS,
              'learning_rate':eta, 
              'metric': metric,
              'seed':SEED,
                } # Default parameters
    
    
    bst_params['num_leaves'] = int(round(num_leaves))
    bst_params['feature_fraction'] = feature_fraction
    bst_params['bagging_fraction'] = bagging_fraction
    bst_params['max_depth'] = int(round(max_depth))
    bst_params['min_split_gain'] = min_split_gain
    bst_params['min_child_weight'] = min_child_weight
    bst_params['min_split_gain'] = min_split_gain
    bst_params['reg_alpha'] = reg_alpha
    bst_params['reg_lambda'] = reg_lambda
    
    

    for fold in range(FOLDS):
        
        X_trn, X_val = X_train_vector[fold], X_val_vector[fold]
        y_trn, y_val = Y_train_vector[fold], Y_val_vector[fold]

        # Create lgb datasets
        train_set = lgb.Dataset(
            X_trn,
            label=y_trn,
            categorical_feature=categorical_feature
        )
        val_set = lgb.Dataset(
            X_val,
            label=y_val,
            categorical_feature=categorical_feature
        )



        model = lgb.train(
            bst_params,
            train_set,
            valid_sets=[train_set, val_set],
            valid_names=["train", "valid"],
            **fit_params,
        )


        best_score.append(model.best_score['valid'][metric])

        del X_trn, X_val, y_trn, y_val

        gc.collect()
    
  
       
    
    # Extract the best score
    score_avg = np.mean(best_score)
    
    # Loss must be minimized
    loss = - score_avg
      

    return loss

In [47]:
# Surrogate model
optimizer = BayesianOptimization(hyp_lgbm,pds,random_state=7)

In [48]:
optimizer.subscribe(Events.OPTIMIZATION_STEP, logger)

In [49]:
# Optimize
optimizer.maximize(init_points=5, n_iter=NITER)

#### Best parameter

In [36]:
optimizer.max

{'target': -2.4225318046259834,
 'params': {'bagging_fraction': 0.8255678640312084,
  'feature_fraction': 0.4395763075107805,
  'max_depth': 6.563973542052434,
  'min_child_weight': 13.092195085919592,
  'min_split_gain': 0.03611876064469009,
  'num_leaves': 6.181847286285093,
  'reg_alpha': 89.75656980050594,
  'reg_lambda': 2.1215817789469127}}

#### Saving best hyperparameters

In [37]:
best_params = optimizer.max['params']

In [38]:
best_params['n_estimators'] = N_ESTIMATORS

In [61]:
best_params

{'bagging_fraction': 0.8152616578747914,
 'feature_fraction': 0.555983758448023,
 'max_depth': 6.068864620086254,
 'min_child_weight': 7234.65178107476,
 'min_split_gain': 0.09782096168766367,
 'num_leaves': 29.69381829805908,
 'reg_alpha': 50.11204636599379,
 'reg_lambda': 7.205113335976154,
 'n_estimators': 2000}

### SAVE HYPERPARAMETERS

In [62]:
np.save(f'{OUTPUT_PATH}/bayesianopt_gbm_regressor_bestparams_d{datetime.now().date()}.npy', best_params)