# XGBOOST - REGRESSOR - BAYESIAN OPTIMIZATION

In [None]:
import numpy as np
from scipy.stats import uniform as sp_rand
from sklearn import datasets
import xgboost as xgb
from xgboost.sklearn import XGBRegressor
from bayes_opt import BayesianOptimization
from bayes_opt.logger import JSONLogger
from bayes_opt.event import Events
from bayes_opt.util import load_logs
import pandas as pd
from datetime import datetime
from time import time
from sklearn.metrics import mean_squared_error
import gc
import pickle

### GLOBAL VARIABLES

In [None]:
INPUT_PATH = '../../../data/train_test'
OUTPUT_PATH = '../../../models/xgboost/hyperparameters'
HYPERPARAM_NAME = 'best_hyperparam_xgb_regressor_r'
LOG_NAME = 'bayesianopt_xgb_regressor_logs_r'
SEED = 47
NITER = 20
N_ESTIMATORS = 1000 # It set the max number of estimators. 
                    # The real number will depend on early sopping round parameter

ESR = 50            # early_stopping_rounds 
CV = 3
SCORE = 'rmse'
handlingnull = False
NJOBS = -1
USEGPU = False
NRUN = 1

### FUNCTIONS

In [None]:
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [None]:
logger = JSONLogger(path=f'{OUTPUT_PATH}/{LOG_NAME}{NRUN}.json')

### LOAD DATASET

In [None]:
X_train_vector = pickle.load(open( f'{INPUT_PATH}/X_train.pkl', "rb" )) # It loads a vector with the folds

In [None]:
Y_train_vector = pickle.load(open( f'{INPUT_PATH}/Y_train.pkl', "rb" )) 

In [None]:
X_val_vector = pickle.load(open( f'{INPUT_PATH}/X_val.pkl', "rb" ))   # It loads a vector with the folds

In [None]:
Y_val_vector = pickle.load(open( f'{INPUT_PATH}/Y_val.pkl', "rb" )) # It loads a vector with the folds

In [None]:
FOLDS = len(X_train_vector)

### TRAIN MODEL

#### Set  hyperparameters 

In [None]:
# ======== General Parameters ======= #

# Select the type of model to run at each iteration. gbtree or gblinear.
boosting = 'gbtree'


# Analogous to learning rate in GBM. 
# Typical final values to be used: 0.01-0.2
eta = 0.01


# Learning Task Parameters
# This defines the loss function to be minimized.
#
# - reg:squarederror: regression with squared loss.
# - count:poisson –poisson regression for count data, output mean of poisson distribution
#    max_delta_step is set to 0.7 by default in poisson regression (used to safeguard optimization)

# - survival:cox: Cox regression for right censored survival time data (negative values are considered right censored). Note that predictions are returned on the hazard ratio scale (i.e., as HR = exp(marginal_prediction) in the proportional hazard function h(t) = h0(t) * HR).
objective  = 'reg:squarederror'


# The metric to be used for validation data.
# - rmse – root mean square error
# - mae – mean absolute error
# - poisson-nloglik: negative log-likelihood for Poisson regression
# - rmsle: root mean square log error: Default metric of reg:squaredlogerror objective.
#            This metric reduces errors generated by outliers in dataset. But because log function is employed, 
#            rmsle might output nan when prediction value is less than -1. 
#            See reg:squaredlogerror for other requirements.
eval_metric = 'rmse'

[xgboost params](https://xgboost.readthedocs.io/en/latest/python/python_api.html)

#### set search space

In [None]:
pds ={
    # Minimal sum hessian in one leaf. Like min_data_in_leaf, 
    # it can be used to deal with over-fitting
    # Default 1e-3
    'min_child_weight':(1e-5, 1e4),
    
    # Minimum loss reduction required to make a further partition on a leaf node of the tree. 
    # The larger gamma is, the more conservative the algorithm will be
    'gamma':(0, 5),
    
    # Subsample ratio of the training instances. Setting it to 0.5 means that XGBoost 
    # would randomly sample half of the training data prior to growing trees. and this will prevent overfitting. 
    # Subsampling will occur once in every boosting iteration.
    'subsample':(0.5, 1),
  
    # is the subsample ratio of columns when constructing each tree. 
    # Subsampling occurs once for every tree constructed.
    'colsample_bytree':(0.1, 1),
  
    # Maximum depth of a tree. 
    # Increasing this value will make the model more complex and more likely to overfit.
    'max_depth': (3, 10),
    
    # L1 regularization term on weights. Increasing this value will make model more conservative.
    'reg_alpha': (0,  100),

    # L2 regularization term on weights. Increasing this value will make model more conservative.
    'reg_lambda': (0, 100) 
}

In [None]:
def hyp_xgb(min_child_weight, gamma, subsample, colsample_bytree, max_depth, reg_alpha, reg_lambda):
    
    best_score = []

    fit_params = {
    'num_boost_round': N_ESTIMATORS,
    'early_stopping_rounds': ESR,
    'verbose_eval': False,
    
    }
      
    bst_params = {'boosting_type': boosting,
              'objective': objective,
              'n_estimators': N_ESTIMATORS,
              'learning_rate':eta, 
              'eval_metric': eval_metric,
              'seed':SEED,
                  
                } # Default parameters
    
    
   
    bst_params['min_child_weight'] = min_child_weight
    bst_params['gamma'] = gamma
    bst_params['max_depth'] = int(round(max_depth))
    bst_params['subsample'] = subsample
    bst_params['colsample_bytree'] = colsample_bytree
    bst_params['reg_alpha'] = reg_alpha
    bst_params['reg_lambda'] = reg_lambda
    
    

    for fold in range(FOLDS):
        
        X_trn, X_val = X_train_vector[fold].values, X_val_vector[fold].values
        y_trn, y_val = Y_train_vector[fold].values, Y_val_vector[fold].values
        

        # Create lgb datasets
        train_set = xgb.DMatrix(
            X_trn,
            label=y_trn
        )
        val_set = xgb.DMatrix(
            X_val,
            label=y_val
        )


        evallist  = [(val_set,'eval'), (train_set,'train')]
        
        model = xgb.train(
            bst_params,
            train_set,
            evals=evallist,
            **fit_params,
        )


        best_score.append(model.best_score['valid'][metric])

        del X_trn, X_val, y_trn, y_val

        gc.collect()
    
  
       
    
    # Extract the best score
    score_avg = np.mean(best_score)
    
    # Loss must be minimized
    loss = - score_avg
      

    return loss

In [None]:
# Surrogate model
optimizer = BayesianOptimization(hyp_xgb,pds,random_state=7)

In [None]:
optimizer.subscribe(Events.OPTIMIZATION_STEP, logger)

In [None]:
# Optimize
optimizer.maximize(init_points=5, n_iter=NITER)

#### Best parameter

In [None]:
load_logs(optimizer, logs=[f'{OUTPUT_PATH}/{LOG_NAME}{NRUN}.json'])

In [None]:
optimizer.max

In [None]:
best_params = optimizer.max['params']

In [None]:
best_params['n_estimators'] = N_ESTIMATORS

In [None]:
best_params 

### SAVE BEST HYPERPARAMETER

In [None]:
np.save(f'{OUTPUT_PATH}/{HYPERPARAM_NAME}{NRUN}.npy', best_params)