# TRAIN XGBOOST REGRESSOR

In [None]:
import numpy as np
from scipy.stats import uniform as sp_rand
from sklearn import datasets
import xgboost as xgb
from xgboost.sklearn import XGBRegressor
import pandas as pd
from datetime import datetime
from time import time
from sklearn.metrics import mean_squared_error
import gc
import pickle
import matplotlib.pyplot as plt
import seaborn as sns

### GLOBAL VARIABLES

In [None]:
INPUT_PATH = '../../../data/train_test'
FEATURES_PATH = '../../../data/features'
HYPERPARAM_PATH = '../../../models/light_xgboost/hyperparameters'
HYPERPARAM_NAME = 'best_hyperparam_xgb_regressor_rX'
MODEL_PATH = '../../../models/xgboost/'
MODEL_NAME = 'xgb_reg_XXX'
SEED = 47
N_ESTIMATORS = 1000 # It set the max number of estimators. 
                    # The real number will depend on early sopping round parameter

ESR = 50            # early_stopping_rounds 
CV = 3
SCORE = 'rmse'
handlingnull = False
NJOBS = -1
USEGPU = False

### FUNCTIONS

In [None]:
def train_xgb(bst_params, fit_params, X_train_vector, Y_train_vector, X_val_vector, Y_val_vector):
    models = []

    FOLDS = len(X_train_vector)

    for fold in range(FOLDS):
        print(f"\n----- Fold: ({fold + 1} / {FOLDS}) -----\n")
        X_trn, X_val = X_train_vector[fold], X_val_vector[fold]
        y_trn, y_val = Y_train_vector[fold].round().astype(int).clip(0), Y_val_vector[fold].round().astype(int).clip(0)
        

        train_set = xgb.DMatrix(
            X_trn,
            label=y_trn
        )
        
        val_set = xgb.DMatrix(
            X_val,
            label=y_val
        )

        evallist  = [(val_set,'eval'), (train_set,'train')]
        
        model = xgb.train(
            bst_params,
            train_set,
            evals=evallist,
            **fit_params,
        )
        
        models.append(model)

        del X_trn, X_val, y_trn, y_val
        gc.collect()

    return models

In [None]:
def show_feature_imp(lgb_model, features_list):
    # sorted(zip(clf.feature_importances_, X.columns), reverse=True)
    feature_imp = pd.DataFrame(sorted(zip(lgb_model.feature_importance(), features_list)), columns=['Value','Feature'])

    plt.figure(figsize=(10, 20))
    sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False))
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout()
    plt.show()


### LOAD DATASET

In [None]:
X_train_vector =  pickle.load(open( f'{INPUT_PATH}/X_train.pkl', "rb" )) # It loads a vector with the folds

In [None]:
Y_train_vector = pickle.load(open( f'{INPUT_PATH}/Y_train.pkl', "rb" ))

In [None]:
X_val_vector = pickle.load(open( f'{INPUT_PATH}/X_val.pkl', "rb" )) # It loads a vector with the folds

In [None]:
Y_val_vector = pickle.load(open( f'{INPUT_PATH}/Y_val.pkl', "rb" )) # It loads a vector with the folds

In [None]:
features = X_train_vector[0].columns.tolist()

In [None]:
categorical_feature = ['item_id', 'cat_id', 'store_id']

### TRAIN MODEL

#### Set hyperparameters

In [None]:
# ======== General Parameters ======= #

# Select the type of model to run at each iteration. gbtree or gblinear.
boosting = 'gbtree'


# Analogous to learning rate in GBM. 
# Typical final values to be used: 0.01-0.2
eta = 0.01


# Learning Task Parameters
# This defines the loss function to be minimized.
#
# - reg:squarederror: regression with squared loss.
# - count:poisson –poisson regression for count data, output mean of poisson distribution
#    max_delta_step is set to 0.7 by default in poisson regression (used to safeguard optimization)

# - survival:cox: Cox regression for right censored survival time data (negative values are considered right censored). Note that predictions are returned on the hazard ratio scale (i.e., as HR = exp(marginal_prediction) in the proportional hazard function h(t) = h0(t) * HR).
objective  = 'reg:squarederror'


# The metric to be used for validation data.
# - rmse – root mean square error
# - mae – mean absolute error
# - poisson-nloglik: negative log-likelihood for Poisson regression
# - rmsle: root mean square log error: Default metric of reg:squaredlogerror objective.
#            This metric reduces errors generated by outliers in dataset. But because log function is employed, 
#            rmsle might output nan when prediction value is less than -1. 
#            See reg:squaredlogerror for other requirements.
eval_metric = 'rmse'

In [None]:
model_param = dict()
model_param = np.load(f'{HYPERPARAM_PATH}/{HYPERPARAM_NAME}.npy', allow_pickle=True).tolist()

In [None]:
model_param['max_depth'] = int(np.round(model_param['max_depth'],0))
model_param['min_child_weight'] = int(np.round(model_param['min_child_weight'],0))
model_param['num_leaves'] = int(np.round(model_param['num_leaves'],0))
model_param['seed'] = SEED
model_param['booster'] = boosting
model_param['objective'] = objective
model_param['n_estimators'] = N_ESTIMATORS
model_param['num_threads'] = NJOBS

In [None]:
fit_params = {
    "num_boost_round": 100_000,
    "early_stopping_rounds": 50,
    "verbose_eval": 100,
}

In [None]:
models = train_lgb(
    model_param, fit_params, X_train_vector, Y_train_vector, X_val_vector, Y_val_vector)

### SAVE MODEL

In [None]:
pickle.dump(models, open(f'{MODEL_PATH}/{MODEL_NAME}.model', 'wb'))

### SHOW FEATURES IMPORTANCE

In [None]:
show_feature_imp(models[0], features)