# TRAIN LightGBM REGRESSOR

In [None]:
import numpy as np
from scipy.stats import uniform as sp_rand
from sklearn import datasets
import lightgbm as lgb
from lightgbm import LGBMRegressor
import pandas as pd
from datetime import datetime
from time import time
from sklearn.metrics import mean_squared_error
import gc
import pickle
import matplotlib.pyplot as plt
import seaborn as sns

### GLOBAL VARIABLES

In [None]:
INPUT_PATH = '../../../data/train_test'
FEATURES_PATH = '../../../data/features'
HYPERPARAM_PATH = '../../../models/light_gbm/hyperparameters'
HYPERPARAM_NAME = 'best_hyperparam_gbm_regressor_r1'
MODEL_PATH = '../../../models/light_gbm/'
MODEL_NAME = 'lgb_reg_004'
SEED = 47
NITER = 10
N_ESTIMATORS = 1000 # It set the max number of estimators. 
                    # The real number will depend on early sopping round parameter

ESR = 50            # early_stopping_rounds 
CV = 3
SCORE = 'rmse'
handlingnull = False
NJOBS = -1
USEGPU = False

### FUNCTIONS

In [None]:
def train_lgb(bst_params, fit_params, X_train_vector, Y_train_vector, X_val_vector, Y_val_vector):
    models = []

    FOLDS = len(X_train_vector)

    for fold in range(FOLDS):
        print(f"\n----- Fold: ({fold + 1} / {FOLDS}) -----\n")
        X_trn, X_val = X_train_vector[fold].values, X_val_vector[fold].values
        y_trn, y_val = Y_train_vector[fold].values, Y_val_vector[fold].values
        

        train_set = lgb.Dataset(
            X_trn,
            label=y_trn,
            categorical_feature=categorical_feature,
        )
        val_set = lgb.Dataset(
            X_val,
            label=y_val,
            categorical_feature=categorical_feature,
        )

        model = lgb.train(
            bst_params,
            train_set,
            valid_sets=[train_set, val_set],
            valid_names=["train", "valid"],
            **fit_params,
        )
        models.append(model)

        del X_trn, X_val, y_trn, y_val
        gc.collect()

    return models

In [None]:
def show_feature_imp(lgb_model, features_list):
    # sorted(zip(clf.feature_importances_, X.columns), reverse=True)
    feature_imp = pd.DataFrame(sorted(zip(lgb_model.feature_importance(), features_list)), columns=['Value','Feature'])

    plt.figure(figsize=(10, 20))
    sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False))
    plt.title('LightGBM Features (avg over folds)')
    plt.tight_layout()
    plt.show()


### LOAD DATASET

In [None]:
X_train_vector =  pickle.load(open( f'{INPUT_PATH}/X_train.pkl', "rb" )) # It loads a vector with the folds

In [None]:
Y_train_vector = pickle.load(open( f'{INPUT_PATH}/Y_train.pkl', "rb" ))

In [None]:
X_val_vector = pickle.load(open( f'{INPUT_PATH}/X_val.pkl', "rb" )) # It loads a vector with the folds

In [None]:
Y_val_vector = pickle.load(open( f'{INPUT_PATH}/Y_val.pkl', "rb" )) # It loads a vector with the folds

In [None]:
features = X_train_vector[0].columns.tolist()#pd.read_csv(f'{INPUT_PATH}/features_features_v002.csv', index_col=False)

In [None]:
categorical_feature = [0,1,2,3,4,5]

### TRAIN MODEL

#### Set hyperparameters

In [None]:
# ======== General Parameters ======= #

# Select the type of model to run at each iteration. gbtree or gblinear.
boosting = 'gbdt'


# ======== Booster Parameters ======== # 

# Analogous to learning rate in GBM. 
# Typical final values to be used: 0.01-0.2
eta = 0.01 


# Learning Task Parameters
# This defines the loss function to be minimized. See documentation
# -  options: regression, regression_l1, huber, fair, poisson, quantile, 
# mape, gamma, tweedie, binary, multiclass, multiclassova, cross_entropy, cross_entropy_lambda,
# lambdarank, aliases: objective_type, app, application
objective  = 'poisson'


# The metric to be used for validation data.
# - rmse, root square loss, aliases: root_mean_squared_error, l2_root
# - quantile, Quantile regression
# - mape, MAPE loss, aliases: mean_absolute_percentage_error
# - huber, Huber loss
# - fair, Fair loss
# - poisson, negative log-likelihood for Poisson regression
# - gamma, negative log-likelihood for Gamma regression
# - gamma_deviance, residual deviance for Gamma regression
# - tweedie, negative log-likelihood for Tweedie regression
# - ndcg, NDCG, aliases: lambdarank
# - map, MAP, aliases: mean_average_precision
# - auc, AUC
# - binary_logloss, log loss, aliases: binary
metric = 'rmse'

In [None]:
model_param = dict()
model_param = np.load(f'{HYPERPARAM_PATH}/{HYPERPARAM_NAME}.npy', allow_pickle=True).tolist()

In [None]:
#model_param['max_depth'] = int(np.round(model_param['max_depth'],0))
#model_param['min_child_weight'] = int(np.round(model_param['min_child_weight'],0))
#model_param['num_leaves'] = int(np.round(model_param['num_leaves'],0))
model_param['seed'] = SEED
model_param['booster'] = boosting
model_param['objective'] = objective
model_param['n_estimators'] = N_ESTIMATORS
model_param['num_threads'] = NJOBS

In [None]:
fit_params = {
    "num_boost_round": 100_000,
    "early_stopping_rounds": 50,
    "verbose_eval": 100,
}

In [None]:
models = train_lgb(
    model_param, fit_params, X_train_vector, Y_train_vector, X_val_vector, Y_val_vector)

### SAVE MODEL

In [None]:
pickle.dump(models, open(f'{MODEL_PATH}/{MODEL_NAME}.model', 'wb'))

### SHOW FEATURES IMPORTANCE

In [None]:
show_feature_imp(models[1], features)