# TRAIN LightGBM REGRESSOR

In [1]:
import numpy as np
from scipy.stats import uniform as sp_rand
from sklearn import datasets
import lightgbm as lgb
from lightgbm import LGBMRegressor
import pandas as pd
from datetime import datetime
from time import time
from sklearn.metrics import mean_squared_error
import gc
import pickle

### GLOBAL VARIABLES

In [2]:
INPUT_PATH = '../../../data/train_test'
FEATURES_PATH = '../../../data/features'
HYPERPARAM_PATH = '../../../models/light_gbm/hyperparameters'
HYPERPARAM_NAME = 'bayesianopt_gbm_regressor_bestparams_dDDMMYYYY'
SEED = 47
NITER = 10
N_ESTIMATORS = 1000 # It set the max number of estimators. 
                    # The real number will depend on early sopping round parameter

ESR = 50            # early_stopping_rounds 
CV = 3
SCORE = 'rmse'
handlingnull = False
NJOBS = -1
USEGPU = False

### FUNCTIONS

In [None]:
def train_lgb(bst_params, fit_params, X_train_vector, Y_train_vector, X_val_vector, Y_val_vector):
    models = []

    FOLDS = X_train_vector.shape[0]

    for fold in range(FOLDS):
        print(f"\n----- Fold: ({fold + 1} / {FOLDS}) -----\n")
        X_trn, X_val = X_train_vector[fold], X_val_vector[fold]
        y_trn, y_val = Y_train_vector[fold], Y_val_vector[fold]
        

        X_trn, X_val = X.iloc[idx_trn], X.iloc[idx_val]
        y_trn, y_val = y.iloc[idx_trn], y.iloc[idx_val]
        train_set = lgb.Dataset(
            X_trn.drop(drop_when_train, axis=1),
            label=y_trn,
            categorical_feature=["item_id"],
        )
        val_set = lgb.Dataset(
            X_val.drop(drop_when_train, axis=1),
            label=y_val,
            categorical_feature=["item_id"],
        )

        model = lgb.train(
            bst_params,
            train_set,
            valid_sets=[train_set, val_set],
            valid_names=["train", "valid"],
            **fit_params,
        )
        models.append(model)

        del X_trn, X_val, y_trn, y_val
        gc.collect()

    return models

### LOAD DATASET

In [3]:
X_train_vector = np.load(f'{INPUT_PATH}/X_train.npy') # It loads a vector with the folds

In [4]:
Y_train_vector = np.load(f'{INPUT_PATH}/Y_train.npy')

(212665, 1205)

In [None]:
X_val_vector = np.load(f'{INPUT_PATH}/X_val.npy') # It loads a vector with the folds

In [None]:
Y_val_vector = np.load(f'{INPUT_PATH}/Y_val.npy') # It loads a vector with the folds

In [None]:
features = pd.read_csv(f'{FEATURES_PATH}/features.csv', index_col=False)

In [None]:
categorical_feature = [0,1,2,3,4,5]

In [18]:
### create a DMatrix and handling Null values
if handlingnull:
    #train_features[np.isnan(train_features)] = -9999
    lgb_train = lgb.Dataset(train_features.values, train_labels.values, missing=-9999)
else:
    lgb_train = lgb.Dataset(train_features.values, train_labels.values)

### TRAIN MODEL

#### Set hyperparameters

In [None]:
fit_params = {
    "num_boost_round": 100_000,
    "early_stopping_rounds": 50,
    "verbose_eval": 100,
}

In [22]:
# ======== General Parameters ======= #

# Select the type of model to run at each iteration. gbtree or gblinear.
boosting = 'gbdt'


# ======== Booster Parameters ======== # 

# Analogous to learning rate in GBM. 
# Typical final values to be used: 0.01-0.2
eta = 0.01 


# A node is split only when the resulting split gives a positive reduction in the loss function. 
# Gamma specifies the minimum loss reduction required to make a split.
gamma = [i/10.0 for i in range(0,5)]


# Control the balance of positive and negative weights, useful for unbalanced classes. 
# A typical value to consider: sum(negative instances) / sum(positive instances)scale_pos_weight = 1
scale_pos_weight = (len(train_labels.target) - sum(train_labels.target))/sum(train_labels.target)


# Learning Task Parameters
# This defines the loss function to be minimized. See documentation
# -  options: regression, regression_l1, huber, fair, poisson, quantile, 
# mape, gamma, tweedie, binary, multiclass, multiclassova, cross_entropy, cross_entropy_lambda,
# lambdarank, aliases: objective_type, app, application
objective  = 'poisson'


# The metric to be used for validation data.
# - rmse, root square loss, aliases: root_mean_squared_error, l2_root
# - quantile, Quantile regression
# - mape, MAPE loss, aliases: mean_absolute_percentage_error
# - huber, Huber loss
# - fair, Fair loss
# - poisson, negative log-likelihood for Poisson regression
# - gamma, negative log-likelihood for Gamma regression
# - gamma_deviance, residual deviance for Gamma regression
# - tweedie, negative log-likelihood for Tweedie regression
# - ndcg, NDCG, aliases: lambdarank
# - map, MAP, aliases: mean_average_precision
# - auc, AUC
# - binary_logloss, log loss, aliases: binary
metric = 'rmse'

In [None]:
model_param = np.load(f'{HYPERPARAM_PATH}/{HYPERPARAM_NAME}.npy')

In [24]:
model_param['max_depth'] = int(np.round(model_param['max_depth'],0))
model_param['min_child_weight'] = int(np.round(model_param['min_child_weight'],0))


In [25]:
model_param['num_leaves'] = int(np.round(model_param['num_leaves'],0))


In [26]:
model_param['seed'] = SEED
model_param['booster'] = boosting
model_param['objective'] = objective
model_param['n_estimators'] = N_ESTIMATORS

In [27]:
model_param['num_threads'] = NJOBS

In [None]:
model_param

In [35]:
models = train_lgb(
    bst_params, fit_params, X_train_vector, Y_train_vector, X_val_vector, Y_val_vector)

In [None]:
pickle.dump(models, open(f'{}', 'wb'))