# TRAIN CATBOOST REGRESSOR

In [1]:
import numpy as np
from scipy.stats import uniform as sp_rand
from catboost import CatBoostRegressor
import pandas as pd
from datetime import datetime
from time import time
import gc
import pickle

In [28]:
import matplotlib.pyplot as plt
import seaborn as sns

ModuleNotFoundError: No module named 'matplotlib'

### GLOBAL VARIABLES

In [15]:
INPUT_PATH = '../../../data/train_test'
FEATURES_PATH = '../../../data/features'
FEATURES_NAME = 'features_v001_info'
HYPERPARAM_PATH = '../../../models/catboost/hyperparameters'
HYPERPARAM_NAME = 'best_hyperparam_catboost_regressor_r1'
MODEL_PATH = '../../../models/catboost'
MODEL_NAME = 'catboost_reg_001'
SEED = 47
NITER = 10
N_ESTIMATORS = 1000 # It set the max number of estimators. 
                    # The real number will depend on early sopping round parameter

ESR = 50            # early_stopping_rounds 
CV = 3
SCORE = 'RMSE'
handlingnull = False
NJOBS = -1
USEGPU = True
TYPE_EVAL = 'SLIDING' # Unique posible values : SLIDING or EXPANDING

### FUNCTIONS

In [3]:
def train_catboost(bst_params, X_train, Y_train, X_val, Y_val, categorical_feature=None, type_eval='SLIDING'):
    
    models = []

    if type_eval == 'EXPANDING':

        if categorical_feature is None:
            categorical_feature = np.where(X_train.dtypes != np.float)[0]
       
        
        X_train.fillna(-9999, inplace=True)
        X_val.fillna(-9999, inplace=True)
        
        model = None
        
        if USEGPU:
            model=CatBoostRegressor(**bst_params, task_type="GPU")
        else:
            model=CatBoostRegressor(**bst_params)

        model.fit(X_train, Y_train, cat_features=categorical_feature, eval_set=(X_val, Y_val), plot=True, verbose=10)
        models.append(model)
        gc.collect()

    elif type_eval == 'SLIDING':

        FOLDS = len(X_train)

        for fold in range(FOLDS):
            print(f"\n----- Fold: ({fold + 1} / {FOLDS}) -----\n")
            X_trn, X_vl = X_train[fold], X_val[fold]
            y_trn, y_vl = Y_train[fold], Y_val[fold]
            
            X_trn.fillna(-9999, inplace=True)
            X_vl.fillna(-9999, inplace=True)

            if categorical_feature is None:
                categorical_feature = np.where(X_trn.dtypes != np.float)[0]
            
            
            model = None
            if USEGPU:
                model=CatBoostRegressor(**bst_params, task_type="GPU")
            else:
                model=CatBoostRegressor(**bst_params)
            
            model.fit(X_trn, y_trn, cat_features=categorical_feature, eval_set=(X_vl, y_vl), plot=False, verbose=False)
            


            models.append(model)

            del X_trn, X_vl, y_trn, y_vl
            gc.collect()
    else:
        raise Exception('type_eval should be SLIDING or EXPANDING. The value of type_eval was: {}'.format(type_eval))

    return models

In [4]:
def load_hyperparameters(path=''):
    # ======== Defaul Booster Parameters ======== # 

    # The maximum number of trees that can be built when solving machine learning problems. Fewer may be used.
    iterations=500 

    # used for reducing the gradient step. It affects the overall 
    # time of training: the smaller the value, the more iterations are required for training.
    learning_rate=0.03

    # Depth of the tree. Can be any integer up to 32. Good values in the range 1 - 10.
    depth=6 

    # try different values for the regularizer to find the best possible. Any positive values are allowed.
    l2_leaf_reg=3 

    # For 2-class classification use 'LogLoss' or 'CrossEntropy'. For multiclass use 'MultiClass'.
    loss_function=SCORE

    # The number of splits for numerical features. Allowed values are integers from 1 to 255 inclusively.
    border_count=32 


    
    objective = "Poisson"
    
    model_param = dict()
    

    try:
        model_param = np.load(path, allow_pickle=True).tolist()
        model_param['depth'] = int(np.round(model_param['depth'],0))
        model_param['border_count'] = int(np.round(model_param['border_count'],0))
        model_param['learning_rate'] = boosting
        model_param['iterations'] = int(np.round(model_param['iterations'],0))
        
    except:
        
        model_param['depth'] = depth
        model_param['l2_leaf_reg'] = l2_leaf_reg
        model_param['loss_function'] = loss_function
        model_param['border_count'] = border_count
        model_param['iterations'] = iterations
        model_param['learning_rate'] = learning_rate
        #model_param['n_estimators'] = N_ESTIMATORS
        #model_param['num_threads'] = NJOBS
        print('Hyperparameters not found or inappropriate params. Setting up defaul hyperparameters')
        
    
    model_param['objective'] = objective
    model_param['random_seed'] = SEED
    model_param['thread_count'] = NJOBS
    
    
    return model_param
    

In [26]:
def show_feature_imp(catboost_model, features_list):
    # sorted(zip(clf.feature_importances_, X.columns), reverse=True)
    feature_imp = pd.DataFrame(sorted(zip(catboost_model.feature_importances_, features_list)), columns=['Value','Feature'])

    plt.figure(figsize=(10, 20))
    sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False))
    plt.title('CatBoost Features (avg over folds)')
    plt.tight_layout()
    plt.show()


### LOAD DATASET

In [6]:
X_train_vector =  pickle.load(open( f'{INPUT_PATH}/X_train.pkl', "rb" )) # It loads a vector with the folds

In [7]:
Y_train_vector = pickle.load(open( f'{INPUT_PATH}/Y_train.pkl', "rb" )) # It loads a vector with the folds

In [8]:
X_val_vector = pickle.load(open( f'{INPUT_PATH}/X_val.pkl', "rb" ))  # It loads a vector with the folds

In [9]:
Y_val_vector = pickle.load(open( f'{INPUT_PATH}/Y_val.pkl', "rb" )) # It loads a vector with the folds

In [10]:
features = pd.read_csv(f'{INPUT_PATH}/{FEATURES_NAME}.csv', index_col=False)

In [11]:
categorical_feature = [0,1,2,3,4,5]

### TRAIN MODEL

#### Set hyperparameters

In [12]:
model_param = load_hyperparameters(f'{HYPERPARAM_PATH}/{HYPERPARAM_NAME}.npy')
print(model_param)

Hyperparameters not found or inappropriate params. Setting up defaul hyperparameters
{'depth': 6, 'l2_leaf_reg': 3, 'loss_function': 'RMSE', 'border_count': 32, 'iterations': 500, 'learning_rate': 0.03, 'objective': 'Poisson', 'random_seed': 47, 'thread_count': -1}


In [13]:
models = train_catboost(model_param, X_train_vector, Y_train_vector, 
                        X_val_vector, Y_val_vector, categorical_feature=categorical_feature, 
                        type_eval='SLIDING')


----- Fold: (1 / 3) -----


----- Fold: (2 / 3) -----


----- Fold: (3 / 3) -----



### SAVE MODEL

In [16]:
pickle.dump(models, open(f'{MODEL_PATH}/{MODEL_NAME}.model', 'wb'))

### SHOW FEATURES IMPORTANCE

In [None]:
show_feature_imp(models[1], features.feature)