In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import random
import time
import re
import pickle

pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
from tqdm import tqdm



#Feature engineering
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, r2_score, mean_squared_error
from sklearn.metrics import r2_score

#Modeling
import statsmodels.formula.api as smf
from sklearn.model_selection import KFold, GridSearchCV, RandomizedSearchCV
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

# Skopt functions
from sklearn.metrics import make_scorer
from sklearn.metrics import mean_squared_error
import shap

#Outlier Detection
import pyod
from pyod.models.abod import ABOD

import warnings
warnings.filterwarnings("ignore") 

import itertools

In [2]:
data = pd.read_csv('segmentos_junio_23.csv')
df = data[['EFH segmento','diff_Z Coor','Payload','Speed GPS AVG', 'aceleración','Engine Load AVG','Engine Speed AVG', 'Transmission Gear AVG','Pitch AVG', 'Throttle Position AVG','Ambient Air Temperature', 'Atmospheric Pressure','Service Hours','Engine Coolant Temperature AVG', 'Engine Fan Speed AVG','operador', 'equipo_acarreo', 'turno', 'Engine Fuel Rate AVG']]
#Se quita del análisis el Camion CA17 debido a que la señal de esta invertida.
df = df[df['equipo_acarreo'] != 'CA17']  
df.shape

#Data Split for traning and testing
X = df.iloc[:,:-1]
y = df.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=100)
df_train = pd.concat([X_train, y_train], axis=1)
df_test = pd.concat([X_test, y_test], axis=1)


#Remoción de Outliers con librería PYOD
#Se utiliza el método Angle-Based Outlier Detection -ABOD-
clf1 = ABOD(contamination=0.12)
clf1.fit(df_train.drop(['operador','equipo_acarreo', 'turno'], axis=1))
df_train['OD'] = clf1.predict(df_train.drop(['operador','equipo_acarreo', 'turno'], axis=1))
df_train=df_train[df_train['OD']!=1].drop('OD', axis=1)

X_train2 = df_train.iloc[:,:-1]
y_train2 = df_train.iloc[:,-1]

#Volvemos a revisar las correlaciones con el target
#corr = df_train.corr()
#corr['Engine Fuel Rate AVG'].sort_values(ascending=False)


#Se anexan las variables categoricas al dataset
train = X_train2.drop(['operador','equipo_acarreo', 'turno'], axis=1)
test = X_test.drop(['operador','equipo_acarreo','turno'], axis=1)

X_train2[['EFH segmento','diff_Z Coor','Payload','Speed GPS AVG', 'aceleración','Engine Load AVG','Engine Speed AVG', 'Transmission Gear AVG','Pitch AVG', 'Throttle Position AVG','Ambient Air Temperature', 'Atmospheric Pressure','Service Hours','Engine Coolant Temperature AVG', 'Engine Fan Speed AVG']] = train

X_test[['EFH segmento','diff_Z Coor','Payload','Speed GPS AVG', 'aceleración','Engine Load AVG','Engine Speed AVG', 'Transmission Gear AVG','Pitch AVG', 'Throttle Position AVG','Ambient Air Temperature', 'Atmospheric Pressure','Service Hours','Engine Coolant Temperature AVG', 'Engine Fan Speed AVG']] = test

#Se generan las variables dummies
X_train3 = pd.get_dummies(X_train2)
y_train3 = y_train2
X_test3 = pd.get_dummies(X_test)
y_test3 = y_test
X_test3.shape


(94479, 176)

In [3]:
X_test3 = X_test3.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))
X_train3 = X_train3.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

In [4]:
curr_best_params = {
                    10:{'score':10000000},
                    }


In [5]:
### CAMBIAR EN OPEN best_params por otro nombre 

best_params_route = 'best_params_2022_10_29_mean_XGB'
dictionary_route = 'dictionary_2022_10_29_mean_XGB'

In [6]:
dictionary = {}
def tuning(n_iterations):
    
    train_x = X_train3.copy()
    train_y = y_train3.copy()
    test_x = X_test3.copy()
    test_y = y_test3.copy()

#        'n_jobs': [-1], 
    param_grid = {
        'colsample_bytree': [0.2,0.4,0.6,0.8,1.0],  
        'eta': [0.005, 0.007, 0.009,0.01,0.015,0.017,0.02,0.05,0.07,0.09,0.1,0.3, 0.5, 0.7, 1], 
        'max_depth': [6,7,8,9,10], 
        'n_estimators': [5, 10,20,50, 70,100], 
        'max_leaves': [2,5,7,10,15,20,30,35], 
        'seed': [42], 
        'n_jobs': [-1], 
        'reg_alpha': [0.0, 0.1, 0.5, 1, 2 , 5, 10, 50, 100], 
        'reg_lambda': [0.0, 0.1, 0.5, 1, 2 , 5, 10, 50, 100], 
        'subsample': [0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 1.0]
    }

    comb_params = list(itertools.product(*param_grid.values()))

    if n_iterations == -1:
        rand_comb_params = comb_params
    else:
        rand_comb_params = random.sample(comb_params, n_iterations)

    best_params_ = curr_best_params[10]
    best_score_  = best_params_.pop('score',None)

    i=0

    for tup in tqdm(rand_comb_params):
        param = {}
        for idx,j in enumerate(param_grid.keys()):
            param[j] = tup[idx]
        
        param_check = param
        
        key_ = hash(frozenset(param_check.values()))
        
        if key_ in dictionary:
            continue
        
        dictionary[key_] = param_check
        
        model = XGBRegressor(**param)
        model.fit(train_x,train_y)
        y_pred = model.predict(test_x)
        score = loss_func(test_y,y_pred)
        #print(score, mean_squared_error(test_y,y_pred))
        del model

        if score < best_score_:
            best_score_ = score
            best_params_ = param
            
            print('@@@@@@@')
            print(best_score_, best_params_)
            
            aux = best_params_
            aux['score'] = best_score_
            curr_best_params[10] = aux
            del aux
            best_params_file = open(best_params_route, 'wb') 
            pickle.dump(curr_best_params, best_params_file) 
            best_params_file.close()    
            
            dictionary_file = open(dictionary_route, 'wb') 
            pickle.dump(dictionary, dictionary_file) 
            dictionary_file.close()

        i = i+1
        if i%100 == 0:
            aux = best_params_
            aux['score'] = best_score_
            curr_best_params[10] = aux
            del aux
            best_params_file = open(best_params_route, 'wb') 
            pickle.dump(curr_best_params, best_params_file) 
            best_params_file.close()    
            
            dictionary_file = open(dictionary_route, 'wb') 
            pickle.dump(dictionary, dictionary_file) 
            dictionary_file.close() 

    aux = best_params_
    aux['score'] = best_score_
    curr_best_params[10] = aux
    del aux
    best_params_file = open(best_params_route, 'wb') 
    pickle.dump(curr_best_params, best_params_file) 
    best_params_file.close()    
    dictionary_file = open(dictionary_route, 'wb') 
    pickle.dump(dictionary, dictionary_file) 
    dictionary_file.close() 
         
    model = XGBRegressor(**best_params_)
    model.fit(train_x,train_y)
    y_pred = model.predict(test_x)
    score = loss_func(test_y,y_pred)
    print('##################################')
    print(score, mean_squared_error(test_y,y_pred))
    
    return best_score_, best_params_


def loss_func(y_test,y_pred):
    return mean_absolute_error(y_test, y_pred)

In [7]:
best_score_, best_params_ = tuning(2000)

  0%|          | 0/2000 [00:00<?, ?it/s]

@@@@@@@
43.67974418133747 {'colsample_bytree': 1.0, 'eta': 1, 'max_depth': 8, 'n_estimators': 10, 'max_leaves': 15, 'seed': 42, 'n_jobs': -1, 'reg_alpha': 0.0, 'reg_lambda': 0.5, 'subsample': 0.85}


  0%|          | 9/2000 [00:59<5:09:44,  9.33s/it]

@@@@@@@
41.39045349538543 {'colsample_bytree': 0.6, 'eta': 0.07, 'max_depth': 8, 'n_estimators': 70, 'max_leaves': 15, 'seed': 42, 'n_jobs': -1, 'reg_alpha': 2, 'reg_lambda': 0.5, 'subsample': 0.6}


  0%|          | 10/2000 [01:15<6:12:43, 11.24s/it]

@@@@@@@
41.33202270239916 {'colsample_bytree': 0.6, 'eta': 0.09, 'max_depth': 6, 'n_estimators': 100, 'max_leaves': 35, 'seed': 42, 'n_jobs': -1, 'reg_alpha': 5, 'reg_lambda': 0.1, 'subsample': 0.85}


  1%|          | 14/2000 [02:02<7:59:09, 14.48s/it]

@@@@@@@
41.02088671281907 {'colsample_bytree': 1.0, 'eta': 0.05, 'max_depth': 8, 'n_estimators': 100, 'max_leaves': 20, 'seed': 42, 'n_jobs': -1, 'reg_alpha': 1, 'reg_lambda': 5, 'subsample': 0.75}


  4%|▍         | 86/2000 [13:20<6:57:06, 13.08s/it]

@@@@@@@
40.588065939427466 {'colsample_bytree': 1.0, 'eta': 0.3, 'max_depth': 8, 'n_estimators': 70, 'max_leaves': 10, 'seed': 42, 'n_jobs': -1, 'reg_alpha': 0.1, 'reg_lambda': 2, 'subsample': 0.9}


  5%|▌         | 101/2000 [16:16<7:31:05, 14.25s/it]

@@@@@@@
40.37987151694299 {'colsample_bytree': 0.6, 'eta': 0.1, 'max_depth': 9, 'n_estimators': 100, 'max_leaves': 2, 'seed': 42, 'n_jobs': -1, 'reg_alpha': 0.0, 'reg_lambda': 0.1, 'subsample': 0.9}


 54%|█████▍    | 1078/2000 [2:57:21<4:34:16, 17.85s/it]

@@@@@@@
40.36009708450232 {'colsample_bytree': 0.8, 'eta': 0.09, 'max_depth': 8, 'n_estimators': 100, 'max_leaves': 15, 'seed': 42, 'n_jobs': -1, 'reg_alpha': 2, 'reg_lambda': 0.5, 'subsample': 1.0}


 56%|█████▌    | 1110/2000 [3:02:02<2:57:15, 11.95s/it]

@@@@@@@
40.34278113622279 {'colsample_bytree': 0.8, 'eta': 0.1, 'max_depth': 8, 'n_estimators': 100, 'max_leaves': 7, 'seed': 42, 'n_jobs': -1, 'reg_alpha': 0.5, 'reg_lambda': 5, 'subsample': 0.85}


 63%|██████▎   | 1267/2000 [3:25:32<2:07:22, 10.43s/it]

@@@@@@@
40.3154328910725 {'colsample_bytree': 1.0, 'eta': 0.1, 'max_depth': 9, 'n_estimators': 70, 'max_leaves': 20, 'seed': 42, 'n_jobs': -1, 'reg_alpha': 1, 'reg_lambda': 0.5, 'subsample': 1.0}


 67%|██████▋   | 1337/2000 [3:37:13<4:33:06, 24.72s/it]

@@@@@@@
40.11477478044242 {'colsample_bytree': 1.0, 'eta': 0.09, 'max_depth': 9, 'n_estimators': 100, 'max_leaves': 20, 'seed': 42, 'n_jobs': -1, 'reg_alpha': 100, 'reg_lambda': 0.5, 'subsample': 0.85}


 73%|███████▎  | 1458/2000 [3:58:50<2:10:21, 14.43s/it]

@@@@@@@
40.07458155603482 {'colsample_bytree': 0.6, 'eta': 0.09, 'max_depth': 10, 'n_estimators': 100, 'max_leaves': 35, 'seed': 42, 'n_jobs': -1, 'reg_alpha': 50, 'reg_lambda': 5, 'subsample': 1.0}


 88%|████████▊ | 1754/2000 [4:54:10<1:11:56, 17.55s/it]

@@@@@@@
40.06066711820832 {'colsample_bytree': 1.0, 'eta': 0.1, 'max_depth': 10, 'n_estimators': 100, 'max_leaves': 35, 'seed': 42, 'n_jobs': -1, 'reg_alpha': 5, 'reg_lambda': 5, 'subsample': 0.75}


100%|██████████| 2000/2000 [5:34:16<00:00, 10.03s/it]  


Parameters: { "score" } are not used.

##################################
40.06066711820832 3770.9452067200414


In [20]:
best_params_

{'colsample_bytree': 0.8,
 'eta': 0.1,
 'max_depth': 10,
 'n_estimators': 100,
 'max_leaves': 7,
 'seed': 42,
 'n_jobs': -1,
 'reg_alpha': 100,
 'reg_lambda': 0.1,
 'subsample': 0.85,
 'score': 40.0285843449525}