In [1]:
import psycopg2
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.impute import KNNImputer
from joblib import parallel_backend
from sklearn.model_selection import train_test_split, RepeatedKFold
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import PowerTransformer, StandardScaler
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from hummingbird.ml import convert, load
import pickle

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
# Cargar las variables desde el archivo pickle
with open('modeling.pkl', 'rb') as f:
    variables_cargadas = pickle.load(f)

# Asignar las variables cargadas a las variables locales
resultados_elec_trial = variables_cargadas['resultados_elec_trial']
resultados_oil_trial = variables_cargadas['resultados_oil_trial']
grid_search_elec = variables_cargadas['grid_search_elec']
grid_search_oil = variables_cargadas['grid_search_oil']
# Ahora puedes usar las variables cargadas en tu código
print(resultados_elec_trial)
print(resultados_oil_trial)
print(grid_search_elec)
print(grid_search_oil)

{'Iteración_1': {'RMSE': 0.05406924835065813, 'MAE': 0.034468884952803244, 'R^2': 0.9316417648954414}, 'Iteración_2': {'RMSE': 0.05026681072892937, 'MAE': 0.03232226420633737, 'R^2': 0.942261786587745}, 'Iteración_3': {'RMSE': 0.05133633339760567, 'MAE': 0.03288158828782299, 'R^2': 0.9336877034516011}, 'Iteración_4': {'RMSE': 0.04847113449341672, 'MAE': 0.03269601270090982, 'R^2': 0.9419323976577548}, 'Iteración_5': {'RMSE': 0.05169923641192172, 'MAE': 0.03168475940712653, 'R^2': 0.9340274484408745}}
{'Iteración_1': {'RMSE': 0.09828399346878117, 'MAE': 0.06247625342602924, 'R^2': 0.9120349301587006}, 'Iteración_2': {'RMSE': 0.09493829945598434, 'MAE': 0.06131155943657474, 'R^2': 0.9188613031907596}, 'Iteración_3': {'RMSE': 0.0941440325283101, 'MAE': 0.061094529850298826, 'R^2': 0.917751815444454}, 'Iteración_4': {'RMSE': 0.09554965006341627, 'MAE': 0.061677152724914086, 'R^2': 0.9162467917211456}, 'Iteración_5': {'RMSE': 0.09621071869933114, 'MAE': 0.06176401610605422, 'R^2': 0.9156434

In [8]:
ventas_oil = pd.read_csv('ventas_oil_imputed.csv')
ventas_elec = pd.read_csv('ventas_elec_imputed.csv')

In [6]:
ventas_oil.dtypes


price_amount                     float64
makeid                           float64
manufacturerprice                float64
km                               float64
vehicleyear                      float64
etiqueta_type_id                 float64
provinceid                       float64
horsepower                       float64
maxspeed                         float64
acceleration                     float64
combustible_type_id              float64
body_type_id                     float64
transmision_type_id              float64
doors                            float64
seatingcapacity                  float64
colores_type_id                  float64
dimensionsinmillimeterswidth     float64
dimensionsinmillimetersheight    float64
dimensionsinmillimeterslength    float64
weight                           float64
tankcapacityinliters             float64
trunkcapacityinliters            float64
consumptionurban                 float64
consumptionmixed                 float64
consumptionextra

In [9]:
ventas_oil['makeid'] = ventas_oil['makeid'].astype('category')
ventas_oil['etiqueta_type_id'] = ventas_oil['etiqueta_type_id'].astype('category')
ventas_oil['provinceid'] = ventas_oil['provinceid'].astype('category')
ventas_oil['transmision_type_id'] = ventas_oil['transmision_type_id'].astype('category')
ventas_oil['body_type_id'] = ventas_oil['body_type_id'].astype('category')
ventas_oil['combustible_type_id'] = ventas_oil['combustible_type_id'].astype('category')
ventas_oil['colores_type_id'] = ventas_oil['colores_type_id'].astype('category')
ventas_oil['lPrice'] = np.log10(ventas_oil['price_amount'])

In [9]:
ventas_oil.dtypes

price_amount                      float64
makeid                           category
manufacturerprice                 float64
km                                float64
vehicleyear                       float64
etiqueta_type_id                 category
provinceid                       category
horsepower                        float64
maxspeed                          float64
acceleration                      float64
combustible_type_id              category
body_type_id                     category
transmision_type_id              category
doors                             float64
seatingcapacity                   float64
colores_type_id                  category
dimensionsinmillimeterswidth      float64
dimensionsinmillimetersheight     float64
dimensionsinmillimeterslength     float64
weight                            float64
tankcapacityinliters              float64
trunkcapacityinliters             float64
consumptionurban                  float64
consumptionmixed                  

In [7]:
ventas_elec.dtypes

price_amount                     float64
makeid                           float64
manufacturerprice                float64
km                               float64
vehicleyear                      float64
etiqueta_type_id                 float64
provinceid                       float64
horsepower                       float64
maxspeed                         float64
acceleration                     float64
combustible_type_id              float64
body_type_id                     float64
transmision_type_id              float64
doors                            float64
seatingcapacity                  float64
colores_type_id                  float64
dimensionsinmillimeterswidth     float64
dimensionsinmillimetersheight    float64
dimensionsinmillimeterslength    float64
weight                           float64
tankcapacityinliters             float64
trunkcapacityinliters            float64
consumptionurban                 float64
consumptionmixed                 float64
consumptionextra

In [10]:
ventas_elec['makeid'] = ventas_elec['makeid'].astype('category')
ventas_elec['etiqueta_type_id'] = ventas_elec['etiqueta_type_id'].astype('category')
ventas_elec['provinceid'] = ventas_elec['provinceid'].astype('category')
ventas_elec['transmision_type_id'] = ventas_elec['transmision_type_id'].astype('category')
ventas_elec['body_type_id'] = ventas_elec['body_type_id'].astype('category')
ventas_elec['combustible_type_id'] = ventas_elec['combustible_type_id'].astype('category')
ventas_elec['colores_type_id'] = ventas_elec['colores_type_id'].astype('category')
ventas_elec['lPrice'] = np.log10(ventas_elec['price_amount'])

In [11]:
ventas_elec.dtypes

price_amount                      float64
makeid                           category
manufacturerprice                 float64
km                                float64
vehicleyear                       float64
etiqueta_type_id                 category
provinceid                       category
horsepower                        float64
maxspeed                          float64
acceleration                      float64
combustible_type_id              category
body_type_id                     category
transmision_type_id              category
doors                             float64
seatingcapacity                   float64
colores_type_id                  category
dimensionsinmillimeterswidth      float64
dimensionsinmillimetersheight     float64
dimensionsinmillimeterslength     float64
weight                            float64
tankcapacityinliters              float64
trunkcapacityinliters             float64
consumptionurban                  float64
consumptionmixed                  

In [11]:
# Separamos en training y test 70% para training y 30% para test
ventas_oil_X = ventas_oil.drop(columns=['price_amount', 'lPrice'])
ventas_oil_y = ventas_oil['lPrice']
ventas_oil_X_train, ventas_oil_X_test, ventas_oil_y_train, ventas_oil_y_test = train_test_split(ventas_oil_X, ventas_oil_y, test_size=0.30, random_state=123)

# Separamos en training y test 70% para training y 30% para test
ventas_elec_X = ventas_elec.drop(columns=['price_amount', 'lPrice'])
ventas_elec_y = ventas_elec['lPrice']
ventas_elec_X_train, ventas_elec_X_test, ventas_elec_y_train, ventas_elec_y_test = train_test_split(ventas_elec_X, ventas_elec_y, test_size=0.30, random_state=123)

# El 55% de los datos de training se dividen en un 80% para training y 20% para validación
# No se usa
ventas_oil_X_train_split, ventas_oil_X_val, ventas_oil_y_train_split, ventas_oil_y_val = train_test_split(ventas_oil_X_train, ventas_oil_y_train, test_size=0.2, random_state=456)

# El 80% de los datos de training se dividen en un 80% para training y 20% para validación
# No se usa
ventas_elec_X_train_split, ventas_elec_X_val, ventas_elec_y_train_split, ventas_elec_y_val = train_test_split(ventas_elec_X_train, ventas_elec_y_train, test_size=0.2, random_state=456)


In [12]:
# Crear objetos validacion cruzada para trial, tuning y assessment
rk_trial = RepeatedKFold(n_splits=5, n_repeats=1, random_state=123)
rk_tuning = RepeatedKFold(n_splits=5, n_repeats=1, random_state=456)
rk_assessment = RepeatedKFold(n_splits=5, n_repeats=1, random_state=789)

def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

def mae(y_true, y_pred):
    return mean_absolute_error(y_true, y_pred)

def rsq(y_true, y_pred):
    return r2_score(y_true, y_pred)

numeric_features_oil = ventas_oil_X_train.select_dtypes(include=['int', 'float']).columns
preprocessor_oil = ColumnTransformer(
    transformers=[
        ('yeo_johnson', PowerTransformer(method='yeo-johnson'), ['km']),
        ('scaling', StandardScaler(), numeric_features_oil)
    ])

numeric_features_elec = ventas_elec_X_train.select_dtypes(include=['int', 'float']).columns
preprocessor_elec = ColumnTransformer(
    transformers=[
        ('yeo_johnson', PowerTransformer(method='yeo-johnson'), ['km']),
        ('scaling', StandardScaler(), numeric_features_elec)
    ])



In [13]:
parameters = dict( alpha_1= np.r_[20:200:10],
                   alpha_2= np.r_[1.25:1.5:.01], 
                   lambda_1= np.r_[1:20:.5], 
                   lambda_2=np.r_[0.1:0.4:.005], 
                   threshold_lambda = np.r_[7150:7400:10],
                   fit_intercept=[True, False],
                   compute_score=[True, False],
                   copy_X=[True, False],
                   )

rf_model_trial = linear_model.ARDRegression()
randm_src = RandomizedSearchCV(estimator=rf_model_trial, param_distributions = parameters,
                               cv = 5, n_iter = 10, n_jobs=-1)
randm_src.fit(ventas_oil_X_train, ventas_oil_y_train)

print(" Results from Random Search " )
print("\n The best estimator across ALL searched params:\n", randm_src.best_estimator_)
print("\n The best score across ALL searched params:\n", randm_src.best_score_)
print("\n The best parameters across ALL searched params:\n", randm_src.best_params_)

 Results from Random Search 

 The best estimator across ALL searched params:
 ARDRegression(alpha_1=90, alpha_2=1.28, compute_score=True, lambda_1=5.5,
              lambda_2=0.27500000000000013, threshold_lambda=7160)

 The best score across ALL searched params:
 0.8134584375207172

 The best parameters across ALL searched params:
 {'threshold_lambda': 7160, 'lambda_2': 0.27500000000000013, 'lambda_1': 5.5, 'fit_intercept': True, 'copy_X': True, 'compute_score': True, 'alpha_2': 1.28, 'alpha_1': 90}


In [38]:
parameters = dict( alpha= np.r_[0.5:5:.25],
                   l1_ratio= np.r_[0:1:.015], 
                   max_iter= np.r_[100:2000:200], 
                   tol=np.r_[0.0001:0.0010:.00005],
                   fit_intercept=[True, False],
                   positive=[True, False],
                   copy_X=[True, False],
                   )

rf_model_trial = linear_model.ElasticNet()
randm_src = RandomizedSearchCV(estimator=rf_model_trial, param_distributions = parameters,
                               cv = 5, n_iter = 10, n_jobs=-1)
randm_src.fit(ventas_oil_X_train, ventas_oil_y_train)

print(" Results from Random Search " )
print("\n The best estimator across ALL searched params:\n", randm_src.best_estimator_)
print("\n The best score across ALL searched params:\n", randm_src.best_score_)
print("\n The best parameters across ALL searched params:\n", randm_src.best_params_)

 Results from Random Search 

 The best estimator across ALL searched params:
 ElasticNet(alpha=0.5, fit_intercept=False, l1_ratio=0.40499999999999997,
           max_iter=1500, tol=0.00035000000000000005)

 The best score across ALL searched params:
 0.7468697992619635

 The best parameters across ALL searched params:
 {'tol': 0.00035000000000000005, 'positive': False, 'max_iter': 1500, 'l1_ratio': 0.40499999999999997, 'fit_intercept': False, 'copy_X': True, 'alpha': 0.5}


In [14]:
parameters = dict( alpha= np.r_[0.5:5:.25],
                   max_iter= np.r_[100:2000:200], 
                   tol=np.r_[0.0001:0.0010:.00005],
                   fit_intercept=[True, False],
                   positive=[True, False],
                   copy_X=[True, False],
                   )

rf_model_trial = linear_model.Lasso()
randm_src = RandomizedSearchCV(estimator=rf_model_trial, param_distributions = parameters,
                               cv = 5, n_iter = 10, n_jobs=-1)
randm_src.fit(ventas_oil_X_train, ventas_oil_y_train)

print(" Results from Random Search " )
print("\n The best estimator across ALL searched params:\n", randm_src.best_estimator_)
print("\n The best score across ALL searched params:\n", randm_src.best_score_)
print("\n The best parameters across ALL searched params:\n", randm_src.best_params_)

 Results from Random Search 

 The best estimator across ALL searched params:
 Lasso(alpha=0.75, max_iter=700, tol=0.0009000000000000002)

 The best score across ALL searched params:
 0.7165193634470782

 The best parameters across ALL searched params:
 {'tol': 0.0009000000000000002, 'positive': False, 'max_iter': 700, 'fit_intercept': True, 'copy_X': True, 'alpha': 0.75}


In [59]:
parameters = dict( fit_intercept=[True, False],
                   positive=[True, False],
                   copy_X=[True, False],
                   )

rf_model_trial = linear_model.LinearRegression()
randm_src = RandomizedSearchCV(estimator=rf_model_trial, param_distributions = parameters,
                               cv = 5, n_iter = 10, n_jobs=-1)
randm_src.fit(ventas_oil_X_train, ventas_oil_y_train)

print(" Results from Random Search " )
print("\n The best estimator across ALL searched params:\n", randm_src.best_estimator_)
print("\n The best score across ALL searched params:\n", randm_src.best_score_)
print("\n The best parameters across ALL searched params:\n", randm_src.best_params_)



 Results from Random Search 

 The best estimator across ALL searched params:
 LinearRegression()

 The best score across ALL searched params:
 0.8134584384588667

 The best parameters across ALL searched params:
 {'positive': False, 'fit_intercept': True, 'copy_X': True}


In [15]:
rf_model_trial = linear_model.Lasso(alpha=0.75, max_iter=700, tol=0.0009000000000000002)
rf_pipeline_trial_oil = Pipeline(steps=[('preprocessor', preprocessor_oil), ('model', rf_model_trial)])
rf_pipeline_trial_elec = Pipeline(steps=[('preprocessor', preprocessor_elec), ('model', rf_model_trial)])

## Trial

In [16]:
resultados_oil_trial = {}
index = 0
for train_index, test_index in rk_trial.split(ventas_oil_X_train):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_trial_train, X_trial_test = ventas_oil_X_train.iloc[train_index], ventas_oil_X_train.iloc[test_index]
    y_trial_train, y_trial_test = ventas_oil_y_train.iloc[train_index], ventas_oil_y_train.iloc[test_index]

    with parallel_backend('threading', n_jobs=8):
        rf_pipeline_trial_oil.fit(X_trial_train, y_trial_train)
    
    y_pred = rf_pipeline_trial_oil.predict(X_trial_test)
    rmse_value = rmse(y_trial_test, y_pred)
    mae_value = mae(y_trial_test, y_pred)
    rsq_value = rsq(y_trial_test, y_pred)
    metricS = {'RMSE': rmse_value, 'MAE': mae_value, 'R^2': rsq_value}
    index += 1
    resultados_oil_trial[f'Iteración_{index}'] = {'RMSE': rmse_value,
                                      'MAE': mae_value,
                                      'R^2': rsq_value}
    

TRAIN: [     0      1      3 ... 109145 109146 109147] TEST: [     2     21     24 ... 109127 109133 109144]
TRAIN: [     0      1      2 ... 109144 109145 109146] TEST: [     8     10     12 ... 109135 109143 109147]
TRAIN: [     0      2      3 ... 109144 109146 109147] TEST: [     1      5      7 ... 109128 109140 109145]
TRAIN: [     0      1      2 ... 109144 109145 109147] TEST: [     3     14     19 ... 109139 109141 109146]
TRAIN: [     1      2      3 ... 109145 109146 109147] TEST: [     0      4      6 ... 109136 109137 109142]


In [17]:
resultados_elec_trial = {}
index = 0
for train_index, test_index in rk_trial.split(ventas_elec_X_train):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_trial_train, X_trial_test = ventas_elec_X_train.iloc[train_index], ventas_elec_X_train.iloc[test_index]
    y_trial_train, y_trial_test = ventas_elec_y_train.iloc[train_index], ventas_elec_y_train.iloc[test_index]

    with parallel_backend('threading', n_jobs=8):
        rf_pipeline_trial_elec.fit(X_trial_train, y_trial_train)
    
    y_pred = rf_pipeline_trial_elec.predict(X_trial_test)
    rmse_value = rmse(y_trial_test, y_pred)
    mae_value = mae(y_trial_test, y_pred)
    rsq_value = rsq(y_trial_test, y_pred)
    metricS = {'RMSE': rmse_value, 'MAE': mae_value, 'R^2': rsq_value}
    index += 1
    resultados_elec_trial[f'Iteración_{index}'] = {'RMSE': rmse_value,
                                      'MAE': mae_value,
                                      'R^2': rsq_value}

TRAIN: [   0    1    2 ... 6831 6832 6834] TEST: [   4    9   10 ... 6819 6826 6833]
TRAIN: [   0    1    2 ... 6832 6833 6834] TEST: [  12   21   24 ... 6824 6825 6827]
TRAIN: [   0    1    2 ... 6831 6832 6833] TEST: [   5    6   18 ... 6823 6830 6834]
TRAIN: [   2    3    4 ... 6832 6833 6834] TEST: [   0    1    7 ... 6822 6829 6831]
TRAIN: [   0    1    4 ... 6831 6833 6834] TEST: [   2    3    8 ... 6820 6828 6832]


In [18]:
for key, value in resultados_oil_trial.items():
    print(key, ": ", value)
    print()

Iteración_1 :  {'RMSE': 0.32886662629543506, 'MAE': 0.24927317744717212, 'R^2': -0.0002260337791308853}

Iteración_2 :  {'RMSE': 0.3312956950834853, 'MAE': 0.25083106061212745, 'R^2': -1.004965366924182e-06}

Iteración_3 :  {'RMSE': 0.3283106685675283, 'MAE': 0.24764534222400522, 'R^2': -1.0833615375238637e-05}

Iteración_4 :  {'RMSE': 0.33324820602542593, 'MAE': 0.25163057458429094, 'R^2': -6.672913963323701e-05}

Iteración_5 :  {'RMSE': 0.3296233329158037, 'MAE': 0.24827040138298753, 'R^2': -1.997723587821021e-05}



In [19]:
for key, value in resultados_elec_trial.items():
    print(key, ": ", value)
    print()

Iteración_1 :  {'RMSE': 0.20409152007170947, 'MAE': 0.1536348982079843, 'R^2': -0.0005723787917437662}

Iteración_2 :  {'RMSE': 0.2036087167286147, 'MAE': 0.15070158030985364, 'R^2': -0.0002307431990324904}

Iteración_3 :  {'RMSE': 0.19929536747235677, 'MAE': 0.15000832050572496, 'R^2': -2.6691140596390994e-05}

Iteración_4 :  {'RMSE': 0.19584892695095285, 'MAE': 0.14528409055370758, 'R^2': -0.0005088363157195186}

Iteración_5 :  {'RMSE': 0.21377389831177512, 'MAE': 0.15833134542604121, 'R^2': -0.0004605818034209008}



## Primer Tuneado

In [15]:
rf_pipeline_tuning_oil = Pipeline(steps=[('preprocessor', preprocessor_oil), ('model', rf_model_trial)])
param_grid = {
    'model__n_estimators': [100,300,500,700,900,1100,1300,1500,1700,1900,2000],
    'model__min_samples_leaf': [5, 10,15,20]
}
grid_search_oil = GridSearchCV(rf_pipeline_tuning_oil, param_grid, cv=5, n_jobs=8)
grid_search_oil.fit(ventas_oil_X_train, ventas_oil_y_train)


In [3]:
print("Resultados de la búsqueda de cuadrícula:")
print(grid_search_oil.cv_results_)

Resultados de la búsqueda de cuadrícula:
{'mean_fit_time': array([  61.41628666,  192.8625731 ,  340.09867926,  477.08272176,
        617.13154702,  746.74504709,  883.02587552, 1011.2526504 ,
       1020.93169017, 1084.22003794, 1160.25186486,   51.89682217,
        155.40727577,  257.29427152,  361.70663595,  465.20602126,
        567.34497781,  668.12488289,  769.02596807,  871.64471874,
        972.04552698, 1020.28417578,   47.70200992,  142.84910889,
        237.35062327,  323.72581053,  414.10549645,  505.33578668,
        592.92587738,  699.54415989,  816.04355459,  907.93048387,
       1037.39619503,   47.19407201,  163.18196521,  290.60280423,
        390.53410487,  508.13529339,  634.87828932,  773.13766356,
        914.95282574, 1041.30811186, 1155.71477156, 1056.78434639]), 'std_fit_time': array([ 0.99217109,  3.5520603 ,  6.22248386,  1.41895185, 12.2149143 ,
        3.54078653,  2.67385557, 33.4368863 , 31.19738976,  6.12600555,
        4.20997637,  0.29567351,  0.356815

In [4]:
print("Mejores parámetros:", grid_search_oil.best_params_)
print("Mejor puntaje:", grid_search_oil.best_score_)
best_tuning_oil_set = grid_search_oil.best_estimator_

Mejores parámetros: {'model__min_samples_leaf': 5, 'model__n_estimators': 1300}
Mejor puntaje: 0.9262131155319284


In [12]:
y_pred_tuning_oil = best_tuning_oil_set.predict(ventas_oil_X_test)
rmse_value = rmse(ventas_oil_y_test, y_pred_tuning_oil)
mae_value = mae(ventas_oil_y_test, y_pred_tuning_oil)
rsq_value = rsq(ventas_oil_y_test, y_pred_tuning_oil)
resultados_oil_tuning = {'RMSE': rmse_value, 'MAE': mae_value, 'R^2': rsq_value}

In [20]:
rf_pipeline_tuniing_elec = Pipeline(steps=[('preprocessor', preprocessor_elec), ('model', rf_model_trial)])
param_grid = {
    'model__n_estimators': [100,300,500,700,900,1100,1300,1500,1700,1900,2000],
    'model__min_samples_leaf': [5, 10,15,20]
}
grid_search_elec = GridSearchCV(rf_pipeline_tuniing_elec, param_grid, cv=5, n_jobs=8)
grid_search_elec.fit(ventas_elec_X_train, ventas_elec_y_train) 

In [22]:
print("Resultados de la búsqueda de cuadrícula:")
print(grid_search_elec.cv_results_)

Resultados de la búsqueda de cuadrícula:
{'mean_fit_time': array([ 3.91085358, 11.50982308, 19.48524132, 27.52988729, 35.97036219,
       45.53812633, 55.50364499, 65.2820343 , 73.93705454, 82.65927238,
       87.31839752,  3.77416234, 11.24598851, 18.9471693 , 26.44847884,
       33.97004452, 41.22141771, 48.3947948 , 55.87044826, 63.57372851,
       71.06564989, 75.00890212,  3.49788909, 10.24072843, 17.2524703 ,
       23.83484664, 30.8661572 , 37.55920324, 44.38638191, 51.18903279,
       58.05206633, 65.02203732, 68.38312101,  3.26791959,  9.65565987,
       16.14593983, 22.53820124, 28.77686262, 35.19601965, 41.75353189,
       48.09806857, 54.69564056, 60.59064531, 56.96886148]), 'std_fit_time': array([0.06518601, 0.17762996, 0.18823992, 0.40061402, 0.87455616,
       0.6354723 , 0.86778968, 0.64796802, 0.85819044, 0.5777983 ,
       0.9588335 , 0.04129734, 0.13944033, 0.26314809, 0.30433796,
       0.47537822, 0.27415647, 0.53048918, 0.73708505, 0.6735125 ,
       1.2289582 , 0

In [5]:
print("Mejores parámetros:", grid_search_elec.best_params_)
best_tuning_elec_set = grid_search_elec.best_estimator_

Mejores parámetros: {'model__min_samples_leaf': 5, 'model__n_estimators': 1500}


In [13]:
y_pred_tuning_elec = best_tuning_elec_set.predict(ventas_elec_X_test)
rmse_value = rmse(ventas_elec_y_test, y_pred_tuning_elec)
mae_value = mae(ventas_elec_y_test, y_pred_tuning_elec)
rsq_value = rsq(ventas_elec_y_test, y_pred_tuning_elec)
resultados_elec_tuning = {'RMSE': rmse_value, 'MAE': mae_value, 'R^2': rsq_value}


In [14]:
print(resultados_oil_tuning)
print(resultados_elec_tuning)

{'RMSE': 0.08709989279491755, 'MAE': 0.05503352492876248, 'R^2': 0.9290470781356323}
{'RMSE': 0.05009927650362308, 'MAE': 0.025694334386295023, 'R^2': 0.9414861883847334}


## Segundo tuneado

In [16]:
rf_pipeline_tuning_oil_nd = Pipeline(steps=[('preprocessor', preprocessor_oil), ('model', rf_model_trial)])
param_grid = {
    'model__n_estimators': [1100,1200,1300,1400,1500],
    'model__min_samples_leaf': [3,4,5,6,7]
}
grid_search_oil_nd = GridSearchCV(rf_pipeline_tuning_oil_nd, param_grid, cv=5, n_jobs=5)
grid_search_oil_nd.fit(ventas_oil_X_train, ventas_oil_y_train)

In [None]:
print("Resultados de la búsqueda de cuadrícula:")
print(grid_search_oil_nd.cv_results_)

In [None]:
print("Mejores parámetros:", grid_search_oil_nd.best_params_)
print("Mejor puntaje:", grid_search_oil_nd.best_score_)
best_tuning_oil_set_nd = grid_search_oil_nd.best_estimator_

In [None]:
rf_pipeline_tuning_elec_nd = Pipeline(steps=[('preprocessor', preprocessor_elec), ('model', rf_model_trial)])
param_grid = {
    'model__n_estimators': [1300,1400,1500,1600,1700],
    'model__min_samples_leaf': [3,4,5,6,7]
}
grid_search_elec_nd = GridSearchCV(rf_pipeline_tuning_elec_nd, param_grid, cv=5, n_jobs=8)
grid_search_elec_nd.fit(ventas_elec_X_train, ventas_elec_y_train)

In [None]:
print("Resultados de la búsqueda de cuadrícula:")
print(grid_search_elec_nd.cv_results_)


In [None]:
print("Mejores parámetros:", grid_search_elec_nd.best_params_)
print("Mejor puntaje:", grid_search_elec_nd.best_score_)
best_tuning_elec_set_nd = grid_search_elec_nd.best_estimator_

In [24]:
# Crear un diccionario con las variables que deseas guardar
variables_a_guardar = {'resultados_elec_trial':resultados_elec_trial,
                       'resultados_oil_trial':resultados_oil_trial,
                       'grid_search_oil':grid_search_oil,
                       'best_tuning_oil_set':best_tuning_oil_set,
                        'grid_search_elec':grid_search_elec,
                        'best_tuning_elec_set':best_tuning_elec_set,
                        'grid_search_oil_nd':grid_search_oil_nd,
                        'best_tuning_oil_set_nd':best_tuning_oil_set_nd,
                        'grid_search_elec_nd':grid_search_elec_nd,
                        'best_tuning_elec_set_nd':best_tuning_elec_set_nd
                       }

# Guardar las variables en un archivo pickle
with open('modeling.pkl', 'wb') as f:
    pickle.dump(variables_a_guardar, f)