In [1]:
import sys
sys.path.append("../../") 

In [2]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate
from sklearn.metrics import max_error, mean_squared_error, r2_score
from modelos import *

Preparado de datos

In [12]:
paramsRandom = {
    "random_state": [RANDOM_SEED],
    'n_estimators': np.arange(1,602,100),
    'max_depth': [None, 4, 8, 12, 16, 20, 24, 28],
    'min_samples_split': [2, 4, 6],
    'min_samples_leaf': [1, 2, 4, 6],
    'bootstrap': [True, False]
}
paramsGrid = {
    "random_state": [RANDOM_SEED],
    'n_estimators': [100,600],
    'max_depth': [None, 16, 28],
    'min_samples_split': [2, 6],
    'min_samples_leaf': [1],
    'bootstrap': [True, False]
}

In [4]:
# Para el modelo con todas las variables
X_train, X_test, y_train, y_test = sep_train_test()
# Eliminamos las columnas repetidas y las columnas temporales (visto en el notebook del analisis)
X_train = X_train.drop(columns=["Temperatura", 'wspd', 'anio', 'mes', 'dia', 'hora'])
X_test = X_test.drop(columns=["Temperatura", 'wspd', 'anio', 'mes', 'dia', 'hora'])

Modelos

In [5]:
# Para probar con datos escalados
scaler = ColumnTransformer(
    transformers=[
        # no escalonamos las dummy de dirección de viento
        ("num", StandardScaler(), [c for c in X_train.columns if not c.startswith('wdir_')]),
    ],
    remainder='passthrough'
)
scaler.fit(X_train)

In [14]:
def random_forest(X_train, X_test, y_train, y_test, run_name, exp_info):
    flo = MLFlow("random_forest")

    modelo_basico = RandomForestRegressor(max_depth=10, random_state=RANDOM_SEED)
    cv_metrics = cross_validate(modelo_basico, X_train, y_train, cv=cv_folds(), scoring="neg_root_mean_squared_error", return_train_score=True)
    modelo_basico.fit(X_train, y_train)

    #Predicciones
    pred_modelo_basico = modelo_basico.predict(X_test)
    #print(f"Error cuadratico medio: {mean_squared_error(y_test, pred_modelo_basico)}.\nr2: {r2_score(y_test, pred_modelo_basico)}\nMaximo error {max_error(y_test, pred_modelo_basico)}")

    # metricas TEST
    metricas = calcular_metricas(y_test, pred_modelo_basico)

    # metricas CV
    metricas["CV_TEST_RMSE"] = -1 * cv_metrics['test_score'].mean()
    metricas["CV_TRAIN_RMSE"] = -1 * cv_metrics['train_score'].mean()
    print(metricas)

    flo.persist_model_to_mlflow(X_train, modelo_basico, {"random_state": RANDOM_SEED, "max_depth": 10}, metricas, run_name, exp_info)


In [15]:
def random_search(X_train, X_test, y_train, y_test, run_name, exp_info):
    flo = MLFlow("random_forest")
    modelo_random_search = RandomizedSearchCV(estimator=RandomForestRegressor(),param_distributions=paramsRandom,n_iter=10, scoring='neg_mean_squared_error', cv=cv_folds(), random_state=RANDOM_SEED, return_train_score=True, n_jobs=-1)
    modelo_random_search.fit(X_train, y_train)
    mejor_modelo_randomSearch = modelo_random_search.best_estimator_

    #Predicciones
    #pred_modelo_random_search = mejor_modelo_randomSearch.predict(X_test)
    #print(f"Error cuadratico medio: {mean_squared_error(y_test, pred_modelo_random_search)}.\nr2: {r2_score(y_test, pred_modelo_random_search)}\nMaximo error {max_error(y_test, pred_modelo_random_search)}")

    metricas = calcular_metricas_search(modelo_random_search, X_test, y_test)
    print(metricas)

    flo.persist_model_to_mlflow(X_train, mejor_modelo_randomSearch, modelo_random_search.best_params_, metricas, run_name, exp_info)

In [16]:
def grid_search(X_train, X_test, y_train, y_test, run_name, exp_info):
    flo = MLFlow("random_forest")
    modelo_grid_search = GridSearchCV(estimator=RandomForestRegressor(),param_grid=paramsGrid, scoring='neg_mean_squared_error', cv=cv_folds(), return_train_score=True, n_jobs=-1)
    modelo_grid_search.fit(X_train, y_train)
    mejor_modelo_gridSearch = modelo_grid_search.best_estimator_

    #Predicciones
    #pred_modelo_grid_search = mejor_modelo_gridSearch.predict(X_test)
    #print(f"Error cuadratico medio: {mean_squared_error(y_test, pred_modelo_grid_search)}.\nr2: {r2_score(y_test, pred_modelo_grid_search)}\nMaximo error {max_error(y_test, pred_modelo_grid_search)}")
    
    metricas = calcular_metricas_search(modelo_grid_search, X_test, y_test)
    print(metricas)
    
    flo.persist_model_to_mlflow(X_train, mejor_modelo_gridSearch, modelo_grid_search.best_params_, metricas, run_name, exp_info)



RANDOM FOREST BÁSICO

In [9]:
#Con las variables explicativas sin escalar
random_forest(X_train, X_test, y_train, y_test, run_name="random_forest_basico", exp_info="Random Forest con todos los hiperparámetros predeterminados y sin escalado de las variables explicativas") 

2024/04/16 18:24:18 INFO mlflow.tracking.fluent: Experiment with name 'random_forest' does not exist. Creating a new experiment.


{'TEST_MAX_ERROR': 2.131047869025786, 'TEST_ROOT_MEAN_SQ_ERROR': 0.41389733047781463, 'TEST_MEDIAN_ABS_ERROR': 0.20092473535610111, 'TEST_MEAN_ABS_ERROR': 0.2915292721135993, 'CV_TEST_RMSE': 0.26685812909241785, 'CV_TRAIN_RMSE': 0.1808297932165988}


Registered model 'random_forest_basico' already exists. Creating a new version of this model...
2024/04/16 18:24:31 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: random_forest_basico, version 2
Created version '2' of model 'random_forest_basico'.


In [10]:
#Con las variables explicativas escaladas
random_forest(scaler.transform(X_train), scaler.transform(X_test), y_train, y_test, run_name="esc_random_forest_basico", exp_info="Random Forest con todos los hiperparámetros predeterminados y con escalado de las variables explicativas") 

{'TEST_MAX_ERROR': 2.1142285711014885, 'TEST_ROOT_MEAN_SQ_ERROR': 0.4122866225606347, 'TEST_MEDIAN_ABS_ERROR': 0.19961147319736638, 'TEST_MEAN_ABS_ERROR': 0.2900479224306535, 'CV_TEST_RMSE': 0.2668161297336646, 'CV_TRAIN_RMSE': 0.1807491633275266}


Successfully registered model 'rf_basico'.
2024/04/16 18:24:48 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: rf_basico, version 1
Created version '1' of model 'rf_basico'.


RANDOM FOREST CON RANDOM SEARCH

In [17]:
#Con las variables explicativas sin escalar
random_search(X_train, X_test, y_train, y_test, run_name="rf_con_random_search", exp_info="Random Forest donde los hiperparámetros se escogen mediante random search y sin escalado de las variables explicativas")

{'TEST_MAX_ERROR': 1.9861018162324129, 'TEST_ROOT_MEAN_SQ_ERROR': 0.4109153323030973, 'TEST_MEDIAN_ABS_ERROR': 0.19879180624516402, 'TEST_MEAN_ABS_ERROR': 0.2899906532685502, 'CV_TEST_RMSE': 0.06707166347149676, 'CV_TRAIN_RMSE': 0.027438585316847335}


Successfully registered model 'rf_con_random_search'.
2024/04/16 18:28:11 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: rf_con_random_search, version 1
Created version '1' of model 'rf_con_random_search'.


In [18]:
#Con las variables explicativas escaladas
random_search(scaler.transform(X_train), scaler.transform(X_test), y_train, y_test, run_name="esc_rf_con_random_search", exp_info="Random Forest donde los hiperparámetros se escogen mediante random search y con escalado de las variables explicativas")

{'TEST_MAX_ERROR': 1.9785163130125807, 'TEST_ROOT_MEAN_SQ_ERROR': 0.4094530505161749, 'TEST_MEDIAN_ABS_ERROR': 0.19786944823215677, 'TEST_MEAN_ABS_ERROR': 0.28856151227097776, 'CV_TEST_RMSE': 0.0671882512990779, 'CV_TRAIN_RMSE': 0.027452590512611973}


Registered model 'rf_con_random_search' already exists. Creating a new version of this model...
2024/04/16 18:29:30 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: rf_con_random_search, version 2
Created version '2' of model 'rf_con_random_search'.


RANDOM FOREST CON GRID SEARCH

In [19]:
#Con las variables explicativas sin escalar
grid_search(X_train, X_test, y_train, y_test, run_name="rf_con_grid_search", exp_info="Random Forest donde los hiperparámetros se escogen mediante gridsearch y sin escalado de las variables explicativas")



{'TEST_MAX_ERROR': 2.0364999999999958, 'TEST_ROOT_MEAN_SQ_ERROR': 0.4128106839757627, 'TEST_MEDIAN_ABS_ERROR': 0.20216666666666677, 'TEST_MEAN_ABS_ERROR': 0.2922737420077531, 'CV_TEST_RMSE': 0.055068171083386394, 'CV_TRAIN_RMSE': 0.007698538391517559}


Successfully registered model 'rf_con_grid_search'.
2024/04/16 18:36:49 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: rf_con_grid_search, version 1
Created version '1' of model 'rf_con_grid_search'.


In [20]:
#Con las variables explicativas escaladas
grid_search(scaler.transform(X_train), scaler.transform(X_test), y_train, y_test, run_name="esc_rf_con_grid_search", exp_info="Random Forest donde los hiperparámetros se escogen mediante gridsearch y con escalado de las variables explicativas")



{'TEST_MAX_ERROR': 2.048166666666666, 'TEST_ROOT_MEAN_SQ_ERROR': 0.4110529069009112, 'TEST_MEDIAN_ABS_ERROR': 0.2017500000000007, 'TEST_MEAN_ABS_ERROR': 0.29055861960636975, 'CV_TEST_RMSE': 0.055256959553960594, 'CV_TRAIN_RMSE': 0.007701519513693722}


Registered model 'rf_con_grid_search' already exists. Creating a new version of this model...
2024/04/16 18:44:49 INFO mlflow.store.model_registry.abstract_store: Waiting up to 300 seconds for model version to finish creation. Model name: rf_con_grid_search, version 2
Created version '2' of model 'rf_con_grid_search'.
