In [1]:
import sys
sys.path.append("../../") # go to parent dir

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time
from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV, GridSearchCV, cross_validate
from sklearn.neural_network import MLPRegressor as MLPRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from modelos import *
from sklearn.compose import ColumnTransformer
from modelos import RANDOM_SEED
from scipy.stats import randint
import random

In [3]:
pd.options.display.max_columns = 50 # para mostrar todas las columnas

In [4]:
RANDOM_STATE = RANDOM_SEED

In [5]:
# cargamos datos
X, y = datos_full()
# Eliminamos las columnas repetidas (visto en el notebook del analisis)
X = X.drop(columns=["Temperatura", 'wspd'])

In [6]:
X.head()

Unnamed: 0,anio,mes,dia,moonphase,hora,temp,dewPt,heat_index,rh,pressure,vis,wc,feels_like,uv_index,wdir_E,wdir_ENE,wdir_ESE,wdir_N,wdir_NE,wdir_NNE,wdir_NNW,wdir_NW,wdir_S,wdir_SE,wdir_SSE,wdir_SSW,wdir_SW,wdir_VAR,wdir_W,wdir_WNW,wdir_WSW,Vviento,PeriodoOlas,Lluvia,Nubosidad
0,2022,1,1,0.96,8,53.5,52.0,53.5,94.0,29.58,3.0,53.5,53.5,0.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,22,12,0.0,43
1,2022,1,1,0.96,9,52.0,50.0,52.0,94.0,29.61,3.0,52.0,52.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,21,12,0.0,7
2,2022,1,1,0.96,10,54.0,51.0,54.0,91.0,29.625,3.0,54.0,54.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,20,13,0.0,20
3,2022,1,1,0.96,11,53.0,51.0,53.0,94.0,29.64,3.0,53.0,53.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,21,13,0.1,40
4,2022,1,1,0.96,12,53.0,51.0,53.0,94.0,29.625,3.0,53.0,53.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,21,13,0.0,74


#### Separación en train y test

In [7]:
# Para el modelo con todas las variables
X_train, X_test, y_train, y_test = sep_train_test() #separa en 2022 y 2023
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=RANDOM_STATE)

SIN ESCALADO DE VARIABLES
-

**MODELO COTA INFERIOR**

In [8]:
def modelo_cota_inferior(x_tr, x_te, y_tr, y_te, run_name,exp_name="mlperceptron", exp_info="Modelo perceptrón sin modificar hiperparámetros con cv"):
    flo = MLFlow(exp_name)
    lr = MLPRegressor(random_state= RANDOM_STATE)
    cv_metrics = cross_validate(lr, x_tr, y_tr, cv=cv_folds(), scoring="neg_root_mean_squared_error", return_train_score=True)
    lr.fit(x_tr, y_tr)

    # metricas TEST
    metricas = calcular_metricas(y_te, lr.predict(x_te))
    # metricas CV
    metricas["CV_TEST_RMSE"] = -1 * cv_metrics['test_score'].mean()
    metricas["CV_TRAIN_RMSE"] = -1 * cv_metrics['train_score'].mean()
    
    flo.persist_model_to_mlflow(x_tr, lr, {"random_state": RANDOM_STATE}, metricas, run_name, exp_info)


In [9]:
modelo_cota_inferior(X_train, X_test, y_train, y_test, "perceptron-cota-inferior")



OPTIMIZANDO HIPERPARÁMETROS CON **GRIDSEARCH**

In [10]:
params = {
    'hidden_layer_sizes': [(50,), (100,), (50,50)],
    'activation': ['relu', 'logistic'],
    'solver': ['adam', 'lbfgs'],
    'alpha': [0.0001, 0.01, 0.01, 0.1]
}

In [11]:
def modelo_grid_search(x_tr, x_te, y_tr, y_te,run_name, exp_info, exp_name="mlperceptron"):
    flo = MLFlow(exp_name)
    modelo = MLPRegressor(random_state= RANDOM_STATE)
    mlpSearch = GridSearchCV(modelo , params, cv=cv_folds(), return_train_score=True, scoring="neg_root_mean_squared_error", n_jobs = -1)
    mlpSearch.fit(x_tr, y_tr)
    flo.persist_model_to_mlflow(x_tr, mlpSearch.best_estimator_, mlpSearch.best_params_, calcular_metricas_search(mlpSearch, x_te, y_te), run_name, "Mejor modelo conseguido con GridSearchCV sin escalado de variables")

In [12]:
modelo_grid_search(X_train, X_test, y_train, y_test, "mlp-grid-search-all-columns", "Modelo perceptrón con GridSearchCV con todas las columnas, incluidas las de anio, mes, dia y hora.")



Quitamos las variables anio, mes, dia y hora

In [13]:
X_train = X_train.drop(columns = ["anio", "mes", "dia", "hora"])
X_test = X_test.drop(columns = ["anio", "mes", "dia", "hora"])

In [14]:
modelo_grid_search(X_train, X_test, y_train, y_test, "mlp-grid-search-less-columns", "Modelo perceptrón con GridSearchCV quitando las columnas de anio, mes, dia y hora")



In [15]:
params_random_search = {
    "hidden_layer_sizes": [(random.randint(20, 100),),
        (random.randint(20, 100), random.randint(20, 100)),
        (random.randint(20, 100), random.randint(20, 100), random.randint(20, 100))],
    "alpha": np.arange(0.0001, 1.0, 0.01),
    "batch_size" : range(100, X_train.shape[0]),
    "activation" : ["relu", "logistic"]
}

In [16]:
def modelo_random_search(x_tr, x_te, y_tr, y_te,run_name, exp_info, exp_name="mlperceptron"):
    flo = MLFlow(exp_name)
    modelo = MLPRegressor(random_state= RANDOM_STATE)
    mlpSearch = RandomizedSearchCV(modelo , param_distributions = params_random_search, cv=cv_folds(), return_train_score=True, scoring="neg_root_mean_squared_error", n_jobs = -1, n_iter = 50, random_state = RANDOM_STATE)
    mlpSearch.fit(x_tr, y_tr)
    flo.persist_model_to_mlflow(x_tr, mlpSearch.best_estimator_, mlpSearch.best_params_, calcular_metricas_search(mlpSearch, x_te, y_te), run_name, "Mejor modelo conseguido con RandomizedSearchCV sin escalado de variables")

In [17]:
modelo_random_search(X_train, X_test, y_train, y_test, "mlp-random-search", "Modelo perceptrón con RandomSearch")



ESCALANDO VARIABLES
-

In [18]:
X_train

Unnamed: 0,moonphase,temp,dewPt,heat_index,rh,pressure,vis,wc,wspd,feels_like,uv_index,wdir_E,wdir_ENE,wdir_ESE,wdir_N,wdir_NE,wdir_NNE,wdir_NNW,wdir_NW,wdir_S,wdir_SE,wdir_SSE,wdir_SSW,wdir_SW,wdir_VAR,wdir_W,wdir_WNW,wdir_WSW,Vviento,PeriodoOlas,Temperatura,Lluvia,Nubosidad
0,0.96,53.5,52.0,53.5,94.0,29.580,3.0,53.5,20.5,53.5,0.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,22,12,12,0.0,43
1,0.96,52.0,50.0,52.0,94.0,29.610,3.0,52.0,19.5,52.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,21,12,11,0.0,7
2,0.96,54.0,51.0,54.0,91.0,29.625,3.0,54.0,18.5,54.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,20,13,11,0.0,20
3,0.96,53.0,51.0,53.0,94.0,29.640,3.0,53.0,19.0,53.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,21,13,11,0.1,40
4,0.96,53.0,51.0,53.0,94.0,29.625,3.0,53.0,24.5,53.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,21,13,11,0.0,74
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4674,0.29,49.0,48.0,49.0,97.0,29.170,5.5,47.5,9.0,47.5,0.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,10,11,10,0.3,100
4675,0.29,48.0,46.0,48.0,93.0,29.185,4.0,44.5,9.0,44.5,0.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,20,11,10,0.4,100
4676,0.29,46.5,44.5,46.5,93.0,29.200,4.5,44.5,6.0,44.5,0.0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,4,11,8,0.4,100
4677,0.29,45.0,43.0,45.0,93.0,29.230,3.0,40.0,9.0,40.0,0.0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,11,11,8,0.0,100


In [19]:
# Para probar con datos escalados
scaler_x = ColumnTransformer(
    transformers=[
        # no escalonamos las dummy de dirección de viento
        ("num", StandardScaler(), [c for c in X_train.columns if not c.startswith('wdir_')]),
    ],
    remainder='passthrough'
)
scaler_x.fit(X_train)

In [20]:
X_train_escalado = scaler_x.transform(X_train)
X_test_escalado = scaler_x.transform(X_test)

In [21]:
# columnas_numericas = [c for c in X_train.columns if not c.startswith('wdir_')]
# columnas_dir_viento = [c for c in X_train.columns if c.startswith('wdir_')]
# nuevo_orden_columnas = columnas_numericas + columnas_dir_viento

In [22]:
# Para probar con datos escalados
scaler_y = StandardScaler()

y_train_2d = y_train.values.reshape(-1, 1)
y_train_2d_scaled= scaler_y.fit_transform(y_train_2d)
y_train_1d_scaled  = y_train_2d_scaled.ravel()

y_test_2d = y_test.values.reshape(-1, 1)
y_test_2d_scaled = scaler_y.transform(y_test_2d)
y_test_1d_scaled  = y_test_2d_scaled.ravel()

**MODELO COTA INFERIOR**

In [23]:
def modelo_cota_inferior_escalado(x_tr, x_te, y_tr, y_te, run_name,exp_name="mlperceptron", exp_info="Modelo perceptrón escalado sin modificar hiperparámetros con cv"):
    flo = MLFlow(exp_name)
    lr = MLPRegressor(random_state= RANDOM_STATE)
    cv_metrics = cross_validate(lr, x_tr, y_tr, cv=cv_folds(), scoring="neg_root_mean_squared_error", return_train_score=True)
    lr.fit(x_tr, y_tr)

    predicciones = lr.predict(x_te)
    pred_lr_2d = predicciones.reshape(-1, 1)
    pred_lr_2d_escala_normal = scaler_y.inverse_transform(pred_lr_2d)
    # metricas TEST
    metricas = calcular_metricas(y_te, pred_lr_2d_escala_normal)
    # metricas CV
    metricas["CV_TEST_RMSE"] = -1 * cv_metrics['test_score'].mean()
    metricas["CV_TRAIN_RMSE"] = -1 * cv_metrics['train_score'].mean()
    
    flo.persist_model_to_mlflow(x_tr, lr, {"random_state": RANDOM_STATE}, metricas, run_name, exp_info)


In [24]:
modelo_cota_inferior_escalado(X_train_escalado, X_test_escalado, y_train_1d_scaled, y_test, "perceptron-cota-inferior-escalado")



**GRID SEARCH**

In [29]:
def calcular_metricas_search_escalando_y(search, X_test, y_test, scaler_y):

    predicciones = search.best_estimator_.predict(X_test)
    #Invertimos la escala de las predicciones
    predicciones_best_model_2d = predicciones.reshape(-1, 1)
    predicciones_2d = scaler_y.inverse_transform(predicciones_best_model_2d)

    # metricas TEST
    metricas = calcular_metricas(y_test, predicciones_2d)
    # metricas CV
    ind = search.best_index_
    metricas["CV_TEST_RMSE"] = -1 * search.cv_results_["mean_test_score"][ind]
    metricas["CV_TRAIN_RMSE"] = -1 * search.cv_results_["mean_train_score"][ind]
    return metricas

In [30]:
def modelo_grid_search_escalado(x_tr, x_te, y_tr, y_te,run_name, exp_info, exp_name="mlperceptron"):
    flo = MLFlow(exp_name)
    modelo = MLPRegressor(random_state= RANDOM_STATE)
    mlpSearch = GridSearchCV(modelo , params, cv=cv_folds(), return_train_score=True, scoring="neg_root_mean_squared_error", n_jobs = -1)
    mlpSearch.fit(x_tr, y_tr)
    flo.persist_model_to_mlflow(x_tr, mlpSearch.best_estimator_, mlpSearch.best_params_, calcular_metricas_search_escalando_y(mlpSearch, x_te, y_te, scaler_y), run_name, "Mejor modelo conseguido con GridSearchCV con escalado de variables")

In [31]:
modelo_grid_search_escalado(X_train_escalado, X_test_escalado, y_train_1d_scaled, y_test, "esc-mlp-grid-search","Modelo GridSearchCv escalando las variables")

**RANDOM SEARCH**

In [32]:
def modelo_random_search_escalado(x_tr, x_te, y_tr, y_te,run_name, exp_info, exp_name="mlperceptron"):
    flo = MLFlow(exp_name)
    modelo = MLPRegressor(random_state= RANDOM_STATE)
    mlpSearch = RandomizedSearchCV(modelo , param_distributions = params_random_search, cv=cv_folds(), return_train_score=True, scoring="neg_root_mean_squared_error", n_jobs = -1, n_iter = 50, random_state = RANDOM_STATE)
    mlpSearch.fit(x_tr, y_tr)
    flo.persist_model_to_mlflow(x_tr, mlpSearch.best_estimator_, mlpSearch.best_params_, calcular_metricas_search_escalando_y(mlpSearch, x_te, y_te, scaler_y), run_name, "Mejor modelo conseguido con RandomizedSearchCV con escalado de variables")

In [33]:
modelo_random_search_escalado(X_train_escalado, X_test_escalado, y_train_1d_scaled, y_test, "esc-mlp-random-search","Modelo RandomSearchCv escalando las variables")



In [34]:
modelo_random_search_escalado(X_train_escalado, X_test_escalado, y_train_1d_scaled, y_test, "esc-mlp-random-search_v2","Modelo RandomSearchCv escalando las variables")



# FIN
-