In [6]:
# librerías
import pandas as pd
import numpy as np
import os
import joblib
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import warnings
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

In [None]:
# configuración inicial
warnings.filterwarnings('ignore')
processed_data_path = '../data/processed/dataset_final.csv'

try:
    df_final = pd.read_csv(processed_data_path, index_col=0, parse_dates=True)
    print('Se cargó el dataset procesado correctamente.')
    print(f'Dimensiones del dataset: {df_final.shape}')
except FileNotFoundError:
    print(f'No se encontró el archivo en la ruta: {processed_data_path}')
    df_final = pd.DataFrame()  

if not df_final.empty:
    # acá reconstruimos los sets de entrenamiento y prueba
    
    target_col = 'target' # la cambiamos de nombre en el preprocesamiento
    X = df_final.drop(columns=[target_col])
    y = df_final[target_col]
    
    train_size = int(len(X) * 0.70)
    
    X_train = X.iloc[:train_size]
    X_test = X.iloc[train_size:]
    y_train = y.iloc[:train_size]
    y_test = y.iloc[train_size:]
    
    print('Sets de entrenamiento y prueba reconstruidos:')
    print(f'X_train: {X_train.shape}, y_train: {y_train.shape}')
    print(f'X_test: {X_test.shape}, y_test: {y_test.shape}')
    
    # confiugramos el tracking de experimentos
    log_path = '../results/experiment_logs.csv'
    os.makedirs(os.path.dirname(log_path), exist_ok=True)
    
    if not os.path.exists(log_path):
        header = [
            'timestamp',
            'model_name',
            'model_version',
            'mae',
            'rmse',
            'r2',
            'parameters'
        ]
        pd.DataFrame(columns=header).to_csv(log_path, index=False)
        print(f'Se creó el archivo de logs en: {log_path}')
    else:
        print(f'El archivo de logs ya existe en: {log_path}')
else:
    print('El dataset procesado está vacío. No se pueden reconstruir los sets de entrenamiento y prueba.')

In [None]:
# función para evaluar modelos
log_path = '../results/experiment_logs.csv'

def train_and_log_model(model, model_name, params_dict, X_train, y_train, X_test, y_test):
    """Entrena el modelo, evalúa su desempeño 
    y registra los resultados en un archivo CSV.
    """
    try:  
        # entrenar el modelo
        model.set_params(**params_dict)
        model.fit(X_train, y_train)
    
        # hacer predicciones
        y_pred = model.predict(X_test)
    
        # calcular métricas
        mae = mean_absolute_error(y_test, y_pred)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        r2 = r2_score(y_test, y_pred)
        
        # para la versión del modelo
        param_str = "_".join([f"{k}_{v}" for k, v in params_dict.items()])
        model_version = f"{model_name}_{param_str}"
    
        # registrar los resultados
        log_entry = {
            'timestamp': pd.Timestamp.now(),
            'model_name': model_name,
            'model_version': model_version,
            'mae': mae,
            'rmse': rmse,
            'r2': r2,
            'parameters': str(params_dict)
        }
    
        # guardar en CSV
        log_df = pd.read_csv(log_path)
        log_df = pd.concat([log_df, pd.DataFrame([log_entry])], ignore_index=True)
        log_df.to_csv(log_path, index=False)
        
        print(f'Log guardado para el modelo {model_name} versión {version}.')
        return model, mae
    except Exception as e:
        print(f'Error al entrenar y registrar el modelo: {e}')
    return None, np.inf   
    

### Entrenamiento de Modelos

#### Ridge/Lasso Regression (Lineales)

In [None]:
if 'X_train' in locals() and not X_train.empty:
    # definimos el parámetro de busqueda para Ridge and Lasso
    params_grid ={
        'alpha':[0.01, 0.1, 0.5, 1.0, 5.0, 10.0]
    }
    
    # Ridge Regression
    ridge_model = Ridge(random_state=42)
    for alpha in params_grid['alpha']:
        params = {'alpha': alpha}
        train_and_log_model(
            model=ridge_model,
            model_name='Ridge_Regression',
            params_dict=params,
            X_train=X_train,y_train=y_train,
            X_test=X_test,y_test=y_test
        )
    
    # Lasso Regression
    lasso_model = Lasso(random_state=42, max_iter=10000)
    for alpha in params_grid['alpha']:
        params = {'alpha': alpha}
        train_and_log_model(
            model=lasso_model,
            model_name='Lasso_Regression',
            params_dict=params,
            X_train=X_train,y_train=y_train,
            X_test=X_test,y_test=y_test
        )
    print('Entrenamientocompletado.')
else:
    print('No se pueden entrenar modelos porque los sets de entrenamiento están vacíos.')

### Random Forest Regression, XGBoost (XGB Regression) y LightGBM (LGBM Regression)

In [None]:
if 'X_train' in locals() and not X_train.empty:
    # parámetros de búsqueda para los modelos
    param_grid = {
        'n_estimators': [100, 200],
        'max_depth': [None, 10, 20],
        'min_samples_split': [2, 5]
    }
    
    # Random Forest Regression
    rf_model = RandomForestRegressor(random_state=42,n_jobs=-1)
    for n in param_grid['n_estimators']:
        params = {'n_estimators': n}
        train_and_log_model(
            model=rf_model,
            model_name='Random_Forest_Regression',
            params_dict=params,
            X_train=X_train,y_train=y_train,
            X_test=X_test,y_test=y_test
        )
    
    # XGBoost Regression
    xgb_model = XGBRegressor(objective='reg:squarederror', random_state=42, n_jobs=-1)
    for n in param_grid['n_estimators']:
        params = {'n_estimators': n}
        train_and_log_model(
            model=xgb_model,
            model_name='XGB_Regression',
            params_dict=params,
            X_train=X_train,y_train=y_train,
            X_test=X_test,y_test=y_test
        )
    
    # LightGBM Regression
    lgbm_model = LGBMRegressor(random_state=42, n_jobs=-1, verbose=-1)
    for n in param_grid['n_estimators']:
        params = {'n_estimators': n}
        train_and_log_model(
            model=lgbm_model,
            model_name='LGBM_Regression',
            params_dict=params,
            X_train=X_train,y_train=y_train,
            X_test=X_test,y_test=y_test
        )
    print(f'Entrenamiento de modelos completados. Los resultados están registrados en el archivo de logs en: {log_path}')
else:
    print('No se pueden entrenar modelos porque los sets de entrenamiento están vacíos.')