# Armado y Entrenamiento de Modelos de Regresion para Laptime_s

En este notebook vamos a armar y entrenar los modelos en base al set creado en EDA.ipynb

In [1]:
# Agrego reloader para no tener que cerrar y abrir vs code
%load_ext autoreload
%autoreload 2

In [2]:
from src import metrics
from src import plots
from src import preprocessing
from src import models

import numpy as np
import pandas as pd

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
from sklearn.base import clone

## Carga y Limpieza de Datos:

In [3]:
df = pd.read_csv("data/processed/monaco_2025_colapinto_alllaps.csv")

# Target
y = df["LapTime_s"].to_numpy()

# Features legales
LEGAL_FEATURES_NUM = ["LapNumber", "Stint", "TyreLife"]
LEGAL_FEATURES_CAT = ["Session", "Compound"]

LEGAL_FEATURES_NUM = [c for c in LEGAL_FEATURES_NUM if c in df.columns]
LEGAL_FEATURES_CAT = [c for c in LEGAL_FEATURES_CAT if c in df.columns]

X = df[LEGAL_FEATURES_NUM + LEGAL_FEATURES_CAT].copy()



In [4]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), LEGAL_FEATURES_NUM),
        ("cat", OneHotEncoder(handle_unknown="ignore"), LEGAL_FEATURES_CAT),
    ]
)


Analisis de NaNs en los datos

In [5]:
FEATURES = LEGAL_FEATURES_NUM + LEGAL_FEATURES_CAT

missing_counts = df[FEATURES].isna().sum()
missing_percent = df[FEATURES].isna().mean() * 100

na_summary = pd.DataFrame({
    "missing_count": missing_counts,
    "missing_%": missing_percent.round(2)
}).sort_values("missing_%", ascending=False)

na_summary


Unnamed: 0,missing_count,missing_%
LapNumber,0,0.0
Stint,0,0.0
TyreLife,0,0.0
Session,0,0.0
Compound,0,0.0


In [6]:
for col in FEATURES:
    na_mask = df[col].isna()
    if na_mask.any():
        print(f"\nColumna: {col}")
        print(df.loc[na_mask, "Session"].value_counts())


In [7]:
cols_context = ["Session", "LapNumber", "Stint", "Compound", "TyreLife"]

for col in FEATURES:
    na_mask = df[col].isna()
    if na_mask.any():
        print(f"\n=== Ejemplos de filas con NaN en {col} ===")
        display(df.loc[na_mask, cols_context].head(10))
    else:
        print(f"\nNo hay NaN en la columna {col}")



No hay NaN en la columna LapNumber

No hay NaN en la columna Stint

No hay NaN en la columna TyreLife

No hay NaN en la columna Session

No hay NaN en la columna Compound


## Definición y Entrenamiento de los Modelos:

Hacemos CrossValidation con K-Fold para poder tener una mejor evaluacion.

In [8]:
def evaluate_model_cv(name, regressor, X, y, n_splits=5, random_state=42):
    """
    Entrena y evalúa un modelo de regresión usando KFold CV.
    Devuelve un dict con métricas promedio.
    """
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    
    mae_scores = []
    rmse_scores = []
    r2_scores = []
    
    for fold, (train_idx, test_idx) in enumerate(kf.split(X), start=1):
        X_train = X.iloc[train_idx]
        X_test  = X.iloc[test_idx]
        y_train = y[train_idx]
        y_test  = y[test_idx]
        
        # Nuevo pipeline para este fold
        reg = clone(regressor)
        model = Pipeline(steps=[
            ("preprocess", preprocessor),
            ("regressor", reg),
        ])
        
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        
        mae = metrics.MAE(y_test, y_pred)
        rmse = metrics.RMSE(y_test, y_pred)
        r2 = metrics.R2(y_test, y_pred)
        
        mae_scores.append(mae)
        rmse_scores.append(rmse)
        r2_scores.append(r2)
        
        print(f"[{name}] Fold {fold}: MAE={mae:.3f}, RMSE={rmse:.3f}, R2={r2:.3f}")
    
    mae_mean, mae_std = np.mean(mae_scores), np.std(mae_scores)
    rmse_mean, rmse_std = np.mean(rmse_scores), np.std(rmse_scores)
    r2_mean, r2_std = np.mean(r2_scores), np.std(r2_scores)
    
    print(f"\n[{name}] === Promedio {n_splits} folds ===")
    print(f"MAE  medio: {mae_mean:.3f} ± {mae_std:.3f}")
    print(f"RMSE medio: {rmse_mean:.3f} ± {rmse_std:.3f}")
    print(f"R2   medio: {r2_mean:.3f} ± {r2_std:.3f}\n")
    
    return {
        "model": name,
        "MAE_mean": mae_mean,
        "MAE_std": mae_std,
        "RMSE_mean": rmse_mean,
        "RMSE_std": rmse_std,
        "R2_mean": r2_mean,
        "R2_std": r2_std,
    }

Defino 4 Primeros modelos para seleccionar uno como Baseline y poder realizar Feature Engineering.

Entreno un RandomForest, un GradientBoosting, un Riedge y un MLP

In [9]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge
from sklearn.neural_network import MLPRegressor

models = {
    "RandomForest": RandomForestRegressor(
        n_estimators=300,
        random_state=42,
        n_jobs=-1
    ),
    "GradientBoosting": GradientBoostingRegressor(
        n_estimators=300,
        learning_rate=0.05,
        max_depth=3,
        random_state=42
    ),
    "Ridge": Ridge(alpha=1.0, random_state=42),

    "MLP": MLPRegressor(
        hidden_layer_sizes=(64, 32),
        activation="relu",
        solver="adam",
        learning_rate_init=1e-3,
        max_iter=500,
        random_state=42
    ),
}


In [10]:
import pandas as pd

results = []

for name, reg in models.items():
    res = evaluate_model_cv(name, reg, X, y, n_splits=5, random_state=42)
    results.append(res)

results_df = pd.DataFrame(results)



[RandomForest] Fold 1: MAE=0.968, RMSE=1.479, R2=0.699
[RandomForest] Fold 2: MAE=0.543, RMSE=0.697, R2=0.900
[RandomForest] Fold 3: MAE=0.699, RMSE=0.949, R2=0.846
[RandomForest] Fold 4: MAE=0.989, RMSE=1.434, R2=0.254
[RandomForest] Fold 5: MAE=0.713, RMSE=0.911, R2=0.755

[RandomForest] === Promedio 5 folds ===
MAE  medio: 0.782 ± 0.171
RMSE medio: 1.094 ± 0.308
R2   medio: 0.691 ± 0.229

[GradientBoosting] Fold 1: MAE=0.926, RMSE=1.339, R2=0.753
[GradientBoosting] Fold 2: MAE=0.495, RMSE=0.623, R2=0.920
[GradientBoosting] Fold 3: MAE=0.820, RMSE=1.156, R2=0.771
[GradientBoosting] Fold 4: MAE=0.837, RMSE=1.235, R2=0.446
[GradientBoosting] Fold 5: MAE=0.663, RMSE=0.840, R2=0.792

[GradientBoosting] === Promedio 5 folds ===
MAE  medio: 0.748 ± 0.152
RMSE medio: 1.039 ± 0.266
R2   medio: 0.736 ± 0.156

[Ridge] Fold 1: MAE=1.262, RMSE=1.736, R2=0.585
[Ridge] Fold 2: MAE=0.816, RMSE=1.120, R2=0.742
[Ridge] Fold 3: MAE=0.965, RMSE=1.257, R2=0.729
[Ridge] Fold 4: MAE=0.981, RMSE=1.312, R2=



[MLP] Fold 1: MAE=3.336, RMSE=3.836, R2=-1.027




[MLP] Fold 2: MAE=2.412, RMSE=2.903, R2=-0.735




[MLP] Fold 3: MAE=2.616, RMSE=3.207, R2=-0.761




[MLP] Fold 4: MAE=2.676, RMSE=3.420, R2=-3.246
[MLP] Fold 5: MAE=3.015, RMSE=3.647, R2=-2.929

[MLP] === Promedio 5 folds ===
MAE  medio: 2.811 ± 0.326
RMSE medio: 3.402 ± 0.327
R2   medio: -1.740 ± 1.110





Resumen de las metricas de los Modelos evaluados por Cross-Validation:

In [11]:
results_df

Unnamed: 0,model,MAE_mean,MAE_std,RMSE_mean,RMSE_std,R2_mean,R2_std
0,RandomForest,0.782488,0.171365,1.094089,0.308264,0.690563,0.22922
1,GradientBoosting,0.74829,0.152101,1.038674,0.266258,0.73641,0.156408
2,Ridge,0.993046,0.146645,1.324708,0.215247,0.601347,0.132682
3,MLP,2.811092,0.326383,3.402389,0.327269,-1.739624,1.109842


## Seleccion de Modelo:

GradientBoosting es el mejor basicamente en todas las metricas o muy similares a las del RF. La diferencia no es enorme, pero si tengo que elegir uno, el GB gana.


- Tiende a generalizar un poco mejor,
- Suele ser más sensible a pequeños cambios de features (bueno para ver el efecto del feature engineering).
- Es buen candidato para tunear despues ya que puedo jugar con n_estimators, learning_rate, max_depth, etc.

Con respecto al resto de modelos
- RandomForest: Está muy cerca en performance. Lo usaría como segundo modelo de comparación.
- Ridge: Me puede servir para ver cuánto aportan las relaciones no lineales y el feature engineering. Si con nuevas features Ridge mejora mucho, se que estoy agregando información “linealmente útil”.
- MLP: Lo descartaría.Con pocos datos y sin tuning claramente está haciendo overfitting o underfitting y con errores gigantes.

## Feature Engineering:

Features a crear:
(Armar lista y explicar cada una)

Uso add_basic_features() de Preproccessing.py para agregar las features nuevas.

In [12]:
df = pd.read_csv("data/processed/monaco_2025_colapinto_alllaps.csv")

# Aplico feature engineering v1
df_fe = preprocessing.add_basic_features(df)

# Target
y = df_fe["LapTime_s"].to_numpy()


Redefino las columnas a utilizar:

In [13]:
LEGAL_FEATURES_NUM = [
    "LapNumber",
    "Stint",
    "TyreLife",
    "lap_norm_session",
    "stint_len",
    "stint_lap_index",
    "stint_lap_norm",
    "tyrelife_norm_stint",
    "is_race",
    "compound_order",
]

LEGAL_FEATURES_CAT = [
    "Session",
    "Compound",
]

LEGAL_FEATURES_NUM = [c for c in LEGAL_FEATURES_NUM if c in df_fe.columns]
LEGAL_FEATURES_CAT = [c for c in LEGAL_FEATURES_CAT if c in df_fe.columns]

FEATURES = LEGAL_FEATURES_NUM + LEGAL_FEATURES_CAT
X = df_fe[FEATURES].copy()

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), LEGAL_FEATURES_NUM),
        ("cat", OneHotEncoder(handle_unknown="ignore"), LEGAL_FEATURES_CAT),
    ]
)

print(X.columns)

Index(['LapNumber', 'Stint', 'TyreLife', 'lap_norm_session', 'stint_len',
       'stint_lap_index', 'stint_lap_norm', 'tyrelife_norm_stint', 'is_race',
       'compound_order', 'Session', 'Compound'],
      dtype='object')


In [14]:
# Definimos el modelo base de Gradient Boosting
gb_fe = GradientBoostingRegressor(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=3,
    random_state=42
)

# Evaluamos con KFold usando las nuevas features
gb_fe_results = evaluate_model_cv(
    name="GradientBoosting_FE_v1",
    regressor=gb_fe,
    X=X,
    y=y,
    n_splits=5,
    random_state=42
)




[GradientBoosting_FE_v1] Fold 1: MAE=0.872, RMSE=1.337, R2=0.754
[GradientBoosting_FE_v1] Fold 2: MAE=0.489, RMSE=0.760, R2=0.881
[GradientBoosting_FE_v1] Fold 3: MAE=0.817, RMSE=1.229, R2=0.741
[GradientBoosting_FE_v1] Fold 4: MAE=0.933, RMSE=1.464, R2=0.222
[GradientBoosting_FE_v1] Fold 5: MAE=0.616, RMSE=0.749, R2=0.834

[GradientBoosting_FE_v1] === Promedio 5 folds ===
MAE  medio: 0.745 ± 0.167
RMSE medio: 1.108 ± 0.298
R2   medio: 0.686 ± 0.238



#### Resumen de los Resultados del FE_v1:

Todas las metricas empeoraron. Por qué puede haber pasado: 
- Dataset chico + más features = más riesgo de sobreajuste / ruido
- Metimos features muy derivadas de las mismas cosas
- Hay features que, desde la lógica del simulador, usan “info del futuro” (Bastante trampa)

Vamos a hacer una segunda version Feature Engineering v2 bastante mas conservadora.

## Feature Engineering v2:

Elimino estas features “con futuro” del DataFrame y de LEGAL_FEATURES_NUM:
- stint_len
- stint_lap_index
- stint_lap_norm
- tyrelife_norm_stint

Me quedo con las que conceptualmente sí tienen sentido para el simulador y no duplican demasiado:

- lap_norm_session → fase de la sesión (principio/medio/fin).
- is_race → modo práctica vs carrera (puede ser útil).
- compound_order → codifica “blando vs duro” de forma ordenada.

In [15]:
LEGAL_FEATURES_NUM = [
    "LapNumber",
    "Stint",
    "TyreLife",
    "lap_norm_session",
    "is_race",
    "compound_order",
]

LEGAL_FEATURES_CAT = [
    "Session",
    "Compound",
]


In [16]:
LEGAL_FEATURES_NUM = [c for c in LEGAL_FEATURES_NUM if c in df_fe.columns]
LEGAL_FEATURES_CAT = [c for c in LEGAL_FEATURES_CAT if c in df_fe.columns]

FEATURES = LEGAL_FEATURES_NUM + LEGAL_FEATURES_CAT
X = df_fe[FEATURES].copy()

preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), LEGAL_FEATURES_NUM),
        ("cat", OneHotEncoder(handle_unknown="ignore"), LEGAL_FEATURES_CAT),
    ]
)

#printeo las columnas de x
print(X.columns)

Index(['LapNumber', 'Stint', 'TyreLife', 'lap_norm_session', 'is_race',
       'compound_order', 'Session', 'Compound'],
      dtype='object')


In [17]:
# Evaluamos con KFold usando las nuevas features
gb_fe_results = evaluate_model_cv(
    name="GradientBoosting_FE_v2",
    regressor=gb_fe,
    X=X,
    y=y,
    n_splits=5,
    random_state=42
)

[GradientBoosting_FE_v2] Fold 1: MAE=0.894, RMSE=1.343, R2=0.751
[GradientBoosting_FE_v2] Fold 2: MAE=0.581, RMSE=0.918, R2=0.827
[GradientBoosting_FE_v2] Fold 3: MAE=0.804, RMSE=1.188, R2=0.758
[GradientBoosting_FE_v2] Fold 4: MAE=0.861, RMSE=1.279, R2=0.406
[GradientBoosting_FE_v2] Fold 5: MAE=0.735, RMSE=0.919, R2=0.750

[GradientBoosting_FE_v2] === Promedio 5 folds ===
MAE  medio: 0.775 ± 0.111
RMSE medio: 1.129 ± 0.179
R2   medio: 0.699 ± 0.149



#### Resumen de los Resultados para FE V2:

- En comparacion con el modelo de GB sin FE se mejora un poco el MAE y el RMSE, manteniendo valor practicamente igual de R2
- FE_v2 no rompe nada y ayuda un poco
- Las nuevas features son conceptualmente correctas para el simulador
- El modelo sigue explicando ~75% de la varianza de LapTime (baseline sólido para comparar estrategias “gruesas” en el simulador y, más adelante, ver si el aprendizaje de (PCA/AE) aporta algo extra.)



Pasos a seguir:
- Congelar este setup como baseline oficial: Features: LapNumber, Stint, TyreLife, lap_norm_session, is_race, compound_order, Session, Compound. Modelo: GradientBoosting con los hiperparámetros actuales. Guardar estos resultados (tabla y configuración) para la parte del informe/paper (“Baseline clásico”).
-  Hacer tuning ligero de hiperparámetros del GB (n_estimators, max_depth, learning_rate) usando el mismo K-fold.

Mas adelante:
- armar el pipeline de PCA + clustering sobre las vueltas
- diseñar el autoencoder para aprender el espacio latente y ver si aparecen clusters por piloto/compuesto/estado de pista.

## Finetunning para GB en FE V2:

In [18]:
from src import preprocessing  

df = pd.read_csv("data/processed/monaco_2025_colapinto_alllaps.csv")

if "TrackStatus" in df.columns:
    df = df[df["TrackStatus"] == 1].copy()

if "LapTime_s" not in df.columns:
    df["LapTime"] = pd.to_timedelta(df["LapTime"])
    df["LapTime_s"] = df["LapTime"].dt.total_seconds()

df_fe2 = preprocessing.add_basic_features(df)

if "Position" in df_fe2.columns:
    df_fe2 = df_fe2.drop(columns=["Position"])

print(df_fe2.columns)
print("Total vueltas (green, FE_v2):", len(df_fe2))

LEGAL_FEATURES_NUM_V2 = [
    "LapNumber",
    "Stint",
    "TyreLife",
    "lap_norm_session",
    "is_race",
    "compound_order",
]

LEGAL_FEATURES_CAT = [
    "Session",
    "Compound",
]

LEGAL_FEATURES_NUM_V2 = [c for c in LEGAL_FEATURES_NUM_V2 if c in df_fe2.columns]
LEGAL_FEATURES_CAT     = [c for c in LEGAL_FEATURES_CAT     if c in df_fe2.columns]

FEATURES_V2 = LEGAL_FEATURES_NUM_V2 + LEGAL_FEATURES_CAT

X = df_fe2[FEATURES_V2].copy()
y = df_fe2["LapTime_s"].to_numpy()


Index(['Time', 'Driver', 'DriverNumber', 'LapTime', 'LapNumber', 'Stint',
       'PitOutTime', 'PitInTime', 'Sector1Time', 'Sector2Time', 'Sector3Time',
       'Sector1SessionTime', 'Sector2SessionTime', 'Sector3SessionTime',
       'SpeedI1', 'SpeedI2', 'SpeedFL', 'SpeedST', 'IsPersonalBest',
       'Compound', 'TyreLife', 'FreshTyre', 'Team', 'LapStartTime',
       'LapStartDate', 'TrackStatus', 'Deleted', 'DeletedReason',
       'FastF1Generated', 'IsAccurate', 'LapTime_s', 'Session',
       'lap_norm_session', 'stint_len', 'stint_lap_index', 'stint_lap_norm',
       'tyrelife_norm_stint', 'is_race', 'compound_order'],
      dtype='object')
Total vueltas (green, FE_v2): 100


In [19]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

preprocessor_v2 = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), LEGAL_FEATURES_NUM_V2),
        ("cat", OneHotEncoder(handle_unknown="ignore"), LEGAL_FEATURES_CAT),
    ]
)


In [20]:
import optuna
from sklearn.model_selection import KFold, cross_val_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import Pipeline

cv = KFold(n_splits=5, shuffle=True, random_state=42)

def objective(trial: optuna.Trial) -> float:
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 150, 800),
        "learning_rate": trial.suggest_float("learning_rate", 0.005, 0.12, log=True),
        "max_depth": trial.suggest_int("max_depth", 2, 5),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 15),
        "min_samples_leaf": trial.suggest_int("min_samples_leaf", 1, 10),
    }

    gb = GradientBoostingRegressor(random_state=42, **params)

    pipe = Pipeline(steps=[
        ("preprocess", preprocessor_v2),
        ("model", gb),
    ])

    scores = cross_val_score(
        pipe,
        X,
        y,
        scoring="neg_mean_absolute_error",
        cv=cv,
        n_jobs=-1,
    )
    mae_cv = -scores.mean()
    trial.set_user_attr("mae_cv", mae_cv)

    return mae_cv


In [21]:
study = optuna.create_study(
    direction="minimize",
    study_name="gb_fe_v2_kfold_mae",
)

print("=== Optuna: empezando tuning (por ejemplo, 50 trials) ===")
study.optimize(objective, n_trials=50, show_progress_bar=True)

best_trial = study.best_trial

print("\n=== Mejor trial FE_v2 ===")
print("MAE CV:", best_trial.value)
for k, v in best_trial.params.items():
    print(f"  {k}: {v}")


[I 2025-12-12 00:10:53,030] A new study created in memory with name: gb_fe_v2_kfold_mae


=== Optuna: empezando tuning (por ejemplo, 50 trials) ===


  0%|          | 0/50 [00:00<?, ?it/s]

[I 2025-12-12 00:11:04,806] Trial 0 finished with value: 0.8373127267187094 and parameters: {'n_estimators': 499, 'learning_rate': 0.0538353108995053, 'max_depth': 4, 'subsample': 0.8534820375920666, 'min_samples_split': 4, 'min_samples_leaf': 10}. Best is trial 0 with value: 0.8373127267187094.
[I 2025-12-12 00:11:09,268] Trial 1 finished with value: 0.7526051033809606 and parameters: {'n_estimators': 777, 'learning_rate': 0.03944250658900388, 'max_depth': 5, 'subsample': 0.6961931459764078, 'min_samples_split': 10, 'min_samples_leaf': 4}. Best is trial 1 with value: 0.7526051033809606.
[I 2025-12-12 00:11:10,632] Trial 2 finished with value: 0.9219745225991076 and parameters: {'n_estimators': 555, 'learning_rate': 0.009570832728060787, 'max_depth': 4, 'subsample': 0.7064110146403608, 'min_samples_split': 2, 'min_samples_leaf': 9}. Best is trial 1 with value: 0.7526051033809606.
[I 2025-12-12 00:11:10,978] Trial 3 finished with value: 0.8294771143808359 and parameters: {'n_estimators'

In [22]:
from sklearn.ensemble import GradientBoostingRegressor

best_gb = GradientBoostingRegressor(
    random_state=42,
    **best_trial.params,
)

gb_tuned_results = evaluate_model_cv(
    name="GradientBoosting_FE_v2_tuned",
    regressor=best_gb,
    X=X,
    y=y,
    n_splits=5,
    random_state=42,
)



[GradientBoosting_FE_v2_tuned] Fold 1: MAE=0.907, RMSE=1.199, R2=0.770
[GradientBoosting_FE_v2_tuned] Fold 2: MAE=0.662, RMSE=0.922, R2=0.764
[GradientBoosting_FE_v2_tuned] Fold 3: MAE=0.403, RMSE=0.489, R2=0.948
[GradientBoosting_FE_v2_tuned] Fold 4: MAE=0.622, RMSE=0.984, R2=0.700
[GradientBoosting_FE_v2_tuned] Fold 5: MAE=0.776, RMSE=1.068, R2=0.642

[GradientBoosting_FE_v2_tuned] === Promedio 5 folds ===
MAE  medio: 0.674 ± 0.168
RMSE medio: 0.932 ± 0.240
R2   medio: 0.765 ± 0.103



## Entrenamiento de Modelos mas Avanzados:

Estos modelos resultaron peores que nuestro GradientBoosting.

Definimos y enrtenamos XGBoost , LightGBM y HistGB

In [23]:
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd


In [24]:
from sklearn.metrics import make_scorer

def rmse(y_true, y_pred):
    return mean_squared_error(y_true, y_pred, squared=False)

rmse_scorer = make_scorer(rmse, greater_is_better=False)


In [25]:
from sklearn.ensemble import GradientBoostingRegressor

models_to_compare = {
    "GB_SinMejoras": GradientBoostingRegressor(
        n_estimators=300,
        learning_rate=0.05,
        max_depth=3,
        random_state=42    
    ),


    "XGBoost": XGBRegressor(
        n_estimators=300,
        learning_rate=0.05,
        max_depth=4,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_lambda=1.0,
        reg_alpha=0.0,
        objective="reg:squarederror",
        tree_method="hist",
        random_state=42,
    ),
    "LightGBM": LGBMRegressor(
        n_estimators=300,
        learning_rate=0.05,
        num_leaves=31,
        max_depth=-1,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_lambda=0.0,
        random_state=42,
    ),
    "HistGB": HistGradientBoostingRegressor(
        max_depth=4,
        learning_rate=0.05,
        max_iter=300,
        random_state=42,
    ),
}


In [26]:
from sklearn.model_selection import cross_validate
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd

results = []

for name, reg in models_to_compare.items():
    print(f"Evaluando {name}...")

    pipe = Pipeline(steps=[
        ("preprocess", preprocessor_v2),
        ("model", reg),
    ])

    scoring = {
        "mae": "neg_mean_absolute_error",
        "mse": "neg_mean_squared_error",
        "r2": "r2",
    }

    cv_results = cross_validate(
        pipe,
        X,
        y,
        scoring=scoring,
        cv=cv,
        n_jobs=-1,
        return_train_score=False,
    )

    mae_scores = -cv_results["test_mae"]               # pasa a positivo
    rmse_scores = np.sqrt(-cv_results["test_mse"])     # sqrt del MSE
    r2_scores = cv_results["test_r2"]

    results.append({
        "model": name,
        "MAE_mean": mae_scores.mean(),
        "MAE_std": mae_scores.std(),
        "RMSE_mean": rmse_scores.mean(),
        "RMSE_std": rmse_scores.std(),
        "R2_mean": r2_scores.mean(),
        "R2_std": r2_scores.std(),
    })

results_df = pd.DataFrame(results).sort_values("MAE_mean")
results_df


Evaluando GB_SinMejoras...
Evaluando XGBoost...
Evaluando LightGBM...
Evaluando HistGB...


Unnamed: 0,model,MAE_mean,MAE_std,RMSE_mean,RMSE_std,R2_mean,R2_std
0,GB_SinMejoras,0.749175,0.170127,1.044313,0.267163,0.709666,0.129235
1,XGBoost,0.755003,0.198441,1.042852,0.316071,0.698617,0.148942
3,HistGB,1.143113,0.16097,1.426712,0.243508,0.461683,0.217836
2,LightGBM,1.207765,0.17583,1.478283,0.249308,0.430027,0.205903
