#  Entrenamiento y selección de hiperparámetros

In [1]:
import pandas as pd
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor

import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor

from hyperopt import fmin, tpe, hp, Trials, STATUS_OK

## 1. Cargar dataset y filtrar 2023

In [2]:
import pandas as pd

# Cargar datos
df = pd.read_csv("../data/sales_train_enriched.csv", parse_dates=["date"])

# Filtrar solo año 2023
df = df[df["date"].dt.year == 2023].copy()

print("Registros totales 2023:", len(df))
print("Rango de fechas:", df["date"].min(), "→", df["date"].max())


Registros totales 2023: 1239531
Rango de fechas: 2023-01-01 00:00:00 → 2023-12-31 00:00:00


## 2. Split temporal

In [3]:
# Ordenar por fecha (muy importante en series temporales)
df = df.sort_values("date")

# Split 80/20 cronológico
split_idx = int(len(df) * 0.8)

train = df.iloc[:split_idx]
val   = df.iloc[split_idx:]

# Definir X, y
X_train = train.drop(columns=["sales", "date"])
y_train = train["sales"]

X_val = val.drop(columns=["sales", "date"])
y_val = val["sales"]

print("Tamaño entrenamiento:", len(train))
print("Tamaño validación:", len(val))
print("Última fecha train:", train["date"].max())
print("Primera fecha val:", val["date"].min())


Tamaño entrenamiento: 991624
Tamaño validación: 247907
Última fecha train: 2023-10-21 00:00:00
Primera fecha val: 2023-10-21 00:00:00


## 3. Selección de features a usar

Seleccionamos las columnas que realmente creemos que pueden aportar información al modelo

In [4]:
features = [
    "warehouse", "availability", "price_log", "orders_log", "max_discount",
    "L1_category_name_en", "L2_category_name_en", "L3_category_name_en", "L4_category_name_en",
    "holiday", "shops_closed", "winter_school_holidays", "school_holidays", 
    "year", "day_of_week", "day_of_year", "year_month", "cos_day", "sin_day", 
    "sales_rolling_7d", "sales_rolling_28d"
]

In [5]:
target = "sales_log"

In [6]:
from sklearn.feature_extraction import DictVectorizer

dv = DictVectorizer(sparse=True)

train_dicts = X_train.to_dict(orient="records")
X_train_encoded = dv.fit_transform(train_dicts).astype("float32")

val_dicts = X_val.to_dict(orient="records")
X_val_encoded = dv.transform(val_dicts).astype("float32")


In [7]:
y_train = X_train[target].values
y_val = X_val[target].values

In [None]:
import mlflow
import os
import mlflow.sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error

mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("retail_nb_experiment")

with mlflow.start_run(run_name="rf_baseline"):
    # Modelo
    rf = RandomForestRegressor(
        n_estimators=10,
        max_depth=10,
        random_state=42,
        n_jobs=-1
    )
    rf.fit(X_train_encoded, y_train)

    # Predicciones
    y_pred = rf.predict(X_val_encoded)
    rmse = root_mean_squared_error(y_val, y_pred)

    # Log params, metrics, model
    mlflow.log_param("n_estimators", 10)
    mlflow.log_param("max_depth", 10)
    mlflow.log_metric("rmse", rmse)

    # Save model locally
    os.makedirs("outputs", exist_ok=True)
    model_path = "outputs/rf_model"
    mlflow.sklearn.save_model(rf, model_path)

    # Log artifacts manually
    mlflow.log_artifacts(model_path, artifact_path="model")

print(f"RMSE on validation: {rmse:.4f}")


🏃 View run rf_baseline at: http://localhost:5000/#/experiments/4/runs/31c0ace0aa6e4bd1b3bbbe760ce50088
🧪 View experiment at: http://localhost:5000/#/experiments/4


MlflowException: Path 'outputs/rf_model' already exists and is not empty

In [9]:
import os
import lightgbm as lgb

with mlflow.start_run(run_name="lgbm_baseline"):
    train_set = lgb.Dataset(X_train_encoded, label=y_train)
    val_set = lgb.Dataset(X_val_encoded, label=y_val, reference=train_set)

    params = {
        "objective": "regression",
        "metric": "rmse",
        "boosting_type": "gbdt",
        "num_leaves": 31,
        "learning_rate": 0.1,
        "feature_fraction": 0.9,
        "bagging_fraction": 0.8,
        "bagging_freq": 5,
        "verbose": -1,
        "seed": 42
    }

    callbacks = [
        lgb.early_stopping(stopping_rounds=20),
        lgb.log_evaluation(period=50)
    ]

    model = lgb.train(
        params,
        train_set,
        num_boost_round=200,
        valid_sets=[train_set, val_set],
        callbacks=callbacks
    )

    # Predicciones
    y_pred = model.predict(X_val_encoded, num_iteration=model.best_iteration)
    rmse = root_mean_squared_error(y_val, y_pred)

    # Log params y métricas
    mlflow.log_params(params)
    mlflow.log_metric("rmse", rmse)

    # Guardar modelo local y loguearlo como artifact
    os.makedirs("outputs_lgbm", exist_ok=True)
    model.save_model("outputs_lgbm/lgbm_model.txt")
    mlflow.log_artifact("outputs_lgbm/lgbm_model.txt", artifact_path="lgbm_model")

print(f"✅ RMSE on validation: {rmse:.4f}")


Training until validation scores don't improve for 20 rounds
[50]	training's rmse: 0.0304644	valid_1's rmse: 0.0356869
[100]	training's rmse: 0.0202031	valid_1's rmse: 0.0242816
[150]	training's rmse: 0.018392	valid_1's rmse: 0.0226674
[200]	training's rmse: 0.0174329	valid_1's rmse: 0.0222356
Did not meet early stopping. Best iteration is:
[200]	training's rmse: 0.0174329	valid_1's rmse: 0.0222356
🏃 View run lgbm_baseline at: http://localhost:5000/#/experiments/4/runs/2e5ee8ec799d46aa817580cb3a0e1d95
🧪 View experiment at: http://localhost:5000/#/experiments/4
✅ RMSE on validation: 0.0222


In [10]:
import os
import mlflow
import xgboost as xgb
from sklearn.metrics import root_mean_squared_error

mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("retail_nb_experiment")

with mlflow.start_run(run_name="xgb_baseline"):
    # Convertir a DMatrix (estructura interna de XGBoost)
    dtrain = xgb.DMatrix(X_train_encoded, label=y_train)
    dval = xgb.DMatrix(X_val_encoded, label=y_val)

    params = {
        "objective": "reg:squarederror",
        "eval_metric": "rmse",
        "max_depth": 6,
        "eta": 0.1,
        "subsample": 0.8,
        "colsample_bytree": 0.8,
        "seed": 42
    }

    evals = [(dtrain, "train"), (dval, "val")]

    # Entrenar con early stopping
    model = xgb.train(
        params=params,
        dtrain=dtrain,
        num_boost_round=500,
        evals=evals,
        early_stopping_rounds=20,
        verbose_eval=50
    )

    # Predicciones
    y_pred = model.predict(dval, iteration_range=(0, model.best_iteration))
    rmse = root_mean_squared_error(y_val, y_pred)

    # Log params y métricas
    mlflow.log_params(params)
    mlflow.log_metric("rmse", rmse)

    # Guardar modelo local y subirlo a MLflow
    os.makedirs("outputs_xgb", exist_ok=True)
    model.save_model("outputs_xgb/xgb_model.json")
    mlflow.log_artifact("outputs_xgb/xgb_model.json", artifact_path="xgb_model")

print(f"✅ RMSE on validation: {rmse:.4f}")


[0]	train-rmse:1.05897	val-rmse:1.05428
[50]	train-rmse:0.02231	val-rmse:0.02522
[100]	train-rmse:0.01851	val-rmse:0.02165
[150]	train-rmse:0.01769	val-rmse:0.02121
[200]	train-rmse:0.01696	val-rmse:0.02098
[249]	train-rmse:0.01633	val-rmse:0.02096
🏃 View run xgb_baseline at: http://localhost:5000/#/experiments/4/runs/24d62de0af7a430caf81d0663c7ab980
🧪 View experiment at: http://localhost:5000/#/experiments/4
✅ RMSE on validation: 0.0209


In [None]:
import os
import mlflow
from catboost import CatBoostRegressor
from sklearn.metrics import root_mean_squared_error

mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("retail_nb_experiment")

with mlflow.start_run(run_name="catboost_baseline"):
    model = CatBoostRegressor(
        iterations=300,
        depth=8,
        learning_rate=0.08,
        loss_function="RMSE",
        random_seed=42,
        verbose=100,
        thread_count=-1
    )
    # Nota: usamos X_train_encoded / X_val_encoded (ya vectorizados)
    model.fit(X_train_encoded, y_train, eval_set=(X_val_encoded, y_val), use_best_model=True)

    y_pred = model.predict(X_val_encoded)
    rmse = root_mean_squared_error(y_val, y_pred)

    mlflow.log_metric("rmse", rmse)
    mlflow.log_params({
        "iterations": 300,
        "depth": 8,
        "learning_rate": 0.08,
        "loss_function": "RMSE"
    })

    os.makedirs("outputs_cat", exist_ok=True)
    model.save_model("outputs_cat/cat_model.cbm")  # formato nativo CatBoost
    mlflow.log_artifact("outputs_cat/cat_model.cbm", artifact_path="catboost_model")

print(f"✅ CatBoost RMSE on validation: {rmse:.4f}")
