#  Entrenamiento y selección de hiperparámetros

In [1]:
import pandas as pd
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor

import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor

from hyperopt import fmin, tpe, hp, Trials, STATUS_OK

## 1. Cargar dataset y filtrar 2023

In [2]:
import pandas as pd

# Cargar datos
df = pd.read_csv("../data/sales_train_enriched.csv", parse_dates=["date"])

# Filtrar solo año 2023
df = df[df["date"].dt.year == 2023].copy()

print("Registros totales 2023:", len(df))
print("Rango de fechas:", df["date"].min(), "→", df["date"].max())


Registros totales 2023: 1239531
Rango de fechas: 2023-01-01 00:00:00 → 2023-12-31 00:00:00


## 2. Split temporal

In [3]:
# Ordenar por fecha (muy importante en series temporales)
df = df.sort_values("date")

# Split 80/20 cronológico
split_idx = int(len(df) * 0.8)

train = df.iloc[:split_idx]
val   = df.iloc[split_idx:]

# Definir X, y
X_train = train.drop(columns=["sales", "date"])
y_train = train["sales"]

X_val = val.drop(columns=["sales", "date"])
y_val = val["sales"]

print("Tamaño entrenamiento:", len(train))
print("Tamaño validación:", len(val))
print("Última fecha train:", train["date"].max())
print("Primera fecha val:", val["date"].min())


Tamaño entrenamiento: 991624
Tamaño validación: 247907
Última fecha train: 2023-10-21 00:00:00
Primera fecha val: 2023-10-21 00:00:00


## 3. Selección de features a usar

Seleccionamos las columnas que realmente creemos que pueden aportar información al modelo

In [4]:
features = [
    "warehouse", "availability", "price_log", "orders_log", "max_discount",
    "L1_category_name_en", "L2_category_name_en", "L3_category_name_en", "L4_category_name_en",
    "holiday", "shops_closed", "winter_school_holidays", "school_holidays", 
    "year", "day_of_week", "day_of_year", "year_month", "cos_day", "sin_day", 
    "sales_rolling_7d", "sales_rolling_28d"
]

In [5]:
target = "sales_log"

In [6]:
from sklearn.feature_extraction import DictVectorizer

dv = DictVectorizer(sparse=True)

train_dicts = X_train.to_dict(orient="records")
X_train_encoded = dv.fit_transform(train_dicts).astype("float32")

val_dicts = X_val.to_dict(orient="records")
X_val_encoded = dv.transform(val_dicts).astype("float32")


In [7]:
y_train = X_train[target].values
y_val = X_val[target].values

In [None]:
import mlflow
import os
import mlflow.sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error

mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("retail_nb_experiment")

with mlflow.start_run(run_name="rf_baseline"):
    # Modelo
    rf = RandomForestRegressor(
        n_estimators=10,
        max_depth=10,
        random_state=42,
        n_jobs=-1
    )
    rf.fit(X_train_encoded, y_train)

    # Predicciones
    y_pred = rf.predict(X_val_encoded)
    rmse = root_mean_squared_error(y_val, y_pred)

    # Log params, metrics, model
    mlflow.log_param("n_estimators", 10)
    mlflow.log_param("max_depth", 10)
    mlflow.log_metric("rmse", rmse)

    # Save model locally
    os.makedirs("outputs", exist_ok=True)
    model_path = "outputs/rf_model"
    mlflow.sklearn.save_model(rf, model_path)

    # Log artifacts manually
    mlflow.log_artifacts(model_path, artifact_path="model")

print(f"RMSE on validation: {rmse:.4f}")


🏃 View run rf_baseline at: http://localhost:5000/#/experiments/4/runs/b65c970f707846b49607baaf29c31c48
🧪 View experiment at: http://localhost:5000/#/experiments/4
RMSE on validation: 0.0007
