#  Entrenamiento y selección de hiperparámetros

In [2]:
import pandas as pd
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor

import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor

from hyperopt import fmin, tpe, hp, Trials, STATUS_OK

## 1. Cargar dataset y filtrar 2023

In [3]:
import pandas as pd

# Cargar datos
df = pd.read_csv("../data/sales_train_enriched.csv", parse_dates=["date"])

# Filtrar solo año 2023
df = df[df["date"].dt.year == 2023].copy()

print("Registros totales 2023:", len(df))
print("Rango de fechas:", df["date"].min(), "→", df["date"].max())


Registros totales 2023: 1239531
Rango de fechas: 2023-01-01 00:00:00 → 2023-12-31 00:00:00


## 2. Split temporal

In [4]:
# Ordenar por fecha (muy importante en series temporales)
df = df.sort_values("date")

# Split 80/20 cronológico
split_idx = int(len(df) * 0.8)

train = df.iloc[:split_idx]
val   = df.iloc[split_idx:]

# Definir X, y
X_train = train.drop(columns=["sales", "date"])
y_train = train["sales"]

X_val = val.drop(columns=["sales", "date"])
y_val = val["sales"]

print("Tamaño entrenamiento:", len(train))
print("Tamaño validación:", len(val))
print("Última fecha train:", train["date"].max())
print("Primera fecha val:", val["date"].min())


Tamaño entrenamiento: 991624
Tamaño validación: 247907
Última fecha train: 2023-10-21 00:00:00
Primera fecha val: 2023-10-21 00:00:00


## 3. Selección de features a usar

Seleccionamos las columnas que realmente creemos que pueden aportar información al modelo

In [9]:
features = [
    "unique_id", "date", "warehouse", 
    "availability", "price_log", "orders_log", "max_discount", 
    "holiday", "shops_closed", "winter_school_holidays", "school_holidays", 
    "year", "day_of_week", "day_of_year", "year_month", "cos_day", "sin_day", 
    "sales_rolling_7d", "sales_rolling_28d"
]


In [10]:
target = "sales_log"

In [None]:
from sklearn.feature_extraction import DictVectorizer

dv = DictVectorizer(sparse=False)
train_dicts = X_train.to_dict(orient="records")
X_train_encoded = dv.fit_transform(train_dicts)
X_val_encoded = dv.transform(X_val.to_dict(orient="records"))
