#  Entrenamiento y selección de hiperparámetros

In [2]:
import pandas as pd
import mlflow
import mlflow.sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor

import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor

from hyperopt import fmin, tpe, hp, Trials, STATUS_OK

## 1. Cargar dataset y filtrar 2023

In [3]:
import pandas as pd

# Cargar datos
df = pd.read_csv("../data/sales_train_enriched.csv", parse_dates=["date"])

# Filtrar solo año 2023
df = df[df["date"].dt.year == 2023].copy()

print("Registros totales 2023:", len(df))
print("Rango de fechas:", df["date"].min(), "→", df["date"].max())


Registros totales 2023: 1239531
Rango de fechas: 2023-01-01 00:00:00 → 2023-12-31 00:00:00


## 2. Split temporal

In [4]:
# Ordenar por fecha (muy importante en series temporales)
df = df.sort_values("date")

# Split 80/20 cronológico
split_idx = int(len(df) * 0.8)

train = df.iloc[:split_idx]
val   = df.iloc[split_idx:]

# Definir X, y
X_train = train.drop(columns=["sales", "date"])
y_train = train["sales"]

X_val = val.drop(columns=["sales", "date"])
y_val = val["sales"]

print("Tamaño entrenamiento:", len(train))
print("Tamaño validación:", len(val))
print("Última fecha train:", train["date"].max())
print("Primera fecha val:", val["date"].min())


Tamaño entrenamiento: 991624
Tamaño validación: 247907
Última fecha train: 2023-10-21 00:00:00
Primera fecha val: 2023-10-21 00:00:00


## 3. Baseline con RandomForest

In [7]:
df.head(2).T

Unnamed: 0,935580,12469
unique_id,2914,40
date,2023-01-01 00:00:00,2023-01-01 00:00:00
warehouse,Budapest_1,Prague_1
total_orders,3973.0,6696.0
sales,18.73,136.89
sell_price_main,7780.2,44.5
availability,1.0,0.71
type_0_discount,0.0,0.0
type_1_discount,0.0,0.0
type_2_discount,0.0,0.0


In [8]:
print("Número total de columnas:", df.shape[1])
print("Columnas disponibles:")
print(df.columns.tolist())


Número total de columnas: 37
Columnas disponibles:
['unique_id', 'date', 'warehouse', 'total_orders', 'sales', 'sell_price_main', 'availability', 'type_0_discount', 'type_1_discount', 'type_2_discount', 'type_3_discount', 'type_4_discount', 'type_5_discount', 'type_6_discount', 'sales_log', 'price_log', 'orders_log', 'max_discount', 'product_unique_id', 'name', 'L1_category_name_en', 'L2_category_name_en', 'L3_category_name_en', 'L4_category_name_en', 'holiday_name', 'holiday', 'shops_closed', 'winter_school_holidays', 'school_holidays', 'year', 'day_of_week', 'day_of_year', 'year_month', 'cos_day', 'sin_day', 'sales_rolling_7d', 'sales_rolling_28d']


In [5]:
import mlflow
import mlflow.sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("retail_experiment")

with mlflow.start_run(run_name="rf_baseline"):
    model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    rmse = mean_squared_error(y_val, y_pred, squared=False)

    mlflow.log_param("n_estimators", 100)
    mlflow.log_metric("rmse", rmse)

    mlflow.sklearn.log_model(model, "model")
    print(f"RandomForest RMSE: {rmse:.4f}")


2025/09/01 23:33:04 INFO mlflow.tracking.fluent: Experiment with name 'retail_experiment' does not exist. Creating a new experiment.


🏃 View run rf_baseline at: http://localhost:5000/#/experiments/3/runs/c9e2062edc974456859d76904be46191
🧪 View experiment at: http://localhost:5000/#/experiments/3


ValueError: could not convert string to float: 'Budapest_1'