# 03_model_zoo_and_submission

Este notebook está pensado para **probar varios modelos** (baseline + modelos ML) sobre el dataset **mensual ya procesado** y producir:

- Tabla comparativa de RMSE en validación (mes 33)
- Guardado de modelos (`.joblib`)
- Generación de `submission.csv` con `ID` correcto (merge con `test.csv` original de Kaggle)


In [1]:
# Imports
import numpy as np
import pandas as pd
from pathlib import Path

import joblib

from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error


## 0) Paths y carga del dataset procesado

In [2]:
# Paths del proyecto
PROJECT_ROOT = Path(".").resolve()
DATA_RAW = PROJECT_ROOT / "data" / "raw"
DATA_PROCESSED = PROJECT_ROOT / "data" / "processed"
ARTIFACTS = PROJECT_ROOT / "artifacts"
ARTIFACTS.mkdir(parents=True, exist_ok=True)

data_path = DATA_PROCESSED / "dataset_monthly.csv.gz"
print("Loading:", data_path)

# Carga
full = pd.read_csv(data_path)
print("full shape:", full.shape)
full.head()

Loading: /Users/andrespadronquintana/Desktop/METODOS_GRAN_ESCALA/TAREAS/tarea1_future_sales/data/processed/dataset_monthly.csv.gz
full shape: (18129480, 11)


Unnamed: 0,date_block_num,shop_id,item_id,item_cnt_month,item_price_mean,item_category_id,item_cnt_month_lag1,item_cnt_month_lag2,item_cnt_month_lag3,item_cnt_month_lag6,item_cnt_month_lag12
0,0,0,32,6.0,221.0,40,0.0,0.0,0.0,0.0,0.0
1,0,0,33,3.0,347.0,37,0.0,0.0,0.0,0.0,0.0
2,0,0,35,1.0,247.0,40,0.0,0.0,0.0,0.0,0.0
3,0,0,43,1.0,221.0,40,0.0,0.0,0.0,0.0,0.0
4,0,0,51,2.0,128.5,57,0.0,0.0,0.0,0.0,0.0


## 1) Definir target, features y split temporal

In [3]:
TARGET = "item_cnt_month"

# Ajusta si tu dataset tiene más/menos columnas
feature_cols = [
    "date_block_num", "shop_id", "item_id",
    "item_price_mean", "item_category_id",
    "item_cnt_month_lag1", "item_cnt_month_lag2", "item_cnt_month_lag3",
    "item_cnt_month_lag6", "item_cnt_month_lag12",
]

# Split temporal recomendado (Kaggle original)
train_df = full[full["date_block_num"] <= 32].copy()
valid_df = full[full["date_block_num"] == 33].copy()
test_df  = full[full["date_block_num"] == 34].copy()

X_train, y_train = train_df[feature_cols], train_df[TARGET]
X_valid, y_valid = valid_df[feature_cols], valid_df[TARGET]
X_test = test_df[feature_cols]

print("Train:", X_train.shape, "Valid:", X_valid.shape, "Test:", X_test.shape)

Train: (17388360, 10) Valid: (526920, 10) Test: (214200, 10)


### (Opcional) Submuestreo para prototipar rápido

Si tu compu se tarda demasiado, activa esto para entrenar **más rápido**.  
Luego desactívalo para entrenar el mejor modelo en full.


In [4]:
USE_SUBSAMPLE = False   # <-- cambia a True si necesitas velocidad
SUBSAMPLE_FRAC = 0.15        # 15% del train para prototipo

if USE_SUBSAMPLE:
    train_idx = X_train.sample(frac=SUBSAMPLE_FRAC, random_state=123).index
    X_train_sub = X_train.loc[train_idx].copy()
    y_train_sub = y_train.loc[train_idx].copy()
    print("Subsample train:", X_train_sub.shape)
else:
    X_train_sub, y_train_sub = X_train, y_train

## 2) Métrica y baselines

In [5]:
def rmse(y_true, y_pred):
    # Evitamos usar mean_squared_error(..., squared=False) por compatibilidad
    return float(np.sqrt(mean_squared_error(y_true, y_pred)))

def clip_preds(pred):
    # Consistencia con Kaggle
    return np.clip(pred, 0, 20)

# Baseline: pred = lag1 (validación completa)
pred_base = valid_df["item_cnt_month_lag1"].values
rmse_base = rmse(y_valid.values, pred_base)
print("Baseline (pred=lag1) RMSE valid (mes 33):", rmse_base)

# Baseline comparable a Kaggle: SOLO pares de test
test_pairs = test_df[["shop_id","item_id"]].drop_duplicates()
valid_tp = valid_df.merge(test_pairs, on=["shop_id","item_id"], how="inner")
rmse_base_tp = rmse(valid_tp[TARGET].values, valid_tp["item_cnt_month_lag1"].values)
print("Baseline (pred=lag1) RMSE valid (solo test pairs):", rmse_base_tp)
print("N test pairs:", len(test_pairs))

Baseline (pred=lag1) RMSE valid (mes 33): 0.7941131236984387
Baseline (pred=lag1) RMSE valid (solo test pairs): 1.084811531906458
N test pairs: 214200


## 3) Preparación de variables categóricas

In [6]:
cat_cols = ["shop_id","item_id","item_category_id"]

def to_category(df, cols):
    df2 = df.copy()
    for c in cols:
        if c in df2.columns:
            df2[c] = df2[c].astype("category")
    return df2

X_train_sub_cat = to_category(X_train_sub, cat_cols)
X_valid_cat = to_category(X_valid, cat_cols)
X_test_cat  = to_category(X_test, cat_cols)

## 4) Model Zoo: Ridge + LightGBM + (opcional) CatBoost / XGBoost

In [7]:
results = []

def log_result(name, rmse_valid, extra=None):
    row = {"model": name, "rmse_valid": rmse_valid}
    if extra:
        row.update(extra)
    results.append(row)
    print(f"{name:>18} | RMSE valid: {rmse_valid:.6f}")

# 4.1 Ridge
ridge = Ridge(alpha=1.0, random_state=123)
ridge.fit(X_train_sub, y_train_sub)
pred_valid = ridge.predict(X_valid)
pred_valid = clip_preds(pred_valid)
log_result("Ridge", rmse(y_valid, pred_valid))

joblib.dump(ridge, ARTIFACTS / "ridge.joblib")

             Ridge | RMSE valid: 0.664110


['/Users/andrespadronquintana/Desktop/METODOS_GRAN_ESCALA/TAREAS/tarea1_future_sales/artifacts/ridge.joblib']

In [8]:
# 4.2 LightGBM
try:
    import lightgbm as lgb

    lgb_train = lgb.Dataset(X_train_sub_cat, label=y_train_sub, categorical_feature=cat_cols, free_raw_data=False)
    lgb_valid = lgb.Dataset(X_valid_cat, label=y_valid, categorical_feature=cat_cols, free_raw_data=False)

    params = dict(
        objective="regression",
        metric="rmse",
        learning_rate=0.05,
        num_leaves=127,
        min_data_in_leaf=100,
        feature_fraction=0.8,
        bagging_fraction=0.8,
        bagging_freq=1,
        lambda_l2=5.0,
        verbose=-1,
        seed=123,
    )

    model_lgb = lgb.train(
        params,
        lgb_train,
        num_boost_round=5000,
        valid_sets=[lgb_train, lgb_valid],
        valid_names=["train","valid"],
        callbacks=[lgb.early_stopping(200), lgb.log_evaluation(100)],
    )

    pred_valid = model_lgb.predict(X_valid_cat, num_iteration=model_lgb.best_iteration)
    pred_valid = clip_preds(pred_valid)
    log_result("LightGBM", rmse(y_valid, pred_valid), {"best_iter": int(model_lgb.best_iteration)})

    joblib.dump(model_lgb, ARTIFACTS / "lightgbm.joblib")

except Exception as e:
    print("LightGBM no disponible o falló import/entrenamiento:", repr(e))

Training until validation scores don't improve for 200 rounds
[100]	train's rmse: 0.466318	valid's rmse: 0.504215
[200]	train's rmse: 0.436295	valid's rmse: 0.502333
[300]	train's rmse: 0.424037	valid's rmse: 0.500101
[400]	train's rmse: 0.415005	valid's rmse: 0.499013
[500]	train's rmse: 0.407956	valid's rmse: 0.498219
[600]	train's rmse: 0.401818	valid's rmse: 0.497386
[700]	train's rmse: 0.396896	valid's rmse: 0.497175
[800]	train's rmse: 0.392488	valid's rmse: 0.496826
[900]	train's rmse: 0.388084	valid's rmse: 0.496207
[1000]	train's rmse: 0.384274	valid's rmse: 0.495896
[1100]	train's rmse: 0.380848	valid's rmse: 0.495665
[1200]	train's rmse: 0.377659	valid's rmse: 0.495489
[1300]	train's rmse: 0.37448	valid's rmse: 0.495284
[1400]	train's rmse: 0.371393	valid's rmse: 0.495182
[1500]	train's rmse: 0.368453	valid's rmse: 0.494984
[1600]	train's rmse: 0.365879	valid's rmse: 0.494909
[1700]	train's rmse: 0.363398	valid's rmse: 0.494722
[1800]	train's rmse: 0.360935	valid's rmse: 0.4

**Early stopping.**  
El mensaje `Training until validation scores don't improve for 200 rounds` indica que se activó un criterio de *early stopping*: si el RMSE de validación no mejora durante 200 rondas consecutivas, el entrenamiento se detiene para evitar sobreajuste. Esto evita continuar agregando árboles que reduzcan el error en entrenamiento pero no aporten mejoras reales en validación.

**Mejor iteración y modelo final.**  
- `Early stopping, best iteration is: [1681]` significa que la mejor performance en validación se alcanzó en la iteración 1681.
- En esa iteración, el log muestra `valid's rmse ≈ 0.494699` (mínimo observado durante el entrenamiento).
- El notebook resume el desempeño final como `LightGBM | RMSE valid: 0.494536`.

**Lectura del resultado.**  
El RMSE de validación (0.4945) representa el tamaño típico del error de predicción en la muestra no vista durante el entrenamiento: valores más bajos implican mejor capacidad predictiva. La diferencia entre RMSE de entrenamiento (0.3638) y validación (~0.4945) es esperable; el *early stopping* ayuda a elegir el punto con mejor generalización.

**Conclusión operativa.**  
Con `best_iteration = 1681` y `RMSE valid ≈ 0.4945`, este modelo se considera el candidato principal en esta corrida (y sirve como referencia para comparar con alternativas como CatBoost).

In [10]:
# 4.3 CatBoost
try:
    from catboost import CatBoostRegressor

    cat_features_idx = [X_train_sub.columns.get_loc(c) for c in cat_cols if c in X_train_sub.columns]

    cb = CatBoostRegressor(
        loss_function="RMSE",
        depth=10,
        learning_rate=0.05,
        iterations=700,
        random_seed=123,
        verbose=200,
        eval_metric="RMSE",
        od_type="Iter",
        od_wait=200,
    )

    cb.fit(
        X_train_sub, y_train_sub,
        cat_features=cat_features_idx,
        eval_set=(X_valid, y_valid),
        use_best_model=True,
    )

    pred_valid = cb.predict(X_valid)
    pred_valid = clip_preds(pred_valid)
    log_result("CatBoost", rmse(y_valid, pred_valid), {"best_iter": int(cb.get_best_iteration())})

    joblib.dump(cb, ARTIFACTS / "catboost.joblib")

except Exception as e:
    print("CatBoost no disponible:", repr(e))
    print("Tip: instala con `uv pip install catboost` (o `pip install catboost`)")


0:	learn: 0.9441086	test: 0.7582783	best: 0.7582783 (0)	total: 8.96s	remaining: 1h 44m 20s
200:	learn: 0.4936552	test: 0.5242953	best: 0.5240156 (177)	total: 23m 28s	remaining: 58m 16s
400:	learn: 0.4500938	test: 0.5237910	best: 0.5229368 (340)	total: 48m 49s	remaining: 36m 24s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.5229368185
bestIteration = 340

Shrink model to first 341 iterations.
          CatBoost | RMSE valid: 0.522886


**Early stopping / overfitting detector.**  
El mensaje `Stopped by overfitting detector (200 iterations wait)` indica que CatBoost detuvo el entrenamiento porque el RMSE de validación no mejoró durante 200 iteraciones consecutivas. Esto evita seguir agregando árboles que solo reduzcan el error en entrenamiento pero empeoren (o no mejoren) la generalización.

**Mejor iteración y modelo final.**  
- `bestIteration = 340` significa que la mejor performance en validación se alcanzó alrededor de la iteración 340.  
- `bestTest ≈ 0.52294` es el mejor RMSE observado en validación.
- `Shrink model to first 341 iterations` confirma que el modelo final se recorta para quedarse con el número óptimo de iteraciones (evitando árboles “extra” que no aportaron mejoras en validación).

In [12]:
# 4.4 XGBoost
    import xgboost as xgb
    import inspect

    xgb_model = xgb.XGBRegressor(
        n_estimators=700,
        learning_rate=0.05,
        max_depth=10,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_lambda=5.0,
        objective="reg:squarederror",
        random_state=123,
        tree_method="hist",
    )

    fit_sig = inspect.signature(xgb.XGBRegressor.fit)
    fit_kwargs = dict(
        eval_set=[(X_valid, y_valid)],
        verbose=200,
    )

    # Opción A: parámetro early_stopping_rounds en .fit (si existe en tu versión)
    if "early_stopping_rounds" in fit_sig.parameters:
        fit_kwargs["early_stopping_rounds"] = 200
        xgb_model.fit(X_train_sub, y_train_sub, **fit_kwargs)

    # Opción B: callbacks (si tu versión no acepta early_stopping_rounds)
    else:
        # Algunas versiones esperan callbacks; usamos EarlyStopping.
        fit_kwargs["callbacks"] = [xgb.callback.EarlyStopping(rounds=200, save_best=True)]
        xgb_model.fit(X_train_sub, y_train_sub, **fit_kwargs)

    pred_valid = xgb_model.predict(X_valid)
    pred_valid = clip_preds(pred_valid)
    best_iter = getattr(xgb_model, "best_iteration", None)
    log_result("XGBoost", rmse(y_valid, pred_valid), {"best_iter": None if best_iter is None else int(best_iter)})

    joblib.dump(xgb_model, ARTIFACTS / "xgboost.joblib")

except Exception as e:
    print("XGBoost no disponible:", repr(e))

XGBoost no disponible: TypeError("XGBModel.fit() got an unexpected keyword argument 'callbacks'")


## 5) Comparación de modelos

In [13]:
results_df = pd.DataFrame(results).sort_values("rmse_valid").reset_index(drop=True)
results_df

Unnamed: 0,model,rmse_valid,best_iter
0,LightGBM,0.494536,1681.0
1,CatBoost,0.522886,340.0
2,Ridge,0.66411,


In [14]:
results_path = ARTIFACTS / "model_comparison.csv"
results_df.to_csv(results_path, index=False)
print("Saved:", results_path)

Saved: /Users/andrespadronquintana/Desktop/METODOS_GRAN_ESCALA/TAREAS/tarea1_future_sales/artifacts/model_comparison.csv


## 6) Entrenar el mejor modelo en Train+Valid y generar submission

In [15]:
best_name = results_df.loc[0, "model"]
print("Best (by valid RMSE):", best_name)

X_trainval = pd.concat([X_train, X_valid], ignore_index=True)
y_trainval = pd.concat([y_train, y_valid], ignore_index=True)
X_trainval_cat = to_category(X_trainval, cat_cols)

final_model = None

if best_name == "LightGBM":
    import lightgbm as lgb
    lgb_trainval = lgb.Dataset(X_trainval_cat, label=y_trainval, categorical_feature=cat_cols, free_raw_data=False)

    best_iter = int(results_df.loc[results_df["model"]=="LightGBM","best_iter"].iloc[0])

    final_model = lgb.train(
        params,
        lgb_trainval,
        num_boost_round=best_iter,
        callbacks=[lgb.log_evaluation(200)],
    )
    joblib.dump(final_model, ARTIFACTS / "final_lightgbm.joblib")
    pred_test = final_model.predict(X_test_cat, num_iteration=final_model.current_iteration())

elif best_name == "CatBoost":
    from catboost import CatBoostRegressor
    cat_features_idx = [X_trainval.columns.get_loc(c) for c in cat_cols if c in X_trainval.columns]
    best_iter = int(results_df.loc[results_df["model"]=="CatBoost","best_iter"].iloc[0])

    cb_final = CatBoostRegressor(
        loss_function="RMSE",
        depth=10,
        learning_rate=0.05,
        iterations=best_iter,
        random_seed=123,
        verbose=200,
    )
    cb_final.fit(X_trainval, y_trainval, cat_features=cat_features_idx)
    final_model = cb_final
    joblib.dump(final_model, ARTIFACTS / "final_catboost.joblib")
    pred_test = final_model.predict(X_test)

elif best_name == "XGBoost":
    import xgboost as xgb
    best_iter = int(results_df.loc[results_df["model"]=="XGBoost","best_iter"].iloc[0])

    xgb_final = xgb.XGBRegressor(
        n_estimators=best_iter,
        learning_rate=0.05,
        max_depth=10,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_lambda=5.0,
        objective="reg:squarederror",
        random_state=123,
        tree_method="hist",
    )
    xgb_final.fit(X_trainval, y_trainval, verbose=False)
    final_model = xgb_final
    joblib.dump(final_model, ARTIFACTS / "final_xgboost.joblib")
    pred_test = final_model.predict(X_test)

else:
    ridge_final = Ridge(alpha=1.0, random_state=123)
    ridge_final.fit(X_trainval, y_trainval)
    final_model = ridge_final
    joblib.dump(final_model, ARTIFACTS / "final_ridge.joblib")
    pred_test = final_model.predict(X_test)

pred_test = clip_preds(pred_test)
pred_test[:10], pred_test.min(), pred_test.max(), len(pred_test)

Best (by valid RMSE): LightGBM


(array([0.14676166, 0.        , 0.21480397, 0.00215574, 0.        ,
        0.03186812, 0.        , 0.        , 0.        , 0.        ]),
 np.float64(0.0),
 np.float64(3.6765069887824215),
 214200)

### 6.1 Armar `submission.csv` con `ID` correcto

In [16]:
test_raw_path = DATA_RAW / "test.csv"
test_raw = pd.read_csv(test_raw_path)
print("test_raw shape:", test_raw.shape)
print("columns:", list(test_raw.columns))

assert {"ID","shop_id","item_id"}.issubset(test_raw.columns), "test.csv no tiene las columnas esperadas"
assert len(test_raw) == 214200, "test.csv debería tener 214200 filas"

pred_df = test_df[["shop_id","item_id"]].copy()
pred_df["item_cnt_month"] = pred_test

sub = test_raw[["ID","shop_id","item_id"]].merge(
    pred_df,
    on=["shop_id","item_id"],
    how="left",
    validate="one_to_one",
)

assert sub["item_cnt_month"].isna().mean() == 0.0, "Hay pares sin predicción (NaNs). Revisa tu test_df/pred_df."
sub = sub.sort_values("ID").reset_index(drop=True)

sub_path = ARTIFACTS / "submission_best.csv"
sub[["ID","item_cnt_month"]].to_csv(sub_path, index=False)

print("Saved:", sub_path)
sub.head()

test_raw shape: (214200, 3)
columns: ['ID', 'shop_id', 'item_id']
Saved: /Users/andrespadronquintana/Desktop/METODOS_GRAN_ESCALA/TAREAS/tarea1_future_sales/artifacts/submission_best.csv


Unnamed: 0,ID,shop_id,item_id,item_cnt_month
0,0,5,5037,0.146762
1,1,5,5320,0.0
2,2,5,5233,0.214804
3,3,5,5232,0.002156
4,4,5,5268,0.0
