# 03 — XGBoost only (Model + Submission)

Este notebook entrena **XGBoost** para *Predict Future Sales* con:
- split temporal: **train = meses ≤ 32**, **valid = mes 33**
- métrica: **RMSE** (con clipping final a **[0, 20]**)
- **early stopping** robusto (compatibilidad entre versiones de `xgboost`)
- re-entrenamiento final con **train+valid** usando el mejor número de árboles
- generación de **submission** con `ID` correcto (merge con `data/raw/test.csv`)


In [5]:
# Imports + Paths

import warnings

warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from pathlib import Path
import joblib

from sklearn.metrics import mean_squared_error

PROJECT_ROOT = Path(
    "/Users/andrespadronquintana/Desktop/METODOS_GRAN_ESCALA/TAREAS/tarea1_future_sales"
)

RAW_DIR = PROJECT_ROOT / "data" / "raw"
PROC_DIR = PROJECT_ROOT / "data" / "processed"
ARTIFACTS = PROJECT_ROOT / "artifacts"
ARTIFACTS.mkdir(parents=True, exist_ok=True)

TEST_RAW = RAW_DIR / "test.csv"
DATASET_MONTHLY = PROC_DIR / "dataset_monthly.csv.gz"

print("PROJECT_ROOT:", PROJECT_ROOT)
print("RAW_DIR:", RAW_DIR, "exists:", RAW_DIR.exists())
print("PROC_DIR:", PROC_DIR, "exists:", PROC_DIR.exists())
print("TEST_RAW:", TEST_RAW, "exists:", TEST_RAW.exists())
print("DATASET_MONTHLY:", DATASET_MONTHLY, "exists:", DATASET_MONTHLY.exists())

PROJECT_ROOT: /Users/andrespadronquintana/Desktop/METODOS_GRAN_ESCALA/TAREAS/tarea1_future_sales
RAW_DIR: /Users/andrespadronquintana/Desktop/METODOS_GRAN_ESCALA/TAREAS/tarea1_future_sales/data/raw exists: True
PROC_DIR: /Users/andrespadronquintana/Desktop/METODOS_GRAN_ESCALA/TAREAS/tarea1_future_sales/data/processed exists: True
TEST_RAW: /Users/andrespadronquintana/Desktop/METODOS_GRAN_ESCALA/TAREAS/tarea1_future_sales/data/raw/test.csv exists: True
DATASET_MONTHLY: /Users/andrespadronquintana/Desktop/METODOS_GRAN_ESCALA/TAREAS/tarea1_future_sales/data/processed/dataset_monthly.csv.gz exists: True


In [6]:
# Cargamos dataset_monthly + checks

if not DATASET_MONTHLY.exists():
    raise FileNotFoundError(
        f"No existe {DATASET_MONTHLY}. Revisa PROJECT_ROOT y tu estructura de carpetas."
    )

df = pd.read_csv(DATASET_MONTHLY, compression="gzip")
print("df shape:", df.shape)

required = {"shop_id", "item_id", "date_block_num"}
missing = required - set(df.columns)
if missing:
    raise ValueError(f"Faltan columnas {missing} en dataset_monthly.")

if "item_cnt_month" not in df.columns:
    raise ValueError(
        "Falta 'item_cnt_month' en dataset_monthly. Este archivo debe traer el target para meses 0..33."
    )

# tipos
for c in ["shop_id", "item_id", "date_block_num"]:
    df[c] = df[c].astype(int)

print("date_block_num min/max:", int(df["date_block_num"].min()), int(df["date_block_num"].max()))
print("cols (primeras 40):", list(df.columns)[:40])

# Definir train_df / test_df
train_df = df[df["date_block_num"] <= 33].copy()

if (df["date_block_num"] == 34).any():
    test_df = df[df["date_block_num"] == 34].copy()
    test_df = test_df.drop(columns=["item_cnt_month"], errors="ignore")
else:
    # Si dataset_monthly no trae mes 34, armamos test_df desde test.csv
    if not TEST_RAW.exists():
        raise FileNotFoundError(f"No existe {TEST_RAW}.")
    test_raw = pd.read_csv(TEST_RAW)
    test_df = test_raw[["shop_id", "item_id"]].copy()
    test_df["date_block_num"] = 34

print("train_df:", train_df.shape)
print("test_df :", test_df.shape)


assert train_df["date_block_num"].max() == 33, "train_df debe llegar a mes 33"
assert set(["shop_id", "item_id"]).issubset(test_df.columns), "test_df debe tener shop_id e item_id"

df shape: (18129480, 11)
date_block_num min/max: 0 34
cols (primeras 40): ['date_block_num', 'shop_id', 'item_id', 'item_cnt_month', 'item_price_mean', 'item_category_id', 'item_cnt_month_lag1', 'item_cnt_month_lag2', 'item_cnt_month_lag3', 'item_cnt_month_lag6', 'item_cnt_month_lag12']
train_df: (17915280, 11)
test_df : (214200, 10)


In [7]:
# Realizamos el split temporal (train <=32, valid==33)

train_sub = train_df[train_df["date_block_num"] <= 32].copy()
valid_df = train_df[train_df["date_block_num"] == 33].copy()

y_train = train_sub["item_cnt_month"].astype(float).values
y_valid = valid_df["item_cnt_month"].astype(float).values

# Features: solo numéricas, excluyendo target
drop_cols = {"item_cnt_month"}
candidate_cols = [c for c in train_sub.columns if c not in drop_cols]
feature_cols = [c for c in candidate_cols if pd.api.types.is_numeric_dtype(train_sub[c])]


X_train = train_sub[feature_cols]
X_valid = valid_df[feature_cols]

print("X_train:", X_train.shape, "X_valid:", X_valid.shape)
print("n_features:", len(feature_cols))
print("Ejemplo features:", feature_cols[:20])

X_train: (17388360, 10) X_valid: (526920, 10)
n_features: 10
Ejemplo features: ['date_block_num', 'shop_id', 'item_id', 'item_price_mean', 'item_category_id', 'item_cnt_month_lag1', 'item_cnt_month_lag2', 'item_cnt_month_lag3', 'item_cnt_month_lag6', 'item_cnt_month_lag12']


In [11]:
# Helpers (RMSE + clip)

import numpy as np
from sklearn.metrics import mean_squared_error


def rmse(y_true, y_pred):
    return float(np.sqrt(mean_squared_error(y_true, y_pred)))


def clip_preds(pred):
    # regla de esta competencia
    return np.clip(pred, 0, 20)


print("OK helpers")

OK helpers


In [12]:
# XGBoost via xgb.train (con early stopping)

import xgboost as xgb

dtrain = xgb.DMatrix(X_train, label=y_train)
dvalid = xgb.DMatrix(X_valid, label=y_valid)

params = {
    "objective": "reg:squarederror",
    "eval_metric": "rmse",
    "eta": 0.03,
    "max_depth": 10,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "lambda": 5.0,
    "seed": 123,
    "tree_method": "hist",
}

watchlist = [(dtrain, "train"), (dvalid, "valid")]

num_boost_round = 5000
early_stopping_rounds = 200

booster = xgb.train(
    params=params,
    dtrain=dtrain,
    num_boost_round=num_boost_round,
    evals=watchlist,
    early_stopping_rounds=early_stopping_rounds,
    verbose_eval=200,
)

pred_valid = clip_preds(booster.predict(dvalid, iteration_range=(0, booster.best_iteration + 1)))
valid_rmse = rmse(y_valid, pred_valid)

print("VALID RMSE (mes 33):", valid_rmse)
print("best_iteration:", booster.best_iteration)

[0]	train-rmse:0.95444	valid-rmse:0.76603
[200]	train-rmse:0.54004	valid-rmse:0.49211
[400]	train-rmse:0.50272	valid-rmse:0.48634
[600]	train-rmse:0.48418	valid-rmse:0.48080
[800]	train-rmse:0.47090	valid-rmse:0.47959
[1000]	train-rmse:0.45995	valid-rmse:0.47875
[1200]	train-rmse:0.45097	valid-rmse:0.47820
[1400]	train-rmse:0.44310	valid-rmse:0.47780
[1480]	train-rmse:0.44031	valid-rmse:0.47807
VALID RMSE (mes 33): 0.47730566432540356
best_iteration: 1280


**Watchlist.**  
La lista `watchlist = [(dtrain, "train"), (dvalid, "valid")]` le indica a XGBoost que reporte el RMSE en ambos conjuntos durante el entrenamiento. Con `verbose_eval=200` imprime el RMSE cada 200 iteraciones.

**Early stopping.**  
Aunque se permite entrenar hasta `num_boost_round = 5000`, se activa `early_stopping_rounds = 200`, lo que detiene el proceso si el RMSE de validación no mejora durante 200 iteraciones consecutivas. Esto evita sobreajuste y selecciona automáticamente el mejor punto.

**Evolución del RMSE.**  
En el log, el RMSE de validación cae rápidamente al inicio (por ejemplo, de ~0.766 a ~0.492 en 200 iteraciones) y luego mejora de forma más gradual, estabilizándose alrededor de ~0.478.

**Mejor iteración y predicción final.**  
- El modelo reporta `best_iteration: 1280`, que es la iteración donde se obtuvo el menor RMSE de validación.
- Por eso, al predecir se usa `iteration_range=(0, booster.best_iteration + 1)` para generar predicciones usando únicamente los árboles hasta el punto óptimo (evitando árboles posteriores que no mejoraron validación).

**Métrica final.**  
El resultado final es `VALID RMSE (mes 33): 0.4773`.  
Este RMSE resume el error típico de predicción en la muestra de validación: valores menores indican mejor desempeño.

**Conclusión operativa (comparación).**  
En esta corrida, XGBoost logra un RMSE de validación (~0.4773), que es mejor (menor) que LightGBM (~0.4945) y CatBoost (~0.5229). Bajo el mismo esquema de validación, XGBoost queda como el modelo con mejor generalización.

In [15]:
# FINAL TRAIN (train+valid)
import xgboost as xgb
import numpy as np

best_iter = int(booster.best_iteration)  # 1280

# Une train+valid (ya tienes X_train, y_train, X_valid, y_valid)
X_trainval = np.vstack([X_train, X_valid])
y_trainval = np.concatenate([y_train, y_valid])

dtrainval = xgb.DMatrix(X_trainval, label=y_trainval)

# Reusamos los mismos params que usasmos arriba (los de xgb.train)

booster_final = xgb.train(
    params=params,
    dtrain=dtrainval,
    num_boost_round=best_iter + 1,  # +1 por el iteration_range usado arriba
    verbose_eval=200,
)

print("Final model trained with rounds:", best_iter + 1)

Final model trained with rounds: 1281


In [16]:
# PREDICT TEST + SUBMISSION
import pandas as pd
from pathlib import Path

# 1) Leer test.csv ORIGINAL (el que trae ID)
test_raw = pd.read_csv(PROJECT_ROOT / "data" / "raw" / "test.csv")  # columnas: ID, shop_id, item_id


# 2) Merge para traer ID y ordenar por ID
test_df2 = test_df.merge(
    test_raw[["ID", "shop_id", "item_id"]],
    on=["shop_id", "item_id"],
    how="left",
    validate="one_to_one",
)

assert test_df2["ID"].notna().all(), "Hay pares (shop_id,item_id) sin ID: revisa el merge"
test_df2["ID"] = test_df2["ID"].astype(int)
test_df2 = test_df2.sort_values("ID").reset_index(drop=True)

# 3) Predecimos
X_test2 = test_df2[feature_cols]
dtest = xgb.DMatrix(X_test2)

pred_test = booster_final.predict(dtest)
pred_test = np.clip(pred_test, 0, 20)  # estándar de esta comp

submission = pd.DataFrame({"ID": test_df2["ID"], "item_cnt_month": pred_test})

sub_path = ARTIFACTS / "submission_xgb.csv"
submission.to_csv(sub_path, index=False)

print("Saved:", sub_path)
print(submission.head())
print("len:", len(submission), "ID min/max:", submission["ID"].min(), submission["ID"].max())

Saved: /Users/andrespadronquintana/Desktop/METODOS_GRAN_ESCALA/TAREAS/tarea1_future_sales/artifacts/submission_xgb.csv
   ID  item_cnt_month
0   0        0.009959
1   1        0.002693
2   2        0.043323
3   3        0.007192
4   4        0.000000
len: 214200 ID min/max: 0 214199
