Gradient Boosting Regressor con lógica del profe

In [21]:
import pandas as pd, numpy as np
from pathlib import Path
from sklearn.ensemble import GradientBoostingRegressor

# ── Rutas ─────────────────────────
BASE   = Path(r"C:\Maestria\Labo 3")
VENTAS = BASE / "sell-in.txt"
LISTA  = BASE / "780_a_predecir.txt"
OUT    = BASE / "submission_t780_gb_profesor.csv"
# ── SKU mágicos del profe ─────────
MAGICOS = [
    20002,20003,20006,20010,20011,20018,20019,20021,20026,20028,
    20035,20039,20042,20044,20045,20046,20049,20051,20052,20053,
    20055,20008,20001,20017,20086,20180,20193,20320,20532,20612,
    20637,20807,20838
]
# ── Coeficientes “semilla” ────────
INTC   = 0.441467
COEF   = [-0.001339, 0.236558, 0.178208, -0.060031, -0.161875,
          -0.007775, 0.151936, 0.043933, 0.142839, 0.103804,
           0.119211, 0.073671]      # lag0 … lag11
# ──────────────────────────────────

# 1) datos
df = pd.read_csv(VENTAS, sep=None, engine="python")
sku = [int(l.strip()) for l in open(LISTA)
       if l.strip() and not l.lower().startswith("product")]
df = df[df.product_id.isin(sku)]
df["periodo"] = df["periodo"].astype(str).str.zfill(6)
df["fecha"]   = pd.to_datetime(df.periodo, format="%Y%m")
mensual = (df.groupby(["product_id","fecha"])["tn"]
             .sum().unstack(level=0).fillna(0).sort_index())

# 2) features (lags) y clase
rows=[]
for pid in mensual.columns:
    s=mensual[pid]
    for i in range(len(s)-11):
        t=s.index[i+11]
        row={"product_id":pid,"fecha":t}
        for l in range(12):
            row[f"lag{l}"]=s.iloc[i+11-l]
        row["clase"]=s.iloc[i+13] if i+13<len(s) else np.nan
        rows.append(row)
feat=pd.DataFrame(rows)
Xcols=[f"lag{l}" for l in range(12)]

# 3) GB entrenado con todos los SKU completos en 2018-12
gb_train = feat[(feat.fecha=="2018-12-01")].dropna()
gb = GradientBoostingRegressor(
        n_estimators=300, learning_rate=0.05,
        max_depth=3, subsample=0.8, random_state=42
).fit(gb_train[Xcols], gb_train.clase)

# 4) Predicción último mes (blend con semilla)
last = mensual.index.max()
pred=[]
for pid in sku:
    s = mensual[pid]
    if len(s) >= 12:
        lags = [s.iloc[-1-l] for l in range(12)]
        gb_pred = gb.predict([lags])[0]
        # semilla del profe
        seed_pred = INTC + np.dot(COEF, lags)
        # reglas de combinación
        if pid in MAGICOS:
            tn = 0.8*seed_pred + 0.2*gb_pred
        else:
            tn = 0.5*seed_pred + 0.5*gb_pred
    else:
        tn = np.nan
    # fallback: promedio 12 m si falta algo
    if np.isnan(tn):
        tn = s.tail(12).mean()
    pred.append((pid, round(tn,5)))

pd.DataFrame(pred, columns=["product_id","tn"]).to_csv(
    OUT, index=False, float_format="%.5f"
)
print("✅ CSV GB+profesor guardado →", OUT.name)




✅ CSV GB+profesor guardado → submission_t780_gb_profesor.csv




este hasta ahora es el mejor