# PIPLEINE VERSIÓN 1

In [None]:
# --- 1) Feature Engineering Genérico ---
def make_features(df_long, cluster_df, lags = [1, 3, 6, 12]):
    df = df_long.copy().merge(cluster_df, on = 'product_code')
    df = df.sort_values(['product_code', 'date'])

    # lags y rolling means
    for k in lags:
        df[f'lag_{k}'] = df.groupby('product_code')['demand'].shift(k)
        df[f'rollmean_{k}'] = df.groupby('product_code')['demand'].transform(lambda x: x.shift(1).rolling(k).mean())

    # dummy mes
    df_feat = make_features(df_long, cluster_df)
    
    # lista de columnas de entrada
    feature_cols = [c for c in df_feat.columns 
                    if c.startswith('lag_') 
                    or c.startswith('rollmean_') 
                    or c.startswith('month_') 
                    or c == 'cluster']

In [None]:
# --- 2) LightGBM Puro ---
X = df_feat[feature_cols]
y = df_feat['demand']
X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size = 0.2, random_state = 0, shuffle = True)

reg_pure = lgb.LGBMRegressor(n_estimators = 500, random_state = 0)
reg_pure.fit(X_tr, y_tr)

In [None]:
# Validación rápida
y_pred = reg_pure.predict(X_val)
print('LightGBM Puro - MAE:', mean_absolute_error(y_val, y_pred), 'RMSE:', np.sqrt(mean_squared_error(y_val, y_pred)))

In [None]:
# --- 3) LightGBM Two-Stage ---
# Clasificador
f = df_feat['flag']
X_trf, X_valf, f_tr, f_val = train_test_split(X, f, test_size = 0.2, random_state = 0, shuffle = True)
clf = lgb.LGBMClassifier(n_estimators = 200, random_state = 0)
clf.fit(X_trf, f_tr)
print('LightGBM Two-Stage - Clasif Precision:', clf.score(X_valf, f_val))

In [None]:
# Regresor Sobre casos positivos (y > 0)
mask = f_tr == 1
reg2 = lgb.LGBMRegressor(n_estimators = 500, random_state = 0)
reg2.fit(X_trf[mask], df_feat.loc[X_trf.index[mask], 'demand'])

In [None]:
# Validar Fase Completa
X_test = X_val.copy()
flags = clf.predict(X_test)
yhat2 = np.where(flags == 1, reg2.predict(X_test), 0.0)
print('LightGBM Two-Stage - MAE:', mean_absolute_error(y_val, yhat2), 'RMSE:', np.sqrt(mean_squared_error(y_val, yhat2)))

In [None]:
# --- 4) Wrappers para backtest_long ---
def lgbm_pure_wrapper(series, alpha=None, h=3, **kw):
    # construye features a partir de la serie y kw['cluster']
    import pandas as _pd
    hist = list(series)
    feats = {}
    for k in [1,3,6,12]:
        feats[f'lag_{k}'] = hist[-k] if len(hist)>=k else 0.0
        feats[f'rollmean_{k}'] = (sum(hist[-k-1:-1])/k) if len(hist)>k else 0.0
    # mes y cluster estáticos
    for d in range(2,13):
        feats[f'month_{d}'] = 0
    feats['cluster'] = kw.get('cluster',0)
    Xp = _pd.DataFrame([feats])
    return reg_pure.predict(Xp).repeat(h)

def lgbm_2phase_wrapper(series, alpha=None, h=3, **kw):
    import pandas as _pd
    hist = list(series)
    preds = []
    for t in range(h):
        feats = {}
        for k in [1,3,6,12]:
            feats[f'lag_{k}'] = hist[-k] if len(hist)>=k else 0.0
            feats[f'rollmean_{k}'] = (sum(hist[-k-1:-1])/k) if len(hist)>k else 0.0
        for d in range(2,13):
            feats[f'month_{d}'] = 0
        feats['cluster'] = kw.get('cluster',0)
        Xp = _pd.DataFrame([feats])
        flag = clf.predict(Xp)[0]
        yhat = reg2.predict(Xp)[0] if flag==1 else 0.0
        preds.append(yhat)
        hist.append(yhat)
    return np.array(preds)

# --- 5) Evaluación cluster-aware backtest ---
methods_ext = [
    ('Croston', croston_forecast),
    ('SBA',     sba_forecast),
    ('SBJ',     sbj_forecast),
    ('TSB',     tsb_wrapper),
    ('GBM-puro', lgbm_pure_wrapper),
    ('GBM-2fases', lgbm_2phase_wrapper)
]

frames = []
for name, func in methods_ext:
    res = backtest_long(df_long, func, h=3, alpha=0.1,
                        cluster=None)  # backtest_long ignora alpha para lgb wrappers
    res['model'] = name
    frames.append(res)

df_compare = pd.concat(frames, ignore_index=True)
print(df_compare.groupby('model')[['MAE','MAPE','RMSE']].mean())

Con esto cubrimos la fase 1:

Montamos un GBM puro y un GBM dos-fases.

Los validamos rápidamente contra un split de hold-out para verificar que funcionan.

Creamos los wrappers que permiten integrarlos en tu pipeline de backtest cluster-aware.

Ejecutamos backtest_long para comparar MAE/MAPE/RMSE contra Croston/SBA/SBJ/TSB.

Cuando lo ejecutes, revisa la tabla final y las distribuciones de error. A partir de ahí decidimos si afinamos parámetros o pasamos a la siguiente fase (LSTM/GRU).