**ENTRENAMIENTO EVALUACION**

In [11]:
import os, pickle, warnings, itertools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.metrics import mean_absolute_error, mean_squared_error

ventas_dia = pd.read_csv("data/processed/ventas_diarias.csv",
                         parse_dates=["InvoiceDate"], index_col="InvoiceDate")["TotalPrice"].asfreq("D")

In [12]:
def create_features(series, lags=(1,7,14), ma_windows=(7,14)):
    s = series.asfreq("D").copy()
    df = pd.DataFrame({"y": s})
    for L in lags:
        df[f"lag{L}"] = s.shift(L)
    for W in ma_windows:
        df[f"ma{W}"] = s.rolling(W, min_periods=W).mean().shift(1)
    df["weekday"] = df.index.weekday
    df["is_weekend"] = (df["weekday"] >= 5).astype(int)
    return df.dropna()

df_feat = create_features(ventas_dia)
df_feat.to_csv("data/processed/ventas_con_features.csv")
print("Guardado features:", "data/processed/ventas_con_features.csv", "| shape:", df_feat.shape)
df_feat.head()

Guardado features: data/processed/ventas_con_features.csv | shape: (72, 8)


Unnamed: 0_level_0,y,lag1,lag7,lag14,ma7,ma14,weekday,is_weekend
InvoiceDate,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2011-01-18,92427.28,26239.46,61971.89,15478.52,21922.35,20717.274286,1,0
2011-01-19,13440.73,92427.28,21828.84,31243.29,26273.12,26213.614286,2,0
2011-01-20,16354.84,13440.73,18262.53,32429.45,25074.818571,24942.002857,3,0
2011-01-21,23577.42,16354.84,17911.67,22656.7,24802.291429,23793.816429,4,0
2011-01-22,0.0,23577.42,0.0,0.0,25611.684286,23859.582143,5,1


In [13]:
cut = pd.to_datetime("2011-03-01")
X = df_feat.drop(columns="y"); y = df_feat["y"]
X_train, X_test = X.loc[X.index < cut], X.loc[X.index >= cut]
y_train, y_test = y.loc[y.index < cut], y.loc[y.index >= cut]

Path("data/train").mkdir(parents=True, exist_ok=True)
Path("data/test").mkdir(parents=True, exist_ok=True)
X_train.to_csv("data/train/X_train.csv")
y_train.to_csv("data/train/y_train.csv")
X_test.to_csv("data/test/X_test.csv")
y_test.to_csv("data/test/y_test.csv")

print("Shapes:", X_train.shape, X_test.shape)

Shapes: (42, 7) (30, 7)


In [14]:
naive = y_test.shift(1).reindex(y_test.index)
mae_naive = mean_absolute_error(y_test[1:], naive[1:])
rmse_naive = np.sqrt(mean_squared_error(y_test[1:], naive[1:]))

# MA(14) usando histórico (train+test) en modo walk-forward simple
full = pd.concat([y_train, y_test])   # <-- en lugar de .append()
ma14 = full.rolling(14, min_periods=14).mean().shift(1).reindex(y_test.index)

mae_ma14 = mean_absolute_error(y_test.dropna(), ma14.dropna())
rmse_ma14 = np.sqrt(mean_squared_error(y_test.dropna(), ma14.dropna()))

print(f"Naive -> MAE {mae_naive:.2f} | RMSE {rmse_naive:.2f}")
print(f"MA(14) -> MAE {mae_ma14:.2f} | RMSE {rmse_ma14:.2f}")

Naive -> MAE 11304.17 | RMSE 14225.45
MA(14) -> MAE 8364.61 | RMSE 11177.92


In [15]:
from statsmodels.tsa.holtwinters import ExponentialSmoothing
os.makedirs("models", exist_ok=True)

ets = ExponentialSmoothing(y_train.asfreq("D"), trend=None, seasonal="add", seasonal_periods=7).fit(optimized=True)
pred_ets = ets.forecast(len(y_test))
mae_ets = mean_absolute_error(y_test, pred_ets); rmse_ets = np.sqrt(mean_squared_error(y_test, pred_ets))
with open("models/trained_model_001.pkl", "wb") as f: pickle.dump(ets, f)
print(f"ETS -> MAE {mae_ets:.2f} | RMSE {rmse_ets:.2f} | Guardado en models/trained_model_001.pkl")

ETS -> MAE 5989.71 | RMSE 8667.53 | Guardado en models/trained_model_001.pkl


In [16]:
import statsmodels.api as sm
warnings.filterwarnings("ignore")

p,q = [0,1,2,3], [0,1,2]
P,D,Q,s = [0,1], 1, [1], 7
best = None

for order in itertools.product(p,[0],q):
    for seas in itertools.product(P,[D],Q,[s]):
        try:
            m = sm.tsa.statespace.SARIMAX(y_train, order=order, seasonal_order=seas,
                                          enforce_stationarity=False, enforce_invertibility=False).fit(disp=False)
            fc = m.forecast(len(y_test))
            mae = mean_absolute_error(y_test, fc); rmse = np.sqrt(mean_squared_error(y_test, fc))
            if (best is None) or (mae < best["mae"]):
                best = {"order":order, "seasonal":seas, "mae":mae, "rmse":rmse, "model":m, "pred":fc}
        except: pass

best_model = best["model"]; pred_sarima = best["pred"]
with open("models/trained_model_002.pkl", "wb") as f: pickle.dump(best_model, f)
print("Best SARIMA:", best["order"], "x", best["seasonal"], "| MAE", round(best["mae"],2), "| RMSE", round(best["rmse"],2))

Best SARIMA: (2, 0, 0) x (0, 1, 1, 7) | MAE 5216.89 | RMSE 7599.49


In [17]:
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor

# Ridge
ridge = Ridge(alpha=1.0).fit(X_train, y_train)
pred_ridge = ridge.predict(X_test)
mae_ridge = mean_absolute_error(y_test, pred_ridge); rmse_ridge = np.sqrt(mean_squared_error(y_test, pred_ridge))
with open("models/trained_model_003.pkl", "wb") as f: pickle.dump(ridge, f)

# RF (ejemplo rápido; sustituye por tu CV final)
rf = RandomForestRegressor(n_estimators=500, max_depth=10, min_samples_leaf=1, random_state=42)
rf.fit(X_train, y_train)
pred_rf = pd.Series(rf.predict(X_test), index=X_test.index)
mae_rf = mean_absolute_error(y_test, pred_rf); rmse_rf = np.sqrt(mean_squared_error(y_test, pred_rf))
with open("models/trained_model_004.pkl", "wb") as f: pickle.dump(rf, f)

print(f"Ridge -> MAE {mae_ridge:.2f} | RMSE {rmse_ridge:.2f}")
print(f"RF    -> MAE {mae_rf:.2f} | RMSE {rmse_rf:.2f}")

Ridge -> MAE 5900.64 | RMSE 7980.85
RF    -> MAE 4557.63 | RMSE 8285.26


In [23]:
#PRUEBA DE SARIMA + RANDOM FOREST

ws = np.linspace(0,1,41)
best_w, best_mae = None, 1e18
for w in ws:
    blend = w*pd.Series(pred_sarima, index=y_test.index).values + (1-w)*pred_rf.values
    mae = mean_absolute_error(y_test, blend)
    if mae < best_mae:
        best_mae, best_w = mae, w

print(f"Ensemble SARIMA+RF -> best w={best_w:.2f} | MAE={best_mae:.2f}")

Ensemble SARIMA+RF -> best w=0.12 | MAE=4508.51


In [19]:
from sklearn.model_selection import TimeSeriesSplit
from xgboost import XGBRegressor

In [20]:
os.makedirs("models", exist_ok=True)

tscv = TimeSeriesSplit(n_splits=3)
param_grid = [
    {"n_estimators": 300, "max_depth": 3, "learning_rate": 0.1, "subsample": 0.8, "colsample_bytree": 0.8},
    {"n_estimators": 500, "max_depth": 3, "learning_rate": 0.05, "subsample": 0.8, "colsample_bytree": 0.8},
    {"n_estimators": 500, "max_depth": 5, "learning_rate": 0.05, "subsample": 0.9, "colsample_bytree": 0.8},
]

best_cfg, best_mae = None, np.inf
for cfg in param_grid:
    maes = []
    for tr_idx, va_idx in tscv.split(X_train):
        X_tr, X_va = X_train.iloc[tr_idx], X_train.iloc[va_idx]
        y_tr, y_va = y_train.iloc[tr_idx], y_train.iloc[va_idx]

        xgb = XGBRegressor(
            random_state=42,
            n_jobs=-1,
            **cfg
        )
        xgb.fit(X_tr, y_tr)
        pred = xgb.predict(X_va)
        maes.append(mean_absolute_error(y_va, pred))
    mae_cv = float(np.mean(maes))
    print(f"CV cfg {cfg} -> MAE val: {mae_cv:.2f}")
    if mae_cv < best_mae:
        best_mae, best_cfg = mae_cv, cfg

print(f"Mejor configuración XGB por CV temporal: {best_cfg} | MAE val: {best_mae:.2f}")

xgb_best = XGBRegressor(random_state=42, n_jobs=-1, **best_cfg)
xgb_best.fit(X_train, y_train)
pred_xgb = xgb_best.predict(X_test)

mae_xgb  = mean_absolute_error(y_test, pred_xgb)
rmse_xgb = np.sqrt(mean_squared_error(y_test, pred_xgb))
print(f"\nXGBoost -> MAE: {mae_xgb:.2f} | RMSE: {rmse_xgb:.2f}")

# Guardar
with open("models/trained_model_005.pkl", "wb") as f:
    pickle.dump(xgb_best, f)
print("Guardado: models/trained_model_005.pkl")

CV cfg {'n_estimators': 300, 'max_depth': 3, 'learning_rate': 0.1, 'subsample': 0.8, 'colsample_bytree': 0.8} -> MAE val: 4041.17
CV cfg {'n_estimators': 500, 'max_depth': 3, 'learning_rate': 0.05, 'subsample': 0.8, 'colsample_bytree': 0.8} -> MAE val: 3768.77
CV cfg {'n_estimators': 500, 'max_depth': 5, 'learning_rate': 0.05, 'subsample': 0.9, 'colsample_bytree': 0.8} -> MAE val: 4132.94
Mejor configuración XGB por CV temporal: {'n_estimators': 500, 'max_depth': 3, 'learning_rate': 0.05, 'subsample': 0.8, 'colsample_bytree': 0.8} | MAE val: 3768.77

XGBoost -> MAE: 5593.46 | RMSE: 8512.94
Guardado: models/trained_model_005.pkl


In [22]:
from lightgbm import LGBMRegressor
os.makedirs("models", exist_ok=True)

tscv = TimeSeriesSplit(n_splits=3)
param_grid = [
    {"n_estimators": 500, "num_leaves": 31, "max_depth": -1, "learning_rate": 0.05, "subsample": 0.8, "colsample_bytree": 0.8},
    {"n_estimators": 1000, "num_leaves": 31, "max_depth": -1, "learning_rate": 0.05, "subsample": 1.0, "colsample_bytree": 1.0},
    {"n_estimators": 1000, "num_leaves": 63, "max_depth": 10, "learning_rate": 0.1, "subsample": 0.9, "colsample_bytree": 0.8},
]

best_cfg, best_mae = None, np.inf
for cfg in param_grid:
    maes = []
    for tr_idx, va_idx in tscv.split(X_train):
        X_tr, X_va = X_train.iloc[tr_idx], X_train.iloc[va_idx]
        y_tr, y_va = y_train.iloc[tr_idx], y_train.iloc[va_idx]

        lgbm = LGBMRegressor(
            random_state=42,
            n_jobs=-1,
            **cfg
        )
        lgbm.fit(X_tr, y_tr)
        pred = lgbm.predict(X_va)
        maes.append(mean_absolute_error(y_va, pred))
    mae_cv = float(np.mean(maes))
    print(f"CV cfg {cfg} -> MAE val: {mae_cv:.2f}")
    if mae_cv < best_mae:
        best_mae, best_cfg = mae_cv, cfg

print(f"Mejor configuración LGBM por CV temporal: {best_cfg} | MAE val: {best_mae:.2f}")

lgbm_best = LGBMRegressor(random_state=42, n_jobs=-1, **best_cfg)
lgbm_best.fit(X_train, y_train)
pred_lgbm = lgbm_best.predict(X_test)

mae_lgbm  = mean_absolute_error(y_test, pred_lgbm)
rmse_lgbm = np.sqrt(mean_squared_error(y_test, pred_lgbm))
print(f"\nLightGBM -> MAE: {mae_lgbm:.2f} | RMSE: {rmse_lgbm:.2f}")

with open("models/trained_model_006.pkl", "wb") as f:
    pickle.dump(lgbm_best, f)
print("Guardado: models/trained_model_006.pkl")

[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 12, number of used features: 0
[LightGBM] [Info] Start training from score 20587.557617
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 22, number of used features: 0
[LightGBM] [Info] Start training from score 18471.834550
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 32, number of used features: 0
[LightGBM] [Info] Start training from score 17396.407829
CV cfg {'n_estimators': 500, 'num_leaves': 31, 'max_depth': -1, 'learning_rate': 0.05, 'subsample': 0.8, 'colsample_bytree': 0.8} -> MAE val: 7031.55
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 12, number of used features: 0
[LightGBM] [Info] Start training from score 20587.557617
[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 22, number of used features: 0
[LightGBM] [Info] Star