In [5]:
import os
os.environ["LOKY_MAX_CPU_COUNT"] = "4"  # reemplaza 4 por la cantidad de cores que quieras usar

import pandas as pd
import numpy as np
import joblib

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import Ridge, LogisticRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, roc_auc_score, f1_score, accuracy_score
from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.base import clone
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier


import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=DeprecationWarning)


In [6]:
# =========================
# 0) Carga
# =========================
parquet_std = r"C:\PROY_FINAL_ML\data\processed\cba_clean_std.parquet"
parquet_clean = r"C:\PROY_FINAL_ML\data\processed\cba_clean.parquet"

path_in = parquet_std if os.path.exists(parquet_std) else parquet_clean
df = pd.read_parquet(path_in)

# Asegurar nombres esperados
col_producto = "producto_std" if "producto_std" in df.columns else "producto"
col_present  = "presentacion" if "presentacion" in df.columns else ("medida" if "medida" in df.columns else None)
col_cadena   = "cadena" if "cadena" in df.columns else None

# Tipos
# Asegurar que la columna es tipo datetime mensual
df["anio_mes"] = pd.to_datetime(df["anio_mes"])
df = df.sort_values(["anio_mes", col_producto, "establecimiento"]).reset_index(drop=True)

In [7]:
# =========================
# 1) Ingeniería de características
# =========================
# Trabajamos por serie (producto + establecimiento)
grp_keys = [col_producto, "establecimiento"]

def add_time_features(g):
    g = g.sort_values("anio_mes").copy()
    # Lags
    for L in [1, 3, 6]:
        g[f"lag_{L}"] = g["precio"].shift(L)
    # Moving Averages y Volatilidades
    g["MA_3"]  = g["precio"].rolling(window=3, min_periods=1).mean()
    g["MA_6"]  = g["precio"].rolling(window=6, min_periods=1).mean()
    g["VOL_3"] = g["precio"].rolling(window=3, min_periods=2).std()
    g["VOL_6"] = g["precio"].rolling(window=6, min_periods=2).std()
    # Objetivo de regresión: precio del próximo mes
    g["y_next"] = g["precio"].shift(-1)
    # % cambio al próximo mes y alerta (clasificación)
    g["pct_change_next"] = (g["y_next"] - g["precio"]) / g["precio"]
    g["alert_next"] = (g["pct_change_next"] >= 0.05).astype("Int64")
    return g

df = df.groupby(grp_keys, group_keys=False).apply(add_time_features)

# Mediana mensual por producto (across establecimientos)
med_mensual = (
    df.groupby([col_producto, "anio_mes"])["precio"]
      .median()
      .rename("mediana_prod_mes")
      .reset_index()
)
df = df.merge(med_mensual, on=[col_producto, "anio_mes"], how="left")
df["ratio_precio_mediana"] = df["precio"] / df["mediana_prod_mes"]

# Variables temporales
df["year"] = df["anio_mes"].dt.year
df["month"] = df["anio_mes"].dt.month

# Quitamos filas sin objetivo (último mes de cada serie no tiene y_next)
df_model = df.dropna(subset=["y_next"]).copy()

In [8]:
# =========================
# 2) Conjuntos Train / Valid / Test por tiempo (estricto)
# =========================
# Cortes temporales (ajústalos según tu rango real)
# Reglas: train < valid < test
dates_sorted = np.sort(df_model["anio_mes"].unique())
if len(dates_sorted) < 6:
    # Con pocos meses, hacemos 50% / 25% / 25%
    n = len(dates_sorted)
    cut1 = dates_sorted[int(n*0.5)]
    cut2 = dates_sorted[int(n*0.75)]
else:
    # 60% / 20% / 20%
    n = len(dates_sorted)
    cut1 = dates_sorted[int(n*0.6)]
    cut2 = dates_sorted[int(n*0.8)]

train = df_model[df_model["anio_mes"] < cut1]
valid = df_model[(df_model["anio_mes"] >= cut1) & (df_model["anio_mes"] < cut2)]
test  = df_model[df_model["anio_mes"] >= cut2]

print("Cortes temporales:")
print("Train hasta:", train["anio_mes"].max())
print("Valid:", valid["anio_mes"].min(), "→", valid["anio_mes"].max())
print("Test desde:", test["anio_mes"].min())

Cortes temporales:
Train hasta: 2021-10-01 00:00:00
Valid: 2021-11-01 00:00:00 → 2022-01-01 00:00:00
Test desde: 2022-02-01 00:00:00


In [9]:
# =========================
# 3) Baselines (lag 1, 3 y 6)sfsfdsdfgashdfa
# =========================
# Crear lags si no existen
for L in [3, 6]:
    col = f"lag_{L}"
    if col not in df_model.columns:
        df_model[col] = df_model.groupby(grp_keys)["precio"].shift(L)

# Crear copias para evitar SettingWithCopyWarning
train_copy = train.copy()
valid_copy = valid.copy()
test_copy  = test.copy()

# Asignar lags a cada subconjunto
for L in [3, 6]:
    col = f"lag_{L}"
    train_copy[col] = df_model.loc[train_copy.index, col]
    valid_copy[col] = df_model.loc[valid_copy.index, col]
    test_copy[col]  = df_model.loc[test_copy.index, col]

# Funciones métricas seguras
def safe_rmse(y_true, y_pred):
    mask = ~np.isnan(y_true) & ~np.isnan(y_pred)
    return float(np.sqrt(np.mean((y_true[mask] - y_pred[mask]) ** 2)))

def safe_mae(y_true, y_pred):
    mask = ~np.isnan(y_true) & ~np.isnan(y_pred)
    return float(np.mean(np.abs(y_true[mask] - y_pred[mask])))

# Evaluar baselines con lag 1 (naive), lag 3 y lag 6
for part_name, part in [("TRAIN", train_copy), ("VALID", valid_copy), ("TEST", test_copy)]:
    y_true = part["y_next"].values

    # Baseline naive (lag 1)
    y_hat_lv = part["precio"].values
    print(f"[{part_name}] Baseline LV (lag 1) → RMSE: {safe_rmse(y_true, y_hat_lv):.4f} | MAE: {safe_mae(y_true, y_hat_lv):.4f}")

    # Baseline estacionales lag 3 y 6
    for L in [3, 6]:
        col = f"lag_{L}"
        if part[col].notna().any():
            mask = part[col].notna()
            y_hat = part.loc[mask, col].values
            y_true_masked = part.loc[mask, "y_next"].values
            print(f"[{part_name}] Baseline Estacional lag {L} → RMSE: {safe_rmse(y_true_masked, y_hat):.4f} | MAE: {safe_mae(y_true_masked, y_hat):.4f}")
        else:
            print(f"[{part_name}] Baseline Estacional lag {L} → no disponible")

print("Rango TRAIN:", train["anio_mes"].min(), "a", train["anio_mes"].max())
print("Cantidad meses únicos en TRAIN:", train["anio_mes"].nunique())

[TRAIN] Baseline LV (lag 1) → RMSE: 0.2440 | MAE: 0.0690
[TRAIN] Baseline Estacional lag 3 → RMSE: 0.3678 | MAE: 0.1378
[TRAIN] Baseline Estacional lag 6 → RMSE: 0.4325 | MAE: 0.1724
[VALID] Baseline LV (lag 1) → RMSE: 0.2629 | MAE: 0.0695
[VALID] Baseline Estacional lag 3 → RMSE: 0.3875 | MAE: 0.1330
[VALID] Baseline Estacional lag 6 → RMSE: 0.4243 | MAE: 0.1615
[TEST] Baseline LV (lag 1) → RMSE: 0.2529 | MAE: 0.0636
[TEST] Baseline Estacional lag 3 → RMSE: 0.4220 | MAE: 0.1483
[TEST] Baseline Estacional lag 6 → RMSE: 0.4651 | MAE: 0.1718
Rango TRAIN: 2021-01-01 00:00:00 a 2021-10-01 00:00:00
Cantidad meses únicos en TRAIN: 10


In [10]:
# =========================
# 4) Preparación de features y pipelines scikit-learn
# =========================
# Features numéricas y categóricas
num_feats = ["precio", "lag_1", "lag_3", "lag_6", "MA_3", "MA_6", "VOL_3", "VOL_6",
             "ratio_precio_mediana", "mediana_prod_mes", "year", "month"]
cat_feats = [col_producto, "establecimiento"]
if col_present is not None: cat_feats.append(col_present)
if col_cadena is not None:  cat_feats.append(col_cadena)

# ColumnTransformer
preproc_reg = ColumnTransformer(
    transformers=[
        ("num", Pipeline(steps=[
            ("imp", SimpleImputer(strategy="median")),
            ("sc", StandardScaler())
        ]), num_feats),
        ("cat", Pipeline(steps=[
            ("imp", SimpleImputer(strategy="most_frequent")),
            ("oh", OneHotEncoder(handle_unknown="ignore", sparse_output=False))
        ]), cat_feats)
    ],
    remainder="drop"
)

preproc_clf = preproc_reg  # mismo preprocesamiento

# Datasets finales
X_train, y_train = train[num_feats + cat_feats], train["y_next"]
X_valid, y_valid = valid[num_feats + cat_feats], valid["y_next"]
X_test,  y_test  = test[num_feats + cat_feats],  test["y_next"]

# Para clasificación (alerta)
y_train_cls = train["alert_next"].fillna(0).astype(int)
y_valid_cls = valid["alert_next"].fillna(0).astype(int)
y_test_cls  = test["alert_next"].fillna(0).astype(int)

In [11]:
# =========================
# 5) Modelos de REGRESIÓN
# =========================
models_reg = {
    "Ridge": Ridge(alpha=1.0, random_state=42),
#    "RFReg": RandomForestRegressor(n_estimators=300, random_state=42, n_jobs=-1, max_depth=None)
    "HGBReg": HistGradientBoostingRegressor(max_iter=300, random_state=42)
}

for name, base in models_reg.items():
    pipe = Pipeline(steps=[("pre", preproc_reg), ("model", base)])
    pipe.fit(X_train, y_train)
    pred_val = pipe.predict(X_valid)
    pred_tst = pipe.predict(X_test)
    print(f"\n[REG:{name}] VALID → RMSE:{np.sqrt(mean_squared_error(y_valid, pred_val)):.4f} | MAE:{mean_absolute_error(y_valid, pred_val):.4f} | R2:{r2_score(y_valid, pred_val):.3f}")
    print(f"[REG:{name}] TEST  → RMSE:{np.sqrt(mean_squared_error(y_test, pred_tst)):.4f} | MAE:{mean_absolute_error(y_test, pred_tst):.4f} | R2:{r2_score(y_test, pred_tst):.3f}")


[REG:Ridge] VALID → RMSE:0.6233 | MAE:0.2383 | R2:0.927
[REG:Ridge] TEST  → RMSE:0.9126 | MAE:0.4252 | R2:0.839

[REG:HGBReg] VALID → RMSE:0.2881 | MAE:0.1025 | R2:0.984
[REG:HGBReg] TEST  → RMSE:0.7383 | MAE:0.2370 | R2:0.894


In [12]:
# =========================
# 6) Modelos de CLASIFICACIÓN (alerta)
# =========================
models_clf = {
    "LogReg": LogisticRegression(max_iter=200, n_jobs=None, class_weight="balanced"),
#    "RFCls": RandomForestClassifier(n_estimators=400, random_state=42, n_jobs=-1, class_weight="balanced")
    "HGBCls": HistGradientBoostingClassifier(max_iter=300, random_state=42)    
}

for name, base in models_clf.items():
    pipe = Pipeline(steps=[("pre", preproc_clf), ("model", base)])
    pipe.fit(X_train, y_train_cls)
    p_val = pipe.predict(X_valid)
    p_tst = pipe.predict(X_test)
    proba_val = pipe.predict_proba(X_valid)[:,1] if hasattr(pipe["model"], "predict_proba") else p_val
    proba_tst = pipe.predict_proba(X_test)[:,1]  if hasattr(pipe["model"], "predict_proba")  else p_tst
    print(f"\n[CLF:{name}] VALID → Acc:{accuracy_score(y_valid_cls, p_val):.3f} | F1:{f1_score(y_valid_cls, p_val):.3f} | ROC-AUC:{roc_auc_score(y_valid_cls, proba_val):.3f}")
    print(f"[CLF:{name}] TEST  → Acc:{accuracy_score(y_test_cls, p_tst):.3f} | F1:{f1_score(y_test_cls, p_tst):.3f} | ROC-AUC:{roc_auc_score(y_test_cls, proba_tst):.3f}")


[CLF:LogReg] VALID → Acc:0.759 | F1:0.375 | ROC-AUC:0.837
[CLF:LogReg] TEST  → Acc:0.772 | F1:0.317 | ROC-AUC:0.808

[CLF:HGBCls] VALID → Acc:0.914 | F1:0.402 | ROC-AUC:0.912
[CLF:HGBCls] TEST  → Acc:0.928 | F1:0.379 | ROC-AUC:0.897


In [13]:
# =========================
# 7) Hyperparameter tuning con validación temporal para Ridge
# =========================
# Definimos funciones métricas consistentes
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

def mae(y_true, y_pred):
    return mean_absolute_error(y_true, y_pred)
    
# Definimos TimeSeriesSplit (ajusta n_splits si quieres)
tscv = TimeSeriesSplit(n_splits=3)

# Pipeline para Ridge con preprocesamiento (ya definido)
pipe_ridge = Pipeline([
    ("pre", preproc_reg),
    ("model", Ridge())
])

# Grid de parámetros a probar
param_grid_ridge = {
    "model__alpha": [0.1, 1.0, 10.0, 100.0]
}

# GridSearchCV con validación temporal y métrica RMSE (neg MSE para sklearn)
grid_search_ridge = GridSearchCV(
    pipe_ridge,
    param_grid=param_grid_ridge,
    cv=tscv,
    scoring="neg_mean_squared_error",
    n_jobs=-1,
    verbose=2
)

print("Iniciando búsqueda de hiperparámetros para Ridge con validación temporal...")
grid_search_ridge.fit(X_train, y_train)

print("Mejor alpha:", grid_search_ridge.best_params_["model__alpha"])
print("Mejor score CV (neg MSE):", grid_search_ridge.best_score_)

# Evaluación en validación y test con mejor modelo
best_ridge = grid_search_ridge.best_estimator_

y_valid_pred = best_ridge.predict(X_valid)
y_test_pred = best_ridge.predict(X_test)

print(f"[RIDGE] VALID RMSE: {rmse(y_valid, y_valid_pred):.4f} | MAE: {mae(y_valid, y_valid_pred):.4f}")
print(f"[RIDGE] TEST  RMSE: {rmse(y_test, y_test_pred):.4f} | MAE: {mae(y_test, y_test_pred):.4f}")

Iniciando búsqueda de hiperparámetros para Ridge con validación temporal...
Fitting 3 folds for each of 4 candidates, totalling 12 fits
Mejor alpha: 100.0
Mejor score CV (neg MSE): -0.1869884310312663
[RIDGE] VALID RMSE: 0.4396 | MAE: 0.2075
[RIDGE] TEST  RMSE: 0.8030 | MAE: 0.3348


In [14]:
# =========================
# 8) Hyperparameter tuning con validación temporal para HistGradientBoostingRegressor
# =========================
from sklearn.ensemble import HistGradientBoostingRegressor

# Pipeline con preprocesamiento
pipe_hgb = Pipeline([
    ("pre", preproc_reg),
    ("model", HistGradientBoostingRegressor(random_state=42, max_iter=200))
])

# Grid de hiperparámetros (más pequeño para acelerar)
param_grid_hgb = {
    "model__max_depth": [3, 5, None],
    "model__min_samples_leaf": [20, 50],
    "model__learning_rate": [0.01, 0.1]
}

# GridSearchCV con validación temporal
grid_search_hgb = GridSearchCV(
    pipe_hgb,
    param_grid=param_grid_hgb,
    cv=tscv,
    scoring="neg_mean_squared_error",
    n_jobs=-1,
    verbose=2
)

print("Iniciando búsqueda de hiperparámetros para HistGradientBoostingRegressor con validación temporal...")
grid_search_hgb.fit(X_train, y_train)

print("Mejores parámetros:", grid_search_hgb.best_params_)
print("Mejor score CV (neg MSE):", grid_search_hgb.best_score_)

# Evaluación en validación y test con mejor modelo
best_hgb = grid_search_hgb.best_estimator_

y_valid_pred_hgb = best_hgb.predict(X_valid)
y_test_pred_hgb  = best_hgb.predict(X_test)

print(f"[HGB] VALID → RMSE: {rmse(y_valid, y_valid_pred_hgb):.4f} | MAE: {mae(y_valid, y_valid_pred_hgb):.4f}")
print(f"[HGB] TEST  → RMSE: {rmse(y_test, y_test_pred_hgb):.4f} | MAE: {mae(y_test, y_test_pred_hgb):.4f}")

Iniciando búsqueda de hiperparámetros para HistGradientBoostingRegressor con validación temporal...
Fitting 3 folds for each of 12 candidates, totalling 36 fits
Mejores parámetros: {'model__learning_rate': 0.1, 'model__max_depth': 5, 'model__min_samples_leaf': 50}
Mejor score CV (neg MSE): -0.12521218083560745
[HGB] VALID → RMSE: 0.2687 | MAE: 0.1009
[HGB] TEST  → RMSE: 0.7374 | MAE: 0.2372


In [18]:
# =========================
# Guardar modelos y preprocesadores
# =========================

# =========================
# 1️⃣ Modelos de regresión
# =========================
joblib.dump(best_ridge, "C:/PROY_FINAL_ML/models/best_ridge.pkl")
joblib.dump(best_hgb, "C:/PROY_FINAL_ML/models/best_hgb.pkl")

# =========================
# 2️⃣ Modelos de clasificación
# =========================
# El último pipeline entrenado en tu loop para HGBCls
joblib.dump(pipe, "C:/PROY_FINAL_ML/models/best_hgb_clf.pkl")

# =========================
# 3️⃣ Preprocesadores
# =========================
joblib.dump(preproc_reg, "C:/PROY_FINAL_ML/models/preproc_reg.pkl")
joblib.dump(preproc_clf, "C:/PROY_FINAL_ML/models/preproc_clf.pkl")

# =========================
# Conjuntos de datos
# =========================
joblib.dump(X_train, "C:/PROY_FINAL_ML/data/processed/X_train.pkl")
joblib.dump(X_valid, "C:/PROY_FINAL_ML/data/processed/X_valid.pkl")
joblib.dump(X_test,  "C:/PROY_FINAL_ML/data/processed/X_test.pkl")

joblib.dump(y_train, "C:/PROY_FINAL_ML/data/processed/y_train.pkl")
joblib.dump(y_valid, "C:/PROY_FINAL_ML/data/processed/y_valid.pkl")
joblib.dump(y_test,  "C:/PROY_FINAL_ML/data/processed/y_test.pkl")

# =========================
# Clasificación
# =========================
joblib.dump(y_train_cls, "C:/PROY_FINAL_ML/data/processed/y_train_cls.pkl")
joblib.dump(y_valid_cls, "C:/PROY_FINAL_ML/data/processed/y_valid_cls.pkl")
joblib.dump(y_test_cls,  "C:/PROY_FINAL_ML/data/processed/y_test_cls.pkl")

# =========================
# Guardar lista de features
# =========================
num_feats = ["precio", "lag_1", "lag_3", "lag_6", "MA_3", "MA_6", "VOL_3", "VOL_6",
             "ratio_precio_mediana", "mediana_prod_mes", "year", "month"]

cat_feats = ["producto_std", "establecimiento", "presentacion", "cadena"]

joblib.dump(num_feats, "C:/PROY_FINAL_ML/data/processed/num_feats.pkl")
joblib.dump(cat_feats, "C:/PROY_FINAL_ML/data/processed/cat_feats.pkl")

print("✅ Datasets y features guardados en 'data/processed'.")


print("✅ Todos los modelos y preprocesadores se han guardado correctamente en 'models'.")


✅ Datasets y features guardados en 'data/processed'.
✅ Todos los modelos y preprocesadores se han guardado correctamente en 'models'.
