AutoGluon - Predicción de ventas (tn) por producto para febrero 2020

In [1]:
# 📦 1. Importar librerías
import pandas as pd

In [2]:
# 💬 Instalar AutoGluon si es necesario
%pip install autogluon.timeseries

from autogluon.timeseries import TimeSeriesPredictor, TimeSeriesDataFrame




  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# 📄 2. Cargar datasets desde carpeta local
df_sellin = pd.read_csv(r"C:\Maestria\Labo 3\sell-in.txt", sep="\t")
df_productos = pd.read_csv(r"C:\Maestria\Labo 3\tb_productos.txt", sep="\t")


In [5]:
# 📄 Leer lista de productos a predecir
with open(r"C:\Maestria\Labo 3\780_a_predecir.TXT", "r") as f:
    product_ids = [int(line.strip()) for line in f if line.strip().isdigit()]


In [6]:
# 🧹 3. Preprocesamiento
# Convertir periodo a datetime
df_sellin['timestamp'] = pd.to_datetime(df_sellin['periodo'], format='%Y%m')

In [7]:
# Filtrar hasta dic 2019 y productos requeridos
df_filtered = df_sellin[
    (df_sellin['timestamp'] <= '2019-12-01') &
    (df_sellin['product_id'].isin(product_ids))
]

In [8]:
# Agregar tn por periodo, cliente y producto
df_grouped = df_filtered.groupby(['timestamp', 'customer_id', 'product_id'], as_index=False)['tn'].sum()

In [9]:
# Agregar tn total por periodo y producto
df_monthly_product = df_grouped.groupby(['timestamp', 'product_id'], as_index=False)['tn'].sum()

In [10]:
# Agregar columna 'item_id' para AutoGluon
df_monthly_product['item_id'] = df_monthly_product['product_id']

In [11]:
# ⏰ 4. Crear TimeSeriesDataFrame
ts_data = TimeSeriesDataFrame.from_data_frame(
    df_monthly_product,
    id_column='item_id',
    timestamp_column='timestamp'
)

In [12]:
# Completar valores faltantes
ts_data = ts_data.fill_missing_values()

In [13]:
# ⚙️ 5. Definir y entrenar predictor
predictor = TimeSeriesPredictor(
    prediction_length=2,
    target='tn',
    freq='MS'  # Frecuencia mensual (Month Start), 
)

predictor.fit(ts_data, num_val_windows=2, time_limit=60*60)

Beginning AutoGluon training... Time limit = 3600s
AutoGluon will save models to 'c:\Users\Equipo\Downloads\AutogluonModels\ag-20250709_210932'
AutoGluon Version:  1.3.1
Python Version:     3.10.11
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.26100
CPU Count:          8
GPU Count:          0
Memory Avail:       1.86 GB / 11.65 GB (16.0%)
Disk Space Avail:   340.67 GB / 476.18 GB (71.5%)

Fitting with arguments:
{'enable_ensemble': True,
 'eval_metric': WQL,
 'freq': 'MS',
 'hyperparameters': 'default',
 'known_covariates_names': [],
 'num_val_windows': 2,
 'prediction_length': 2,
 'quantile_levels': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
 'random_seed': 123,
 'refit_every_n_windows': 1,
 'refit_full': False,
 'skip_model_selection': False,
 'target': 'tn',
 'time_limit': 3600,
 'verbosity': 2}

train_data with frequency 'IRREG' has been resampled to frequency 'MS'.
Provided train_data has 22375 rows (NaN fraction=0.1%), 780 time series. Median 

<autogluon.timeseries.predictor.TimeSeriesPredictor at 0x280d3529840>

In [14]:
# 🔮 6. Generar predicción
forecast = predictor.predict(ts_data)

data with frequency 'IRREG' has been resampled to frequency 'MS'.
Model not specified in predict, will default to the model with the best validation score: WeightedEnsemble


In [15]:
# Extraer predicción media y filtrar febrero 2020
forecast_mean = forecast['mean'].reset_index()
print(forecast_mean.columns)

Index(['item_id', 'timestamp', 'mean'], dtype='object')


In [16]:
# Tomar solo item_id y la predicción 'mean'
resultado = forecast['mean'].reset_index()[['item_id', 'mean']]
resultado.columns = ['product_id', 'tn']

# Filtrar solo febrero 2020
resultado = forecast['mean'].reset_index()
resultado = resultado[resultado['timestamp'] == '2020-02-01']

# Renombrar columnas
resultado = resultado[['item_id', 'mean']]
resultado.columns = ['product_id', 'tn']


In [17]:
# 💾 7. Guardar archivo
resultado.to_csv(r"C:\Maestria\Labo 3\PREDICCION_CON_AUTOGLUON.csv", index=False)
resultado.head()


Unnamed: 0,product_id,tn
1,20001,1306.214976
3,20002,1071.362823
5,20003,693.191672
7,20004,523.211746
9,20005,519.866436


DE ACA PARA ABAJO NO SIRVE NADAAAAAAAAAAAAAAAAAAAAAA

In [3]:
"""
WeightedEnsemble (AutoGluon) + semilla lineal del profe
"""

import pandas as pd, numpy as np
from pathlib import Path
from autogluon.timeseries import TimeSeriesPredictor, TimeSeriesDataFrame

# ── RUTAS ───────────────────────────────────────────
BASE      = Path(r"C:\Maestria\Labo 3")
VENTAS    = BASE / "sell-in.txt"
LISTA     = BASE / "780_a_predecir.txt"
OUT       = BASE / "submission_t780_ag_seed.csv"
MODEL_DIR = BASE / "AutogluonModels" / "ag-latest"   # ← ajustá si tu carpeta es otra
# ── SKU mágicos & coeficientes ──────────────────────
MAGICOS = [20002,20003,20006,20010,20011,20018,20019,20021,20026,20028,
           20035,20039,20042,20044,20045,20046,20049,20051,20052,20053,
           20055,20008,20001,20017,20086,20180,20193,20320,20532,20612,
           20637,20807,20838]

B0  = 0.441467
B   = np.array([-0.001339, 0.236558, 0.178208, -0.060031,
                -0.161875, -0.007775, 0.151936, 0.043933,
                 0.142839, 0.103804, 0.119211, 0.073671])   # lag0…lag11
# ────────────────────────────────────────────────────

# 1) datos
df = pd.read_csv(VENTAS, sep=None, engine="python")
sku780 = [int(l.strip()) for l in open(LISTA) if l.strip() and not l.lower().startswith("product")]
df = df[df.product_id.isin(sku780)]

df["periodo"]   = df["periodo"].astype(str).str.zfill(6)
df["timestamp"] = pd.to_datetime(df.periodo, format="%Y%m")

# TimeSeriesDataFrame para AutoGluon
ts_df = (df.rename(columns={"product_id":"item_id", "tn":"target"})
           .loc[:, ["timestamp", "item_id", "target"]])

ts_data = (TimeSeriesDataFrame.from_data_frame(
              ts_df, id_column="item_id", timestamp_column="timestamp")
           .convert_frequency("MS")
           .fill_missing_values())

# 2) cargar (o volver a entrenar) WeightedEnsemble
try:
    predictor = TimeSeriesPredictor.load(MODEL_DIR)
except FileNotFoundError:
    print("⚠️  Carpeta de modelos no encontrada. Entrenando nuevamente (30 min)…")
    predictor = TimeSeriesPredictor(
        target="target", prediction_length=1, freq="MS")
    predictor.fit(ts_data, time_limit=1800, refit_full=True)

# 3) predicción AutoGluon (WeightedEnsemble) para t+1
ag_pred = (predictor.predict(ts_data)["mean"]
             .reset_index()
             .groupby("item_id", as_index=False)
             .last()
             .rename(columns={"item_id":"product_id", "mean":"tn_ag"}))

# 4) semilla del profe con los 12 lags del último mes
mensual = (df.groupby(["product_id","timestamp"])["target"]
             .sum().unstack(level=0).fillna(0).sort_index())
last = mensual.index.max()

seed_rows = []
for pid in sku780:
    serie = mensual[pid]
    if len(serie) >= 12:
        lags = np.array([serie.iloc[-1 - l] for l in range(12)])
        seed_val = B0 + B.dot(lags)
    else:
        seed_val = np.nan
    seed_rows.append((pid, seed_val))
seed_df = pd.DataFrame(seed_rows, columns=["product_id","tn_seed"])

# 5) promedio 12 m (fallback)
prom12 = (mensual.tail(12).mean()
            .reset_index()
            .rename(columns={0:"tn_prom"}))

# 6) mezclar predicciones
combo = (prom12.merge(ag_pred,  on="product_id", how="left")
                 .merge(seed_df, on="product_id", how="left"))

def final(row):
    ag, sd, pr = row.tn_ag, row.tn_seed, row.tn_prom
    pid = row.product_id
    if not np.isnan(sd) and not np.isnan(ag):
        w = 0.85 if pid in MAGICOS else 0.55   # pesos empíricamente buenos
        return w*sd + (1-w)*ag
    if not np.isnan(sd):
        return sd
    if not np.isnan(ag):
        return ag
    return pr

combo["tn"] = combo.apply(final, axis=1).round(5)

# 7) guardar CSV
combo[["product_id","tn"]].to_csv(OUT, index=False, float_format="%.5f")
print("✅  Archivo listo:", OUT)


Trying to fill missing values in an unsorted dataframe. It is highly recommended to call `ts_df.sort_index()` before calling `ts_df.fill_missing_values()`
Beginning AutoGluon training... Time limit = 1800s
AutoGluon will save models to 'c:\Maestria\Labo 3\AutogluonModels\ag-20250709_221900'
AutoGluon Version:  1.3.1
Python Version:     3.10.11
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.26100
CPU Count:          8
GPU Count:          0
Memory Avail:       2.50 GB / 11.65 GB (21.5%)
Disk Space Avail:   339.55 GB / 476.18 GB (71.3%)

Fitting with arguments:
{'enable_ensemble': True,
 'eval_metric': WQL,
 'freq': 'MS',
 'hyperparameters': 'default',
 'known_covariates_names': [],
 'num_val_windows': 1,
 'prediction_length': 1,
 'quantile_levels': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
 'random_seed': 123,
 'refit_every_n_windows': 1,
 'refit_full': True,
 'skip_model_selection': False,
 'target': 'target',
 'time_limit': 1800,
 'verbosity': 2}



⚠️  Carpeta de modelos no encontrada. Entrenando nuevamente (30 min)…


Provided train_data has 22375 rows, 780 time series. Median time series length is 36 (min=4, max=36). 
	Removing 24 short time series from train_data. Only series with length >= 6 will be used for training.
	After filtering, train_data has 22264 rows, 756 time series. Median time series length is 36 (min=6, max=36). 

Provided data contains following columns:
	target: 'target'

AutoGluon will gauge predictive performance using evaluation metric: 'WQL'
	This metric's sign has been flipped to adhere to being higher_is_better. The metric score can be multiplied by -1 to get the metric value.

Starting training. Start time is 2025-07-09 19:19:00
Models that will be trained: ['SeasonalNaive', 'RecursiveTabular', 'DirectTabular', 'NPTS', 'DynamicOptimizedTheta', 'AutoETS', 'ChronosZeroShot[bolt_base]', 'ChronosFineTuned[bolt_small]', 'TemporalFusionTransformer', 'DeepAR', 'PatchTST', 'TiDE']
Training timeseries model SeasonalNaive. Training for up to 138.4s of the 1799.4s of remaining time.


KeyError: 'Column not found: target'

In [4]:
import pandas as pd, numpy as np
from pathlib import Path
from autogluon.timeseries import TimeSeriesPredictor, TimeSeriesDataFrame

# ─── Rutas ───────────────────────────────
BASE   = Path(r"C:\Maestria\Labo 3")
VENTAS = BASE / "sell-in.txt"
LISTA  = BASE / "780_a_predecir.txt"

# ①  Poné aquí la carpeta que se creó (¡ajustala si el nombre cambia!)
MODEL_DIR = Path(r"C:\Maestria\Labo 3\AutogluonModels\ag-20250709_221900")

OUT    = BASE / "submission_t780_ag_seed.csv"

# ─── Semilla del profe ───────────────────
MAGICOS = [20002,20003,20006,20010,20011,20018,20019,20021,20026,20028,
           20035,20039,20042,20044,20045,20046,20049,20051,20052,20053,
           20055,20008,20001,20017,20086,20180,20193,20320,20532,20612,
           20637,20807,20838]
B0 = 0.441467
B  = np.array([-0.001339, 0.236558, 0.178208, -0.060031,
               -0.161875, -0.007775, 0.151936, 0.043933,
                0.142839, 0.103804, 0.119211, 0.073671])

# ─── Cargar datos una sola vez ───────────
df = pd.read_csv(VENTAS, sep=None, engine="python")
sku780 = [int(l.strip()) for l in open(LISTA) if l.strip() and not l.lower().startswith("product")]
df = df[df.product_id.isin(sku780)]
df["periodo"]   = df["periodo"].astype(str).str.zfill(6)
df["timestamp"] = pd.to_datetime(df.periodo, format="%Y%m")

# Tabla mensual (para seed y promedio)
mensual = (df.groupby(["product_id","timestamp"])["tn"]
             .sum().unstack(level=0).fillna(0).sort_index())
last = mensual.index.max()

# ─── 2. Cargar el WeightedEnsemble ya entrenado ───
print("🔄  Cargando ensemble desde:", MODEL_DIR)
predictor = TimeSeriesPredictor.load(MODEL_DIR)

# ─── 3. Predicción AutoGluon para t+1 ───
ts_df = (df.rename(columns={"product_id":"item_id", "tn":"target"})
           .loc[:, ["timestamp","item_id","target"]])
ts_data = TimeSeriesDataFrame.from_data_frame(
    ts_df, id_column="item_id", timestamp_column="timestamp"
).convert_frequency("MS").fill_missing_values()

ag_pred = (predictor.predict(ts_data)["mean"]
             .reset_index()
             .groupby("item_id", as_index=False)
             .last()
             .rename(columns={"item_id":"product_id","mean":"tn_ag"}))

# ─── 4. Semilla del profe ───
seed_vals = []
for pid in sku780:
    s = mensual[pid]
    if len(s) >= 12:
        lags = np.array([s.iloc[-1-l] for l in range(12)])
        seed_pred = B0 + B.dot(lags)
    else:
        seed_pred = np.nan
    seed_vals.append((pid, seed_pred))
seed_df = pd.DataFrame(seed_vals, columns=["product_id","tn_seed"])

# ─── 5. Promedio 12 m (fallback) ───
prom12 = mensual.tail(12).mean().reset_index().rename(columns={0:"tn_prom"})

# ─── 6. Blend final ───
df_all = (prom12.merge(ag_pred, on="product_id", how="left")
                  .merge(seed_df, on="product_id", how="left"))

def blend(row):
    ag, sd, pr = row.tn_ag, row.tn_seed, row.tn_prom
    pid = row.product_id
    if not np.isnan(sd) and not np.isnan(ag):
        w = 0.85 if pid in MAGICOS else 0.55
        return w*sd + (1-w)*ag
    if not np.isnan(sd): return sd
    if not np.isnan(ag): return ag
    return pr

df_all["tn"] = df_all.apply(blend, axis=1).round(5)

# ─── 7. Guardar CSV ───
df_all[["product_id","tn"]].to_csv(OUT, index=False, float_format="%.5f")
print("✅  Predicción lista →", OUT)


Loading predictor from path C:\Maestria\Labo 3\AutogluonModels\ag-20250709_221900


🔄  Cargando ensemble desde: C:\Maestria\Labo 3\AutogluonModels\ag-20250709_221900


Trying to fill missing values in an unsorted dataframe. It is highly recommended to call `ts_df.sort_index()` before calling `ts_df.fill_missing_values()`
Model not specified in predict, will default to the model with the best validation score: WeightedEnsemble_FULL


✅  Predicción lista → C:\Maestria\Labo 3\submission_t780_ag_seed.csv


esto de aca abajo no sirvio para nada

In [2]:
from autogluon.timeseries import TimeSeriesPredictor, TimeSeriesDataFrame
import pandas as pd, numpy as np
from pathlib import Path

# ─── rutas ─────────────────────────────────────
BASE = Path(r"C:\Maestria\Labo 3")
VENTAS = BASE / "sell-in.txt"
LISTA  = BASE / "780_a_predecir.txt"
OUT    = BASE / "PREDICCION_CON_AUTOGLUON2.csv"
# ───────────────────────────────────────────────

# 1) datos
df = pd.read_csv(VENTAS, sep=None, engine="python")
sku = [int(l.strip()) for l in open(LISTA) if l.strip() and not l.lower().startswith("product")]
df = df[df.product_id.isin(sku)]

df["periodo"] = df["periodo"].astype(str).str.zfill(6)
df["timestamp"] = pd.to_datetime(df.periodo, format="%Y%m")

ts_df = (df.rename(columns={"product_id":"item_id", "tn":"target"})
           .loc[:, ["timestamp", "item_id", "target"]])

ts_data = (TimeSeriesDataFrame.from_data_frame(
               ts_df, id_column="item_id", timestamp_column="timestamp")
           .convert_frequency("MS")
           .fill_missing_values())

# 2) predictor sin random_seed en el init
predictor = TimeSeriesPredictor(
    target="target",
    prediction_length=1,
    freq="MS"
)

# 3) fit con refit_full y seed reproducible
predictor.fit(
    ts_data,
    time_limit=3600,       # 1 hora como antes
    refit_full=True,       # re-entrena con todo el set
    random_seed=123        # seed opcional (aquí sí está permitido)
)

# 4) predicción mes futuro
forecast = predictor.predict(ts_data)
resultado = (forecast["mean"]
             .reset_index()
             .groupby("item_id", as_index=False)
             .last()                         # solo la última fecha
             .rename(columns={"item_id":"product_id", "mean":"tn"}))
resultado["tn"] = resultado["tn"].round(5)

# 5) guardar CSV
resultado.to_csv(OUT, index=False, float_format="%.5f")
print("✅  Archivo guardado en:", OUT)


Trying to fill missing values in an unsorted dataframe. It is highly recommended to call `ts_df.sort_index()` before calling `ts_df.fill_missing_values()`
Beginning AutoGluon training... Time limit = 3600s
AutoGluon will save models to 'c:\Maestria\Labo 3\AutogluonModels\ag-20250709_215017'
AutoGluon Version:  1.3.1
Python Version:     3.10.11
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.26100
CPU Count:          8
GPU Count:          0
Memory Avail:       3.17 GB / 11.65 GB (27.2%)
Disk Space Avail:   339.74 GB / 476.18 GB (71.3%)

Fitting with arguments:
{'enable_ensemble': True,
 'eval_metric': WQL,
 'freq': 'MS',
 'hyperparameters': 'default',
 'known_covariates_names': [],
 'num_val_windows': 1,
 'prediction_length': 1,
 'quantile_levels': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
 'random_seed': 123,
 'refit_every_n_windows': 1,
 'refit_full': True,
 'skip_model_selection': False,
 'target': 'target',
 'time_limit': 3600,
 'verbosity': 2}

P

✅  Archivo guardado en: C:\Maestria\Labo 3\PREDICCION_CON_AUTOGLUON2.csv


In [None]:
# Eliminar columna 'timestamp'
df = df.drop(columns=['timestamp'])

# Guardar archivo con solo dos columnas
df.to_csv(r"C:\Maestria\Labo 3\PREDICCION_CON_AUTOGLUON2.csv", index=False)
