In [1]:
# 📦 1. Importar librerías
import pandas as pd

In [2]:
# 💬 Instalar AutoGluon si es necesario
%pip install autogluon.timeseries

from autogluon.timeseries import TimeSeriesPredictor, TimeSeriesDataFrame

Collecting autogluon.timeseries
  Downloading autogluon.timeseries-1.3.1-py3-none-any.whl.metadata (12 kB)
Collecting lightning<2.7,>=2.2 (from autogluon.timeseries)
  Downloading lightning-2.5.2-py3-none-any.whl.metadata (38 kB)
Collecting transformers<4.50,>=4.38.0 (from transformers[sentencepiece]<4.50,>=4.38.0->autogluon.timeseries)
  Downloading transformers-4.49.0-py3-none-any.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Collecting gluonts<0.17,>=0.15.0 (from autogluon.timeseries)
  Downloading gluonts-0.16.2-py3-none-any.whl.metadata (9.8 kB)
Collecting statsforecast<2.0.2,>=1.7.0 (from autogluon.timeseries)
  Downloading statsforecast-2.0.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (29 kB)
Collecting mlforecast<0.14,>0.13 (from autogluon.timeseries)
  Downloading mlforecast-0.13.6-py3-none-any.whl.metadata (12 kB)
Collecting utilsforecast<0.2.11

In [3]:
# 📄 2. Cargar datasets desde carpeta local
import pandas as pd

BASE = "/kaggle/input/labo3-sales-data"

# i) Carga sin parsear fechas
df_sellin = pd.read_csv(
    f"{BASE}/sell-in.txt",
    sep="\t",
)
df_productos = pd.read_csv(f"{BASE}/tb_productos.txt", sep="\t")


In [4]:
# 📄 Leer lista de productos a predecir
with open(f"{BASE}/780_a_predecir.txt", "r") as f:
    product_ids = [
        int(line.strip())
        for line in f
        if line.strip().isdigit()
    ]


In [5]:
# 🧹 3. Preprocesamiento
# Convertir periodo a datetime
df_sellin['timestamp'] = pd.to_datetime(df_sellin['periodo'], format='%Y%m')

In [6]:
# Filtrar hasta dic 2019 y productos requeridos
df_filtered = df_sellin[
    (df_sellin['timestamp'] <= '2019-12-01') &
    (df_sellin['product_id'].isin(product_ids))
]

In [7]:
# Agregar tn por periodo, cliente y producto
df_grouped = df_filtered.groupby(['timestamp', 'customer_id', 'product_id'], as_index=False)['tn'].sum()

In [8]:
# Agregar tn total por periodo y producto
df_monthly_product = df_grouped.groupby(['timestamp', 'product_id'], as_index=False)['tn'].sum()

In [9]:
# Agregar columna 'item_id' para AutoGluon
df_monthly_product['item_id'] = df_monthly_product['product_id']

In [10]:
# ⏰ 4. Crear TimeSeriesDataFrame

!pip install autogluon.timeseries

from autogluon.timeseries import TimeSeriesDataFrame

ts_data = TimeSeriesDataFrame.from_data_frame(
    df_monthly_product,
    id_column='item_id',
    timestamp_column='timestamp'
)



In [11]:
# Completar valores faltantes
ts_data = ts_data.fill_missing_values()

In [12]:
!pip install autogluon.timeseries



In [13]:
from autogluon.timeseries import TimeSeriesPredictor

# 1) Creamos el predictor sin presets en __init__
predictor = TimeSeriesPredictor(
    prediction_length=2,
    target='tn',
    freq='MS',
    path='modelo_autogluon_guardado' 
)

# 2) Entrenamos indicando presets en fit()
predictor.fit(
    ts_data,
    presets='best_quality',   # aquí van los presets
    num_val_windows=5,
    time_limit=3600,      
    random_seed=777
)



Beginning AutoGluon training... Time limit = 3600s
AutoGluon will save models to '/kaggle/working/modelo_autogluon_guardado'
AutoGluon Version:  1.3.1
Python Version:     3.11.13
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP PREEMPT_DYNAMIC Sun Nov 10 10:07:59 UTC 2024
CPU Count:          4
GPU Count:          1
Memory Avail:       29.52 GB / 31.35 GB (94.2%)
Disk Space Avail:   19.50 GB / 19.52 GB (99.9%)
Setting presets to: best_quality

Fitting with arguments:
{'enable_ensemble': True,
 'eval_metric': WQL,
 'freq': 'MS',
 'hyperparameters': 'default',
 'known_covariates_names': [],
 'num_val_windows': 5,
 'prediction_length': 2,
 'quantile_levels': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
 'random_seed': 777,
 'refit_every_n_windows': 1,
 'refit_full': False,
 'skip_model_selection': False,
 'target': 'tn',
 'time_limit': 3600,
 'verbosity': 2}

train_data with frequency 'IRREG' has been resampled to frequency 'MS'.
Provided train_data has 223

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/821M [00:00<?, ?B/s]

	-0.1979       = Validation score (-WQL)
	28.61   s     = Training runtime
	1.38    s     = Validation (prediction) runtime
Training timeseries model ChronosFineTuned[bolt_small]. Training for up to 576.7s of the 3460.2s of remaining time.
	Skipping covariate_regressor since the dataset contains no covariates or static features.


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/191M [00:00<?, ?B/s]

	Saving fine-tuned model to /kaggle/working/modelo_autogluon_guardado/models/ChronosFineTuned[bolt_small]/W0/fine-tuned-ckpt
	Skipping covariate_regressor since the dataset contains no covariates or static features.
	Saving fine-tuned model to /kaggle/working/modelo_autogluon_guardado/models/ChronosFineTuned[bolt_small]/W1/fine-tuned-ckpt
	Skipping covariate_regressor since the dataset contains no covariates or static features.
	Saving fine-tuned model to /kaggle/working/modelo_autogluon_guardado/models/ChronosFineTuned[bolt_small]/W2/fine-tuned-ckpt
	Skipping covariate_regressor since the dataset contains no covariates or static features.
	Saving fine-tuned model to /kaggle/working/modelo_autogluon_guardado/models/ChronosFineTuned[bolt_small]/W3/fine-tuned-ckpt
	Skipping covariate_regressor since the dataset contains no covariates or static features.
	Saving fine-tuned model to /kaggle/working/modelo_autogluon_guardado/models/ChronosFineTuned[bolt_small]/W4/fine-tuned-ckpt
	-0.1930   

<autogluon.timeseries.predictor.TimeSeriesPredictor at 0x7956220f8910>

In [14]:
# 💾 Guardar el modelo entrenado para no perderlo
predictor.save()


In [15]:
# 📊 Leaderboard de modelos entrenados
leaderboard_df = predictor.leaderboard(ts_data, silent=True)

# Mostrar top 10 modelos
print("🏆 Top 10 modelos:")
display(leaderboard_df.head(10))

# 💾 Guardar leaderboard completo en CSV
leaderboard_df.to_csv("leaderboard_autogluon.csv", index=False)


data with frequency 'IRREG' has been resampled to frequency 'MS'.
Additional data provided, testing on additional data. Resulting leaderboard will be sorted according to test score (`score_test`).


🏆 Top 10 modelos:


Unnamed: 0,model,score_test,score_val,pred_time_test,pred_time_val,fit_time_marginal,fit_order
0,ChronosFineTuned[bolt_small],-0.183669,-0.193032,0.780935,0.084163,317.422849,8
1,TemporalFusionTransformer,-0.18564,-0.190655,0.552336,0.335686,369.365963,9
2,WeightedEnsemble,-0.18695,-0.17822,15.425096,10.557218,7.025072,13
3,DeepAR,-0.194393,-0.192342,0.594094,0.385137,268.658817,10
4,PatchTST,-0.209173,-0.188821,0.536519,0.364714,162.487847,11
5,ChronosZeroShot[bolt_base],-0.211922,-0.19792,1.512402,1.375603,28.605322,7
6,AutoETS,-0.215147,-0.207196,8.306672,7.969134,28.243108,6
7,DynamicOptimizedTheta,-0.221798,-0.202014,3.238974,1.178175,8.412431,5
8,DirectTabular,-0.233621,-0.230933,0.097489,0.105938,38.48774,3
9,TiDE,-0.233941,-0.227058,1.009471,0.773645,586.600043,12


In [16]:
# 🔮 6. Generar predicción
forecast = predictor.predict(ts_data)

data with frequency 'IRREG' has been resampled to frequency 'MS'.
Model not specified in predict, will default to the model with the best validation score: WeightedEnsemble


In [17]:
# Extraer predicción media y filtrar febrero 2020
forecast_mean = forecast['mean'].reset_index()
print(forecast_mean.columns)

Index(['item_id', 'timestamp', 'mean'], dtype='object')


In [18]:
# Tomar solo item_id y la predicción 'mean'
resultado = forecast['mean'].reset_index()[['item_id', 'mean']]
resultado.columns = ['product_id', 'tn']

# Filtrar solo febrero 2020
resultado = forecast['mean'].reset_index()
resultado = resultado[resultado['timestamp'] == '2020-02-01']

# Renombrar columnas
resultado = resultado[['item_id', 'mean']]
resultado.columns = ['product_id', 'tn']


In [19]:
# 💾 7. Guardar archivo
#resultado.to_csv("PREDICCION_CON_AUTOGLUON_2.csv", index=False)

#resultado.head()


In [20]:
# 🔁 Obtener modelos individuales, excluyendo el WeightedEnsemble
modelos = predictor.leaderboard(silent=True)['model'].tolist()
modelos_disponibles = [m for m in modelos if "Ensemble" not in m]

# Selección de los tres mejores
modelos_top3 = modelos_disponibles[:3]
print("Modelos seleccionados para ensemble:", modelos_top3)

Modelos seleccionados para ensemble: ['PatchTST', 'TemporalFusionTransformer', 'DeepAR']


In [21]:
# 🎯 Predicciones de los tres mejores modelos
pred1 = predictor.predict(ts_data, model=modelos_top3[0])
pred2 = predictor.predict(ts_data, model=modelos_top3[1])
pred3 = predictor.predict(ts_data, model=modelos_top3[2])


data with frequency 'IRREG' has been resampled to frequency 'MS'.
data with frequency 'IRREG' has been resampled to frequency 'MS'.
data with frequency 'IRREG' has been resampled to frequency 'MS'.


In [22]:
# 🤝 Ensemble promedio
pred_final = (pred1 + pred2 + pred3) / 3


In [23]:
# 👇 pred_final ya es un DataFrame con timestamp como índice
df_pred = pred_final.reset_index()  # ✅ convierte índices en columnas

# Filtramos solo febrero 2020
df_febrero = df_pred[df_pred['timestamp'] == '2020-02-01']

# Seleccionamos columnas y renombramos
df_feb = df_febrero[['item_id', 'mean']].copy()
df_feb.columns = ['product_id', 'tn']


In [24]:
# 💾 Guardar CSV para Kaggle
df_feb.to_csv("submission_ensemble_feb2020.csv", index=False)
df_feb.head()


Unnamed: 0,product_id,tn
1,20001,1256.390381
3,20002,985.61027
5,20003,680.051086
7,20004,512.511861
9,20005,514.550883
