## Import

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from autogluon.timeseries import TimeSeriesDataFrame, TimeSeriesPredictor


## Read data

In [4]:
df = pd.read_parquet("../data/featured/train_featured.parquet")

## Train/Test split

In [5]:
# 2) Define forecast horizon
prediction_length = 15

# 3) Compute global cutoff date
max_date = df["timestamp"].max()
cutoff   = max_date - pd.Timedelta(days=prediction_length)
print(f"Cutoff for train/test split: {cutoff.date()} (last {prediction_length} days held out)")

Cutoff for train/test split: 2017-07-31 (last 15 days held out)


In [6]:
train_df = df[df["timestamp"] <= cutoff].copy()
test_df  = df[df["timestamp"] >  cutoff].copy()

## Create TimeSeriesDF

In [7]:
ts_df = TimeSeriesDataFrame.from_data_frame(
    df,
    id_column="item_id",
    timestamp_column="timestamp"
)

In [8]:
ts_train = TimeSeriesDataFrame.from_data_frame(
    train_df,
    id_column="item_id",
    timestamp_column="timestamp",
)
ts_test = TimeSeriesDataFrame.from_data_frame(
    test_df,
    id_column="item_id",
    timestamp_column="timestamp",
)

## Train

In [None]:
predictor = TimeSeriesPredictor(
    target="sales",
    prediction_length=prediction_length,
    eval_metric="MASE",
    freq="D",
    path ="../models/bolt_small",
).fit(
    train_data=ts_train,
    hyperparameters={
        "Chronos": [
            {"model_path": "bolt_small", "ag_args": {"name_suffix": ""}},
        ]
    },
    time_limit=300,
    enable_ensemble=False,
)

Beginning AutoGluon training... Time limit = 300s
AutoGluon will save models to '/opt/work/favorita-forecast/notebooks/AutogluonModels/ag-20250728_080707'
AutoGluon Version:  1.3.1
Python Version:     3.11.12
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Tue Nov 5 00:21:55 UTC 2024
CPU Count:          16
GPU Count:          1
Memory Avail:       0.87 GB / 15.22 GB (5.7%)
Disk Space Avail:   924.75 GB / 1006.85 GB (91.8%)

Fitting with arguments:
{'enable_ensemble': False,
 'eval_metric': MASE,
 'freq': 'D',
 'hyperparameters': {'Chronos': [{'ag_args': {'name_suffix': ''},
                                  'model_path': 'bolt_small'}]},
 'known_covariates_names': [],
 'num_val_windows': 1,
 'prediction_length': 15,
 'quantile_levels': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9],
 'random_seed': 123,
 'refit_every_n_windows': 1,
 'refit_full': False,
 'skip_model_selection': False,
 'target': 'sales',
 'time_limit': 300,
 'verbosity': 2}

train_data wi

## Predict

In [10]:
forecast_ts = predictor.predict(ts_train)  

data with frequency 'IRREG' has been resampled to frequency 'D'.
Model not specified in predict, will default to the model with the best validation score: Chronos[bolt_small]


## Visualize

In [11]:
obs_train_df = ts_train.to_data_frame().reset_index()   
obs_test_df  = ts_test .to_data_frame().reset_index()
pred_df      = forecast_ts.to_data_frame().reset_index()

In [13]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages

# Path to the single PDF
out_pdf = "../visualization/all_series_forecasts.pdf"
# Ensure output dir exists if you want it in a subfolder:
last_date    = max(test_df["timestamp"].max(), pred_df["timestamp"].max())
window_start = last_date - pd.Timedelta(days=200)
with PdfPages(out_pdf) as pdf:
    count = 0
    for series_id in pred_df["item_id"].unique():
        # 1) Slice each DF for this series and the last 200 days
        df_tr = (train_df[(train_df["item_id"] == series_id) &
                          (train_df["timestamp"] >= window_start)]
                 .sort_values("timestamp"))
        df_te = (test_df [(test_df ["item_id"] == series_id) &
                          (test_df ["timestamp"] >= window_start)]
                 .sort_values("timestamp"))
        df_pr = (pred_df [(pred_df ["item_id"] == series_id) &
                          (pred_df ["timestamp"] >= window_start)]
                 .sort_values("timestamp"))
        if df_pr.empty:
            continue  # no forecast, skip

        # 2) Make the figure
        fig, ax = plt.subplots(figsize=(8, 3))
        ax.plot(df_tr["timestamp"], df_tr["sales"], label="Train")
        ax.plot(df_te["timestamp"], df_te["sales"], label="Test")
        ax.plot(df_pr["timestamp"], df_pr["mean"], linestyle="--", label="Forecast")

        # optional: quantile ribbon
        if {"0.1", "0.9"}.issubset(df_pr.columns):
            ax.fill_between(df_pr["timestamp"],
                            df_pr["0.1"], df_pr["0.9"],
                            color="gray", alpha=0.3)

        # vertical line at forecast start
        if not df_tr.empty:
            ax.axvline(df_tr["timestamp"].max(), color="black", linestyle=":")

        ax.set_title(series_id)
        ax.set_xlabel("Date")
        ax.set_ylabel("Sales")
        ax.legend(loc="upper left", fontsize="small")
        plt.tight_layout()

        # 3) Save this page to the PDF
        pdf.savefig(fig)
        plt.close(fig)
        count += 1

print(f"Saved {count} plots into {out_pdf}")


Saved 1729 plots into ../visualization/all_series_forecasts.pdf
