# Running GIFT-Eval with tsagentkit (DEPRECATED - v1.x only)

**⚠️ This notebook uses tsagentkit v1.x API which has been removed in v2.0.**

For v2.0 ensemble forecasting, use the updated `tsagentkit_quick.py` script instead:
```bash
python tsagentkit_quick.py --help
```

This notebook is kept for reference but requires tsagentkit v1.x to run.

### Installation

This notebook assumes Python 3.11 with `tsagentkit` and benchmark dependencies installed.

In [None]:
# Optional when running outside the project environment:
# %pip install "tsagentkit[tsfm,gift-eval]"

## Import dependencies and setup

In [None]:
import logging
import os
import warnings
import csv
from pathlib import Path

import numpy as np

warnings.simplefilter(action='ignore', category=FutureWarning)
logging.getLogger("gluonts").setLevel(logging.ERROR)

import pandas as pd
from gluonts.ev.metrics import (
    MAE,
    MAPE,
    MASE,
    MSE,
    MSIS,
    ND,
    NRMSE,
    RMSE,
    SMAPE,
    MeanWeightedSumQuantileLoss,
)
from gluonts.model import evaluate_model
from gluonts.time_feature import get_seasonality

from tsagentkit.gift_eval.data import Dataset
from tsagentkit.gift_eval.predictor import QUANTILES, TSAgentKitPredictor

## Configuration

In [None]:
# Paths
storage_path = Path("./data/gift-eval")
output_dir = Path("./results/tsagentkit")
os.makedirs(output_dir, exist_ok=True)

# Model configuration
model_name = "tsagentkit"
mode = "standard"
preload_adapters = ["chronos"]
batch_size = 512

# Debug mode: use only a few short datasets for quick testing
debug = False

# Data download: Set to True to download GIFT-eval datasets if not already available
# Data source: https://github.com/SalesforceAIResearch/gift-eval
# Expected data structure: {storage_path}/{dataset_name}/{term}/
download_data = False

# Datasets (short + med/long)
short_datasets = [
    "m4_yearly", "m4_quarterly", "m4_monthly", "m4_weekly", "m4_daily", "m4_hourly",
    "electricity/15T", "electricity/H", "electricity/D", "electricity/W",
    "solar/10T", "solar/H", "solar/D", "solar/W",
    "hospital", "covid_deaths",
    "us_births/D", "us_births/M", "us_births/W",
    "saugeenday/D", "saugeenday/M", "saugeenday/W",
    "temperature_rain_with_missing",
    "kdd_cup_2018_with_missing/H", "kdd_cup_2018_with_missing/D",
    "car_parts_with_missing",
    "restaurant",
    "hierarchical_sales/D", "hierarchical_sales/W",
    "LOOP_SEATTLE/5T", "LOOP_SEATTLE/H", "LOOP_SEATTLE/D",
    "SZ_TAXI/15T", "SZ_TAXI/H",
    "M_DENSE/H", "M_DENSE/D",
    "ett1/15T", "ett1/H", "ett1/D", "ett1/W",
    "ett2/15T", "ett2/H", "ett2/D", "ett2/W",
    "jena_weather/10T", "jena_weather/H", "jena_weather/D",
    "bitbrains_fast_storage/5T", "bitbrains_fast_storage/H",
    "bitbrains_rnd/5T", "bitbrains_rnd/H",
    "bizitobs_application", "bizitobs_service",
    "bizitobs_l2c/5T", "bizitobs_l2c/H"
]

med_long_datasets = [
    "electricity/15T", "electricity/H",
    "solar/10T", "solar/H",
    "kdd_cup_2018_with_missing/H",
    "LOOP_SEATTLE/5T", "LOOP_SEATTLE/H",
    "SZ_TAXI/15T",
    "M_DENSE/H",
    "ett1/15T", "ett1/H",
    "ett2/15T", "ett2/H",
    "jena_weather/10T", "jena_weather/H",
    "bitbrains_fast_storage/5T",
    "bitbrains_rnd/5T",
    "bizitobs_application", "bizitobs_service",
    "bizitobs_l2c/5T", "bizitobs_l2c/H"
]

# Dataset properties
dataset_properties_map = {
    "m4_yearly": {"frequency": "Y", "domain": "finance", "num_variates": 1},
    "m4_quarterly": {"frequency": "Q", "domain": "finance", "num_variates": 1},
    "m4_monthly": {"frequency": "M", "domain": "finance", "num_variates": 1},
    "m4_weekly": {"frequency": "W", "domain": "finance", "num_variates": 1},
    "m4_daily": {"frequency": "D", "domain": "finance", "num_variates": 1},
    "m4_hourly": {"frequency": "H", "domain": "finance", "num_variates": 1},
    "electricity": {"frequency": "H", "domain": "energy", "num_variates": 370},
    "solar": {"frequency": "10T", "domain": "energy", "num_variates": 137},
    "hospital": {"frequency": "H", "domain": "health", "num_variates": 1},
    "covid_deaths": {"frequency": "W", "domain": "health", "num_variates": 1},
    "us_births": {"frequency": "D", "domain": "demographics", "num_variates": 1},
    "saugeen": {"frequency": "D", "domain": "environment", "num_variates": 1},
    "temperature_rain": {"frequency": "D", "domain": "environment", "num_variates": 1},
    "kdd_cup_2018": {"frequency": "H", "domain": "traffic", "num_variates": 1},
    "car_parts": {"frequency": "M", "domain": "business", "num_variates": 1},
    "restaurant": {"frequency": "W", "domain": "business", "num_variates": 1},
    "hierarchical_sales": {"frequency": "W", "domain": "business", "num_variates": 1},
    "loop_seattle": {"frequency": "5T", "domain": "traffic", "num_variates": 323},
    "sz_taxi": {"frequency": "15T", "domain": "traffic", "num_variates": 10215},
    "m_dense": {"frequency": "H", "domain": "traffic", "num_variates": 963},
    "ett1": {"frequency": "H", "domain": "energy", "num_variates": 7},
    "ett2": {"frequency": "H", "domain": "energy", "num_variates": 7},
    "jena_weather": {"frequency": "10T", "domain": "environment", "num_variates": 21},
    "bitbrains_fast_storage": {"frequency": "5T", "domain": "it", "num_variates": 1},
    "bitbrains_rnd": {"frequency": "5T", "domain": "it", "num_variates": 1},
    "bizitobs_application": {"frequency": "H", "domain": "it", "num_variates": 1},
    "bizitobs_service": {"frequency": "H", "domain": "it", "num_variates": 1},
    "bizitobs_l2c": {"frequency": "5T", "domain": "it", "num_variates": 1}
}

# Combined datasets
all_datasets = sorted(set(short_datasets + med_long_datasets))

# Debug mode: use only a few datasets
if debug:
    all_datasets = all_datasets[:5]

# Pretty names for dataset keys
pretty_names = {
    "saugeenday": "saugeen",
    "temperature_rain_with_missing": "temperature_rain",
    "kdd_cup_2018_with_missing": "kdd_cup_2018",
    "car_parts_with_missing": "car_parts",
}

# Download data if requested
if download_data:
    from tsagentkit.gift_eval.eval import GIFTEval
    GIFTEval.download_data(storage_path=storage_path)

print(f"storage_path={storage_path.resolve()}")
print(f"output_dir={output_dir.resolve()}")
print(f"Debug mode: {debug}")
print(f"Download data: {download_data}")
print(f"Total datasets: {len(all_datasets)}")

## Define evaluation metrics

In [None]:
metrics = [
    MSE(forecast_type="mean"),
    MSE(forecast_type=0.5),
    MAE(),
    MASE(),
    MAPE(),
    SMAPE(),
    MSIS(),
    RMSE(),
    NRMSE(),
    ND(),
    MeanWeightedSumQuantileLoss(
        quantile_levels=QUANTILES
    ),
]

## Run evaluation

In [None]:
csv_file_path = output_dir / "all_results.csv"
header = [
    "dataset",
    "model",
    "eval_metrics/MSE[mean]",
    "eval_metrics/MSE[0.5]",
    "eval_metrics/MAE[0.5]",
    "eval_metrics/MASE[0.5]",
    "eval_metrics/MAPE[0.5]",
    "eval_metrics/sMAPE[0.5]",
    "eval_metrics/MSIS",
    "eval_metrics/RMSE[mean]",
    "eval_metrics/NRMSE[mean]",
    "eval_metrics/ND[0.5]",
    "eval_metrics/mean_weighted_sum_quantile_loss",
    "domain",
    "num_variates",
]

results = []

# Check if evaluate_model supports batch_size parameter
import inspect
evaluate_model_params = set(inspect.signature(evaluate_model).parameters)
supports_eval_batch_size = "batch_size" in evaluate_model_params

# Create predictor
predictor = TSAgentKitPredictor(
    mode=mode,
    batch_size=batch_size,
    preload_adapters=preload_adapters,
)

try:
    for ds_name in all_datasets:
        print(f"Processing dataset: {ds_name}")
        terms = ["short", "medium", "long"]
        
        for term in terms:
            if (term == "medium" or term == "long") and ds_name not in med_long_datasets:
                continue

            # Parse dataset key and frequency
            if "/" in ds_name:
                ds_key = ds_name.split("/")[0]
                ds_freq = ds_name.split("/")[1]
                ds_key = ds_key.lower()
                ds_key = pretty_names.get(ds_key, ds_key)
            else:
                ds_key = ds_name.lower()
                ds_key = pretty_names.get(ds_key, ds_key)
                ds_freq = dataset_properties_map[ds_key]["frequency"]

            ds_config = f"{ds_key}/{ds_freq}/{term}"

            # Create dataset
            to_univariate = False if Dataset(
                name=ds_name, term=term, to_univariate=False, storage_path=storage_path
            ).target_dim == 1 else True

            dataset = Dataset(
                name=ds_name, term=term, to_univariate=to_univariate, storage_path=storage_path
            )

            # Update predictor parameters for this dataset
            predictor.h = dataset.prediction_length
            predictor.freq = dataset.freq

            # Adjust batch size if needed for memory
            current_batch_size = batch_size
            while current_batch_size >= 1:
                try:
                    predictor.batch_size = current_batch_size
                    season_length = get_seasonality(dataset.freq)
                    
                    eval_kwargs = dict(
                        test_data=dataset.test_data,
                        metrics=metrics,
                        axis=None,
                        mask_invalid_label=True,
                        allow_nan_forecast=False,
                        seasonality=season_length,
                    )
                    if supports_eval_batch_size:
                        eval_kwargs["batch_size"] = current_batch_size
                    
                    res = evaluate_model(predictor, **eval_kwargs)
                    break  # Success
                except RuntimeError as e:
                    if ("CUDA out of memory" in str(e) or "out of memory" in str(e).lower()):
                        print(f"CUDA out of memory with batch_size={current_batch_size}, halving...")
                        current_batch_size //= 2
                        if current_batch_size < 1:
                            raise RuntimeError("Batch size reduced below 1; cannot proceed.") from e
                    else:
                        raise

            result_row = [
                ds_config,
                model_name,
                res["MSE[mean]"][0],
                res["MSE[0.5]"][0],
                res["MAE[0.5]"][0],
                res["MASE[0.5]"][0],
                res["MAPE[0.5]"][0],
                res["sMAPE[0.5]"][0],
                res["MSIS"][0],
                res["RMSE[mean]"][0],
                res["NRMSE[mean]"][0],
                res["ND[0.5]"][0],
                res["mean_weighted_sum_quantile_loss"][0],
                dataset_properties_map[ds_key]["domain"],
                dataset_properties_map[ds_key]["num_variates"],
            ]

            results.append((ds_config, result_row))
            print(f"  {ds_config}: MASE={result_row[5]:.6f}, CRPS={result_row[12]:.6f}")

finally:
    predictor.close()

# Write results to CSV
with open(csv_file_path, "w", newline="") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(header)
    for _, row in results:
        writer.writerow(row)

print(f"\nResults written to {csv_file_path}")

## Compute normalized scores vs Seasonal Naive baseline

In [None]:
baseline_path = Path("./results/seasonal_naive/all_results.csv")

if baseline_path.exists():
    seasonal_naive = pd.read_csv(baseline_path).sort_values("dataset")
    df = pd.DataFrame([row for _, row in results], columns=header).sort_values("dataset")

    baseline_by_dataset = seasonal_naive.set_index("dataset")
    aligned = baseline_by_dataset.loc[df["dataset"]]

    df["normalized MASE"] = (
        df["eval_metrics/MASE[0.5]"].to_numpy() /
        aligned["eval_metrics/MASE[0.5]"].to_numpy()
    )
    df["normalized CRPS"] = (
        df["eval_metrics/mean_weighted_sum_quantile_loss"].to_numpy() /
        aligned["eval_metrics/mean_weighted_sum_quantile_loss"].to_numpy()
    )

    mase = float(np.exp(np.mean(np.log(df["normalized MASE"].to_numpy())))
    crps = float(np.exp(np.mean(np.log(df["normalized CRPS"].to_numpy())))

    print(f"Final GIFT-Eval performance of {model_name}:")
    print(f"MASE = {mase}")
    print(f"CRPS = {crps}")
else:
    print(
        "Normalized MASE/CRPS not computed "
        "(missing ./results/seasonal_naive/all_results.csv)."
    )

## View results

In [None]:
eval_df = pd.read_csv(csv_file_path)
print(f"Rows: {len(eval_df)}")
eval_df.tail(10)