# Running GIFT-Eval with tsagentkit

Credence-style evaluation notebook adapted for tsagentkit. The structure mirrors
`notebooks/Credence.ipynb` so prediction and metric computation are comparable,
while the prediction interface is implemented through `TSAgentKitPredictor`.


### tsagentkit model note

`Credence.ipynb` directly calls a hosted API per forecasting window. tsagentkit uses
a local GluonTS-compatible predictor (`TSAgentKitPredictor`) instead, so we use a
thin adapter class while keeping the same dataset loop and metric pipeline.


### Installation

This notebook assumes Python 3.11 with `tsagentkit` and benchmark dependencies installed.


In [None]:
# Optional when running outside the project environment:
# %pip install "tsagentkit[tsfm,gift-eval]"


Import the required dependencies.


In [None]:
import csv
import json
import os
import random
import warnings
from pathlib import Path

import numpy as np
import pandas as pd
from gluonts.ev.metrics import (
    MAE,
    MAPE,
    MASE,
    MSE,
    MSIS,
    ND,
    NRMSE,
    RMSE,
    SMAPE,
    MeanWeightedSumQuantileLoss,
)
from gluonts.model import evaluate_model

from tsagentkit.gift_eval.data import Dataset, MED_LONG_DATASETS, SHORT_DATASETS
from tsagentkit.gift_eval.dataset_properties import DATASET_PROPERTIES
from tsagentkit.gift_eval.eval import GIFTEval
from tsagentkit.gift_eval.predictor import QUANTILES, TSAgentKitPredictor

warnings.filterwarnings("ignore")


Prepare the configuration.


In [None]:
# Path configurations
storage_path = Path("./data/gift-eval")
out_dir = Path("./results/tsagentkit")

# Model configuration
model_name = "tsagentkit"
mode = "standard"
preload_adapters = ["chronos"]
batch_size = 512

# Auxiliary configurations
seed = 0
download_data = False

if download_data:
    GIFTEval.download_data(storage_path=storage_path)

out_dir.mkdir(parents=True, exist_ok=True)
print(f"storage_path={storage_path.resolve()}")
print(f"out_dir={out_dir.resolve()}")


Experiment configurations.


In [None]:
short_datasets = " ".join(SHORT_DATASETS)
med_long_datasets = " ".join(MED_LONG_DATASETS)
dataset_properties = DATASET_PROPERTIES

short_datasets.split()[:5], len(short_datasets.split()), len(med_long_datasets.split())


In [None]:
# Auxiliary functions
def set_seed(seed: int) -> None:
    random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    np.random.seed(seed)


class TSAgentKitGiftAdapter:
    """Adapter to keep evaluate_model(...) usage aligned with Credence notebook style."""

    def __init__(self, predictor: TSAgentKitPredictor):
        self.predictor = predictor

    def predict(self, test_data, **kwargs):
        return self.predictor.predict(test_data, **kwargs)


Experiment wrapper.


In [None]:
base_row = [
    "dataset",
    "model",
    "eval_metrics/MSE[mean]",
    "eval_metrics/MSE[0.5]",
    "eval_metrics/MAE[mean]",
    "eval_metrics/MAE[0.5]",
    "eval_metrics/MASE[0.5]",
    "eval_metrics/MAPE[0.5]",
    "eval_metrics/sMAPE[0.5]",
    "eval_metrics/MSIS",
    "eval_metrics/RMSE[mean]",
    "eval_metrics/NRMSE[mean]",
    "eval_metrics/ND[0.5]",
    "eval_metrics/mean_weighted_sum_quantile_loss",
    "domain",
    "num_variates",
]


def run_gift_eval(zs: bool = False, save: bool = False, verbose: bool = True):
    set_seed(seed)

    # Match official notebook behavior: evaluate the union of short + med/long datasets.
    all_datasets = sorted(set(short_datasets.split() + med_long_datasets.split()))
    dataset_properties_map = dataset_properties

    metrics = [
        MSE(forecast_type="mean"),
        MSE(forecast_type=0.5),
        MAE(forecast_type="mean"),
        MAE(forecast_type=0.5),
        MASE(),
        MAPE(),
        SMAPE(),
        MSIS(),
        RMSE(),
        NRMSE(),
        ND(),
        MeanWeightedSumQuantileLoss(quantile_levels=QUANTILES),
    ]

    csv_file_path = out_dir / "all_results.csv"

    pretty_names = {
        "saugeenday": "saugeen",
        "temperature_rain_with_missing": "temperature_rain",
        "kdd_cup_2018_with_missing": "kdd_cup_2018",
        "car_parts_with_missing": "car_parts",
    }

    if save and not csv_file_path.exists():
        with csv_file_path.open("a", newline="") as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(base_row)

    if save and csv_file_path.exists():
        df_res_done = pd.read_csv(csv_file_path)
        done_datasets = set(df_res_done["dataset"].tolist())
    else:
        df_res_done = pd.DataFrame(columns=base_row)
        done_datasets = set()

    df_res = pd.DataFrame(columns=base_row)

    # Optional zero-shot subset used in official notebooks for certain pretraining comparisons.
    if zs:
        excluded = {
            "solar/H",
            "m4_monthly",
            "m4_weekly",
            "m4_daily",
            "m4_hourly",
            "electricity/15T",
            "electricity/H",
            "electricity/W",
            "kdd_cup_2018_with_missing/D",
            "kdd_cup_2018_with_missing/H",
            "temperature_rain_with_missing",
        }
    else:
        excluded = set()

    predictor = TSAgentKitPredictor(
        mode=mode,
        batch_size=batch_size,
        preload_adapters=preload_adapters,
    )
    model = TSAgentKitGiftAdapter(predictor)

    import inspect
    evaluate_model_params = set(inspect.signature(evaluate_model).parameters)
    supports_eval_batch_size = "batch_size" in evaluate_model_params

    try:
        for ds_name in all_datasets:
            if ds_name in excluded:
                continue

            set_seed(seed)
            for term in ("short", "medium", "long"):
                if term in {"medium", "long"} and ds_name not in med_long_datasets.split():
                    continue

                if "/" in ds_name:
                    ds_key, ds_freq = ds_name.split("/", maxsplit=1)
                    ds_key = pretty_names.get(ds_key.lower(), ds_key.lower())
                else:
                    ds_key = pretty_names.get(ds_name.lower(), ds_name.lower())
                    ds_freq = str(dataset_properties_map[ds_key]["frequency"])

                ds_config = f"{ds_key}/{ds_freq}/{term}"

                if ds_config in done_datasets:
                    df_res = pd.concat(
                        [df_res, df_res_done.loc[df_res_done["dataset"] == ds_config]],
                        ignore_index=True,
                    )
                    continue

                temp_dataset = Dataset(
                    name=ds_name,
                    term=term,
                    to_univariate=False,
                    storage_path=storage_path,
                )
                to_univariate = temp_dataset.target_dim != 1
                dataset = Dataset(
                    name=ds_name,
                    term=term,
                    to_univariate=to_univariate,
                    storage_path=storage_path,
                )

                predictor.h = dataset.prediction_length
                predictor.freq = dataset.freq

                if verbose:
                    print(
                        f"Dataset: {ds_name}, term={term}, "
                        f"freq={dataset.freq}, H={dataset.prediction_length}"
                    )

                eval_kwargs = dict(
                    test_data=dataset.test_data,
                    metrics=metrics,
                    axis=None,
                    mask_invalid_label=True,
                    allow_nan_forecast=False,
                )
                if supports_eval_batch_size:
                    eval_kwargs["batch_size"] = batch_size

                res = evaluate_model(model, **eval_kwargs)

                row = [
                    ds_config,
                    model_name,
                    float(res["MSE[mean]"].iloc[0]),
                    float(res["MSE[0.5]"].iloc[0]),
                    float(res["MAE[mean]"].iloc[0]),
                    float(res["MAE[0.5]"].iloc[0]),
                    float(res["MASE[0.5]"].iloc[0]),
                    float(res["MAPE[0.5]"].iloc[0]),
                    float(res["sMAPE[0.5]"].iloc[0]),
                    float(res["MSIS"].iloc[0]),
                    float(res["RMSE[mean]"].iloc[0]),
                    float(res["NRMSE[mean]"].iloc[0]),
                    float(res["ND[0.5]"].iloc[0]),
                    float(res["mean_weighted_sum_quantile_loss"].iloc[0]),
                    dataset_properties_map[ds_key]["domain"],
                    dataset_properties_map[ds_key]["num_variates"],
                ]

                if save:
                    with csv_file_path.open("a", newline="") as csvfile:
                        writer = csv.writer(csvfile)
                        writer.writerow(row)
                    if verbose:
                        print(f"Results for {ds_config} written to {csv_file_path}")

                df_res.loc[len(df_res)] = row

                if verbose:
                    print(
                        f"MASE={row[6]:.6f} | CRPS={row[13]:.6f}"
                    )
    finally:
        predictor.close()

    baseline_path = Path("./results/seasonal_naive/all_results.csv")
    if baseline_path.exists() and len(df_res) > 0:
        seasonal_naive = pd.read_csv(baseline_path).sort_values("dataset")
        df = df_res.sort_values("dataset").copy()

        baseline_by_dataset = seasonal_naive.set_index("dataset")
        aligned = baseline_by_dataset.loc[df["dataset"]]
        df["normalized MASE"] = (
            df["eval_metrics/MASE[0.5]"].to_numpy() /
            aligned["eval_metrics/MASE[0.5]"].to_numpy()
        )
        df["normalized CRPS"] = (
            df["eval_metrics/mean_weighted_sum_quantile_loss"].to_numpy() /
            aligned["eval_metrics/mean_weighted_sum_quantile_loss"].to_numpy()
        )

        mase = float(np.exp(np.mean(np.log(df["normalized MASE"].to_numpy()))))
        crps = float(np.exp(np.mean(np.log(df["normalized CRPS"].to_numpy()))))
    else:
        mase = float("nan")
        crps = float("nan")

    return mase, crps, df_res


Start the experiment.


In [None]:
mase, crps, df_res = run_gift_eval(verbose=True, save=True)

if np.isfinite(mase) and np.isfinite(crps):
    print(f"Final GIFT-Eval performance of {model_name}:\nMASE = {mase}, CRPS = {crps}")
else:
    print(
        "Evaluation finished. Normalized MASE/CRPS not computed "
        "(missing ./results/seasonal_naive/all_results.csv)."
    )


In [None]:
results_csv = out_dir / "all_results.csv"
eval_df = pd.read_csv(results_csv)
print(f"Rows: {len(eval_df)}")
eval_df.tail(10)
