# tsagentkit (GIFT-Eval)

Official-style single notebook for running tsagentkit on GIFT-Eval and exporting `all_results.csv`.

## Requirements and Installation

This notebook assumes a Python 3.11 environment with tsagentkit and GIFT-Eval dependencies installed.

In [None]:
# Optional when not already in a local dev environment:
# %pip install "tsagentkit[tsfm,gift-eval]" pandas numpy

## Dataset Setup

In [None]:
from __future__ import annotations

import logging
from pathlib import Path

import pandas as pd

from tsagentkit.gift_eval.data import DATASETS_WITH_TERMS, FULL_MATRIX_SIZE
from tsagentkit.gift_eval.eval import GIFTEval, RESULT_COLUMNS
from tsagentkit.gift_eval.predictor import TSAgentKitPredictor
from tsagentkit.gift_eval.score import compute_aggregate_scores, compute_normalized_scores

logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger("tsagentkit_gifteval_notebook")

In [None]:
# Runtime configuration
storage_path = Path("./data/gift-eval")
output_path = Path("./results/tsagentkit")

model_name = "tsagentkit"
mode = "standard"
batch_size = 512
preload_adapters = ["chronos"]
overwrite_results = True

# Submission metadata (for config.json in the benchmark repo)
submission_meta = {
    "model": model_name,
    "model_type": "agentic",
    "model_dtype": "float32",
    "model_link": "https://github.com/your-org/tsagentkit",
    "code_link": "https://github.com/your-org/tsagentkit/blob/main/benchmarks/gift_eval/notebooks/tsagentkit.ipynb",
    "org": "YourOrg",
    "testdata_leakage": "No",
    "replication_code_available": "Yes",
}

print(f"Storage path: {storage_path.resolve()}")
print(f"Output path: {output_path.resolve()}")

In [None]:
download_data = False
if download_data:
    GIFTEval.download_data(storage_path=storage_path)

output_path.mkdir(parents=True, exist_ok=True)
print("Ready.")

## Model Implementation

In [None]:
def build_predictor(batch_size: int = 512) -> TSAgentKitPredictor:
    return TSAgentKitPredictor(
        mode=mode,
        batch_size=batch_size,
        preload_adapters=preload_adapters,
    )

## Evaluation

### Defining the evaluator

Each benchmark row is defined by a `(dataset_name, term)` combination.

In [None]:
def evaluate_tsagentkit(
    predictor: TSAgentKitPredictor,
    dataset_name: str,
    term: str,
    output_path: Path,
    storage_path: Path,
    mode: str,
    preload_adapters: list[str],
    batch_size: int,
    model_name: str,
) -> pd.DataFrame:
    evaluator = GIFTEval(
        dataset_name=dataset_name,
        term=term,
        output_path=output_path,
        storage_path=storage_path,
        mode=mode,
        preload_adapters=preload_adapters,
    )
    return evaluator.evaluate_predictor(
        predictor=predictor,
        batch_size=batch_size,
        overwrite=False,
        model_name=model_name,
    )

### Performing evaluation

In [None]:
smoke_combinations = [
    ("m4_weekly", "short"),
    ("bizitobs_l2c/H", "short"),
    ("bizitobs_l2c/H", "medium"),
    ("bizitobs_l2c/H", "long"),
]

use_full_matrix = False
combinations = list(DATASETS_WITH_TERMS) if use_full_matrix else smoke_combinations

results_csv = output_path / "all_results.csv"
if overwrite_results and results_csv.exists():
    results_csv.unlink()

if not storage_path.exists():
    raise FileNotFoundError(
        f"Dataset path not found: {storage_path}. Set download_data=True first."
    )

predictor = build_predictor(batch_size=batch_size)
try:
    for idx, (dataset_name, term) in enumerate(combinations, start=1):
        logger.info("[%d/%d] %s/%s", idx, len(combinations), dataset_name, term)
        df = evaluate_tsagentkit(
            predictor=predictor,
            dataset_name=dataset_name,
            term=term,
            output_path=output_path,
            storage_path=storage_path,
            mode=mode,
            preload_adapters=preload_adapters,
            batch_size=batch_size,
            model_name=model_name,
        )
        row = df.iloc[-1]
        logger.info(
            "MASE=%.4f | sMAPE=%.4f | CRPS=%.4f",
            float(row["eval_metrics/MASE[0.5]"]),
            float(row["eval_metrics/sMAPE[0.5]"]),
            float(row["eval_metrics/mean_weighted_sum_quantile_loss"]),
        )
finally:
    predictor.close()

print(f"Saved: {results_csv}")

You can access the complete combination of datasets with the following:

In [None]:
DATASETS_WITH_TERMS[:3], len(DATASETS_WITH_TERMS)

In [None]:
eval_df = pd.read_csv(results_csv)
print(f"Rows: {len(eval_df)}")
eval_df.tail(10)

## Submission Readiness Checks

In [None]:
assert list(eval_df.columns) == RESULT_COLUMNS, "Unexpected result columns."
assert not eval_df["dataset"].duplicated().any(), "Duplicate dataset rows detected."

if use_full_matrix:
    assert len(eval_df) == FULL_MATRIX_SIZE, (
        f"Expected {FULL_MATRIX_SIZE} rows for full matrix; got {len(eval_df)}"
    )

print("Result schema checks passed.")

## Reproducibility statement

This notebook uses tsagentkit's packaged GIFT-Eval integration and writes standard
`all_results.csv` rows. It also supports optional normalized score computation against
a seasonal-naive baseline file when available.

In [None]:
summary = {
    "aggregate": compute_aggregate_scores(eval_df),
}

baseline_file = Path("./results/seasonal_naive/all_results.csv")
if baseline_file.exists():
    baseline_df = pd.read_csv(baseline_file)
    summary["normalized"] = compute_normalized_scores(eval_df, baseline_df)

summary

## Changelog

- 2026-02-11: Remove backtest mode parameter (multi-model is now default)
- 2026-02-11: Second-pass polish for official-style readability, submission metadata, and schema checks.