# ChronoTick 2: Zero-Shot TSFM Benchmark

Automated benchmark of time series foundation models on clock drift prediction.
Runs all models across 4 machines, with and without covariates, at multiple
context lengths and horizons. Just click **Run All**.

## Models
| Model | Origin | Params | Covariates | Probabilistic |
|-------|--------|--------|------------|---------------|
| Chronos-2 Small | Amazon | 28M | Yes | Yes |
| Chronos-2 Base | Amazon | 120M | Yes | Yes |
| TimesFM 2.5 | Google | 200M | Yes | Yes |
| Granite TTM | IBM | 1-5M | No | No (point only) |
| Toto | Datadog | 151M | Yes | Yes (samples) |
| Moirai 1.1 Small | Salesforce | 14M | Yes | Yes (samples) |

## Dependency Groups
Models are ordered by dependency compatibility. Results are saved per-model,
so later installs that override earlier packages don't affect completed runs.

- **Group A** (compatible): Chronos-2, Granite TTM, TimesFM 2.5
- **Group B** (exact-pinned deps): Toto
- **Group C** (needs old torch): Moirai

In [None]:
# === Environment Setup ===
import os
import subprocess
import sys

IN_COLAB = "COLAB_GPU" in os.environ or os.path.exists("/content")

if IN_COLAB:
    REPO_DIR = "/content/sensor-collector"
    REPO_URL = "https://github.com/JaimeCernuda/sensor-collector.git"

    # Clone or pull latest tick2 code from GitHub
    if os.path.exists(REPO_DIR):
        subprocess.run(["git", "-C", REPO_DIR, "pull", "-q"], check=True)
    else:
        subprocess.run(["git", "clone", "-q", REPO_URL, REPO_DIR], check=True)

    # Install tick2 package in editable mode
    subprocess.run(["pip", "install", "-q", "-e", f"{REPO_DIR}/tick2/"], check=True)

    # Ensure tick2 is importable (pip install via subprocess doesn't always
    # update sys.path in the running kernel)
    tick2_src = f"{REPO_DIR}/tick2/src"
    if tick2_src not in sys.path:
        sys.path.insert(0, tick2_src)

    # Data: repo now includes the CSVs
    DATA_DIR = f"{REPO_DIR}/sensors/data"
    if not os.path.isdir(f"{DATA_DIR}/24h_snapshot"):
        from google.colab import drive
        drive.mount("/content/drive")
        DATA_DIR = "/content/drive/MyDrive/chronotick2/data"

    RESULTS_DIR = "/content/results/zero_shot"
else:
    DATA_DIR = None
    RESULTS_DIR = "../results/zero_shot"

print(f"Environment: {'Colab' if IN_COLAB else 'Local'}")
print(f"Data dir:    {DATA_DIR or '(default)'}")
print(f"Results dir: {RESULTS_DIR}")

In [None]:
# === Model Definitions & Helpers ===
import importlib
from pathlib import Path

import pandas as pd

# Models ordered by dependency compatibility group.
# Group A models share compatible deps and run first.
# Before Group B/C, conflicting packages are uninstalled so pip can resolve
# the new model's pinned versions cleanly.
#
# Dep conflicts (why we need the cleanup cycle):
#   granite-tsfm pins transformers==4.56  vs  toto-ts pins transformers==4.52
#   chronos needs scikit-learn>=1.6       vs  toto-ts pins scikit-learn==1.5
#   toto-ts pins torch==2.7              vs  uni2ts needs torch<2.5
MODEL_CONFIGS = [
    # --- Group A: compatible (chronos + granite + timesfm) ---
    {
        "name": "chronos2-small",
        "cleanup": [],
        "install": ['pip install -q "chronos-forecasting[extras]>=2.2"'],
        "verify": "chronos",
        "group": "A",
    },
    {
        "name": "chronos2-base",
        "cleanup": [],
        "install": [],  # same package as chronos2-small
        "verify": "chronos",
        "group": "A",
    },
    {
        "name": "granite-ttm",
        "cleanup": [],
        "install": ['pip install -q "granite-tsfm>=0.3.3"'],
        "verify": "tsfm_public",
        "group": "A",
    },
    {
        "name": "timesfm-2.5",
        "cleanup": [],
        "install": [
            "git clone -q https://github.com/google-research/timesfm /content/timesfm 2>/dev/null || true",
            'pip install -q -e "/content/timesfm[torch]"',
        ],
        "verify": "timesfm",
        "group": "A",
    },
    # --- Group B: toto (exact-pinned deps) ---
    # Uninstall granite + chronos first so pip can resolve toto's pins.
    # toto-ts uses pkg_resources which was removed in setuptools>=82.
    {
        "name": "toto",
        "cleanup": [
            "pip uninstall -y -q chronos-forecasting granite-tsfm 2>/dev/null; true",
        ],
        "install": [
            'pip install -q "setuptools<81"',
            "pip install -q toto-ts",
        ],
        "verify": "toto",
        "group": "B",
    },
    # --- Group C: moirai (needs torch<2.5, may fail on modern Colab) ---
    # Uninstall toto first to free its version pins
    {
        "name": "moirai-1.1-small",
        "cleanup": [
            "pip uninstall -y -q toto-ts 2>/dev/null; true",
        ],
        "install": ["pip install -q uni2ts"],
        "verify": "uni2ts",
        "group": "C",
    },
]


def _can_import(module_name: str) -> bool:
    """Check if a Python module is importable."""
    try:
        importlib.import_module(module_name)
        return True
    except (ImportError, ModuleNotFoundError):
        return False


def install_model_deps(cfg: dict) -> bool:
    """Install dependencies for a model, cleaning up conflicts first.

    Flow: check cache → cleanup conflicting packages → install → verify.
    Returns True if the model is ready to run.
    """
    name, verify = cfg["name"], cfg["verify"]

    # Already importable — no install needed
    if not cfg.get("cleanup") and _can_import(verify):
        print(f"  [{verify}] already available")
        return True

    if not IN_COLAB and not _can_import(verify):
        print(f"  [SKIP] {verify} not installed (install manually for local runs)")
        return False

    # If no install commands, just verify (e.g., chronos2-base shares chronos pkg)
    if not cfg["install"] and not cfg["cleanup"]:
        return _can_import(verify)

    # Cleanup: uninstall packages whose version pins conflict with this model
    for cmd in cfg.get("cleanup", []):
        print(f"  $ {cmd}")
        subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=120)

    # After cleanup, check again (maybe we don't need to install)
    importlib.invalidate_caches()
    if _can_import(verify) and not cfg.get("cleanup"):
        print(f"  [{verify}] already available")
        return True

    # Install
    for cmd in cfg["install"]:
        print(f"  $ {cmd}")
        result = subprocess.run(cmd, shell=True, capture_output=True, text=True, timeout=600)
        if result.returncode != 0:
            tail = result.stderr.strip().split("\n")[-3:]
            print("  [FAIL]", "\n        ".join(tail))
            return False

    importlib.invalidate_caches()
    if _can_import(verify):
        print(f"  [{verify}] ready")
        return True

    print(f"  [FAIL] {verify} not importable after install")
    return False


def load_model_results(out_dir: Path, model_name: str) -> pd.DataFrame | None:
    """Load previously saved per-model results, or None."""
    csv_path = out_dir / f"{model_name}.csv"
    return pd.read_csv(csv_path) if csv_path.exists() else None


def save_model_results(out_dir: Path, model_name: str, df: pd.DataFrame) -> None:
    """Save per-model results CSV for incremental resume."""
    out_dir.mkdir(parents=True, exist_ok=True)
    csv_path = out_dir / f"{model_name}.csv"
    df.to_csv(csv_path, index=False)
    print(f"  Saved: {csv_path}")

In [None]:
# === Imports, Config & Data ===
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch

from tick2.benchmark.runner import BenchmarkConfig, run_benchmark, results_to_dataframe
from tick2.benchmark.reporting import format_summary, results_to_latex, save_results
from tick2.data.preprocessing import TARGET_COL, get_feature_cols, load_all
from tick2.models.registry import get_model, list_models
from tick2.utils.gpu import clear_gpu_memory

sns.set_theme(style="whitegrid", font_scale=1.1)

# --- GPU ---
device = "cuda" if torch.cuda.is_available() else "cpu"
if device == "cuda":
    print(f"GPU:  {torch.cuda.get_device_name(0)}")
    print(f"VRAM: {torch.cuda.get_device_properties(0).total_mem / 1024**3:.1f} GB")
else:
    print("Running on CPU (Granite TTM only)")

# --- Benchmark config ---
config = BenchmarkConfig(
    context_lengths=[512, 1024],
    horizons=[60, 120],
    n_samples=25,
    seed=42,
    use_covariates=[False, True],
    quantile_alpha=0.2,
)

# --- Load sensor data ---
data_dir = Path(DATA_DIR) if DATA_DIR else None
datasets = load_all(data_dir=data_dir, snapshot="24h_snapshot")
for name, (df, cats) in datasets.items():
    print(f"  {name:16s}: {len(df):6d} rows, {len(get_feature_cols(df)):3d} features")

# --- Results directory ---
results_dir = Path(RESULTS_DIR)
results_dir.mkdir(parents=True, exist_ok=True)
print(f"\nResults: {results_dir}")
print(f"Models:  {list_models()}")

## Run All Benchmarks

For each model: install deps, load, benchmark across all machines/configs,
save per-model CSV, unload. Cached results are loaded on resume.
On Colab T4, expect ~20-40 min per model.

In [None]:
completed = {}  # model_name -> DataFrame
skipped = []

for cfg in MODEL_CONFIGS:
    model_name = cfg["name"]
    print(f"\n{'='*60}")
    print(f" {model_name}  [Group {cfg['group']}]")
    print(f"{'='*60}")

    # Resume: load cached results if available
    cached = load_model_results(results_dir, model_name)
    if cached is not None:
        print(f"  [CACHED] {len(cached)} rows from previous run")
        completed[model_name] = cached
        continue

    # Install dependencies
    if not install_model_deps(cfg):
        skipped.append(model_name)
        continue

    # Benchmark
    model = None
    try:
        clear_gpu_memory()
        model = get_model(model_name)
        model.load(device=device)
        print(f"  Loaded on {device}, ~{model.memory_footprint_mb():.0f} MB")

        run_results = run_benchmark(model, datasets, config, progress=True)
        model_df = results_to_dataframe(run_results)

        save_model_results(results_dir, model_name, model_df)
        completed[model_name] = model_df
        print(f"  Done: {len(run_results)} configs, mean MAE = {model_df['mae'].mean():.4f}")

    except Exception as e:
        print(f"  [FAIL] {e}")
        import traceback
        traceback.print_exc()
        skipped.append(model_name)

    finally:
        del model
        clear_gpu_memory()

# --- Summary ---
print(f"\n{'='*60}")
print(f"  Completed: {list(completed.keys())}")
if skipped:
    print(f"  Skipped:   {skipped}")
print(f"{'='*60}")

## Results

In [None]:
if completed:
    results_df = pd.concat(completed.values(), ignore_index=True)
    print(format_summary(results_df))
    display(results_df)
else:
    results_df = pd.DataFrame()
    print("No results collected. Check install/runtime failures above.")

## Visualizations

In [None]:
if not results_df.empty:
    df = results_df.copy()
    uni_df = df[~df["with_covariates"]]

    # --- 1. MAE by Model and Machine (univariate) ---
    fig, ax = plt.subplots(figsize=(12, 5))
    if not uni_df.empty:
        pivot = uni_df.pivot_table(values="mae", index="model", columns="machine")
        pivot.plot(kind="bar", ax=ax)
        ax.set_ylabel("MAE (ppm)")
        ax.set_title("Zero-Shot MAE by Model and Machine (Univariate)")
        ax.legend(title="Machine")
        plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

    # --- 2. Covariate Effect ---
    cov_models = df[df["model"].isin(df[df["with_covariates"]]["model"].unique())]
    if not cov_models.empty and len(cov_models["with_covariates"].unique()) > 1:
        fig, ax = plt.subplots(figsize=(10, 5))
        sns.barplot(data=cov_models, x="model", y="mae", hue="with_covariates", ax=ax)
        ax.set_ylabel("MAE (ppm)")
        ax.set_title("Covariate Effect: Univariate vs. Multivariate")
        ax.legend(title="With Covariates")
        plt.xticks(rotation=45)
        plt.tight_layout()
        plt.show()

    # --- 3. Inference Latency ---
    fig, ax = plt.subplots(figsize=(10, 5))
    time_data = df.groupby("model")["inference_ms"].mean().sort_values()
    time_data.plot(kind="barh", ax=ax, color="steelblue")
    ax.set_xlabel("Mean Inference Time (ms)")
    ax.set_title("Inference Latency by Model")
    plt.tight_layout()
    plt.show()

    # --- 4. Context Length Sensitivity ---
    if len(uni_df["context_length"].unique()) > 1:
        fig, ax = plt.subplots(figsize=(10, 5))
        sns.lineplot(
            data=uni_df, x="context_length", y="mae",
            hue="model", style="model", markers=True, ax=ax,
        )
        ax.set_xlabel("Context Length (timesteps)")
        ax.set_ylabel("MAE (ppm)")
        ax.set_title("MAE vs. Context Length")
        plt.tight_layout()
        plt.show()

## Export Results

In [None]:
if not results_df.empty:
    csv_path, latex_path = save_results(results_df, results_dir, prefix="zero_shot")
    print(f"CSV:   {csv_path}")
    print(f"LaTeX: {latex_path}")
    print(f"\n{results_to_latex(results_df)}")