# ChronoTick 2: Moirai 1.1 Fine-Tuning

Fine-tune Salesforce Moirai 1.1 Small (14M) using wide_multivariate mode.
All channels (target + sensors) are treated as joint targets during training;
at inference, only the target channel prediction is extracted.

## Experiments
- E1: Univariate FT (drift only)
- E2: Multivariate FT (drift + top-20 SHAP features)
- E3: Per-machine vs combined training

## Training Mode
Set `TRAINING_MODE` to "combined" or "per_machine".

In [None]:
# === Environment Setup ===
import os, subprocess, sys

IN_COLAB = "COLAB_GPU" in os.environ or os.path.exists("/content")

if IN_COLAB:
    REPO_DIR = "/content/sensor-collector"
    REPO_URL = "https://github.com/JaimeCernuda/sensor-collector.git"
    GITHUB_TOKEN = None
    try:
        from google.colab import userdata
        GITHUB_TOKEN = userdata.get("GITHUB_TOKEN")
    except Exception:
        print("WARNING: GITHUB_TOKEN not available")
    auth_url = f"https://{GITHUB_TOKEN}@github.com/JaimeCernuda/sensor-collector.git" if GITHUB_TOKEN else REPO_URL
    if os.path.exists(REPO_DIR):
        subprocess.run(["git", "-C", REPO_DIR, "remote", "set-url", "origin", auth_url], check=True)
        subprocess.run(["git", "-C", REPO_DIR, "fetch", "-q", "origin"], check=True)
        subprocess.run(["git", "-C", REPO_DIR, "reset", "--hard", "origin/main"], check=True)
    else:
        subprocess.run(["git", "clone", "-q", auth_url, REPO_DIR], check=True)
    subprocess.run(["git", "-C", REPO_DIR, "config", "user.name", "Colab Runner"], check=True)
    subprocess.run(["git", "-C", REPO_DIR, "config", "user.email", "colab@chronotick.dev"], check=True)
    subprocess.run(["pip", "install", "-q", "-e", f"{REPO_DIR}/tick2/"], check=True)
    tick2_src = f"{REPO_DIR}/tick2/src"
    if tick2_src not in sys.path:
        sys.path.insert(0, tick2_src)

    # Always mount Drive â€” needed for checkpoint persistence (models too large for git)
    from google.colab import drive
    drive.mount("/content/drive")

    # Data: prefer repo copy, fall back to Drive
    DATA_DIR = f"{REPO_DIR}/sensors/data"
    if not os.path.isdir(f"{DATA_DIR}/24h_snapshot"):
        DATA_DIR = "/content/drive/MyDrive/chronotick2/data"

    RESULTS_DIR = f"{REPO_DIR}/tick2/notebooks/output/03"
else:
    GITHUB_TOKEN = None
    DATA_DIR = None
    RESULTS_DIR = os.path.join(os.path.dirname("__file__") if "__file__" in dir() else ".", "output", "03")

DEVICE_DIR_MAP = {"cuda": "gpu", "cpu": "cpu"}

def checkpoint_push(label):
    if not IN_COLAB:
        return
    try:
        subprocess.run(["git", "-C", REPO_DIR, "add", "tick2/notebooks/output/03/"], check=True, capture_output=True)
        status = subprocess.run(["git", "-C", REPO_DIR, "status", "--porcelain", "tick2/notebooks/output/03/"], capture_output=True, text=True)
        if not status.stdout.strip():
            return
        subprocess.run(["git", "-C", REPO_DIR, "commit", "-m", f"results: notebook 03c moirai {label} ({device_label})"], check=True, capture_output=True)
        if GITHUB_TOKEN:
            subprocess.run(["git", "-C", REPO_DIR, "push"], check=True, capture_output=True, timeout=60)
            print(f"  [CHECKPOINT] Pushed {label}")
    except Exception as e:
        print(f"  [CHECKPOINT WARNING] {e}")

print(f"Environment: {'Colab' if IN_COLAB else 'Local'}")

In [None]:
# === Install Moirai Dependencies ===
if IN_COLAB:
    # uni2ts pins torch<2.5; install with --no-deps to keep CUDA torch
    subprocess.run(["pip", "install", "-q", "uni2ts", "--no-deps"], check=True)
    subprocess.run(["pip", "install", "-q", "einops>=0.7", "gluonts>=0.14", "jaxtyping", "hydra-core", "python-dotenv", "lightning", "safetensors", "huggingface_hub"], check=True)

from uni2ts.model.moirai import MoiraiForecast, MoiraiModule
print("uni2ts ready")

In [None]:
# === Imports, Config & Training Mode ===
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from pathlib import Path

from tick2.data.preprocessing import TARGET_COL, load_all
from tick2.finetuning.base import FineTuneConfig
from tick2.finetuning.data_prep import prepare_datasets
from tick2.finetuning.moirai_ft import finetune_moirai, load_finetuned_moirai
from tick2.finetuning.evaluate import evaluate_finetuned, load_zero_shot_baselines, compare_ft_vs_zero_shot
from tick2.utils.gpu import clear_gpu_memory

sns.set_theme(style="whitegrid", font_scale=1.1)

# --- User-configurable knobs ---
TRAINING_MODE = "combined"   # "combined" or "per_machine"
DEVICE_OVERRIDE = None       # None = auto-detect, "cuda", or "cpu"
FORCE_RETRAIN = False        # Set True to retrain even if cached CSV exists

device = DEVICE_OVERRIDE or ("cuda" if torch.cuda.is_available() else "cpu")
device_label = DEVICE_DIR_MAP.get(device, device)
config = FineTuneConfig(context_length=1000, prediction_length=96, max_covariates=20, seed=42)

if device == "cuda":
    props = torch.cuda.get_device_properties(0)
    vram = getattr(props, "total_memory", getattr(props, "total_mem", 0))
    print(f"GPU:  {torch.cuda.get_device_name(0)}")
    print(f"VRAM: {vram / 1024**3:.1f} GB")
else:
    print("Running on CPU")
print(f"Device: {device}, Mode: {TRAINING_MODE}")

In [None]:
# === Load & Prepare Data ===
data_dir = Path(DATA_DIR) if DATA_DIR else None
prepared = prepare_datasets(config, data_dir=data_dir)

for name, p in prepared.items():
    print(f"  {name:16s}: train={len(p.split.train)}, val={len(p.split.val)}, test={len(p.split.test)}, features={len(p.feature_cols)}")

In [None]:
# === Fine-Tune Moirai 1.1 ===
from tick2.utils.colab import save_checkpoint_to_drive, load_checkpoint_from_drive, setup_training_log

output_base = Path(RESULTS_DIR)
ft_output_dir = output_base / "moirai_ft" / TRAINING_MODE
device_results_dir = output_base / device_label

# Persist training logs to disk (epoch losses, early stopping, errors)
log_path = setup_training_log(ft_output_dir)
print(f"Training log: {log_path}")

cached_path = device_results_dir / f"moirai-1.1-ft_{TRAINING_MODE}.csv"
ft_results = None

# Check for existing Drive checkpoint (resume after disconnect)
drive_model_name = f"moirai_ft/{TRAINING_MODE}/best"
ckpt_local = ft_output_dir / "combined" / "best"
if not cached_path.exists() and not FORCE_RETRAIN and not ckpt_local.exists():
    resumed = load_checkpoint_from_drive(
        model_name=drive_model_name,
        local_path=str(ckpt_local),
    )
    if resumed:
        print(f"[RESUMED] Loaded checkpoint from Drive: {resumed}")

if cached_path.exists() and not FORCE_RETRAIN:
    print(f"[CACHED] {cached_path}")
elif ckpt_local.exists() and not FORCE_RETRAIN:
    print(f"[CACHED] Checkpoint exists at {ckpt_local}, skipping training")
else:
    clear_gpu_memory()
    ft_results = finetune_moirai(
        prepared=prepared,
        config=config,
        output_dir=str(ft_output_dir),
        training_mode=TRAINING_MODE,
        patch_size=32,
        max_epochs=20,
        learning_rate=1e-4,
        batch_size=32,
        early_stopping_patience=5,
        device=device,
    )
    for r in ft_results:
        print(f"  {r.machine}: {r.training_time_s:.1f}s, best_epoch={r.best_epoch}")

    # Save checkpoint to Drive for persistence
    save_checkpoint_to_drive(
        local_path=ckpt_local,
        model_name=drive_model_name,
    )

    checkpoint_push("finetuning")

In [None]:
# === Evaluate Fine-Tuned Model ===
from tick2.models.moirai import MoiraiWrapper

if cached_path.exists() and not FORCE_RETRAIN:
    ft_eval_df = pd.read_csv(cached_path)
    print(f"Loaded cached evaluation: {len(ft_eval_df)} rows")
else:
    # Load FT model and create wrapper
    ckpt_path = ft_output_dir / "combined" / "best" if TRAINING_MODE == "combined" else ft_output_dir
    ft_model = load_finetuned_moirai(
        str(ckpt_path),
        context_length=config.context_length,
        prediction_length=config.prediction_length,
        n_covariates=config.max_covariates,
    )

    ft_wrapper = MoiraiWrapper(model_name="moirai-1.1-ft", max_covariates=config.max_covariates)
    ft_wrapper._model = ft_model.module if hasattr(ft_model, 'module') else ft_model
    ft_wrapper._device = device

    ft_eval_df = evaluate_finetuned(
        model=ft_wrapper,
        prepared=prepared,
        config=config,
        training_mode=f"ft_{TRAINING_MODE}",
    )
    device_results_dir.mkdir(parents=True, exist_ok=True)
    ft_eval_df.to_csv(cached_path, index=False)
    checkpoint_push("evaluation")

print(f"Mean MAE: {ft_eval_df['mae'].mean():.4f}")
print(f"Rows:     {len(ft_eval_df)}")
display(ft_eval_df)

In [None]:
# === Load Zero-Shot Baselines ===
zs_dir = output_base.parent / "output" / "02"
zs_results = load_zero_shot_baselines(zs_dir, model_name="moirai-1.1-small")
print(f"Zero-shot baselines: {len(zs_results)} rows")
if not zs_results.empty:
    print(f"  Mean MAE (ZS): {zs_results['mae'].mean():.4f}")
else:
    print("  No zero-shot results found. Run notebook 02 first.")

In [None]:
# === Comparison: Fine-Tuned vs Zero-Shot ===
if not zs_results.empty:
    comparison_df = compare_ft_vs_zero_shot(ft_eval_df, zs_results)

    # Compute improvement per machine and covariate mode
    merge_cols = ["machine", "context_length", "horizon", "with_covariates"]
    ft_sub = ft_eval_df[merge_cols + ["mae"]].rename(columns={"mae": "mae_ft"})
    zs_sub = zs_results[merge_cols + ["mae"]].rename(columns={"mae": "mae_zs"})
    merged = ft_sub.merge(zs_sub, on=merge_cols, how="inner")
    merged["improvement_pct"] = (1 - merged["mae_ft"] / merged["mae_zs"]) * 100

    print("=== Improvement Summary ===")
    print(f"  Overall mean improvement: {merged['improvement_pct'].mean():.1f}%")
    print()
    print("  Per machine:")
    for machine, group in merged.groupby("machine"):
        print(f"    {machine:16s}: {group['improvement_pct'].mean():+.1f}%  (FT MAE={group['mae_ft'].mean():.4f}, ZS MAE={group['mae_zs'].mean():.4f})")
    print()
    display(merged)
else:
    comparison_df = ft_eval_df.copy()
    print("Skipping comparison (no zero-shot baselines available).")

In [None]:
# === Visualizations ===
results_dir = Path(RESULTS_DIR)
results_dir.mkdir(parents=True, exist_ok=True)

# --- 1. MAE Comparison: Fine-Tuned vs Zero-Shot ---
if not zs_results.empty:
    fig, ax = plt.subplots(figsize=(10, 5))
    plot_data = []
    for machine in sorted(ft_eval_df["machine"].unique()):
        ft_mae = ft_eval_df[ft_eval_df["machine"] == machine]["mae"].mean()
        zs_mae = zs_results[zs_results["machine"] == machine]["mae"].mean() if machine in zs_results["machine"].values else None
        plot_data.append({"machine": machine, "MAE": ft_mae, "mode": "Fine-Tuned"})
        if zs_mae is not None:
            plot_data.append({"machine": machine, "MAE": zs_mae, "mode": "Zero-Shot"})
    bar_df = pd.DataFrame(plot_data)
    sns.barplot(data=bar_df, x="machine", y="MAE", hue="mode", ax=ax)
    ax.set_ylabel("MAE (ppm)")
    ax.set_title(f"Moirai 1.1: Fine-Tuned ({TRAINING_MODE}) vs Zero-Shot")
    ax.legend(title="Mode")
    plt.xticks(rotation=45)
    plt.tight_layout()
    fig.savefig(results_dir / f"moirai_ft_vs_zs_{TRAINING_MODE}.png", dpi=150, bbox_inches="tight")
    plt.show()

# --- 2. Training Loss Curves ---
if ft_results is not None:
    fig, axes = plt.subplots(1, len(ft_results), figsize=(6 * len(ft_results), 4), squeeze=False)
    for i, r in enumerate(ft_results):
        ax = axes[0, i]
        epochs = range(1, len(r.train_loss) + 1)
        ax.plot(epochs, r.train_loss, label="Train Loss", marker="o", markersize=3)
        ax.plot(epochs, r.val_loss, label="Val Loss", marker="s", markersize=3)
        ax.axvline(r.best_epoch + 1, color="red", linestyle="--", alpha=0.7, label=f"Best Epoch ({r.best_epoch + 1})")
        ax.set_xlabel("Epoch")
        ax.set_ylabel("Loss")
        ax.set_title(f"{r.machine} ({r.training_time_s:.0f}s)")
        ax.legend(fontsize=8)
    fig.suptitle(f"Moirai 1.1 Fine-Tuning Loss ({TRAINING_MODE})", fontsize=13)
    plt.tight_layout()
    fig.savefig(results_dir / f"moirai_ft_loss_{TRAINING_MODE}.png", dpi=150, bbox_inches="tight")
    plt.show()
else:
    print("Training loss curves not available (loaded from cache).")

print(f"Saved figures to: {results_dir}")

In [None]:
# === Export Results ===
results_dir = Path(RESULTS_DIR)
device_results_dir = results_dir / device_label
device_results_dir.mkdir(parents=True, exist_ok=True)

# Save combined comparison CSV
if not zs_results.empty:
    comparison_csv = device_results_dir / f"moirai_ft_vs_zs_{TRAINING_MODE}.csv"
    comparison_df.to_csv(comparison_csv, index=False)
    print(f"Comparison CSV: {comparison_csv}")

# Save LaTeX table
summary = ft_eval_df.groupby(["machine", "with_covariates"]).agg(
    mae=("mae", "mean"),
    rmse=("rmse", "mean"),
    coverage=("coverage", "mean"),
    inference_ms=("inference_ms", "mean"),
).round(4).reset_index()

latex_path = results_dir / f"moirai_ft_{TRAINING_MODE}.tex"
with open(latex_path, "w") as f:
    f.write(summary.to_latex(index=False, float_format="%.4f"))
print(f"LaTeX table:    {latex_path}")
print()
print(summary.to_string(index=False))

In [None]:
# === Final Push ===
if IN_COLAB:
    os.chdir(REPO_DIR)

    # Stage all outputs
    subprocess.run(["git", "add", "tick2/notebooks/output/03/"], check=True)

    status = subprocess.run(
        ["git", "status", "--porcelain", "tick2/notebooks/output/03/"],
        capture_output=True, text=True,
    )
    if status.stdout.strip():
        subprocess.run(
            ["git", "commit", "-m",
             f"results: notebook 03c moirai fine-tuning final ({TRAINING_MODE}, {device_label})"],
            check=True,
        )
        if GITHUB_TOKEN:
            subprocess.run(["git", "push"], check=True)
            print("Pushed final outputs to GitHub.")
        else:
            print("Committed locally but GITHUB_TOKEN not set.")
    else:
        print("No new outputs to commit.")
else:
    print(f"Local run. Outputs saved to: {Path(RESULTS_DIR)}")
    print("Run 'git add tick2/notebooks/output/03/ && git commit && git push' to share.")