# ResPSANN Compute-Parity Experiments (Colab Runner)

This notebook orchestrates the experiments described in `plan.txt` using the datasets summarised in `data_descriptions.txt`. Execute it inside Google Colab (GPU runtime recommended).


## Run Checklist
- Prefer Google Colab with a GPU runtime (recommended) before running any experiments.
- Let the setup cell install the latest published `psann` package via `pip`; no repository clone is required.
- Upload or mount the dataset directory so that `DATA_ROOT` points to it (defaults to `<working dir>/datasets`).
- Adjust `GLOBAL_CONFIG` and the experiment toggles before launching training to stay within the Colab budget.
- Keep the heavy training cells disabled until you are ready to execute them in Colab.


In [48]:
import os
import sys
from pathlib import Path

COLAB = "google.colab" in sys.modules

DEFAULT_PROJECT_ROOT = Path("/content") if COLAB else Path.cwd()
PROJECT_ROOT = Path(os.getenv("PSANN_PROJECT_ROOT", DEFAULT_PROJECT_ROOT)).resolve()

DATA_ROOT = Path(os.getenv("PSANN_DATA_ROOT", PROJECT_ROOT / "datasets")).resolve()
RESULTS_ROOT = Path(os.getenv("PSANN_RESULTS_ROOT", PROJECT_ROOT / "colab_results")).resolve()
FIGURE_ROOT = RESULTS_ROOT / "figures"

RESULTS_ROOT.mkdir(parents=True, exist_ok=True)
FIGURE_ROOT.mkdir(parents=True, exist_ok=True)

if not DATA_ROOT.exists():
    print(f"[WARN] DATA_ROOT {DATA_ROOT} does not exist yet. Upload datasets or update PSANN_DATA_ROOT.")

print(f"Colab runtime         : {COLAB}")
print(f"Project root          : {PROJECT_ROOT}")
print(f"Dataset root          : {DATA_ROOT}")
print(f"Results directory     : {RESULTS_ROOT}")


Colab runtime         : True
Project root          : /content
Dataset root          : /content/datasets
Results directory     : /content/colab_results


In [49]:
# --- Robust extraction for your datasets.zip layout ---
import zipfile
import shutil
import re
from pathlib import Path, PureWindowsPath

# Fallbacks if not already defined in your notebook
try:
    PROJECT_ROOT
except NameError:
    PROJECT_ROOT = Path("/content")
try:
    DATA_ROOT
except NameError:
    DATA_ROOT = PROJECT_ROOT / "datasets"

zip_path = PROJECT_ROOT / "datasets.zip"

# Canonical names your code expects
EXPECTED_FOLDERS = [
    "Industrial Data from the Electric Arc Furnace",
    "Beijing Air Quality",
    "Human Activity Recognition",
    "Jena Climate 2009-2016",  # we'll normalize any en/em/Unicode minus to a hyphen
    "Kaggle Rossmann Store Sales",
]

def _normalize_name(s: str) -> str:
    # unify hyphen-like chars, collapse whitespace, lowercase
    s = (s.replace("\u2013", "-")  # en dash
           .replace("\u2014", "-")  # em dash
           .replace("\u2212", "-")  # minus
           .replace("\xa0", " "))   # non-breaking space
    s = re.sub(r"\s+", " ", s).strip().lower()
    return s

def _safe_mkdir(path: Path) -> None:
    """
    Create directory `path`, removing any FILE that blocks directory creation
    at this path or any ancestor.
    """
    try:
        path.mkdir(parents=True, exist_ok=True)
        return
    except NotADirectoryError:
        # Find any ancestor that's a file and remove it.
        # Include the path itself first, then walk upward.
        for ancestor in [path, *path.parents]:
            try:
                if ancestor.exists() and ancestor.is_file():
                    ancestor.unlink()
            except Exception:
                # If we can't remove, re-raise later when mkdir fails again
                pass
        # Try once more after clearing blockers
        path.mkdir(parents=True, exist_ok=True)

def _canonicalize_top_level_dirs(root: Path, expected_names: list[str]) -> None:
    """
    If top-level dirs exist with dash/space variants, rename them
    to the canonical EXPECTED_FOLDERS names so downstream code works.
    """
    if not root.exists():
        return
    # Map normalized->actual path for current top-level dirs
    current = { _normalize_name(p.name): p for p in root.iterdir() if p.is_dir() }
    for exp in expected_names:
        canonical = root / exp
        if canonical.exists():
            continue
        norm = _normalize_name(exp)
        if norm in current and current[norm].exists():
            src = current[norm]
            # Avoid rename conflict: if a file with target name exists, remove it
            if canonical.exists() and canonical.is_file():
                canonical.unlink()
            print(f"[rename] {src.name} -> {exp}")
            src.rename(canonical)

def extract_datasets(zip_path: Path, target_root: Path) -> None:
    scratch_root = target_root.parent / "_datasets_unpack_tmp"
    if scratch_root.exists():
        shutil.rmtree(scratch_root)

    with zipfile.ZipFile(zip_path, "r") as zf:
        for entry in zf.infolist():
            # Normalize path parts from the ZIP (handles both / and \ separators)
            parts = PureWindowsPath(entry.filename).parts
            if not parts or parts[0].lower() != "datasets":
                continue
            rel_parts = parts[1:]
            if not rel_parts:
                continue

            dest = scratch_root.joinpath(*rel_parts)

            # Treat as directory if zip marks it so OR the path text ends with a slash/backslash
            is_dir_entry = entry.is_dir() or entry.filename.endswith("/") or entry.filename.endswith("\\")
            if is_dir_entry:
                _safe_mkdir(dest)
            else:
                # Ensure parents exist, removing any file that blocks dir creation
                _safe_mkdir(dest.parent)
                # Extract the file
                with zf.open(entry, "r") as src, open(dest, "wb") as dst:
                    shutil.copyfileobj(src, dst)

    # Replace/refresh target_root
    if target_root.exists():
        shutil.rmtree(target_root)
    scratch_root.rename(target_root)

    # Canonicalize top-level dir names (e.g., Jena dash variants)
    _canonicalize_top_level_dirs(target_root, EXPECTED_FOLDERS)

def datasets_ready(root: Path) -> bool:
    if not root.exists():
        return False
    # Accept either exact or normalized matches for robustness
    have = { _normalize_name(p.name) for p in root.iterdir() if p.is_dir() }
    need = { _normalize_name(n) for n in EXPECTED_FOLDERS }
    return need.issubset(have)

# --- Clean up stray "datasets\..." artefacts before extraction (from Windows zips) ---
for leftover in PROJECT_ROOT.iterdir():
    if "\\" in leftover.name and leftover.name.lower().startswith("datasets"):
        if leftover.is_dir():
            shutil.rmtree(leftover)
        else:
            leftover.unlink()

# --- Run ---
if datasets_ready(DATA_ROOT):
    print(f"Datasets already present at {DATA_ROOT}")
elif zip_path.exists():
    print(f"Extracting {zip_path} (normalising Windows/Unicode paths)…")
    extract_datasets(zip_path, DATA_ROOT)
    if datasets_ready(DATA_ROOT):
        print(f"Extraction complete. DATA_ROOT now available at {DATA_ROOT}")
        # Optional: quick sanity peek
        for p in sorted(DATA_ROOT.iterdir()):
            if p.is_dir():
                print(" -", p.name)
    else:
        print("[WARN] Extraction finished but expected folders are still missing.")
else:
    print(f"Archive {zip_path} not found. Upload datasets.zip or mount the datasets directory.")


Datasets already present at /content/datasets


In [50]:
import subprocess
import sys


def install_dependencies():
    base_packages = [
        "psann",
        "pandas>=2.0",
        "numpy>=1.24",
        "scikit-learn>=1.3",
        "torch>=2.1",
        "torchvision>=0.16",
        "torchaudio>=2.1",
        "lightgbm>=4.0",
        "xgboost>=1.7",
        "catboost>=1.2",
        "shap>=0.44",
        "matplotlib>=3.7",
        "seaborn>=0.13",
        "plotly>=5.18",
        "imbalanced-learn>=0.12",
        "tqdm>=4.66",
        "einops>=0.7",
        "rich>=13.7",
    ]
    print("Installing psann and supporting packages...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q"] + base_packages)


if COLAB:
    install_dependencies()
else:
    print("Skipping dependency installation because we are not inside Colab.")


Installing psann and supporting packages...


In [51]:
# Core dependencies used across the notebook
import itertools
import json
import math
import random
import time
from dataclasses import dataclass, field
from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Literal

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from tqdm.auto import tqdm


In [52]:
@dataclass
class TrainConfig:
    epochs: int
    batch_size: int
    learning_rate: float
    weight_decay: float = 0.0
    max_minutes: Optional[float] = None
    early_stopping: bool = True
    patience: int = 10
    gradient_clip: Optional[float] = None
    scheduler: Optional[str] = None
    scheduler_params: Optional[Dict[str, Any]] = None
    warmup_steps: int = 0
    max_batches_per_epoch: Optional[int] = None


@dataclass
class ModelSpec:
    name: str
    builder: Callable[[Tuple[int, ...], int, Dict[str, Any]], nn.Module]
    train_config: TrainConfig
    task_type: Literal["regression", "classification", "multitask"]
    input_kind: Literal["tabular", "sequence"]
    group: str = "baseline"
    extra: Dict[str, Any] = field(default_factory=dict)
    param_target: Optional[int] = None
    notes: str = ""


@dataclass
class DatasetBundle:
    name: str
    task_type: Literal["regression", "classification", "multitask"]
    input_kind: Literal["tabular", "sequence"]
    feature_names: List[str]
    target_names: List[str]
    train: Dict[str, np.ndarray]
    val: Dict[str, np.ndarray]
    test: Dict[str, np.ndarray]
    metadata: Dict[str, Any] = field(default_factory=dict)

    def summary(self) -> Dict[str, Any]:
        info = {
            "name": self.name,
            "task_type": self.task_type,
            "input_kind": self.input_kind,
            "n_train": len(self.train["X"]),
            "n_val": len(self.val["X"]),
            "n_test": len(self.test["X"]),
            "input_shape": tuple(self.train["X"].shape[1:]),
            "target_shape": tuple(self.train["y"].shape[1:]) if self.train["y"].ndim > 1 else (),
        }
        info.update({f"meta_{k}": v for k, v in self.metadata.items() if isinstance(v, (int, float, str))})
        return info


@dataclass
class ExperimentResult:
    dataset: str
    task: str
    model: str
    group: str
    split: str
    metrics: Dict[str, float]
    params: int
    train_wall_seconds: float
    notes: str = ""


class ResultLogger:
    def __init__(self) -> None:
        self._rows: List[ExperimentResult] = []

    def append(self, row: ExperimentResult) -> None:
        self._rows.append(row)

    def to_frame(self) -> pd.DataFrame:
        records = []
        for row in self._rows:
            rec = {
                "dataset": row.dataset,
                "task": row.task,
                "model": row.model,
                "group": row.group,
                "split": row.split,
                "params": row.params,
                "train_wall_seconds": row.train_wall_seconds,
                "notes": row.notes,
            }
            rec.update(row.metrics)
            records.append(rec)
        return pd.DataFrame(records)


RESULT_LOGGER = ResultLogger()


In [53]:
def load_jena_climate(data_root: Path) -> pd.DataFrame:
    path = data_root / "Jena Climate 2009-2016" / "jena_climate_2009_2016.csv"
    if not path.exists():
        raise FileNotFoundError(f"Jena climate CSV not found at {path}")
    df = pd.read_csv(path)
    df["datetime"] = pd.to_datetime(df["Date Time"], dayfirst=True)
    df = df.drop(columns=["Date Time"])
    numeric_cols = [col for col in df.columns if col != "datetime"]
    df[numeric_cols] = df[numeric_cols].astype(np.float32)
    df = df.sort_values("datetime").reset_index(drop=True)
    return df


def prepare_jena_bundle(
    df: pd.DataFrame,
    target: str = "T (degC)",
    context_steps: int = 72,
    horizon_steps: int = 36,
    resample_factor: int = 1,
) -> DatasetBundle:
    df = df.copy()
    if resample_factor > 1:
        df = df.iloc[::resample_factor].reset_index(drop=True)
    df = add_calendar_features(df, "datetime")
    feature_cols = [c for c in df.columns if c not in ("datetime", target)]
    df[feature_cols] = df[feature_cols].apply(pd.to_numeric, errors="coerce")

    values = df[feature_cols].to_numpy(dtype=np.float32)
    target_values = df[target].to_numpy(dtype=np.float32)
    timestamps = df["datetime"].to_numpy()

    windows = []
    targets = []
    ts_list = []
    for idx in range(context_steps, len(df) - horizon_steps):
        window = values[idx - context_steps : idx]
        target_value = target_values[idx + horizon_steps]
        windows.append(window)
        targets.append(target_value)
        ts_list.append(timestamps[idx])
    X = np.stack(windows)
    y = np.asarray(targets, dtype=np.float32)[:, None]
    ts = np.asarray(ts_list)

    df_windows = pd.DataFrame({"datetime": ts})
    train_df, val_df, test_df = train_val_test_split_by_time(
        df_windows, "datetime", "2015-01-01", "2016-01-01"
    )
    train_idx = train_df.index.to_numpy()
    val_idx = val_df.index.to_numpy()
    test_idx = test_df.index.to_numpy()

    target_slug = (
        target.lower()
        .replace(" ", "")
        .replace("(", "")
        .replace(")", "")
        .replace("/", "")
    )
    bundle_name = f"Jena_{target_slug}_{context_steps}ctx_{horizon_steps}h"

    bundle = DatasetBundle(
        name=bundle_name,
        task_type="regression",
        input_kind="sequence",
        feature_names=feature_cols,
        target_names=[target],
        train={"X": X[train_idx], "y": y[train_idx]},
        val={"X": X[val_idx], "y": y[val_idx]},
        test={"X": X[test_idx], "y": y[test_idx]},
        metadata={
            "context_steps": context_steps,
            "horizon_steps": horizon_steps,
            "resample_factor": resample_factor,
        },
    )
    return bundle


In [54]:
def load_har_engineered(data_root: Path):
    base = data_root / "Human Activity Recognition" / "UCI HAR Dataset"
    X_train = pd.read_csv(base / "train" / "X_train.txt", sep='\s+', header=None)
    y_train = pd.read_csv(base / "train" / "y_train.txt", header=None).squeeze("columns")
    subject_train = pd.read_csv(base / "train" / "subject_train.txt", header=None).squeeze("columns")

    X_test = pd.read_csv(base / "test" / "X_test.txt", sep='\s+', header=None)
    y_test = pd.read_csv(base / "test" / "y_test.txt", header=None).squeeze("columns")
    subject_test = pd.read_csv(base / "test" / "subject_test.txt", header=None).squeeze("columns")

    y_train = y_train.values.astype(int) - 1
    y_test = y_test.values.astype(int) - 1

    features = (base / "features.txt").read_text().strip().splitlines()
    feature_names = [line.split()[1] for line in features]

    X_train.columns = feature_names
    X_test.columns = feature_names

    train_df = X_train.copy()
    test_df = X_test.copy()
    train_df["label"] = y_train
    train_df["subject"] = subject_train.values
    test_df["label"] = y_test
    test_df["subject"] = subject_test.values

    return train_df, test_df, feature_names


def prepare_har_engineered_bundle(
    train_df: pd.DataFrame,
    test_df: pd.DataFrame,
    feature_names: List[str],
    val_fraction: float = 0.15,
) -> DatasetBundle:
    from sklearn.model_selection import StratifiedShuffleSplit

    X = train_df[feature_names].to_numpy(dtype=np.float32)
    y = train_df["label"].to_numpy(dtype=np.int64)
    splitter = StratifiedShuffleSplit(n_splits=1, test_size=val_fraction, random_state=GLOBAL_CONFIG["seed"])
    train_idx, val_idx = next(splitter.split(X, y))

    X_train = X[train_idx]
    y_train = y[train_idx][:, None]
    X_val = X[val_idx]
    y_val = y[val_idx][:, None]

    X_test = test_df[feature_names].to_numpy(dtype=np.float32)
    y_test = test_df["label"].to_numpy(dtype=np.int64)[:, None]

    bundle = DatasetBundle(
        name="HAR_engineered",
        task_type="classification",
        input_kind="tabular",
        feature_names=feature_names,
        target_names=["activity"],
        train={"X": X_train, "y": y_train},
        val={"X": X_val, "y": y_val},
        test={"X": X_test, "y": y_test},
        metadata={
            "n_classes": 6,
            "label_mapping": {
                0: "WALKING",
                1: "WALKING_UPSTAIRS",
                2: "WALKING_DOWNSTAIRS",
                3: "SITTING",
                4: "STANDING",
                5: "LAYING",
            },
        },
    )
    return bundle


def load_har_raw_sequences(data_root: Path):
    base = data_root / "Human Activity Recognition" / "UCI HAR Dataset"
    axes = [
        "body_acc_x",
        "body_acc_y",
        "body_acc_z",
        "body_gyro_x",
        "body_gyro_y",
        "body_gyro_z",
        "total_acc_x",
        "total_acc_y",
        "total_acc_z",
    ]

    def load_split(split: str):
        signals = []
        for axis in axes:
            path = base / split / "Inertial Signals" / f"{axis}_{split}.txt"
            arr = np.loadtxt(path)
            signals.append(arr[:, :, None])
        X = np.concatenate(signals, axis=2).astype(np.float32)
        y = np.loadtxt(base / split / f"y_{split}.txt").astype(int) - 1
        return X, y

    X_train, y_train = load_split("train")
    X_test, y_test = load_split("test")
    return X_train, y_train, X_test, y_test, axes


def prepare_har_raw_bundle(
    X_train: np.ndarray,
    y_train: np.ndarray,
    X_test: np.ndarray,
    y_test: np.ndarray,
    val_fraction: float = 0.15,
) -> DatasetBundle:
    from sklearn.model_selection import StratifiedShuffleSplit

    splitter = StratifiedShuffleSplit(n_splits=1, test_size=val_fraction, random_state=GLOBAL_CONFIG["seed"])
    train_idx, val_idx = next(splitter.split(X_train, y_train))
    bundle = DatasetBundle(
        name="HAR_raw_sequence",
        task_type="classification",
        input_kind="sequence",
        feature_names=[f"axis_{i}" for i in range(X_train.shape[2])],
        target_names=["activity"],
        train={"X": X_train[train_idx], "y": y_train[train_idx][:, None]},
        val={"X": X_train[val_idx], "y": y_train[val_idx][:, None]},
        test={"X": X_test, "y": y_test[:, None]},
        metadata={
            "sequence_length": X_train.shape[1],
            "n_channels": X_train.shape[2],
            "n_classes": 6,
        },
    )
    return bundle


  X_train = pd.read_csv(base / "train" / "X_train.txt", sep='\s+', header=None)
  X_test = pd.read_csv(base / "test" / "X_test.txt", sep='\s+', header=None)


In [55]:
def load_rossmann_frames(data_root: Path):
    base = data_root / "Kaggle Rossmann Store Sales" / "rossmann-store-sales"
    train_path = base / "train.csv"
    test_path = base / "test.csv"
    store_path = base / "store.csv"
    train = pd.read_csv(train_path, parse_dates=["Date"])
    test = pd.read_csv(test_path, parse_dates=["Date"])
    store = pd.read_csv(store_path)
    return train, test, store


def is_promo2_active(row: pd.Series) -> int:
    if not row.get("Promo2", 0):
        return 0
    month = row["Date"].month
    if isinstance(row["PromoInterval"], str) and row["PromoInterval"]:
        month_map = {name: idx for idx, name in enumerate(["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"], start=1)}
        promo_months = [month_map.get(m.strip(), 0) for m in row["PromoInterval"].split(",")]
        return int(month in promo_months)
    return 0


def preprocess_rossmann(train: pd.DataFrame, store: pd.DataFrame) -> Tuple[pd.DataFrame, List[str], str]:
    df = train.merge(store, on="Store", how="left")
    df = df[df["Open"] != 0].copy()

    median_distance = df["CompetitionDistance"].median()
    df["CompetitionDistance"] = df["CompetitionDistance"].fillna(median_distance)
    df["CompetitionOpenSinceYear"] = df["CompetitionOpenSinceYear"].fillna(df["CompetitionOpenSinceYear"].median())
    df["CompetitionOpenSinceMonth"] = df["CompetitionOpenSinceMonth"].fillna(df["CompetitionOpenSinceMonth"].median())
    df["Promo2SinceWeek"] = df["Promo2SinceWeek"].fillna(0)
    df["Promo2SinceYear"] = df["Promo2SinceYear"].fillna(0)
    df["PromoInterval"] = df["PromoInterval"].fillna("")

    df["Date"] = pd.to_datetime(df["Date"])
    df["Year"] = df["Date"].dt.year
    df["Month"] = df["Date"].dt.month
    df["Day"] = df["Date"].dt.day
    df["WeekOfYear"] = df["Date"].dt.isocalendar().week.astype(int)
    df["DayOfWeek"] = df["Date"].dt.dayofweek

    df["IsPromo2Month"] = df.apply(is_promo2_active, axis=1)

    state_holiday_map = {"0": "None", "a": "PublicHoliday", "b": "EasterHoliday", "c": "Christmas"}
    df["StateHoliday"] = df["StateHoliday"].replace(state_holiday_map)

    categorical_cols = ["StoreType", "Assortment", "StateHoliday", "PromoInterval"]
    df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

    df["CustomersLag7"] = df.groupby("Store")["Customers"].shift(7)
    df["SalesLag7"] = df.groupby("Store")["Sales"].shift(7)
    df["SalesMA14"] = df.groupby("Store")["Sales"].transform(lambda s: s.rolling(14, min_periods=1).mean())
    df["PromoMovingAvg"] = df.groupby("Store")["Promo"].transform(lambda s: s.rolling(30, min_periods=1).mean())

    df = df.dropna().reset_index(drop=True)

    feature_cols = [c for c in df.columns if c not in ("Sales", "Date")]
    target_col = "Sales"
    return df, feature_cols, target_col


def prepare_rossmann_bundle(df: pd.DataFrame, feature_cols: List[str], target_col: str) -> DatasetBundle:
    df = df.sort_values("Date").reset_index(drop=True)
    unique_dates = np.sort(df["Date"].unique())
    if unique_dates.size < 3:
        raise ValueError("Rossmann dataset requires at least three distinct dates to form train/val/test splits.")

    train_cut_idx = max(1, int(0.8 * unique_dates.size))
    val_cut_idx = max(train_cut_idx + 1, int(0.9 * unique_dates.size))
    if val_cut_idx >= unique_dates.size:
        val_cut_idx = unique_dates.size - 1
    train_end = unique_dates[train_cut_idx]
    val_end = unique_dates[val_cut_idx]

    train_mask = df["Date"] < train_end
    val_mask = (df["Date"] >= train_end) & (df["Date"] < val_end)
    test_mask = df["Date"] >= val_end

    train_df = df[train_mask].copy()
    val_df = df[val_mask].copy()
    test_df = df[test_mask].copy()

    if train_df.empty or val_df.empty or test_df.empty:
        raise ValueError("Rossmann split produced an empty partition; adjust quantiles or check input data.")

    X_train = train_df[feature_cols].to_numpy(dtype=np.float32)
    y_train = train_df[target_col].to_numpy(dtype=np.float32)[:, None]
    X_val = val_df[feature_cols].to_numpy(dtype=np.float32)
    y_val = val_df[target_col].to_numpy(dtype=np.float32)[:, None]
    X_test = test_df[feature_cols].to_numpy(dtype=np.float32)
    y_test = test_df[target_col].to_numpy(dtype=np.float32)[:, None]

    feature_mean = X_train.mean(axis=0, keepdims=True)
    feature_std = X_train.std(axis=0, keepdims=True)
    feature_std = np.where(feature_std < 1e-6, 1.0, feature_std)

    target_mean = y_train.mean(axis=0, keepdims=True)
    target_std = y_train.std(axis=0, keepdims=True)
    target_std = np.where(target_std < 1e-6, 1.0, target_std)

    def _normalize(arr: np.ndarray, mean: np.ndarray, std: np.ndarray) -> np.ndarray:
        return ((arr - mean) / std).astype(np.float32)

    X_train = _normalize(X_train, feature_mean, feature_std)
    X_val = _normalize(X_val, feature_mean, feature_std)
    X_test = _normalize(X_test, feature_mean, feature_std)

    y_train_norm = _normalize(y_train, target_mean, target_std)
    y_val_norm = _normalize(y_val, target_mean, target_std)
    y_test_norm = _normalize(y_test, target_mean, target_std)

    bundle = DatasetBundle(
        name="Rossmann_sales",
        task_type="regression",
        input_kind="tabular",
        feature_names=feature_cols,
        target_names=[target_col],
        train={"X": X_train, "y": y_train_norm},
        val={"X": X_val, "y": y_val_norm},
        test={"X": X_test, "y": y_test_norm},
        metadata={
            "train_range": [str(train_df["Date"].min()), str(train_df["Date"].max())],
            "val_range": [str(val_df["Date"].min()), str(val_df["Date"].max())],
            "test_range": [str(test_df["Date"].min()), str(test_df["Date"].max())],
            "feature_scaler": {
                "mean": feature_mean.flatten().astype(np.float32).tolist(),
                "std": feature_std.flatten().astype(np.float32).tolist(),
            },
            "target_scaler": {
                "mean": target_mean.flatten().astype(np.float32).tolist(),
                "std": target_std.flatten().astype(np.float32).tolist(),
            },
        },
    )
    return bundle

In [56]:
def load_beijing_stations(data_root: Path) -> Dict[str, pd.DataFrame]:
    base = data_root / "Beijing Air Quality"
    if not base.exists():
        raise FileNotFoundError(f"Beijing Air Quality directory not found at {base}")
    stations: Dict[str, pd.DataFrame] = {}
    for csv_path in base.glob("PRSA_Data_*.csv"):
        station_name = csv_path.stem.replace("PRSA_Data_", "")
        print(f"Loading Beijing station {station_name}...")
        df = pd.read_csv(csv_path)
        df["datetime"] = pd.to_datetime(
            df[["year", "month", "day", "hour"]].rename(columns=str)
        )
        df = df.sort_values("datetime").reset_index(drop=True)
        if "No" in df.columns:
            df = df.drop(columns=["No"])
        stations[station_name] = df
    return stations


def preprocess_beijing_station(df: pd.DataFrame, target_col: str = "PM2.5") -> Tuple[pd.DataFrame, pd.DataFrame]:
    df = df.copy()
    pollutant_cols = ["PM2.5", "PM10", "SO2", "NO2", "CO", "O3"]
    meteorology_cols = ["PRES", "DEWP", "TEMP", "RAIN", "WSPM"]
    for col in pollutant_cols + meteorology_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors="coerce")
    mask = df[pollutant_cols + meteorology_cols].isna()
    df[pollutant_cols + meteorology_cols] = df[pollutant_cols + meteorology_cols].interpolate(limit=6, limit_direction="both")
    df[pollutant_cols + meteorology_cols] = df[pollutant_cols + meteorology_cols].ffill().bfill()

    calendar = pd.DataFrame(
        {
            "hour": df["datetime"].dt.hour,
            "dow": df["datetime"].dt.dayofweek,
            "month": df["datetime"].dt.month,
        }
    )
    calendar["hour_sin"] = np.sin(2 * np.pi * calendar["hour"] / 24.0)
    calendar["hour_cos"] = np.cos(2 * np.pi * calendar["hour"] / 24.0)
    calendar["dow_sin"] = np.sin(2 * np.pi * calendar["dow"] / 7.0)
    calendar["dow_cos"] = np.cos(2 * np.pi * calendar["dow"] / 7.0)
    calendar["month_sin"] = np.sin(2 * np.pi * calendar["month"] / 12.0)
    calendar["month_cos"] = np.cos(2 * np.pi * calendar["month"] / 12.0)

    feature_frame = pd.concat(
        [df[["datetime", target_col]], df[pollutant_cols + meteorology_cols], calendar],
        axis=1,
    )
    mask_frame = mask.astype(np.float32)
    mask_frame.columns = [f"{col}_mask" for col in mask_frame.columns]
    feature_frame = pd.concat([feature_frame, mask_frame], axis=1)
    return feature_frame, mask_frame


def build_temporal_windows(frame, target_col, feature_cols, context, horizon, drop_na=True):
    """
    Returns (windows, targets, indices) where:
      - windows: list/array of shape (n_windows, context, n_features)
      - targets: list/array of target values aligned at idx+horizon
      - indices: original indices of the window end (optional)
    This robustly handles scalar/array targets and missing values.
    """
    values = frame[feature_cols].to_numpy(dtype=np.float32)
    targets = frame[target_col].to_numpy(dtype=np.float32)
    windows = []
    target_list = []
    idxs = []

    n = len(values)
    for idx in range(context, n - horizon):
        window = values[idx - context : idx]
        target = targets[idx + horizon]

        if drop_na:
            # Use pd.isna then np.any so this works if `target` is scalar or array-like
            if np.any(pd.isna(window)) or np.any(pd.isna(target)):
                continue

        windows.append(window)
        target_list.append(target)
        idxs.append(idx)

    X = np.stack(windows).astype(np.float32) if windows else np.empty((0, context, values.shape[1]), dtype=np.float32)
    y = np.array(target_list, dtype=np.float32)
    return X, y, np.array(idxs)


def assemble_beijing_cross_station_bundle(
    stations: Dict[str, pd.DataFrame],
    train_stations: List[str],
    val_station: str,
    test_station: str,
    target: str = "PM2.5",
    context: int = 24,
    horizon: int = 6,
) -> DatasetBundle:
    feature_frames: Dict[str, pd.DataFrame] = {}
    feature_cols: Optional[List[str]] = None
    for name, df in stations.items():
        features, _ = preprocess_beijing_station(df, target_col=target)
        feature_frames[name] = features
        if feature_cols is None:
            feature_cols = [col for col in features.columns if col not in ("datetime", target)]
    assert feature_cols is not None

    def collect(names: List[str]) -> Tuple[np.ndarray, np.ndarray]:
        arrays = []
        targets = []
        for station_name in names:
            frame = feature_frames[station_name]
            X, y, _ = build_temporal_windows(frame, target, feature_cols, context, horizon)
            arrays.append(X)
            targets.append(y)
        if arrays:
            X_all = np.concatenate(arrays, axis=0).astype(np.float32)
            y_all = np.concatenate(targets, axis=0)
            if y_all.ndim == 1:
                y_all = y_all[:, None]
            else:
                y_all = y_all.reshape(y_all.shape[0], -1)
            y_all = y_all.astype(np.float32)
        else:
            X_all = np.empty((0, context, len(feature_cols)), dtype=np.float32)
            y_all = np.empty((0, 1), dtype=np.float32)
        return X_all, y_all

    X_train, y_train = collect(train_stations)
    X_val, y_val = collect([val_station])
    X_test, y_test = collect([test_station])

    bundle = DatasetBundle(
        name=f"Beijing_PM25_{context}h_ctx_{horizon}h_horizon",
        task_type="regression",
        input_kind="sequence",
        feature_names=feature_cols,
        target_names=[target],
        train={"X": X_train, "y": y_train},
        val={"X": X_val, "y": y_val},
        test={"X": X_test, "y": y_test},
        metadata={
            "context_hours": context,
            "horizon_hours": horizon,
            "train_stations": train_stations,
            "val_station": val_station,
            "test_station": test_station,
        },
    )
    return bundle


In [57]:
SEED = int(os.getenv("PSANN_GLOBAL_SEED", "2025"))
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {DEVICE}")
if DEVICE.type == "cuda":
    print(f"CUDA device name: {torch.cuda.get_device_name(0)}")

GLOBAL_CONFIG: Dict[str, Any] = {
    "seed": SEED,
    "device": DEVICE,
    "default_epochs": 100,
    "default_lr": 1e-3,
    "default_weight_decay": 0.0,
    "default_batch_size": 256,
    "max_time_minutes": 5.0,
    "num_workers": 2 if DEVICE.type == "cuda" else 0,
    "label_smoothing": 0.05,
    "results_root": RESULTS_ROOT,
    "figure_root": FIGURE_ROOT,
}


Using device: cuda
CUDA device name: NVIDIA L4


In [58]:
from sklearn.metrics import accuracy_score, f1_score, log_loss


def rmse(y_true: np.ndarray, y_pred: np.ndarray) -> float:
    return float(np.sqrt(np.mean((y_true - y_pred) ** 2)))


def mae(y_true: np.ndarray, y_pred: np.ndarray) -> float:
    return float(np.mean(np.abs(y_true - y_pred)))


def smape(y_true: np.ndarray, y_pred: np.ndarray) -> float:
    denom = (np.abs(y_true) + np.abs(y_pred) + 1e-8) / 2.0
    return float(np.mean(np.abs(y_true - y_pred) / denom))


def r2_score_np(y_true: np.ndarray, y_pred: np.ndarray) -> float:
    ss_res = np.sum((y_true - y_pred) ** 2)
    ss_tot = np.sum((y_true - np.mean(y_true)) ** 2)
    return float(1 - ss_res / ss_tot) if ss_tot != 0 else float('nan')


def mase(y_true: np.ndarray, y_pred: np.ndarray, seasonal_period: int = 1) -> float:
    if len(y_true) <= seasonal_period:
        return float('nan')
    naive = np.mean(np.abs(np.diff(y_true, n=seasonal_period)))
    return float(np.mean(np.abs(y_true - y_pred)) / (naive + 1e-8))


def expected_calibration_error(probs: np.ndarray, y_true: np.ndarray, n_bins: int = 15) -> float:
    confidences = probs.max(axis=1)
    predictions = probs.argmax(axis=1)
    bin_edges = np.linspace(0.0, 1.0, n_bins + 1)
    ece = 0.0
    for i in range(n_bins):
        mask = (confidences >= bin_edges[i]) & (confidences < bin_edges[i + 1])
        if not np.any(mask):
            continue
        bin_acc = np.mean(predictions[mask] == y_true[mask])
        bin_conf = np.mean(confidences[mask])
        ece += np.abs(bin_acc - bin_conf) * np.mean(mask)
    return float(ece)


def classification_metrics(y_true: np.ndarray, logits: np.ndarray, average: str = 'macro') -> Dict[str, float]:
    probs = torch.softmax(torch.from_numpy(logits), dim=-1).numpy()
    preds = probs.argmax(axis=1)
    metrics = {
        'accuracy': float(accuracy_score(y_true, preds)),
        'f1_macro': float(f1_score(y_true, preds, average=average)),
        'nll': float(log_loss(y_true, probs, labels=list(range(probs.shape[1])))),
    }
    metrics['ece'] = expected_calibration_error(probs, y_true, n_bins=15)
    return metrics


def _prepare_regression_arrays(y_true: np.ndarray, y_pred: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
    y_true = np.asarray(y_true)
    y_pred = np.asarray(y_pred)
    if y_true.ndim > 2:
        y_true = y_true.reshape(y_true.shape[0], -1)
    if y_pred.ndim > 2:
        y_pred = y_pred.reshape(y_pred.shape[0], -1)
    if y_true.ndim == 1:
        y_true = y_true[:, None]
    if y_pred.ndim == 1:
        y_pred = y_pred[:, None]
    if y_true.shape != y_pred.shape:
        raise ValueError(f'Regression metric shape mismatch: {y_true.shape} vs {y_pred.shape}')
    return y_true.astype(np.float64), y_pred.astype(np.float64)


def regression_metrics(y_true: np.ndarray, y_pred: np.ndarray, seasonal_period: int = 1) -> Dict[str, float]:
    y_true, y_pred = _prepare_regression_arrays(y_true, y_pred)
    rmse_vals = []
    mae_vals = []
    smape_vals = []
    r2_vals = []
    mase_vals = []
    for i in range(y_true.shape[1]):
        yt = y_true[:, i]
        yp = y_pred[:, i]
        rmse_vals.append(rmse(yt, yp))
        mae_vals.append(mae(yt, yp))
        smape_vals.append(smape(yt, yp))
        r2_vals.append(r2_score_np(yt, yp))
        mase_vals.append(mase(yt, yp, seasonal_period=seasonal_period))
    return {
        'rmse': float(np.mean(rmse_vals)),
        'mae': float(np.mean(mae_vals)),
        'smape': float(np.mean(smape_vals)),
        'r2': float(np.mean(r2_vals)),
        'mase': float(np.mean(mase_vals)),
    }


In [59]:
def build_dataloader(
    X: np.ndarray,
    y: np.ndarray,
    batch_size: int,
    shuffle: bool,
    task_type: Literal["regression", "classification", "multitask"] = "regression",
    drop_last: bool = False,
) -> DataLoader:
    X_tensor = torch.from_numpy(X).float()
    if task_type == "classification":
        y_tensor = torch.from_numpy(y.squeeze()).long()
    else:
        y_tensor = torch.from_numpy(y.astype(np.float32))
    dataset = TensorDataset(X_tensor, y_tensor)
    loader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=GLOBAL_CONFIG["num_workers"],
        pin_memory=(DEVICE.type == "cuda"),
    )
    return loader


class Timer:
    def __enter__(self):
        self.start = time.perf_counter()
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        self.end = time.perf_counter()

    @property
    def elapsed(self) -> float:
        return getattr(self, "end", time.perf_counter()) - getattr(self, "start", time.perf_counter())


In [60]:
def coerce_decimal(series: pd.Series) -> pd.Series:
    if pd.api.types.is_numeric_dtype(series):
        return series
    as_str = series.astype(str).str.replace(" ", "")
    as_str = as_str.replace({"nan": np.nan, "None": np.nan})
    as_str = as_str.str.replace(",", ".", regex=False)
    return pd.to_numeric(as_str, errors="coerce")


def coerce_datetime(series: pd.Series) -> pd.Series:
    as_str = series.astype(str).str.strip()
    as_str = as_str.replace({"nan": np.nan, "NaT": np.nan})
    as_str = as_str.str.replace(",", ".", n=1, regex=False)
    return pd.to_datetime(as_str, errors="coerce")


def ensure_float(df: pd.DataFrame, columns: Iterable[str]) -> pd.DataFrame:
    for col in columns:
        if col in df.columns:
            df[col] = coerce_decimal(df[col])
    return df


def ensure_datetime(df: pd.DataFrame, columns: Iterable[str]) -> pd.DataFrame:
    for col in columns:
        if col in df.columns:
            df[col] = coerce_datetime(df[col])
    return df


def add_calendar_features(frame: pd.DataFrame, timestamp_col: str) -> pd.DataFrame:
    ts = pd.to_datetime(frame[timestamp_col])
    frame[f"{timestamp_col}_year"] = ts.dt.year
    frame[f"{timestamp_col}_month"] = ts.dt.month
    frame[f"{timestamp_col}_day"] = ts.dt.day
    frame[f"{timestamp_col}_hour"] = ts.dt.hour
    frame[f"{timestamp_col}_dow"] = ts.dt.dayofweek
    frame[f"{timestamp_col}_week"] = ts.dt.isocalendar().week.astype(int)
    frame[f"{timestamp_col}_dayofyear"] = ts.dt.dayofyear
    frame[f"{timestamp_col}_sin_hour"] = np.sin(2 * np.pi * frame[f"{timestamp_col}_hour"] / 24.0)
    frame[f"{timestamp_col}_cos_hour"] = np.cos(2 * np.pi * frame[f"{timestamp_col}_hour"] / 24.0)
    frame[f"{timestamp_col}_sin_dayofyear"] = np.sin(2 * np.pi * frame[f"{timestamp_col}_dayofyear"] / 365.25)
    frame[f"{timestamp_col}_cos_dayofyear"] = np.cos(2 * np.pi * frame[f"{timestamp_col}_dayofyear"] / 365.25)
    return frame


def train_val_test_split_by_time(df: pd.DataFrame, time_col: str, train_end: str, val_end: str) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    ts = pd.to_datetime(df[time_col])
    train_mask = ts < pd.to_datetime(train_end)
    val_mask = (ts >= pd.to_datetime(train_end)) & (ts < pd.to_datetime(val_end))
    test_mask = ts >= pd.to_datetime(val_end)
    return df[train_mask].copy(), df[val_mask].copy(), df[test_mask].copy()


In [61]:
EAF_TABLES = [
    "eaf_temp",
    "eaf_gaslance_mat",
    "inj_mat",
    "eaf_transformer",
    "eaf_added_materials",
    "basket_charged",
    "lf_added_materials",
    "lf_initial_chemical_measurements",
    "eaf_final_chemical_measurements",
    "ladle_tapping",
]


def parse_duration_minutes(value: Any) -> Optional[float]:
    if pd.isna(value):
        return np.nan
    s = str(value).strip()
    if not s:
        return np.nan
    s = s.replace(" ", "")
    if ":" not in s:
        return coerce_decimal(pd.Series([s])).iloc[0]
    parts = s.split(":")
    try:
        hours = float(parts[0])
        minutes = float(parts[1])
        return hours * 60.0 + minutes
    except Exception:
        return np.nan


In [62]:
def load_eaf_tables(data_root: Path) -> Dict[str, pd.DataFrame]:
    candidate_dirs = [
        data_root / "Industrial Data from the Electric Arc Furnace",
        data_root / "Industrial_Data_from_the_Electric_Arc_Furnace",
    ]
    base = next((path for path in candidate_dirs if path.exists()), None)

    table_paths: Dict[str, Path] = {}
    if base is not None:
        table_paths = {name: base / f"{name}.csv" for name in EAF_TABLES}
    else:
        print(f"[WARN] Expected EAF directory missing under {data_root}. Falling back to glob search.")
        for name in EAF_TABLES:
            path = next(
                (
                    candidate
                    for candidate in [
                        data_root / f"{name}.csv",
                        data_root / f"{name.upper()}.csv",
                    ]
                    if candidate.exists()
                ),
                None,
            )
            if path is None:
                matches = list(data_root.rglob(f"{name}.csv"))
                if matches:
                    path = matches[0]
            table_paths[name] = path

    missing = [name for name, path in table_paths.items() if path is None or not path.exists()]
    if missing:
        raise FileNotFoundError(
            "Unable to locate EAF tables: " + ", ".join(missing) + f". Ensure the CSV files are present under {data_root}."
        )

    tables: Dict[str, pd.DataFrame] = {}
    for name, path in table_paths.items():
        if path is None:
            continue
        print(f"Loading {name} from {path}...")
        if name in {"eaf_gaslance_mat", "inj_mat"}:
            df = pd.read_csv(path, dtype=str)
            df = ensure_datetime(df, ["REVTIME"])
            if "DATETIME" not in df.columns and "REVTIME" in df.columns:
                df["DATETIME"] = df["REVTIME"]
            numeric_cols = [c for c in df.columns if c not in ("REVTIME", "HEATID", "DATETIME")]
            df = ensure_float(df, numeric_cols)
        elif name == "eaf_temp":
            df = pd.read_csv(path)
            df = ensure_datetime(df, ["DATETIME"])
            numeric_cols = [c for c in df.columns if c not in ("HEATID", "DATETIME")]
            df = ensure_float(df, numeric_cols)
        elif name == "eaf_transformer":
            df = pd.read_csv(path, dtype=str)
            df = ensure_datetime(df, ["STARTTIME"])
            if "DATETIME" not in df.columns and "STARTTIME" in df.columns:
                df["DATETIME"] = df["STARTTIME"]
            df["DURATION_MIN"] = df["DURATION"].astype(str).str.replace(" ", "")
            df["DURATION_MIN"] = df["DURATION_MIN"].apply(parse_duration_minutes)
            df = ensure_float(df, ["DURATION_MIN", "MW"])
        else:
            df = pd.read_csv(path, dtype=str)
            datetime_cols = [c for c in df.columns if "DATE" in c.upper() or "TIME" in c.upper()]
            if datetime_cols:
                df = ensure_datetime(df, datetime_cols)
                if "DATETIME" not in df.columns:
                    df["DATETIME"] = df[datetime_cols[0]]
            numeric_cols = [
                c
                for c in df.columns
                if c not in datetime_cols and c not in ("HEATID", "RECID", "POSITIONROW", "DATETIME")
            ]
            df = ensure_float(df, numeric_cols)
        tables[name] = df
    return tables


def compute_heatwise_aggregates(df: pd.DataFrame, heat_col: str, aggregations: Dict[str, List[str]]) -> pd.DataFrame:
    grouped = df.groupby(heat_col).agg(aggregations)
    grouped.columns = [f"{col}_{agg}" for col, agg in grouped.columns]
    grouped = grouped.reset_index()
    return grouped

# Fixed vectorized merge_asof_multikey (searchsorted-based, dtype-safe assignments)
import numpy as np
import pandas as pd
from typing import Optional

def merge_asof_multikey(
    left: pd.DataFrame,
    right: pd.DataFrame,
    *,
    on: str,
    by: str,
    suffix: str = "rhs",
    tolerance: Optional[pd.Timedelta] = None,
    direction: str = "backward",
    verbose: bool = False,
) -> pd.DataFrame:
    """
    Vectorized as-of merge on (by, on) using numpy.searchsorted on structured keys.
    Supports direction='backward' (most common for telemetry alignment).
    Returns left with right's non-key columns suffixed by _{suffix}.
    """
    if right is None or len(right) == 0:
        return left.copy()
    if direction != "backward":
        raise NotImplementedError("Only 'backward' direction supported in this implementation")

    # Basic checks
    if on not in left.columns or on not in right.columns or by not in left.columns or by not in right.columns:
        raise KeyError(f"Both frames must contain columns '{by}' and '{on}'")

    # Work on copies
    L = left.copy()
    R = right.copy()

    # Mask and filter rows missing keys (we'll reattach them at the end)
    mask_L_valid = L[on].notna() & L[by].notna()
    mask_R_valid = R[on].notna() & R[by].notna()
    L_valid = L.loc[mask_L_valid].copy()
    R_valid = R.loc[mask_R_valid].copy()

    # Coerce datetimes
    L_valid[on] = pd.to_datetime(L_valid[on], errors="coerce")
    R_valid[on] = pd.to_datetime(R_valid[on], errors="coerce")
    L_valid = L_valid[L_valid[on].notna()]
    R_valid = R_valid[R_valid[on].notna()]

    # If no valid rows remain on left, return original left with NaNs for merge cols
    if L_valid.empty:
        merged = L.copy()
        merge_cols = [c for c in R.columns if c not in (by, on)]
        for c in merge_cols:
            merged[f"{c}_{suffix}"] = np.nan
        return merged

    # Dedupe right on (by, on) keeping last measurement (reduces search space)
    R_valid = R_valid.sort_values([by, on], kind="mergesort").drop_duplicates(subset=[by, on], keep="last")

    # Factorize right groups to compact integer ids
    right_labels = pd.unique(R_valid[by].astype(object))  # preserve order
    group_to_id = {val: i for i, val in enumerate(right_labels)}
    # Map right group ids
    right_group_ids = np.array([group_to_id[v] for v in R_valid[by].astype(object)], dtype=np.int32)

    # Map left group ids; groups not in right get -1
    left_group_values = L_valid[by].astype(object).values
    left_group_ids = np.array([group_to_id.get(v, -1) for v in left_group_values], dtype=np.int32)

    # Convert times to int64 ns
    left_times_ns = L_valid[on].values.astype("datetime64[ns]").astype("int64")
    right_times_ns = R_valid[on].values.astype("datetime64[ns]").astype("int64")

    # Build structured keys and sort by (group,time)
    key_dtype = np.dtype([("g", np.int32), ("t", np.int64)])
    right_keys = np.empty(len(right_group_ids), dtype=key_dtype)
    right_keys["g"] = right_group_ids
    right_keys["t"] = right_times_ns
    order = np.argsort(right_keys, order=("g", "t"))
    right_keys_sorted = right_keys[order]
    R_sorted = R_valid.reset_index(drop=True).iloc[order].reset_index(drop=True)
    right_times_sorted = right_keys_sorted["t"]

    # Left keys structured array (same dtype)
    left_keys = np.empty(len(left_group_ids), dtype=key_dtype)
    left_keys["g"] = left_group_ids
    left_keys["t"] = left_times_ns

    # Searchsorted to find previous (backward) right index for each left key
    idxs = np.searchsorted(right_keys_sorted, left_keys, side="right") - 1

    # Initialize keep_mask (default False)
    keep_mask = np.zeros(len(idxs), dtype=bool)

    # valid where idxs >= 0
    valid_mask = idxs >= 0
    if valid_mask.any():
        # Check matched group's id equals left group id (otherwise it's from a different group)
        matched_group_ids = right_keys_sorted["g"][idxs[valid_mask]]
        left_group_ids_valid = left_keys["g"][valid_mask]
        same_group = matched_group_ids == left_group_ids_valid
        # Set keep_mask True only where same_group is True
        keep_mask[np.flatnonzero(valid_mask)[same_group]] = True

    # Apply tolerance if provided (left_time - matched_right_time must be <= tol and >=0)
    if tolerance is not None and keep_mask.any():
        tol_ns = int(pd.to_timedelta(tolerance).to_timedelta64().astype("timedelta64[ns]") / np.timedelta64(1, "ns"))
        kept_positions = np.flatnonzero(keep_mask)
        matched_right_times = right_times_sorted[idxs[kept_positions]]
        left_times_for_kept = left_keys["t"][kept_positions]
        diffs = left_times_for_kept - matched_right_times
        tol_ok = (diffs >= 0) & (diffs <= tol_ns)
        # Zero out positions violating tolerance
        if not np.all(tol_ok):
            keep_mask[kept_positions[~tol_ok]] = False

    # Prepare result skeleton using proper dtypes (avoid assigning arrays of incompatible dtype)
    merge_cols = [c for c in R_sorted.columns if c not in (by, on)]
    result = L_valid.copy()
    for c in merge_cols:
        src_dtype = R_sorted[c].dtype
        try:
            result[f"{c}_{suffix}"] = pd.Series(index=result.index, dtype=src_dtype)
        except Exception:
            result[f"{c}_{suffix}"] = pd.Series(index=result.index, dtype="object")

    # Fill merged columns for kept matches
    kept_positions = np.flatnonzero(keep_mask)
    if kept_positions.size:
        matched_idxs = idxs[kept_positions]  # indices into R_sorted
        for col in merge_cols:
            vals = R_sorted.iloc[matched_idxs][col].values
            s = pd.Series(vals, index=result.index[kept_positions])
            # If target dtype is datetime, ensure series is datetime
            if np.issubdtype(result[f"{col}_{suffix}"].dtype, np.datetime64):
                s = pd.to_datetime(s)
            result.loc[result.index[kept_positions], f"{col}_{suffix}"] = s

    # Rows that were invalid (no match) remain NaN in merged cols

    # Reattach left rows that were dropped due to missing keys
    if mask_L_valid.sum() != len(L):
        dropped = L.loc[~mask_L_valid].copy()
        for c in merge_cols:
            dropped[f"{c}_{suffix}"] = np.nan
        combined = pd.concat([result, dropped]).loc[L.index]
    else:
        combined = result

    # Reindex to original left.index to preserve order
    combined = combined.reindex(left.index)
    return combined

In [63]:
def prepare_eaf_temp_and_o2_bundles(
    tables: Dict[str, pd.DataFrame],
    history_lags: List[int] = (1, 2, 3, 6),
    horizon: int = 1,
) -> Tuple[DatasetBundle, DatasetBundle]:
    temp = tables["eaf_temp"].copy()
    temp["DATETIME"] = pd.to_datetime(temp["DATETIME"])
    temp = temp.sort_values(["HEATID", "DATETIME"]).reset_index(drop=True)
    temp = temp.drop_duplicates(subset=["HEATID", "DATETIME"], keep="last")

    for lag in history_lags:
        temp[f"TEMP_lag_{lag}"] = temp.groupby("HEATID")["TEMP"].shift(lag)
        temp[f"VALO2_lag_{lag}"] = temp.groupby("HEATID")["VALO2_PPM"].shift(lag)

    temp["TEMP_target"] = temp.groupby("HEATID")["TEMP"].shift(-horizon)
    temp["VALO2_target"] = temp.groupby("HEATID")["VALO2_PPM"].shift(-horizon)

    temp["HEAT_START"] = temp.groupby("HEATID")["DATETIME"].transform("min")
    temp["minutes_from_heat_start"] = (temp["DATETIME"] - temp["HEAT_START"]).dt.total_seconds() / 60.0
    temp["sample_index"] = temp.groupby("HEATID").cumcount()
    temp["minutes_between_samples"] = temp.groupby("HEATID")["DATETIME"].diff().dt.total_seconds().fillna(0.0) / 60.0

    gas = tables.get("eaf_gaslance_mat")
    if gas is not None and not gas.empty:
        gas = gas.sort_values(["HEATID", "REVTIME"])
        for col in ["O2_AMOUNT", "GAS_AMOUNT", "O2_FLOW", "GAS_FLOW"]:
            if col in gas.columns:
                gas[f"{col}_cum"] = gas.groupby("HEATID")[col].cumsum()
        temp = merge_asof_multikey(
            temp,
            gas,
            on="DATETIME",
            by="HEATID",
            suffix="gas",
            tolerance=pd.Timedelta(minutes=30),
        )

    inj = tables.get("inj_mat")
    if inj is not None and not inj.empty:
        inj = inj.sort_values(["HEATID", "REVTIME"])
        for col in ["INJ_AMOUNT_CARBON", "INJ_FLOW_CARBON"]:
            if col in inj.columns:
                inj[f"{col}_cum"] = inj.groupby("HEATID")[col].cumsum()
        temp = merge_asof_multikey(
            temp,
            inj,
            on="DATETIME",
            by="HEATID",
            suffix="inj",
            tolerance=pd.Timedelta(minutes=30),
        )

    transformer = tables.get("eaf_transformer")
    if transformer is not None and not transformer.empty:
        transformer = transformer.sort_values(["HEATID", "STARTTIME"])
        temp = merge_asof_multikey(
            temp,
            transformer,
            on="DATETIME",
            by="HEATID",
            suffix="xfmr",
            tolerance=pd.Timedelta(hours=2),
        )

    temp = add_calendar_features(temp, "DATETIME")
    feature_cols = [
        col
        for col in temp.columns
        if col
        not in {
            "TEMP",
            "VALO2_PPM",
            "TEMP_target",
            "VALO2_target",
            "HEATID",
            "HEAT_START",
            "DATETIME",
        }
        and not col.endswith("_xfmr")
    ]
    feature_cols = [c for c in feature_cols if pd.api.types.is_numeric_dtype(temp[c])]

    temp[feature_cols] = temp[feature_cols].astype(np.float32)

    temp = temp.dropna(subset=feature_cols + ["TEMP_target", "VALO2_target"]).reset_index(drop=True)

    temp["year"] = temp["DATETIME"].dt.year
    heat_year = temp.groupby("HEATID")["year"].max().reset_index().rename(columns={"year": "heat_year"})
    temp = temp.merge(heat_year, on="HEATID", how="left")

    train_mask = temp["heat_year"] <= 2016
    val_mask = temp["heat_year"] == 2017
    test_mask = temp["heat_year"] >= 2018


    feature_mean = temp.loc[train_mask, feature_cols].mean()
    feature_std = temp.loc[train_mask, feature_cols].std().replace(0.0, 1.0)
    temp[feature_cols] = (temp[feature_cols] - feature_mean) / feature_std

    scaler_meta = {
        "feature_mean": {k: float(v) for k, v in feature_mean.items()},
        "feature_std": {k: float(v) for k, v in feature_std.items()},
    }

    def build_split(mask: pd.Series) -> Dict[str, np.ndarray]:
        X = temp.loc[mask, feature_cols].to_numpy(dtype=np.float32)
        y_temp = temp.loc[mask, "TEMP_target"].to_numpy(dtype=np.float32)[:, None]
        y_o2 = temp.loc[mask, "VALO2_target"].to_numpy(dtype=np.float32)[:, None]
        return {"X": X, "y_temp": y_temp, "y_o2": y_o2}

    train_split = build_split(train_mask)
    val_split = build_split(val_mask)
    test_split = build_split(test_mask)

    temp_target_mean = train_split["y_temp"].mean(axis=0, keepdims=True)
    temp_target_std = train_split["y_temp"].std(axis=0, keepdims=True)
    temp_target_std = np.where(temp_target_std < 1e-6, 1.0, temp_target_std)

    o2_target_mean = train_split["y_o2"].mean(axis=0, keepdims=True)
    o2_target_std = train_split["y_o2"].std(axis=0, keepdims=True)
    o2_target_std = np.where(o2_target_std < 1e-6, 1.0, o2_target_std)

    def _normalize_target(split: Dict[str, np.ndarray], key: str, mean: np.ndarray, std: np.ndarray) -> None:
        split[key] = ((split[key] - mean) / std).astype(np.float32)

    for split_dict in (train_split, val_split, test_split):
        _normalize_target(split_dict, "y_temp", temp_target_mean, temp_target_std)
        _normalize_target(split_dict, "y_o2", o2_target_mean, o2_target_std)

    temp_bundle = DatasetBundle(
        name="EAF_TEMP_forecast",
        task_type="regression",
        input_kind="tabular",
        feature_names=feature_cols,
        target_names=["TEMP_target"],
        train={"X": train_split["X"], "y": train_split["y_temp"]},
        val={"X": val_split["X"], "y": val_split["y_temp"]},
        test={"X": test_split["X"], "y": test_split["y_temp"]},
        metadata={
            "horizon_steps": horizon,
            "history_lags": list(history_lags),
            "feature_source": "temp + gas + injection + calendar",
            **scaler_meta,
            "target_scaler": {
                "mean": float(temp_target_mean.squeeze()),
                "std": float(temp_target_std.squeeze()),
            },
        },
    )

    o2_bundle = DatasetBundle(
        name="EAF_VALO2_forecast",
        task_type="regression",
        input_kind="tabular",
        feature_names=feature_cols,
        target_names=["VALO2_target"],
        train={"X": train_split["X"], "y": train_split["y_o2"]},
        val={"X": val_split["X"], "y": val_split["y_o2"]},
        test={"X": test_split["X"], "y": test_split["y_o2"]},
        metadata={
            "horizon_steps": horizon,
            "history_lags": list(history_lags),
            "feature_source": "temp + gas + injection + calendar",
            **scaler_meta,
            "target_scaler": {
                "mean": float(o2_target_mean.squeeze()),
                "std": float(o2_target_std.squeeze()),
            },
        },
    )

    return temp_bundle, o2_bundle


In [64]:
def prepare_eaf_chemistry_bundle(tables: Dict[str, pd.DataFrame]) -> DatasetBundle:
    chem = tables["eaf_final_chemical_measurements"].copy()
    chem = ensure_datetime(chem, ["DATETIME"])
    chem = chem.sort_values(["HEATID", "DATETIME"])
    chem = chem.drop_duplicates(subset=["HEATID"], keep="last")

    target_cols = [c for c in chem.columns if c not in ("HEATID", "POSITIONROW", "DATETIME")]
    chem = ensure_float(chem, target_cols)

    temp = tables["eaf_temp"].copy()
    temp = ensure_datetime(temp, ["DATETIME"])
    temp = temp.sort_values(["HEATID", "DATETIME"])
    temp["sample_index"] = temp.groupby("HEATID").cumcount()
    temp = add_calendar_features(temp, "DATETIME")
    temp_aggs = compute_heatwise_aggregates(
        temp,
        "HEATID",
        {
            "TEMP": ["mean", "max", "min", "last"],
            "VALO2_PPM": ["mean", "max", "last"],
            "DATETIME_month": ["last"],
            "DATETIME_hour": ["mean"],
            "sample_index": ["max"],
        },
    )

    def safe_aggregates(frame: Optional[pd.DataFrame], aggregations: Dict[str, List[str]]) -> pd.DataFrame:
        if frame is None or frame.empty:
            columns = ["HEATID"] + [f"{feature}_{agg}" for feature, aggs in aggregations.items() for agg in aggs]
            return pd.DataFrame(columns=columns)
        frame = frame.copy()
        datetime_cols = [c for c in frame.columns if "TIME" in c.upper() or "DATE" in c.upper()]
        if datetime_cols:
            frame = ensure_datetime(frame, datetime_cols)
        numeric_cols = [c for c in frame.columns if c not in ("HEATID", "REVTIME", "STARTTIME")]
        frame = ensure_float(frame, numeric_cols)
        return compute_heatwise_aggregates(frame, "HEATID", aggregations)

    gas_aggs = safe_aggregates(
        tables.get("eaf_gaslance_mat"),
        {
            "O2_AMOUNT": ["max"],
            "GAS_AMOUNT": ["max"],
            "O2_FLOW": ["mean", "max"],
            "GAS_FLOW": ["mean", "max"],
        },
    )
    inj_aggs = safe_aggregates(
        tables.get("inj_mat"),
        {
            "INJ_AMOUNT_CARBON": ["max"],
            "INJ_FLOW_CARBON": ["mean", "max"],
        },
    )
    transformer_aggs = safe_aggregates(
        tables.get("eaf_transformer"),
        {
            "MW": ["mean", "max"],
            "DURATION_MIN": ["sum"],
        },
    )

    features = chem[["HEATID", "DATETIME"]].merge(temp_aggs, on="HEATID", how="left")
    features = features.merge(gas_aggs, on="HEATID", how="left")
    features = features.merge(inj_aggs, on="HEATID", how="left")
    features = features.merge(transformer_aggs, on="HEATID", how="left")

    numeric_feature_cols = [c for c in features.columns if c not in ("HEATID", "DATETIME")]
    features = ensure_float(features, numeric_feature_cols)
    features = add_calendar_features(features, "DATETIME")
    feature_cols = [c for c in features.columns if c not in ("HEATID", "DATETIME")]

    merged = features.merge(chem[["HEATID"] + target_cols], on="HEATID", how="inner")
    merged = merged.dropna(subset=feature_cols + target_cols).reset_index(drop=True)

    merged["year"] = pd.to_datetime(merged["DATETIME"]).dt.year
    train_mask = merged["year"] <= 2016
    val_mask = merged["year"] == 2017
    test_mask = merged["year"] >= 2018

    X_train = merged.loc[train_mask, feature_cols].to_numpy(dtype=np.float32)
    y_train = merged.loc[train_mask, target_cols].to_numpy(dtype=np.float32)
    X_val = merged.loc[val_mask, feature_cols].to_numpy(dtype=np.float32)
    y_val = merged.loc[val_mask, target_cols].to_numpy(dtype=np.float32)
    X_test = merged.loc[test_mask, feature_cols].to_numpy(dtype=np.float32)
    y_test = merged.loc[test_mask, target_cols].to_numpy(dtype=np.float32)

    if X_train.size == 0 or X_val.size == 0 or X_test.size == 0:
        raise ValueError("EAF chemistry splits produced empty partitions; check year filters.")

    feature_mean = X_train.mean(axis=0, keepdims=True)
    feature_std = X_train.std(axis=0, keepdims=True)
    feature_std = np.where(feature_std < 1e-6, 1.0, feature_std)

    target_mean = y_train.mean(axis=0, keepdims=True)
    target_std = y_train.std(axis=0, keepdims=True)
    target_std = np.where(target_std < 1e-6, 1.0, target_std)

    def _normalize(arr: np.ndarray, mean: np.ndarray, std: np.ndarray) -> np.ndarray:
        return ((arr - mean) / std).astype(np.float32)

    X_train = _normalize(X_train, feature_mean, feature_std)
    X_val = _normalize(X_val, feature_mean, feature_std)
    X_test = _normalize(X_test, feature_mean, feature_std)

    y_train_norm = _normalize(y_train, target_mean, target_std)
    y_val_norm = _normalize(y_val, target_mean, target_std)
    y_test_norm = _normalize(y_test, target_mean, target_std)

    bundle = DatasetBundle(
        name="EAF_chemistry",
        task_type="regression",
        input_kind="tabular",
        feature_names=feature_cols,
        target_names=target_cols,
        train={"X": X_train, "y": y_train_norm},
        val={"X": X_val, "y": y_val_norm},
        test={"X": X_test, "y": y_test_norm},
        metadata={
            "target_dim": len(target_cols),
            "note": "heat-level aggregates for final composition",
            "feature_scaler": {
                "mean": feature_mean.flatten().astype(np.float32).tolist(),
                "std": feature_std.flatten().astype(np.float32).tolist(),
            },
            "target_scaler": {
                "mean": target_mean.flatten().astype(np.float32).tolist(),
                "std": target_std.flatten().astype(np.float32).tolist(),
            },
        },
    )
    return bundle

In [65]:
from psann.nn import ResidualPSANNNet
from psann.models.wave_resnet import WaveResNet


class IdentitySpine(nn.Module):
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        if x.ndim == 3:
            return x.reshape(x.size(0), -1)
        return x


class TemporalConvSpine(nn.Module):
    def __init__(
        self,
        input_channels: int,
        hidden_channels: int,
        kernel_size: int = 3,
        stride: int = 2,
        depth: int = 2,
        activation: Callable[[], nn.Module] = nn.GELU,
    ):
        super().__init__()
        layers: List[nn.Module] = []
        channels = input_channels
        for _ in range(depth):
            layers.append(
                nn.Conv1d(
                    channels,
                    hidden_channels,
                    kernel_size=kernel_size,
                    stride=stride,
                    padding=kernel_size // 2,
                )
            )
            layers.append(nn.BatchNorm1d(hidden_channels))
            layers.append(activation())
            channels = hidden_channels
        self.net = nn.Sequential(*layers)
        self.pool = nn.AdaptiveAvgPool1d(1)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        z = x.transpose(1, 2)
        z = self.net(z)
        z = self.pool(z).squeeze(-1)
        return z


class TemporalAttentionSpine(nn.Module):
    def __init__(self, input_dim: int, num_heads: int = 1, ff_factor: int = 2, dropout: float = 0.1):
        super().__init__()
        self.norm = nn.LayerNorm(input_dim)
        self.attn = nn.MultiheadAttention(embed_dim=input_dim, num_heads=num_heads, batch_first=True, dropout=dropout)
        self.ff = nn.Sequential(
            nn.LayerNorm(input_dim),
            nn.Linear(input_dim, ff_factor * input_dim),
            nn.GELU(),
            nn.Linear(ff_factor * input_dim, input_dim),
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        z = self.norm(x)
        attn_out, _ = self.attn(z, z, z)
        z = z + attn_out
        z = z + self.ff(z)
        return z.mean(dim=1)


class FlattenSpine(nn.Module):
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        if x.ndim == 3:
            return x.reshape(x.size(0), -1)
        return x


class SequencePSANNModel(nn.Module):
    def __init__(
        self,
        input_shape: Tuple[int, ...],
        output_dim: int,
        *,
        hidden_layers: int,
        hidden_units: int,
        spine_type: str = "flatten",
        spine_params: Optional[Dict[str, Any]] = None,
        activation_type: str = "psann",
    ):
        super().__init__()
        spine_params = spine_params or {}
        time_steps, channels = input_shape
        if spine_type == "conv":
            self.spine = TemporalConvSpine(
                channels,
                spine_params.get("channels", hidden_units),
                kernel_size=spine_params.get("kernel_size", 5),
                stride=spine_params.get("stride", 2),
                depth=spine_params.get("depth", 2),
            )
            psann_input_dim = spine_params.get("channels", hidden_units)
        elif spine_type == "attention":
            self.spine = TemporalAttentionSpine(
                input_dim=channels,
                num_heads=spine_params.get("num_heads", 1),
                ff_factor=spine_params.get("ff_factor", 2),
                dropout=spine_params.get("dropout", 0.1),
            )
            psann_input_dim = channels
        elif spine_type == "flatten":
            self.spine = FlattenSpine()
            psann_input_dim = time_steps * channels
        else:
            self.spine = IdentitySpine()
            psann_input_dim = time_steps * channels
        self.core = ResidualPSANNNet(
            psann_input_dim,
            output_dim,
            hidden_layers=hidden_layers,
            hidden_units=hidden_units,
            hidden_width=hidden_units,
            activation_type=activation_type,
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        if x.ndim == 3:
            z = self.spine(x)
        else:
            z = x
        return self.core(z)


class TabularPSANNModel(nn.Module):
    def __init__(
        self,
        input_dim: int,
        output_dim: int,
        *,
        hidden_layers: int,
        hidden_units: int,
        activation_type: str = "psann",
    ):
        super().__init__()
        self.core = ResidualPSANNNet(
            input_dim,
            output_dim,
            hidden_layers=hidden_layers,
            hidden_units=hidden_units,
            hidden_width=hidden_units,
            activation_type=activation_type,
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        if x.ndim > 2:
            x = x.reshape(x.size(0), -1)
        return self.core(x)


class MLPModel(nn.Module):
    def __init__(self, input_dim: int, output_dim: int, hidden_layers: int = 3, hidden_units: int = 256, dropout: float = 0.1):
        super().__init__()
        layers: List[nn.Module] = []
        in_dim = input_dim
        for _ in range(hidden_layers):
            layers.append(nn.Linear(in_dim, hidden_units))
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout))
            in_dim = hidden_units
        layers.append(nn.Linear(in_dim, output_dim))
        self.net = nn.Sequential(*layers)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        if x.ndim > 2:
            x = x.reshape(x.size(0), -1)
        return self.net(x)


class LSTMHead(nn.Module):
    def __init__(self, input_dim: int, hidden_units: int, num_layers: int, output_dim: int, bidirectional: bool = False, dropout: float = 0.1):
        super().__init__()
        self.lstm = nn.LSTM(
            input_dim,
            hidden_units,
            num_layers=num_layers,
            dropout=dropout if num_layers > 1 else 0.0,
            batch_first=True,
            bidirectional=bidirectional,
        )
        out_dim = hidden_units * (2 if bidirectional else 1)
        self.head = nn.Sequential(
            nn.LayerNorm(out_dim),
            nn.Linear(out_dim, output_dim),
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        if x.ndim == 2:
            x = x.unsqueeze(1)
        _, (h_n, _) = self.lstm(x)
        z = h_n[-1]
        return self.head(z)


class TinyTCNBlock(nn.Module):
    def __init__(self, channels: int, kernel_size: int, dilation: int, dropout: float):
        super().__init__()
        self.conv = nn.Sequential(
            nn.Conv1d(channels, channels, kernel_size, padding="same", dilation=dilation),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Conv1d(channels, channels, kernel_size, padding="same", dilation=dilation),
            nn.GELU(),
            nn.Dropout(dropout),
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return x + self.conv(x)


class TinyTCN(nn.Module):
    def __init__(self, input_channels: int, output_dim: int, hidden_channels: int = 128, layers: int = 3, kernel_size: int = 3, dropout: float = 0.1):
        super().__init__()
        self.pre = nn.Conv1d(input_channels, hidden_channels, kernel_size=1)
        blocks = []
        for i in range(layers):
            blocks.append(TinyTCNBlock(hidden_channels, kernel_size, dilation=2 ** i, dropout=dropout))
        self.blocks = nn.Sequential(*blocks)
        self.head = nn.Sequential(
            nn.AdaptiveAvgPool1d(1),
            nn.Flatten(),
            nn.Linear(hidden_channels, output_dim),
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        z = x.transpose(1, 2)
        z = self.pre(z)
        z = self.blocks(z)
        z = self.head(z)
        return z


def build_psann_tabular(input_shape: Tuple[int, ...], output_dim: int, extra: Dict[str, Any]) -> nn.Module:
    hidden_layers = extra.get("hidden_layers", 8)
    hidden_units = extra.get("hidden_units", 256)
    activation_type = extra.get("activation_type", "psann")
    return TabularPSANNModel(
        input_dim=int(np.prod(input_shape)),
        output_dim=output_dim,
        hidden_layers=hidden_layers,
        hidden_units=hidden_units,
        activation_type=activation_type,
    )


def build_psann_sequence(input_shape: Tuple[int, ...], output_dim: int, extra: Dict[str, Any]) -> nn.Module:
    hidden_layers = extra.get("hidden_layers", 8)
    hidden_units = extra.get("hidden_units", 256)
    spine_type = extra.get("spine_type", "flatten")
    spine_params = extra.get("spine_params", {})
    activation_type = extra.get("activation_type", "psann")
    return SequencePSANNModel(
        input_shape,
        output_dim,
        hidden_layers=hidden_layers,
        hidden_units=hidden_units,
        spine_type=spine_type,
        spine_params=spine_params,
        activation_type=activation_type,
    )


def build_mlp_model(input_shape: Tuple[int, ...], output_dim: int, extra: Dict[str, Any]) -> nn.Module:
    hidden_layers = extra.get("hidden_layers", 3)
    hidden_units = extra.get("hidden_units", 256)
    dropout = extra.get("dropout", 0.1)
    return MLPModel(
        input_dim=int(np.prod(input_shape)),
        output_dim=output_dim,
        hidden_layers=hidden_layers,
        hidden_units=hidden_units,
        dropout=dropout,
    )


def build_lstm_model(input_shape: Tuple[int, ...], output_dim: int, extra: Dict[str, Any]) -> nn.Module:
    sequence_length, channels = input_shape
    hidden_units = extra.get("hidden_units", 128)
    num_layers = extra.get("num_layers", 1)
    bidirectional = extra.get("bidirectional", False)
    return LSTMHead(
        input_dim=channels,
        hidden_units=hidden_units,
        num_layers=num_layers,
        output_dim=output_dim,
        bidirectional=bidirectional,
        dropout=extra.get("dropout", 0.1),
    )


def build_tcn_model(input_shape: Tuple[int, ...], output_dim: int, extra: Dict[str, Any]) -> nn.Module:
    sequence_length, channels = input_shape
    hidden_channels = extra.get("hidden_channels", 128)
    layers = extra.get("layers", 3)
    kernel_size = extra.get("kernel_size", 3)
    dropout = extra.get("dropout", 0.1)
    return TinyTCN(
        input_channels=channels,
        output_dim=output_dim,
        hidden_channels=hidden_channels,
        layers=layers,
        kernel_size=kernel_size,
        dropout=dropout,
    )


class WaveResNetSequenceModel(nn.Module):
    def __init__(
        self,
        input_shape: Tuple[int, ...],
        output_dim: int,
        *,
        aggregator: str = "conv",
        aggregator_params: Optional[Dict[str, Any]] = None,
        wave_kwargs: Optional[Dict[str, Any]] = None,
    ) -> None:
        super().__init__()
        aggregator_params = aggregator_params or {}
        wave_kwargs = wave_kwargs or {}
        time_steps, channels = input_shape
        if aggregator == "conv":
            hidden_channels = aggregator_params.get("channels", wave_kwargs.get("hidden_dim", 128))
            activation = aggregator_params.get("activation", nn.GELU)
            self.spine = TemporalConvSpine(
                channels,
                hidden_channels,
                kernel_size=aggregator_params.get("kernel_size", 5),
                stride=aggregator_params.get("stride", 2),
                depth=aggregator_params.get("depth", 2),
                activation=activation,
            )
            wave_input_dim = hidden_channels
        elif aggregator == "flatten":
            self.spine = FlattenSpine()
            wave_input_dim = time_steps * channels
        elif aggregator == "identity":
            self.spine = IdentitySpine()
            wave_input_dim = time_steps * channels
        else:
            raise ValueError(f"Unsupported aggregator '{aggregator}' for WaveResNetSequenceModel.")
        final_wave_kwargs = wave_kwargs.copy()
        final_wave_kwargs["input_dim"] = wave_input_dim
        final_wave_kwargs["output_dim"] = output_dim
        self.core = WaveResNet(**final_wave_kwargs)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        z = self.spine(x) if x.ndim == 3 else x
        if z.ndim > 2:
            z = z.view(z.size(0), -1)
        return self.core(z)


def _count_params(module: nn.Module) -> int:
    return sum(p.numel() for p in module.parameters())


def build_wave_resnet_tabular(input_shape: Tuple[int, ...], output_dim: int, extra: Dict[str, Any]) -> nn.Module:
    hidden_dims = extra.get("hidden_dims", [192, 224, 256])
    depths = extra.get("depths", [4, 6, 8])
    target_params = extra.get("target_params")
    tol = extra.get("param_tol", 0.15)
    dropout = extra.get("dropout", 0.05)
    first_layer_w0 = extra.get("first_layer_w0", 30.0)
    hidden_w0 = extra.get("hidden_w0", 1.0)
    input_dim = int(np.prod(input_shape))
    best_model = None
    best_gap = float("inf")
    for hidden_dim in hidden_dims:
        for depth in depths:
            wave_kwargs = {
                "input_dim": input_dim,
                "hidden_dim": hidden_dim,
                "depth": depth,
                "output_dim": output_dim,
                "dropout": dropout,
                "first_layer_w0": first_layer_w0,
                "hidden_w0": hidden_w0,
            }
            candidate = WaveResNet(**wave_kwargs)
            params = _count_params(candidate)
            if target_params:
                gap = abs(params - target_params)
                if target_params > 0 and gap / target_params <= tol:
                    return candidate
                if gap < best_gap:
                    best_model = candidate
                    best_gap = gap
            elif best_model is None:
                best_model = candidate
    if best_model is None:
        raise RuntimeError("Unable to construct WaveResNet tabular model with the provided search space.")
    return best_model


def build_wave_resnet_sequence(input_shape: Tuple[int, ...], output_dim: int, extra: Dict[str, Any]) -> nn.Module:
    hidden_dims = extra.get("hidden_dims", [160, 192, 224])
    depths = extra.get("depths", [4, 6, 8])
    target_params = extra.get("target_params")
    tol = extra.get("param_tol", 0.15)
    aggregator = extra.get("aggregator", "conv")
    aggregator_params = extra.get("aggregator_params", {})
    dropout = extra.get("dropout", 0.05)
    first_layer_w0 = extra.get("first_layer_w0", 30.0)
    hidden_w0 = extra.get("hidden_w0", 1.0)
    best_model = None
    best_gap = float("inf")
    for hidden_dim in hidden_dims:
        for depth in depths:
            wave_kwargs = {
                "hidden_dim": hidden_dim,
                "depth": depth,
                "dropout": dropout,
                "first_layer_w0": first_layer_w0,
                "hidden_w0": hidden_w0,
            }
            agg_params = dict(aggregator_params)
            if aggregator == "conv":
                agg_params.setdefault("channels", hidden_dim)
            candidate = WaveResNetSequenceModel(
                input_shape,
                output_dim,
                aggregator=aggregator,
                aggregator_params=agg_params,
                wave_kwargs=wave_kwargs,
            )
            params = _count_params(candidate)
            if target_params:
                gap = abs(params - target_params)
                if target_params > 0 and gap / target_params <= tol:
                    return candidate
                if gap < best_gap:
                    best_model = candidate
                    best_gap = gap
            elif best_model is None:
                best_model = candidate
    if best_model is None:
        raise RuntimeError("Unable to construct WaveResNet sequence model with the provided search space.")
    return best_model


In [76]:
def count_trainable_parameters(model: nn.Module) -> int:
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


def denormalize_regression_outputs(
    bundle: DatasetBundle, y_true: np.ndarray, y_pred: np.ndarray
) -> Tuple[np.ndarray, np.ndarray]:
    scaler = bundle.metadata.get("target_scaler") if getattr(bundle, "metadata", None) else None
    if not scaler:
        return y_true, y_pred
    mean = np.asarray(scaler.get("mean", 0.0), dtype=np.float32)
    std = np.asarray(scaler.get("std", 1.0), dtype=np.float32)
    return y_true * std + mean, y_pred * std + mean


def evaluate_model(model: nn.Module, loader: DataLoader, spec: ModelSpec) -> Tuple[np.ndarray, np.ndarray]:
    model.eval()
    param = next(model.parameters(), None)
    model_device = param.device if param is not None else torch.device("cpu")

    preds: List[np.ndarray] = []
    truths: List[np.ndarray] = []

    with torch.no_grad():
        for X_batch, y_batch in loader:
            X_batch = X_batch.to(model_device)
            y_batch = y_batch.to(model_device)

            outputs = model(X_batch)
            if spec.task_type != "classification":
                if outputs.ndim > 2:
                    outputs = outputs.view(outputs.size(0), -1)
                if y_batch.ndim > 2:
                    y_batch = y_batch.view(y_batch.size(0), -1)
                elif y_batch.ndim == 1:
                    y_batch = y_batch.unsqueeze(-1)

            preds.append(outputs.detach().cpu().numpy())
            truths.append(y_batch.detach().cpu().numpy())

    if not preds:
        raise ValueError("Evaluation loader produced no batches; check dataset splits and batch size.")

    y_pred = np.concatenate(preds, axis=0)
    y_true = np.concatenate(truths, axis=0)
    return y_true, y_pred


def train_model_on_bundle(bundle: DatasetBundle, spec: ModelSpec, task_name: str) -> Dict[str, Any]:
    input_shape = bundle.train["X"].shape[1:]
    if spec.task_type == "classification":
        output_dim = int(bundle.metadata.get("n_classes", np.unique(bundle.train["y"]).size))
    else:
        output_dim = bundle.train["y"].shape[1] if bundle.train["y"].ndim > 1 else 1

    model = spec.builder(input_shape, output_dim, spec.extra)
    model.to(DEVICE)
    params = count_trainable_parameters(model)

    optimizer_cls = torch.optim.AdamW if spec.train_config.weight_decay > 0 else torch.optim.Adam
    optimizer = optimizer_cls(
        model.parameters(),
        lr=spec.train_config.learning_rate,
        weight_decay=spec.train_config.weight_decay,
    )
    scheduler = None
    if spec.train_config.scheduler == "cosine":
        scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=spec.train_config.epochs)

    train_loader = build_dataloader(
        bundle.train["X"],
        bundle.train["y"],
        spec.train_config.batch_size,
        shuffle=True,
        task_type=spec.task_type,
    )
    val_loader = build_dataloader(
        bundle.val["X"],
        bundle.val["y"],
        spec.train_config.batch_size,
        shuffle=False,
        task_type=spec.task_type,
    )
    test_loader = build_dataloader(
        bundle.test["X"],
        bundle.test["y"],
        spec.train_config.batch_size,
        shuffle=False,
        task_type=spec.task_type,
    )

    best_state = None
    best_val_metric = -float("inf")
    patience_counter = spec.train_config.patience
    history: List[Dict[str, float]] = []
    criterion_reg = nn.MSELoss()

    with Timer() as timer:
        for epoch in range(spec.train_config.epochs):
            model.train()
            model_device = next(model.parameters(), DEVICE).device
            running_loss = 0.0
            batches = 0

            for step, (X_batch, y_batch) in enumerate(train_loader, start=1):
                X_batch = X_batch.to(model_device)
                y_batch = y_batch.to(model_device)
                optimizer.zero_grad()

                outputs = model(X_batch)
                if spec.task_type != "classification" and outputs.ndim > 2:
                    outputs = outputs.view(outputs.size(0), -1)

                if spec.task_type == "classification":
                    loss = nn.functional.cross_entropy(
                        outputs,
                        y_batch,
                        label_smoothing=GLOBAL_CONFIG["label_smoothing"],
                    )
                else:
                    target = y_batch
                    if target.ndim > 2:
                        target = target.view(target.size(0), -1)
                    elif target.ndim == 1:
                        target = target.unsqueeze(-1)
                    loss = criterion_reg(outputs, target)

                loss.backward()
                if spec.train_config.gradient_clip is not None:
                    nn.utils.clip_grad_norm_(model.parameters(), spec.train_config.gradient_clip)
                optimizer.step()

                running_loss += loss.item()
                batches += 1
                if spec.train_config.max_batches_per_epoch and batches >= spec.train_config.max_batches_per_epoch:
                    break

            if scheduler is not None:
                scheduler.step()

            avg_loss = running_loss / max(1, batches)
            val_true, val_pred = evaluate_model(model, val_loader, spec)

            if spec.task_type == "classification":
                metrics = classification_metrics(val_true, val_pred)
                score = metrics["accuracy"]
            else:
                val_true_den, val_pred_den = denormalize_regression_outputs(bundle, val_true, val_pred)
                metrics = regression_metrics(val_true_den, val_pred_den)
                score = -metrics["rmse"]

            history.append({"epoch": epoch + 1, "train_loss": avg_loss, "val_score": score})

            if score > best_val_metric:
                best_val_metric = score
                best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}
                patience_counter = spec.train_config.patience
            else:
                patience_counter -= 1

            if spec.train_config.early_stopping and patience_counter <= 0:
                break
            if spec.train_config.max_minutes is not None and timer.elapsed / 60.0 > spec.train_config.max_minutes:
                print(f"[INFO] Time budget reached for {spec.name}; stopping at epoch {epoch + 1}.")
                break

    if best_state is not None:
        model.load_state_dict(best_state)

    train_true, train_pred = evaluate_model(model, train_loader, spec)
    val_true, val_pred = evaluate_model(model, val_loader, spec)
    test_true, test_pred = evaluate_model(model, test_loader, spec)

    if spec.task_type == "classification":
        train_metrics = classification_metrics(train_true, train_pred)
        val_metrics = classification_metrics(val_true, val_pred)
        test_metrics = classification_metrics(test_true, test_pred)

        train_true_out, train_pred_out = train_true, train_pred
        val_true_out, val_pred_out = val_true, val_pred
        test_true_out, test_pred_out = test_true, test_pred
    else:
        train_true_den, train_pred_den = denormalize_regression_outputs(bundle, train_true, train_pred)
        val_true_den, val_pred_den = denormalize_regression_outputs(bundle, val_true, val_pred)
        test_true_den, test_pred_den = denormalize_regression_outputs(bundle, test_true, test_pred)

        train_metrics = regression_metrics(train_true_den, train_pred_den)
        val_metrics = regression_metrics(val_true_den, val_pred_den)
        test_metrics = regression_metrics(test_true_den, test_pred_den)

        train_true_out, train_pred_out = train_true_den, train_pred_den
        val_true_out, val_pred_out = val_true_den, val_pred_den
        test_true_out, test_pred_out = test_true_den, test_pred_den

    RESULT_LOGGER.append(
        ExperimentResult(
            dataset=bundle.name,
            task=task_name,
            model=spec.name,
            group=spec.group,
            split="train",
            params=params,
            train_wall_seconds=timer.elapsed,
            metrics=train_metrics,
            notes=spec.notes,
        )
    )
    RESULT_LOGGER.append(
        ExperimentResult(
            dataset=bundle.name,
            task=task_name,
            model=spec.name,
            group=spec.group,
            split="val",
            params=params,
            train_wall_seconds=timer.elapsed,
            metrics=val_metrics,
            notes=spec.notes,
        )
    )
    RESULT_LOGGER.append(
        ExperimentResult(
            dataset=bundle.name,
            task=task_name,
            model=spec.name,
            group=spec.group,
            split="test",
            params=params,
            train_wall_seconds=timer.elapsed,
            metrics=test_metrics,
            notes=spec.notes,
        )
    )

    model_cpu = model.to("cpu")

    return {
        "model": model_cpu,
        "train_metrics": train_metrics,
        "val_metrics": val_metrics,
        "test_metrics": test_metrics,
        "train_true": train_true_out,
        "train_pred": train_pred_out,
        "val_true": val_true_out,
        "val_pred": val_pred_out,
        "test_true": test_true_out,
        "test_pred": test_pred_out,
        "history": history,
        "params": params,
        "train_time": timer.elapsed,
    }

In [77]:
def permutation_importance(
    model: nn.Module,
    bundle: DatasetBundle,
    spec: ModelSpec,
    feature_groups: Dict[str, List[int]],
    split: str = "test",
    n_repeats: int = 5,
) -> pd.DataFrame:
    data = getattr(bundle, split)
    baseline_loader = build_dataloader(
        data["X"],
        data["y"],
        spec.train_config.batch_size,
        shuffle=False,
        task_type=spec.task_type,
    )
    y_true, y_pred = evaluate_model(model, baseline_loader, spec)
    if spec.task_type == "classification":
        baseline_metric = classification_metrics(y_true, y_pred)["accuracy"]
    else:
        baseline_metric = regression_metrics(y_true.squeeze(), y_pred.squeeze())["rmse"]

    rows = []
    for group_name, columns in feature_groups.items():
        deltas = []
        cols = np.atleast_1d(columns)
        for _ in range(n_repeats):
            X_perm = data["X"].copy()
            if bundle.input_kind == "tabular":
                for col in cols:
                    np.random.shuffle(X_perm[:, col])
            else:
                for col in cols:
                    np.random.shuffle(X_perm[:, :, col])
            loader = build_dataloader(
                X_perm,
                data["y"],
                spec.train_config.batch_size,
                shuffle=False,
                task_type=spec.task_type,
            )
            y_true_perm, y_pred_perm = evaluate_model(model, loader, spec)
            if spec.task_type == "classification":
                metric_value = classification_metrics(y_true_perm, y_pred_perm)["accuracy"]
                delta = baseline_metric - metric_value
            else:
                metric_value = regression_metrics(y_true_perm.squeeze(), y_pred_perm.squeeze())["rmse"]
                delta = metric_value - baseline_metric
            deltas.append(delta)
        rows.append(
            {
                "group": group_name,
                "mean_delta": float(np.mean(deltas)),
                "std_delta": float(np.std(deltas)),
                "baseline": baseline_metric,
            }
        )
    return pd.DataFrame(rows)


def compute_shap_importance(
    model: nn.Module,
    bundle: DatasetBundle,
    spec: ModelSpec,
    split: str = "val",
    sample_size: int = 512,
) -> Dict[str, Any]:
    import shap

    data = getattr(bundle, split)
    X = data["X"]
    if len(X) == 0:
        raise ValueError(f"No samples available in {split} split for SHAP computation.")
    sample_size = min(sample_size, len(X))
    idx = np.random.choice(len(X), size=sample_size, replace=False)
    X_sample = X[idx]

    model_cpu = model.to("cpu").eval()

    def predict_fn(batch: np.ndarray) -> np.ndarray:
        with torch.no_grad():
            inputs = torch.from_numpy(batch).float()
            outputs = model_cpu(inputs)
            if spec.task_type == "classification":
                return torch.softmax(outputs, dim=-1).numpy()
            return outputs.numpy()

    if bundle.input_kind == "tabular":
        background = X_sample[: min(128, sample_size)]
        explainer = shap.KernelExplainer(predict_fn, background)
        shap_values = explainer.shap_values(X_sample)
    else:
        background = torch.from_numpy(X_sample[: min(64, sample_size)]).float()
        explainer = shap.DeepExplainer(model_cpu, background)
        shap_values = explainer.shap_values(torch.from_numpy(X_sample).float())

    model.to(DEVICE)
    return {"explainer": explainer, "shap_values": shap_values, "sample_indices": idx}


def compute_jacobian_singular_values(model: nn.Module, inputs: torch.Tensor, max_samples: int = 128) -> np.ndarray:
    model.eval()
    inputs = inputs[:max_samples].to(DEVICE).requires_grad_(True)
    outputs = model(inputs)
    if outputs.ndim == 1:
        outputs = outputs.unsqueeze(-1)
    jacobian_rows = []
    for i in range(outputs.shape[1]):
        grad_outputs = torch.zeros_like(outputs)
        grad_outputs[:, i] = 1.0
        grads = torch.autograd.grad(outputs, inputs, grad_outputs=grad_outputs, retain_graph=True, create_graph=False)[0]
        jacobian_rows.append(grads.reshape(grads.size(0), -1).detach().cpu().numpy())
    jacobian = np.concatenate(jacobian_rows, axis=1)
    sigma = np.linalg.svd(jacobian, compute_uv=False)
    return sigma


def participation_ratio(singular_values: np.ndarray) -> float:
    if singular_values.size == 0:
        return float("nan")
    numerator = (singular_values ** 2).sum() ** 2
    denominator = (singular_values ** 4).sum() + 1e-8
    return float(numerator / denominator)


def frequency_response_probe(model: nn.Module, input_dim: int, frequencies: Iterable[float], amplitude: float = 1.0) -> pd.DataFrame:
    model.eval()
    rows = []
    times = torch.linspace(0, 2 * math.pi, steps=512).unsqueeze(0)
    for freq in frequencies:
        signal = amplitude * torch.sin(freq * times)
        if input_dim > 1:
            signal = signal.repeat(1, input_dim)
        signal = signal.to(DEVICE).float()
        with torch.no_grad():
            output = model(signal)
        energy = output.pow(2).mean().sqrt().item()
        rows.append({"frequency": freq, "output_rms": energy})
    return pd.DataFrame(rows)


def evaluate_robustness(
    model: nn.Module,
    bundle: DatasetBundle,
    spec: ModelSpec,
    corruption_fn: Callable[[np.ndarray, float], np.ndarray],
    split: str = "test",
    levels: Iterable[float] = (0.0, 0.1, 0.2, 0.3),
) -> pd.DataFrame:
    rows = []
    base_data = getattr(bundle, split)
    for level in levels:
        X_corrupted = corruption_fn(base_data["X"], level)
        loader = build_dataloader(
            X_corrupted,
            base_data["y"],
            spec.train_config.batch_size,
            shuffle=False,
            task_type=spec.task_type,
        )
        y_true, y_pred = evaluate_model(model, loader, spec)
        if spec.task_type == "classification":
            metrics = classification_metrics(y_true, y_pred)
        else:
            metrics = regression_metrics(y_true.squeeze(), y_pred.squeeze())
        row = {"level": level}
        row.update(metrics)
        rows.append(row)
    return pd.DataFrame(rows)


In [78]:
# Cell (1) — helpers + containers
from typing import Dict
from pathlib import Path

# Where your datasets live (should already be defined in the notebook; override only if missing)
try:
    DATA_ROOT
except NameError:
    DATA_ROOT = Path("/content/datasets")

# container for results
DATA_BUNDLES: Dict[str, "DatasetBundle"] = {}

def find_key(stations: dict, short_name: str) -> str:
    """Find the canonical station key by case-insensitive substring match.
    Raises KeyError if no match found.
    """
    short = short_name.lower()
    matches = [k for k in stations.keys() if short in k.lower()]
    if not matches:
        raise KeyError(f"No station matching '{short_name}'")
    # prefer exact prefix match if available (more deterministic)
    for m in matches:
        if m.lower().startswith(short):
            return m
    return matches[0]

In [79]:
# Cell (2) — load all bundles (robust to small naming mismatches)
print("Loading datasets...")

# --- EAF bundles (these will use the merge_asof you've patched earlier) ---
try:
    eaf_tables = load_eaf_tables(DATA_ROOT)
    eaf_temp_bundle, eaf_o2_bundle = prepare_eaf_temp_and_o2_bundles(eaf_tables)
    eaf_chem_bundle = prepare_eaf_chemistry_bundle(eaf_tables)
    DATA_BUNDLES[eaf_temp_bundle.name] = eaf_temp_bundle
    DATA_BUNDLES[eaf_o2_bundle.name]   = eaf_o2_bundle
    DATA_BUNDLES[eaf_chem_bundle.name] = eaf_chem_bundle
    print(" - EAF bundles ready")
except Exception as e:
    print("[warn] EAF bundle creation failed:", repr(e))

# --- Beijing cross-station bundle (use robust mapping for station names) ---
try:
    beijing_stations = load_beijing_stations(DATA_ROOT)
    # map the short names you expect to their actual keys
    val_key  = find_key(beijing_stations, "Wanshouxigong")
    test_key = find_key(beijing_stations, "Huairou")
    train_keys = [k for k in beijing_stations.keys() if k not in {val_key, test_key}]
    print(f" - Beijing: val={val_key}, test={test_key}, train_count={len(train_keys)}")

    beijing_bundle = assemble_beijing_cross_station_bundle(
        beijing_stations,
        train_stations=train_keys,
        val_station=val_key,
        test_station=test_key,
        target="PM2.5",
        context=24,
        horizon=6,
    )
    DATA_BUNDLES[beijing_bundle.name] = beijing_bundle
    print(" - Beijing bundle ready")
except Exception as e:
    print("[warn] Beijing bundle creation failed:", repr(e))

# --- Jena ---
try:
    jena_df = load_jena_climate(DATA_ROOT)
    jena_bundle = prepare_jena_bundle(jena_df, target="T (degC)", context_steps=72, horizon_steps=36)
    DATA_BUNDLES[jena_bundle.name] = jena_bundle
    print(" - Jena bundle ready")
except Exception as e:
    print("[warn] Jena bundle creation failed:", repr(e))

# --- HAR (engineered + raw) ---
try:
    har_train_df, har_test_df, har_feature_names = load_har_engineered(DATA_ROOT)
    har_engineered_bundle = prepare_har_engineered_bundle(har_train_df, har_test_df, har_feature_names)
    DATA_BUNDLES[har_engineered_bundle.name] = har_engineered_bundle
    print(" - HAR engineered bundle ready")
except Exception as e:
    print("[warn] HAR engineered creation failed:", repr(e))

try:
    X_har_train_raw, y_har_train_raw, X_har_test_raw, y_har_test_raw, har_axes = load_har_raw_sequences(DATA_ROOT)
    har_raw_bundle = prepare_har_raw_bundle(X_har_train_raw, y_har_train_raw, X_har_test_raw, y_har_test_raw)
    DATA_BUNDLES[har_raw_bundle.name] = har_raw_bundle
    print(" - HAR raw bundle ready")
except Exception as e:
    print("[warn] HAR raw creation failed:", repr(e))

# --- Rossmann ---
try:
    ross_train, ross_test, ross_store = load_rossmann_frames(DATA_ROOT)
    ross_prepared, ross_features, ross_target = preprocess_rossmann(ross_train, ross_store)
    ross_bundle = prepare_rossmann_bundle(ross_prepared, ross_features, ross_target)
    DATA_BUNDLES[ross_bundle.name] = ross_bundle
    print(" - Rossmann bundle ready")
except Exception as e:
    print("[warn] Rossmann bundle creation failed:", repr(e))

# --- Summary of what succeeded ---
print("\nAvailable dataset bundles:")
for name, bundle in DATA_BUNDLES.items():
    try:
        print(f" - {name}: {bundle.summary()}")
    except Exception:
        print(f" - {name}: (created, but summary() failed)")

Loading datasets...
Loading eaf_temp from /content/datasets/Industrial Data from the Electric Arc Furnace/eaf_temp.csv...
Loading eaf_gaslance_mat from /content/datasets/Industrial Data from the Electric Arc Furnace/eaf_gaslance_mat.csv...
Loading inj_mat from /content/datasets/Industrial Data from the Electric Arc Furnace/inj_mat.csv...
Loading eaf_transformer from /content/datasets/Industrial Data from the Electric Arc Furnace/eaf_transformer.csv...
Loading eaf_added_materials from /content/datasets/Industrial Data from the Electric Arc Furnace/eaf_added_materials.csv...
Loading basket_charged from /content/datasets/Industrial Data from the Electric Arc Furnace/basket_charged.csv...
Loading lf_added_materials from /content/datasets/Industrial Data from the Electric Arc Furnace/lf_added_materials.csv...
Loading lf_initial_chemical_measurements from /content/datasets/Industrial Data from the Electric Arc Furnace/lf_initial_chemical_measurements.csv...
Loading eaf_final_chemical_measure

  train = pd.read_csv(train_path, parse_dates=["Date"])


 - Rossmann bundle ready

Available dataset bundles:
 - EAF_TEMP_forecast: {'name': 'EAF_TEMP_forecast', 'task_type': 'regression', 'input_kind': 'tabular', 'n_train': 1984, 'n_val': 429, 'n_test': 89, 'input_shape': (34,), 'target_shape': (1,), 'meta_horizon_steps': 1, 'meta_feature_source': 'temp + gas + injection + calendar'}
 - EAF_VALO2_forecast: {'name': 'EAF_VALO2_forecast', 'task_type': 'regression', 'input_kind': 'tabular', 'n_train': 1984, 'n_val': 429, 'n_test': 89, 'input_shape': (34,), 'target_shape': (1,), 'meta_horizon_steps': 1, 'meta_feature_source': 'temp + gas + injection + calendar'}
 - EAF_chemistry: {'name': 'EAF_chemistry', 'task_type': 'regression', 'input_kind': 'tabular', 'n_train': 1189, 'n_val': 290, 'n_test': 932, 'input_shape': (33,), 'target_shape': (12,), 'meta_target_dim': 12, 'meta_note': 'heat-level aggregates for final composition'}
 - Beijing_PM25_24h_ctx_6h_horizon: {'name': 'Beijing_PM25_24h_ctx_6h_horizon', 'task_type': 'regression', 'input_kind'

In [80]:
EXPERIMENT_REGISTRY: Dict[str, List[ModelSpec]] = {}

common_regression_train = TrainConfig(
    epochs=60,
    batch_size=512,
    learning_rate=1e-3,
    weight_decay=1e-4,
    patience=10,
    max_minutes=GLOBAL_CONFIG["max_time_minutes"],
    gradient_clip=1.0,
)

common_sequence_train = TrainConfig(
    epochs=50,
    batch_size=256,
    learning_rate=1e-3,
    weight_decay=1e-4,
    patience=8,
    max_minutes=GLOBAL_CONFIG["max_time_minutes"],
    gradient_clip=1.0,
)

common_classification_train = TrainConfig(
    epochs=50,
    batch_size=256,
    learning_rate=5e-4,
    weight_decay=5e-5,
    patience=8,
    max_minutes=GLOBAL_CONFIG["max_time_minutes"],
    gradient_clip=1.0,
)


def register_specs(bundle: DatasetBundle):
    specs: List[ModelSpec] = []
    if bundle.input_kind == "tabular":
        train_cfg = common_regression_train if bundle.task_type == "regression" else common_classification_train
        specs.append(
            ModelSpec(
                name="ResPSANN_tabular",
                builder=build_psann_tabular,
                train_config=train_cfg,
                task_type=bundle.task_type,
                input_kind="tabular",
                group="psann",
                extra={"hidden_layers": 8, "hidden_units": 256},
                notes="Residual PSANN core",
            )
        )
        specs.append(
            ModelSpec(
                name="MLP_baseline",
                builder=build_mlp_model,
                train_config=train_cfg,
                task_type=bundle.task_type,
                input_kind="tabular",
                group="baseline",
                extra={"hidden_layers": 4, "hidden_units": 256, "dropout": 0.1},
                notes="ReLU MLP with similar parameter budget",
            )
        )
specs.append(
    ModelSpec(
        name="WaveResNet_tabular",
        builder=build_wave_resnet_tabular,
        train_config=train_cfg,
        task_type=bundle.task_type,
        input_kind="tabular",
        group="baseline",
        extra={
            "hidden_dims": [192, 224, 256],
            "depths": [4, 6, 8],
            "target_params": 400_000,
            "param_tol": 0.2,
            "dropout": 0.05,
            "first_layer_w0": 30.0,
            "hidden_w0": 1.0,
        },
        notes="WaveResNet baseline with sine residual blocks",
    )
)
    else:
        train_cfg = common_sequence_train if bundle.task_type == "regression" else common_classification_train
        specs.append(
            ModelSpec(
                name="ResPSANN_conv_spine",
                builder=build_psann_sequence,
                train_config=train_cfg,
                task_type=bundle.task_type,
                input_kind="sequence",
                group="psann",
                extra={
                    "hidden_layers": 6,
                    "hidden_units": 192,
                    "spine_type": "conv",
                    "spine_params": {"channels": 192, "depth": 2, "kernel_size": 5, "stride": 2},
                },
                notes="ResPSANN with strided Conv1d spine",
            )
        )
        specs.append(
            ModelSpec(
                name="ResPSANN_attention_spine",
                builder=build_psann_sequence,
                train_config=train_cfg,
                task_type=bundle.task_type,
                input_kind="sequence",
                group="psann",
                extra={
                    "hidden_layers": 6,
                    "hidden_units": 192,
                    "spine_type": "attention",
                    "spine_params": {"num_heads": 1},
                },
                notes="ResPSANN with single-head attention spine",
            )
        )
        specs.append(
            ModelSpec(
                name="LSTM_baseline",
                builder=build_lstm_model,
                train_config=train_cfg,
                task_type=bundle.task_type,
                input_kind="sequence",
                group="baseline",
                extra={"hidden_units": 192, "num_layers": 1, "dropout": 0.1},
                notes="Single-layer LSTM baseline",
            )
        )
        specs.append(
            ModelSpec(
                name="TCN_baseline",
                builder=build_tcn_model,
                train_config=train_cfg,
                task_type=bundle.task_type,
                input_kind="sequence",
                group="baseline",
                extra={"hidden_channels": 192, "layers": 3, "kernel_size": 3, "dropout": 0.1},
                notes="Tiny TCN baseline",
            )
        )
specs.append(
    ModelSpec(
        name="WaveResNet_sequence",
        builder=build_wave_resnet_sequence,
        train_config=train_cfg,
        task_type=bundle.task_type,
        input_kind="sequence",
        group="baseline",
        extra={
            "hidden_dims": [160, 192, 224],
            "depths": [4, 6, 8],
            "aggregator": "conv",
            "aggregator_params": {"depth": 2, "kernel_size": 5, "stride": 2},
            "target_params": 350_000,
            "param_tol": 0.2,
            "dropout": 0.05,
            "first_layer_w0": 30.0,
            "hidden_w0": 1.0,
        },
        notes="WaveResNet baseline with a lightweight temporal spine",
    )
)
    EXPERIMENT_REGISTRY[bundle.name] = specs


for bundle in DATA_BUNDLES.values():
    register_specs(bundle)

print("Registered model specs:")
for dataset_name, specs in EXPERIMENT_REGISTRY.items():
    print(f"- {dataset_name}: {[spec.name for spec in specs]}")


Registered model specs:
- EAF_TEMP_forecast: ['ResPSANN_tabular', 'MLP_baseline']
- EAF_VALO2_forecast: ['ResPSANN_tabular', 'MLP_baseline']
- EAF_chemistry: ['ResPSANN_tabular', 'MLP_baseline']
- Beijing_PM25_24h_ctx_6h_horizon: ['ResPSANN_conv_spine', 'ResPSANN_attention_spine', 'LSTM_baseline', 'TCN_baseline']
- Jena_tdegc_72ctx_36h: ['ResPSANN_conv_spine', 'ResPSANN_attention_spine', 'LSTM_baseline', 'TCN_baseline']
- HAR_engineered: ['ResPSANN_tabular', 'MLP_baseline']
- HAR_raw_sequence: ['ResPSANN_conv_spine', 'ResPSANN_attention_spine', 'LSTM_baseline', 'TCN_baseline']
- Rossmann_sales: ['ResPSANN_tabular', 'MLP_baseline']


In [81]:
RUN_EXPERIMENTS = {
    "EAF_TEMP_forecast": True,
    "EAF_VALO2_forecast": True,
    "EAF_chemistry": True,
    "Beijing_PM25_24h_ctx_6h_horizon": True,
    "Jena_tdegc_72ctx_36h": True,
    "HAR_engineered": True,
    "HAR_raw_sequence": True,
    "Rossmann_sales": True,
}


In [82]:
EXPERIMENT_ARTIFACTS: Dict[str, Dict[str, Any]] = {}

for dataset_name, run_flag in RUN_EXPERIMENTS.items():
    if not run_flag:
        continue
    if dataset_name not in DATA_BUNDLES:
        print(f"[WARN] Dataset {dataset_name} not loaded; skipping.")
        continue
    bundle = DATA_BUNDLES[dataset_name]
    specs = EXPERIMENT_REGISTRY.get(dataset_name, [])
    if not specs:
        print(f"[WARN] No model specs registered for {dataset_name}; skipping.")
        continue
    print("=" * 80)
    print(f"Dataset: {dataset_name} ({bundle.task_type}, {bundle.input_kind})")
    for spec in specs:
        print(f"  -> Training {spec.name}")
        result = train_model_on_bundle(bundle, spec, task_name=dataset_name)
        EXPERIMENT_ARTIFACTS.setdefault(dataset_name, {})[spec.name] = result
        artifact_path = RESULTS_ROOT / f"{dataset_name}_{spec.name}_predictions.npz"
        np.savez_compressed(
            artifact_path,
            train_true=result["train_true"],
            train_pred=result["train_pred"],
            val_true=result["val_true"],
            val_pred=result["val_pred"],
            test_true=result["test_true"],
            test_pred=result["test_pred"],
        )
        print(f"    Validation metrics: {result['val_metrics']}")
        print(f"    Test metrics       : {result['test_metrics']}")
        print(f"    Saved predictions to {artifact_path}")


Dataset: EAF_TEMP_forecast (regression, tabular)
  -> Training ResPSANN_tabular
    Validation metrics: {'rmse': 20.683494547953096, 'mae': 13.403224704982517, 'smape': 0.008165598887099217, 'r2': 0.23956036913762524, 'mase': 0.9367374542330634}
    Test metrics       : {'rmse': 24.962252279244773, 'mae': 15.8895263671875, 'smape': 0.009694324685986012, 'r2': -0.03632192811632873, 'mase': 0.7441608939103983}
    Saved predictions to /content/colab_results/EAF_TEMP_forecast_ResPSANN_tabular_predictions.npz
  -> Training MLP_baseline
    Validation metrics: {'rmse': 19.917977433661637, 'mae': 12.470661012164918, 'smape': 0.007600588039037466, 'r2': 0.29480799162011695, 'mase': 0.8715615462894025}
    Test metrics       : {'rmse': 25.901293090117896, 'mae': 15.062251744645366, 'smape': 0.009193397514782058, 'r2': -0.11575808915917318, 'mase': 0.70541679239384}
    Saved predictions to /content/colab_results/EAF_TEMP_forecast_MLP_baseline_predictions.npz
Dataset: EAF_VALO2_forecast (regres

In [83]:
results_df = RESULT_LOGGER.to_frame()
results_path = RESULTS_ROOT / "experiment_metrics.csv"
if not results_df.empty:
    results_df.to_csv(results_path, index=False)
    display(results_df)
    print(f"Metrics saved to {results_path}")
else:
    print("No experiments were run yet. Toggle RUN_EXPERIMENTS before executing the training cell.")


Unnamed: 0,dataset,task,model,group,split,params,train_wall_seconds,notes,rmse,mae,smape,r2,mase,accuracy,f1_macro,nll,ece
0,EAF_TEMP_forecast,EAF_TEMP_forecast,ResPSANN_tabular,psann,train,1076489,7.958187,Residual PSANN core,21.872530,14.257730,0.008698,0.289276,0.541215,,,,
1,EAF_TEMP_forecast,EAF_TEMP_forecast,ResPSANN_tabular,psann,val,1076489,7.958187,Residual PSANN core,20.517528,13.436105,0.008185,0.251715,0.939035,,,,
2,EAF_TEMP_forecast,EAF_TEMP_forecast,ResPSANN_tabular,psann,test,1076489,7.958187,Residual PSANN core,23.891865,15.075829,0.009201,0.050648,0.706053,,,,
3,EAF_TEMP_forecast,EAF_TEMP_forecast,MLP_baseline,baseline,train,206593,6.217907,ReLU MLP with similar parameter budget,21.370292,14.206419,0.008670,0.321541,0.538999,,,,
4,EAF_TEMP_forecast,EAF_TEMP_forecast,MLP_baseline,baseline,val,206593,6.217907,ReLU MLP with similar parameter budget,20.418592,13.498109,0.008228,0.258914,0.943369,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
85,Rossmann_sales,Rossmann_sales,ResPSANN_tabular,psann,val,1076233,321.576789,Residual PSANN core,478.446377,337.931761,0.050940,0.974256,0.125858,,,,
86,Rossmann_sales,Rossmann_sales,ResPSANN_tabular,psann,test,1076233,321.576789,Residual PSANN core,488.978574,348.974955,0.050791,0.974847,0.125557,,,,
87,Rossmann_sales,Rossmann_sales,MLP_baseline,baseline,train,206337,75.815578,ReLU MLP with similar parameter budget,548.341991,389.962456,0.058990,0.969114,0.119690,,,,
88,Rossmann_sales,Rossmann_sales,MLP_baseline,baseline,val,206337,75.815578,ReLU MLP with similar parameter budget,559.548665,416.914380,0.063978,0.964789,0.155273,,,,


Metrics saved to /content/colab_results/experiment_metrics.csv


In [84]:
TARGET_DATASET = "EAF_TEMP_forecast"
TARGET_MODEL = "ResPSANN_tabular"

if TARGET_DATASET in EXPERIMENT_ARTIFACTS and TARGET_MODEL in EXPERIMENT_ARTIFACTS[TARGET_DATASET]:
    bundle = DATA_BUNDLES[TARGET_DATASET]
    spec = next(spec for spec in EXPERIMENT_REGISTRY[TARGET_DATASET] if spec.name == TARGET_MODEL)
    trained_model = EXPERIMENT_ARTIFACTS[TARGET_DATASET][TARGET_MODEL]["model"]

    prefix_groups = {
        "temp_lags": [i for i, name in enumerate(bundle.feature_names) if name.startswith("TEMP_lag")],
        "valo2_lags": [i for i, name in enumerate(bundle.feature_names) if name.startswith("VALO2_lag")],
        "gas_flow": [i for i, name in enumerate(bundle.feature_names) if "gas" in name.lower()],
        "inj": [i for i, name in enumerate(bundle.feature_names) if "inj" in name.lower()],
        "calendar": [i for i, name in enumerate(bundle.feature_names) if "DATETIME" in name],
    }

    perm_df = permutation_importance(
        trained_model,
        bundle,
        spec,
        feature_groups=prefix_groups,
        split="test",
        n_repeats=5,
    )
    display(perm_df.sort_values("mean_delta", ascending=False))
else:
    print("Train the target model first; EXPERIMENT_ARTIFACTS does not contain it yet.")


Unnamed: 0,group,mean_delta,std_delta,baseline
0,temp_lags,0.031854,0.007176,0.962133
4,calendar,0.018385,0.004915,0.962133
2,gas_flow,0.006855,0.003384,0.962133
3,inj,-0.006991,0.009792,0.962133
1,valo2_lags,-0.007853,0.002448,0.962133


In [85]:
TARGET_DATASET = "Jena_tdegc_72ctx_36h"
TARGET_MODEL = "ResPSANN_conv_spine"

if TARGET_DATASET in EXPERIMENT_ARTIFACTS and TARGET_MODEL in EXPERIMENT_ARTIFACTS[TARGET_DATASET]:
    bundle = DATA_BUNDLES[TARGET_DATASET]
    spec = next(spec for spec in EXPERIMENT_REGISTRY[TARGET_DATASET] if spec.name == TARGET_MODEL)
    trained_model = EXPERIMENT_ARTIFACTS[TARGET_DATASET][TARGET_MODEL]["model"].to(DEVICE)
    sample_loader = build_dataloader(
        bundle.val["X"],
        bundle.val["y"],
        batch_size=32,
        shuffle=False,
        task_type=spec.task_type,
    )
    sample_batch = next(iter(sample_loader))[0][:64]
    singular_values = compute_jacobian_singular_values(trained_model, sample_batch, max_samples=64)
    pr = participation_ratio(singular_values)
    print(f"Participation ratio: {pr:.4f}")
    trained_model.to("cpu")
else:
    print("Train the target model first to access EXPERIMENT_ARTIFACTS.")


Participation ratio: 1.4756


In [86]:
TARGET_DATASET = "Beijing_PM25_24h_ctx_6h_horizon"
TARGET_MODEL = "ResPSANN_conv_spine"

if TARGET_DATASET in EXPERIMENT_ARTIFACTS and TARGET_MODEL in EXPERIMENT_ARTIFACTS[TARGET_DATASET]:
    bundle = DATA_BUNDLES[TARGET_DATASET]
    spec = next(spec for spec in EXPERIMENT_REGISTRY[TARGET_DATASET] if spec.name == TARGET_MODEL)
    trained_model = EXPERIMENT_ARTIFACTS[TARGET_DATASET][TARGET_MODEL]["model"]

    def missingness_fn(X: np.ndarray, level: float) -> np.ndarray:
        rng = np.random.default_rng(GLOBAL_CONFIG["seed"])
        mask = rng.random(size=X.shape) < level
        X_corrupted = X.copy()
        X_corrupted[mask] = 0.0
        return X_corrupted

    robustness_df = evaluate_robustness(
        trained_model,
        bundle,
        spec,
        corruption_fn=missingness_fn,
        split="test",
        levels=[0.0, 0.1, 0.2, 0.3, 0.4],
    )
    display(robustness_df)
else:
    print("Train the target model before running robustness experiments.")


Unnamed: 0,level,rmse,mae,smape,r2,mase
0,0.0,46.014864,29.070318,0.499069,0.581113,3.222387
1,0.1,127.797654,95.996053,1.212224,-2.231074,10.64097
2,0.2,110.871029,84.176608,1.19964,-1.431851,9.330809
3,0.3,98.701487,73.510134,1.206441,-0.927294,8.148451
4,0.4,94.484721,68.612599,1.252539,-0.766135,7.605569


In [87]:
TARGET_DATASET = "HAR_raw_sequence"
if TARGET_DATASET in EXPERIMENT_ARTIFACTS:
    results = EXPERIMENT_ARTIFACTS[TARGET_DATASET]
    if "ResPSANN_conv_spine" in results and "ResPSANN_attention_spine" in results:
        conv_acc = results["ResPSANN_conv_spine"]["test_metrics"]["accuracy"]
        attn_acc = results["ResPSANN_attention_spine"]["test_metrics"]["accuracy"]
        print(f"Conv spine accuracy: {conv_acc:.4f}")
        print(f"Attention spine accuracy: {attn_acc:.4f}")
    else:
        print("Run both PSANN spine variants on HAR_raw_sequence first.")
else:
    print("Train HAR_raw_sequence models before evaluating H5.")


Conv spine accuracy: 0.9230
Attention spine accuracy: 0.7757


## Notebook Complete
All core experiment scaffolding is now in place. Toggle the runs you need, execute the training cell per hypothesis, and archive outputs from `colab_results/` before ending your Colab session.


In [88]:
import os
import zipfile
from pathlib import Path

def zip_folder(folder_path: str | Path, output_path: str | Path | None = None, *, include_hidden: bool = True) -> Path:
    """
    Compresses an entire folder (recursively) into a .zip archive.

    Parameters
    ----------
    folder_path : str or Path
        Path to the folder to zip.
    output_path : str or Path or None, optional
        Output .zip file path. Defaults to "<folder_name>.zip" in the same directory.
    include_hidden : bool, optional
        Whether to include hidden files (those starting with '.').

    Returns
    -------
    Path
        Path to the created .zip file.
    """
    folder_path = Path(folder_path).resolve()
    if not folder_path.is_dir():
        raise ValueError(f"{folder_path} is not a valid directory")

    if output_path is None:
        output_path = folder_path.with_suffix(".zip")
    else:
        output_path = Path(output_path).resolve()

    with zipfile.ZipFile(output_path, "w", compression=zipfile.ZIP_DEFLATED) as zf:
        for root, dirs, files in os.walk(folder_path):
            # skip hidden dirs/files if requested
            if not include_hidden:
                dirs[:] = [d for d in dirs if not d.startswith(".")]
                files = [f for f in files if not f.startswith(".")]

            for file in files:
                abs_path = Path(root) / file
                # relative path inside the zip
                rel_path = abs_path.relative_to(folder_path)
                zf.write(abs_path, arcname=rel_path)

    print(f"Zipped {folder_path} → {output_path}")
    return output_path



In [89]:
zip_folder(folder_path = '/content/colab_results', output_path = '/content/colab_results.zip')

Zipped /content/colab_results → /content/colab_results.zip


PosixPath('/content/colab_results.zip')

---

## Synthetic Probe Suite

This notebook fabricates compact synthetic datasets that stand in for the larger
benchmarks in the plan, then runs a focused suite of probes:

• Parity-locked comparisons (PSANN + tiny temporal spine vs TCN/LSTM/MLP)
• Cross-"station" generalization & missingness robustness
• Grouped permutation (information-usage) ablations
• Spectral/geometry diagnostics (Jacobian SVD & participation ratio)
• EAF-style per-heat resets & ΔTEMP targets

Results are saved to: /content/psann_synth_results/

Design draws from the experiment plan and PSANN docs:
- Plan & hypotheses (H1–H5), fairness constraints, and probes:  :contentReference[oaicite:3]{index=3}
- Estimator surface and usage patterns (sklearn-style, HISSO-ready):  :contentReference[oaicite:4]{index=4}
- Activation math, residual wrappers, and utilities (Jacobian/NTK):  :contentReference[oaicite:5]{index=5}


In [1]:
# @title Imports, device, tiny utils
import os, math, time, json, random, itertools, functools
from dataclasses import dataclass
import numpy as np
import pandas as pd
from typing import Tuple, Dict, List, Callable, Optional

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.metrics import accuracy_score, f1_score

SEED = 1337
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
RESULTS_DIR = "/content/psann_synth_results"
os.makedirs(RESULTS_DIR, exist_ok=True)

def set_seed(s:int):
    random.seed(s); np.random.seed(s); torch.manual_seed(s)

def param_count(model: nn.Module) -> int:
    return sum(p.numel() for p in model.parameters())

def smape(y_true, y_pred, eps=1e-8):
    denom = (np.abs(y_true) + np.abs(y_pred) + eps)
    return (100.0 / len(y_true)) * np.sum(np.abs(y_pred - y_true) / denom)

def mase(y_true, y_pred, m=1):
    # naive seasonal m=1 by default
    y_true = np.asarray(y_true).ravel()
    y_pred = np.asarray(y_pred).ravel()
    naive = np.mean(np.abs(y_true[m:] - y_true[:-m])) + 1e-8
    return np.mean(np.abs(y_true - y_pred)) / naive

def save_csv(df: pd.DataFrame, name: str):
    path = os.path.join(RESULTS_DIR, name)
    if os.path.exists(path):
        old = pd.read_csv(path)
        df = pd.concat([old, df], ignore_index=True)
    df.to_csv(path, index=False)
    print(f"Saved -> {path} ({len(df)} rows)")


In [2]:
!pip install psann

Collecting psann
  Downloading psann-0.10.3-py3-none-any.whl.metadata (10 kB)
Downloading psann-0.10.3-py3-none-any.whl (78 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/78.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.4/78.4 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: psann
Successfully installed psann-0.10.3


In [3]:
# @title Try importing your PSANN; else define a compact fallback that mirrors PSANN
try:
    import psann  # type: ignore
    HAVE_PSANN = True
    from psann import PSANNRegressor  # sklearn-style
    try:
        from psann.models.wave_resnet import WaveResNet as PSANNWaveResNet
    except Exception:
        PSANNWaveResNet = None
    print("Using installed psann package.")
except Exception as e:
    HAVE_PSANN = False
    PSANNRegressor = None
    PSANNWaveResNet = None
    print("psann not found; using a compact sine-activated residual fallback.")

import math

class SineParam(nn.Module):
    """SIREN-style learnable sine activation with optional decay as in TECHNICAL_DETAILS.md (simplified)."""
    def __init__(self, features, w0=30.0, use_decay=True):
        super().__init__()
        self.a = nn.Parameter(torch.zeros(features))  # amplitude pre-softplus
        self.b = nn.Parameter(torch.zeros(features))  # frequency pre-softplus
        self.c = nn.Parameter(torch.zeros(features))  # decay pre-softplus
        self.w0 = w0
        self.use_decay = use_decay
        self.softplus = nn.Softplus()

    def forward(self, z):
        A = self.softplus(self.a).view(1, -1)
        f = self.softplus(self.b).view(1, -1) + 1e-6
        d = self.softplus(self.c).view(1, -1)
        if z.dim() == 3:  # (B, T, F)
            A = A.unsqueeze(1)
            f = f.unsqueeze(1)
            d = d.unsqueeze(1)
        if self.use_decay:
            return A * torch.exp(-d * torch.abs(z)) * torch.sin(f * z)
        else:
            return A * torch.sin(f * z)

class ResBlock(nn.Module):
    def __init__(self, in_f, out_f, p_drop=0.0):
        super().__init__()
        self.fc = nn.Linear(in_f, out_f)
        self.act = SineParam(out_f)
        self.alpha = nn.Parameter(torch.tensor(1.0))
        self.do = nn.Dropout(p_drop)
        self.short = (in_f == out_f)

        # SIREN-ish init for fc
        wstd = math.sqrt(6.0 / in_f) / 30.0
        nn.init.uniform_(self.fc.weight, -wstd, wstd)
        nn.init.zeros_(self.fc.bias)

    def forward(self, x):
        h = self.do(self.act(self.fc(x)))
        out = h
        if self.short:
            out = x + self.alpha * h
        return out

class ResSineMLP(nn.Module):
    def __init__(self, in_f, hidden, layers=2, out_f=1, p_drop=0.0):
        super().__init__()
        net = []
        dim = in_f
        for _ in range(layers):
            net.append(ResBlock(dim, hidden, p_drop))
            dim = hidden
        self.net = nn.Sequential(*net)
        self.head = nn.Linear(dim, out_f)
        nn.init.zeros_(self.head.bias)

    def forward(self, x):  # x: (B, F)
        h = self.net(x)
        return self.head(h)

class ConvSpine1D(nn.Module):
    """Tiny strided Conv1d spine -> global average -> residual sine MLP head."""
    def __init__(self, in_ch, ch=32, k=5, stride=2, mlp_hidden=64, mlp_layers=2, out_f=1):
        super().__init__()
        pad = (k//2)
        self.conv1 = nn.Conv1d(in_ch, ch, kernel_size=k, padding=pad, stride=stride)
        self.conv2 = nn.Conv1d(ch, ch, kernel_size=k, padding=pad, stride=stride)
        self.mlp = ResSineMLP(in_f=ch, hidden=mlp_hidden, layers=mlp_layers, out_f=out_f)
        # SIREN-ish init
        for c in [self.conv1, self.conv2]:
            nn.init.kaiming_uniform_(c.weight, a=math.sqrt(5))
            if c.bias is not None: nn.init.zeros_(c.bias)

    def forward(self, x):  # x: (B, T, F) -> (B, F, T)
        x = x.transpose(1, 2)
        h = F.silu(self.conv1(x))
        h = F.silu(self.conv2(h))
        h = h.mean(-1)  # GAP over time -> (B, ch)
        return self.mlp(h)

class AttentionSpine1D(nn.Module):
    """Single-head self-attention spine -> mean pool -> residual sine MLP head."""
    def __init__(self, in_f, d=64, mlp_hidden=64, mlp_layers=2, out_f=1):
        super().__init__()
        self.q = nn.Linear(in_f, d); self.k = nn.Linear(in_f, d); self.v = nn.Linear(in_f, d)
        self.mlp = ResSineMLP(in_f=d, hidden=mlp_hidden, layers=mlp_layers, out_f=out_f)

    def forward(self, x):  # (B, T, F)
        Q, K, V = self.q(x), self.k(x), self.v(x)
        att = torch.softmax(Q @ K.transpose(1,2) / math.sqrt(Q.size(-1)), dim=-1)
        H = att @ V
        h = H.mean(1)  # pool over time
        return self.mlp(h)

class TCNBlock(nn.Module):
    """Simple 1D TCN block with causal padding."""
    def __init__(self, in_ch, out_ch, k=5, dilation=1):
        super().__init__()
        pad = (k-1)*dilation
        self.conv = nn.Conv1d(in_ch, out_ch, k, padding=pad, dilation=dilation)
        self.short = (in_ch == out_ch)
        self.proj = nn.Conv1d(in_ch, out_ch, 1) if not self.short else nn.Identity()

    def forward(self, x):
        h = self.conv(x)
        h = h[..., :x.size(-1)]  # causal trim
        h = F.relu(h)
        return F.relu(self.proj(x) + h)

class TinyTCN(nn.Module):
    def __init__(self, in_ch, ch=32, layers=2, k=5, out_f=1):
        super().__init__()
        blocks = []
        c = in_ch
        for i in range(layers):
            blocks.append(TCNBlock(c, ch, k=k, dilation=2**i))
            c = ch
        self.tcn = nn.Sequential(*blocks)
        self.head = nn.Linear(ch, out_f)

    def forward(self, x):  # (B, T, F) -> (B, F, T)
        x = x.transpose(1,2)
        h = self.tcn(x).transpose(1,2)  # back to (B, T, C)
        h_last = h[:, -1, :]
        return self.head(h_last)

class TinyLSTM(nn.Module):
    def __init__(self, in_f, hidden=64, out_f=1, layers=1):
        super().__init__()
        self.lstm = nn.LSTM(in_f, hidden, num_layers=layers, batch_first=True)
        self.head = nn.Linear(hidden, out_f)

    def forward(self, x):  # (B, T, F)
        h, (hn, cn) = self.lstm(x)
        return self.head(h[:, -1, :])

class WaveResNetFallback(nn.Module):
    def __init__(
        self,
        input_dim: int,
        *,
        hidden_dim: int,
        depth: int,
        output_dim: int,
        dropout: float = 0.0,
        first_layer_w0: float = 30.0,
        hidden_w0: float = 1.0,
    ) -> None:
        super().__init__()
        if input_dim <= 0 or hidden_dim <= 0 or output_dim <= 0:
            raise ValueError("input_dim, hidden_dim, and output_dim must be positive.")
        if depth <= 0:
            raise ValueError("depth must be positive.")
        self.stem = nn.Linear(input_dim, hidden_dim)
        wstd = math.sqrt(6.0 / input_dim) / max(first_layer_w0, 1e-6)
        nn.init.uniform_(self.stem.weight, -wstd, wstd)
        nn.init.zeros_(self.stem.bias)
        self.blocks = nn.ModuleList([ResBlock(hidden_dim, hidden_dim, p_drop=dropout) for _ in range(depth)])
        self.head = nn.Linear(hidden_dim, output_dim)
        nn.init.zeros_(self.head.bias)
        self.stem_w0 = first_layer_w0

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        h = torch.sin(self.stem_w0 * self.stem(x))
        for block in self.blocks:
            h = block(h)
        return self.head(h)


def make_wave_resnet(
    input_dim: int,
    *,
    hidden_dim: int,
    depth: int,
    output_dim: int,
    dropout: float = 0.0,
    first_layer_w0: float = 30.0,
    hidden_w0: float = 1.0,
):
    if 'PSANNWaveResNet' in globals() and PSANNWaveResNet is not None:
        return PSANNWaveResNet(
            input_dim=input_dim,
            hidden_dim=hidden_dim,
            depth=depth,
            output_dim=output_dim,
            dropout=dropout,
            first_layer_w0=first_layer_w0,
            hidden_w0=hidden_w0,
        )
    return WaveResNetFallback(
        input_dim=input_dim,
        hidden_dim=hidden_dim,
        depth=depth,
        output_dim=output_dim,
        dropout=dropout,
        first_layer_w0=first_layer_w0,
        hidden_w0=hidden_w0,
    )


# --- Generic training helpers (regression) ---
def fit_regressor(model, X_train, y_train, X_val, y_val, epochs=60, lr=1e-3, bs=128, verbose=False):
    model = model.to(DEVICE)
    opt = torch.optim.AdamW(model.parameters(), lr=lr)
    best = math.inf; best_state = None
    X_train = torch.tensor(X_train, dtype=torch.float32); y_train = torch.tensor(y_train, dtype=torch.float32)
    X_val = torch.tensor(X_val, dtype=torch.float32); y_val = torch.tensor(y_val, dtype=torch.float32)
    train_loader = DataLoader(torch.utils.data.TensorDataset(X_train, y_train), batch_size=bs, shuffle=True)
    for ep in range(epochs):
        model.train()
        for xb, yb in train_loader:
            xb, yb = xb.to(DEVICE), yb.to(DEVICE)
            pred = model(xb)
            loss = F.mse_loss(pred.view_as(yb), yb)
            opt.zero_grad(); loss.backward(); opt.step()
        model.eval()
        with torch.no_grad():
            val_pred = model(X_val.to(DEVICE)).cpu().numpy().ravel()
            vloss = np.mean((val_pred - y_val.numpy().ravel())**2)
        if vloss < best:
            best = vloss; best_state = {k: v.detach().cpu().clone() for k,v in model.state_dict().items()}
        if verbose and (ep+1)%10==0:
            print(f"ep {ep+1:3d} val_mse={vloss:.6f}")
    if best_state is not None:
        model.load_state_dict(best_state)
    return model

def predict_regressor(model, X):
    model.eval()
    with torch.no_grad():
        X = torch.tensor(X, dtype=torch.float32, device=DEVICE)
        y = model(X).cpu().numpy().ravel()
    return y


Using installed psann package.


In [4]:
# @title Generators: Seasonal (Jena proxy), Cross-Station Air (Beijing proxy), and EAF ΔTEMP
def gen_seasonal_series(n=20000, noise=0.15, drift=0.0002, w=(1/24, 1/168)):
    t = np.arange(n, dtype=np.float32)
    daily = np.sin(2*np.pi*w[0]*t)
    weekly = 0.5*np.sin(2*np.pi*w[1]*t + 0.3)
    trend = drift * t
    base = daily + weekly + trend
    exo = np.stack([
        np.cos(2*np.pi*w[0]*t), np.sin(2*np.pi*w[0]*t),
        np.cos(2*np.pi*w[1]*t), np.sin(2*np.pi*w[1]*t),
    ], axis=1)
    y = base + noise*np.random.randn(n).astype(np.float32)
    return y.astype(np.float32), exo.astype(np.float32)

def window_xy(y, exo, ctx=72, horizon=6):
    Xs, Ys = [], []
    for i in range(ctx, len(y)-horizon):
        hist = y[i-ctx:i].reshape(-1,1)
        feats = np.concatenate([hist, exo[i-ctx:i]], axis=1)
        Xs.append(feats)           # (ctx, 1+exo_dim)
        Ys.append(y[i+horizon])
    return np.array(Xs, np.float32), np.array(Ys, np.float32)

def gen_cross_station_air(stations=10, n=5000, ctx=24, horizon=3, missing=0.0, seed=0):
    rs = np.random.RandomState(seed)
    data = []
    for s in range(stations):
        amp = 0.8 + 0.4*rs.rand()
        phi = rs.rand()*2*np.pi
        t = np.arange(n, dtype=np.float32)
        base = amp*np.sin(2*np.pi*(1/24)*t + phi) + 0.2*np.sin(2*np.pi*(1/48)*t)
        met = np.stack([
            np.sin(2*np.pi*(1/24)*t + 0.1), np.cos(2*np.pi*(1/24)*t + 0.2),
            0.5*np.sin(2*np.pi*(1/168)*t + 0.3)
        ], axis=1).astype(np.float32)
        y = base + 0.1*rs.randn(n).astype(np.float32)
        X, Y = window_xy(y, met, ctx=ctx, horizon=horizon)
        # inject missingness on met channels only
        if missing > 0:
            mask = rs.rand(*X.shape) < (missing * (X.shape[-1]-1)/X.shape[-1])
            # keep history column (index 0) intact, drop some exo
            mask[..., 0] = False
            X[mask] = 0.0
            # optional: append masks as features
            X = np.concatenate([X, mask.astype(np.float32)], axis=-1)
        data.append((s, X, Y))
    return data  # list of (station_id, X, y)

def gen_eaf_heats(heats=120, min_len=80, max_len=200, seed=0):
    """ΔTEMP_t ≈ a1*O2_t + a2*MW_t + a3*sqrt(O2_t)*noise; counters reset per heat."""
    rs = np.random.RandomState(seed)
    X_rows, y_rows, heat_ids = [], [], []
    for h in range(heats):
        L = rs.randint(min_len, max_len+1)
        O2 = np.abs(rs.randn(L).astype(np.float32))*2.0
        MW = np.abs(rs.randn(L).astype(np.float32))*3.0
        # cumulative counters reset at each heat:
        c_O2 = np.cumsum(O2)
        c_MW = np.cumsum(MW)
        dtemp = 0.7*O2 + 0.4*MW + 0.15*np.sqrt(O2+1e-6)*rs.randn(L).astype(np.float32)
        TEMP = np.cumsum(dtemp) + 20*rs.rand()  # integral of ΔTEMP
        # target: next-step ΔTEMP
        y = np.roll(dtemp, -1); y[-1] = dtemp[-1]
        feats = np.stack([O2, MW, c_O2, c_MW, TEMP], axis=1).astype(np.float32)
        X_rows.append(feats); y_rows.append(y.astype(np.float32)); heat_ids += [h]*L
    X = np.concatenate(X_rows, axis=0)
    y = np.concatenate(y_rows, axis=0)
    heat_ids = np.array(heat_ids, dtype=np.int32)
    # build windows
    ctx=24
    Xs, Ys = [], []
    for i in range(ctx, len(y)-1):
        if heat_ids[i-ctx] != heat_ids[i]:  # ensure window does not cross heats
            continue
        hist = X[i-ctx:i, :]  # includes counters already reset per-heat
        Xs.append(hist); Ys.append(y[i])
    return np.array(Xs, np.float32), np.array(Ys, np.float32)


In [5]:
# @title Parity matchers: adjust hidden sizes to hit a target parameter budget
def build_psann_conv(in_f, target_params=200_000, tol=0.15, out_f=1):
    # search small grid over (conv_ch, mlp_hidden, layers)
    for ch in [16, 24, 32, 40, 48]:
        for mh in [32, 48, 64, 96]:
            for layers in [1, 2, 3]:
                m = ConvSpine1D(in_ch=in_f, ch=ch, k=5, stride=2, mlp_hidden=mh, mlp_layers=layers, out_f=out_f)
                p = param_count(m)
                if abs(p - target_params)/target_params <= tol:
                    return m, p
    # fallback: pick closest
    best, bestp = None, 1e18
    for ch in [16, 24, 32, 40, 48]:
        for mh in [32, 48, 64, 96]:
            for layers in [1, 2, 3]:
                m = ConvSpine1D(in_ch=in_f, ch=ch, k=5, stride=2, mlp_hidden=mh, mlp_layers=layers, out_f=out_f)
                p = param_count(m)
                if abs(p - target_params) < abs(bestp - target_params):
                    best, bestp = m, p
    return best, bestp

def build_tcn(in_f, target_params=200_000, tol=0.15, out_f=1):
    for ch in [16, 24, 32, 40, 48, 64]:
        for layers in [1,2,3,4]:
            m = TinyTCN(in_ch=in_f, ch=ch, layers=layers, k=5, out_f=out_f)
            p = param_count(m)
            if abs(p - target_params)/target_params <= tol:
                return m, p
    # fallback
    best, bestp = None, 1e18
    for ch in [16, 24, 32, 40, 48, 64]:
        for layers in [1,2,3,4]:
            m = TinyTCN(in_ch=in_f, ch=ch, layers=layers, k=5, out_f=out_f)
            p = param_count(m)
            if abs(p - target_params) < abs(bestp - target_params):
                best, bestp = m, p
    return best, bestp

def build_lstm(in_f, target_params=200_000, tol=0.15, out_f=1):
    for h in [32, 48, 64, 80, 96, 128, 160]:
        m = TinyLSTM(in_f=in_f, hidden=h, out_f=out_f, layers=1)
        p = param_count(m)
        if abs(p - target_params)/target_params <= tol:
            return m, p
    # fallback
    best, bestp = None, 1e18
    for h in [32, 48, 64, 80, 96, 128, 160]:
        m = TinyLSTM(in_f=in_f, hidden=h, out_f=out_f, layers=1)
        p = param_count(m)
        if abs(p - target_params) < abs(bestp - target_params):
            best, bestp = m, p
    return best, bestp

def build_psann_tabular(in_f, target_params=200_000, tol=0.15, out_f=1):
    for mh in [32, 48, 64, 96, 128]:
        for layers in [1, 2, 3]:
            m = ResSineMLP(in_f, hidden=mh, layers=layers, out_f=out_f)
            p = param_count(m)
            if abs(p - target_params)/target_params <= tol:
                return m, p
    # fallback
    best, bestp = None, 1e18
    for mh in [32, 48, 64, 96, 128]:
        for layers in [1, 2, 3]:
            m = ResSineMLP(in_f, hidden=mh, layers=layers, out_f=out_f)
            p = param_count(m)
            if abs(p - target_params) < abs(bestp - target_params):
                best, bestp = m, p
    return best, bestp

class SimpleScaler:
    def __init__(self):
        self.mean = None; self.std = None
    def fit(self, X):
        self.mean = X.mean(axis=0, keepdims=True)
        self.std = X.std(axis=0, keepdims=True) + 1e-8
    def transform(self, X):
        return (X - self.mean)/self.std

def eval_regression(y_true, y_pred):
    r2 = r2_score(y_true, y_pred)
    mae = mean_absolute_error(y_true, y_pred)
    rmse = float(np.sqrt(np.mean((y_true - y_pred)**2)))
    sm = smape(y_true, y_pred)
    ms = mase(y_true, y_pred, m=1)
    return dict(r2=r2, mae=mae, rmse=rmse, smape=sm, mase=ms)

class WaveResNetSeqBaseline(nn.Module):
    def __init__(
        self,
        in_ch: int,
        *,
        conv_channels: int = 64,
        conv_depth: int = 2,
        hidden_dim: int = 160,
        depth: int = 4,
        out_f: int = 1,
        dropout: float = 0.05,
        first_layer_w0: float = 30.0,
        hidden_w0: float = 1.0,
    ) -> None:
        super().__init__()
        layers: List[nn.Module] = []
        channels = in_ch
        for _ in range(conv_depth):
            conv = nn.Conv1d(channels, conv_channels, kernel_size=5, padding=2, stride=2)
            nn.init.kaiming_uniform_(conv.weight, a=math.sqrt(5))
            if conv.bias is not None:
                nn.init.zeros_(conv.bias)
            layers.extend([conv, nn.GELU()])
            channels = conv_channels
        self.conv = nn.Sequential(*layers) if layers else nn.Identity()
        self.pool = nn.AdaptiveAvgPool1d(1)
        final_input = conv_channels if layers else in_ch
        self.wave = make_wave_resnet(
            final_input,
            hidden_dim=hidden_dim,
            depth=depth,
            output_dim=out_f,
            dropout=dropout,
            first_layer_w0=first_layer_w0,
            hidden_w0=hidden_w0,
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        z = x.transpose(1, 2)
        z = self.conv(z)
        z = self.pool(z).squeeze(-1)
        return self.wave(z)


def build_wave_resnet_seq(in_f, target_params=200_000, tol=0.15, out_f=1):
    best, bestp = None, float('inf')
    for conv_channels in [48, 64, 96, 128]:
        for conv_depth in [1, 2, 3]:
            for hidden_dim in [128, 160, 192, 224]:
                for depth in [3, 4, 5]:
                    model = WaveResNetSeqBaseline(
                        in_ch=in_f,
                        conv_channels=conv_channels,
                        conv_depth=conv_depth,
                        hidden_dim=hidden_dim,
                        depth=depth,
                        out_f=out_f,
                        dropout=0.05,
                    )
                    p = param_count(model)
                    if abs(p - target_params) / max(target_params, 1) <= tol:
                        return model, p
                    if abs(p - target_params) < abs(bestp - target_params):
                        best, bestp = model, p
    return best, bestp


def build_wave_resnet_tabular(in_f, target_params=200_000, tol=0.15, out_f=1):
    best, bestp = None, float('inf')
    for hidden_dim in [128, 160, 192, 224, 256]:
        for depth in [3, 4, 5, 6]:
            model = make_wave_resnet(
                in_f,
                hidden_dim=hidden_dim,
                depth=depth,
                output_dim=out_f,
                dropout=0.05,
            )
            p = param_count(model)
            if abs(p - target_params) / max(target_params, 1) <= tol:
                return model, p
            if abs(p - target_params) < abs(bestp - target_params):
                best, bestp = model, p
    return best, bestp


In [6]:
# @title Seasonal probe: PSANN+Conv vs TCN vs LSTM (parity, 3 seeds)
set_seed(SEED)
y, exo = gen_seasonal_series(n=22000, noise=0.12, drift=0.00015)
X, Y = window_xy(y, exo, ctx=72, horizon=6)   # X: (N, 72, 1+4) -> F=5
# Train/val/test split
N = len(Y); n_train = int(0.7*N); n_val = int(0.15*N)
X_train, y_train = X[:n_train], Y[:n_train]
X_val,   y_val   = X[n_train:n_train+n_val], Y[n_train:n_train+n_val]
X_test,  y_test  = X[n_train+n_val:], Y[n_train+n_val:]
sc = SimpleScaler();  # scale per-feature across flattened dims
flat = X_train.reshape(-1, X_train.shape[-1]); sc.fit(flat)
def scale_seq(Xseq):
    shp = Xseq.shape
    Xs = sc.transform(Xseq.reshape(-1, shp[-1])).reshape(shp)
    return Xs

X_train_s = scale_seq(X_train); X_val_s = scale_seq(X_val); X_test_s = scale_seq(X_test)

target_params = 220_000
in_f = X.shape[-1]
records = []
for seed in [1, 2, 3]:
    set_seed(seed)
    # Build parity-matched models
    psann_m, p_ps = build_psann_conv(in_f, target_params, out_f=1)
    tcn_m,   p_tc = build_tcn(in_f, target_params, out_f=1)
    lstm_m,  p_ls = build_lstm(in_f, target_params, out_f=1)
    wave_m,  p_wr = build_wave_resnet_seq(in_f, target_params, out_f=1)
    for name, model, pcount in [
        ("ResPSANN_conv_spine", psann_m, p_ps),
        ("TCN_baseline",        tcn_m,   p_tc),
        ("LSTM_baseline",       lstm_m,  p_ls),
        ("WaveResNet_sequence",  wave_m,  p_wr),
    ]:
        t0 = time.time()
        model = fit_regressor(model, X_train_s, y_train[:,None], X_val_s, y_val[:,None],
                              epochs=60, lr=3e-3, bs=128, verbose=False)
        t1 = time.time()
        yhat = predict_regressor(model, X_test_s)
        mets = eval_regression(y_test, yhat)
        rec = dict(
            dataset="SEASONAL_JENA_PROXY", probe="PARITY", split="test", seed=seed,
            model=name, params=param_count(model), train_wall_seconds=t1-t0, **mets
        )
        records.append(rec)
df = pd.DataFrame(records)
save_csv(df, "synthetic_experiment_metrics.csv")
df.pivot_table(index=["dataset","probe","model"], values=["r2","mae","rmse","smape","mase","params","train_wall_seconds"]).round(4)


Saved -> /content/psann_synth_results/synthetic_experiment_metrics.csv (9 rows)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,mae,mase,params,r2,rmse,smape,train_wall_seconds
dataset,probe,model,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
SEASONAL_JENA_PROXY,PARITY,LSTM_baseline,0.1742,0.8531,107041.0,0.9279,0.2176,2.8717,39.9298
SEASONAL_JENA_PROXY,PARITY,ResPSANN_conv_spine,0.1172,0.5742,37108.0,0.9656,0.1504,1.9844,41.4863
SEASONAL_JENA_PROXY,PARITY,TCN_baseline,0.1113,0.5452,63745.0,0.9704,0.1395,1.9404,29.8154


In [8]:
# @title Cross-station held-out & missingness robustness; parity-matched PSANN+Conv vs TCN vs LSTM
set_seed(SEED)

def prep_dataset(miss=0.0, seed=42, ctx=24, horizon=3):
    """
    Generate a fresh cross-station dataset with a given missingness rate,
    split into train (stations != 0,1), val (station 0), test (station 1),
    and fit a scaler on the training split only.
    """
    data = gen_cross_station_air(stations=10, n=6000, ctx=ctx, horizon=horizon, missing=miss, seed=seed)
    # Split by station id
    X_train = np.concatenate([x for (sid, x, y) in data if sid not in (0, 1)], axis=0)
    y_train = np.concatenate([y for (sid, x, y) in data if sid not in (0, 1)], axis=0)
    X_val   = next(x for (sid, x, y) in data if sid == 0)
    y_val   = next(y for (sid, x, y) in data if sid == 0)
    X_test  = next(x for (sid, x, y) in data if sid == 1)
    y_test  = next(y for (sid, x, y) in data if sid == 1)

    # Scale using only TRAIN statistics
    scaler = SimpleScaler()
    scaler.fit(X_train.reshape(-1, X_train.shape[-1]))
    def scale_seq(X):
        shp = X.shape
        return scaler.transform(X.reshape(-1, shp[-1])).reshape(shp)

    return scale_seq(X_train), y_train, scale_seq(X_val), y_val, scale_seq(X_test), y_test

target_params = 200_000
records = []
for miss in [0.0, 0.1, 0.3]:
    # Fresh dataset (and scaler) per missingness level for clean robustness curves
    Xtr, ytr, Xv, yv, Xte, yte = prep_dataset(miss=miss, seed=42, ctx=24, horizon=3)
    in_f = Xtr.shape[-1]

    for seed in [7, 8, 9]:
        set_seed(seed)
        # Build parity-matched models (≈ same param count)
        psann_m, p_ps = build_psann_conv(in_f, target_params, out_f=1)
        tcn_m,   p_tc = build_tcn(in_f, target_params, out_f=1)
        lstm_m,  p_ls = build_lstm(in_f, target_params, out_f=1)
        wave_m,  p_wr = build_wave_resnet_seq(in_f, target_params, out_f=1)

        for name, model in [
            ("ResPSANN_conv_spine", psann_m),
            ("TCN_baseline",        tcn_m),
            ("LSTM_baseline",       lstm_m),
            ("WaveResNet_sequence",  wave_m),
        ]:
            t0 = time.time()
            model = fit_regressor(model, Xtr, ytr[:, None], Xv, yv[:, None],
                                  epochs=50, lr=3e-3, bs=256)
            t1 = time.time()
            yhat = predict_regressor(model, Xte)
            mets = eval_regression(yte, yhat)
            rec = dict(
                dataset="AIR_BEIJING_PROXY",
                probe=f"HELDOUT+MISS_{int(miss*100)}",
                split="test",
                seed=seed,
                model=name,
                params=param_count(model),
                train_wall_seconds=t1 - t0,
                **mets
            )
            records.append(rec)

df = pd.DataFrame(records)
save_csv(df, "synthetic_experiment_metrics.csv")
df[df["dataset"] == "AIR_BEIJING_PROXY"].pivot_table(
    index=["probe", "model"], values=["r2", "mae", "rmse"]
).round(4)


Saved -> /content/psann_synth_results/synthetic_experiment_metrics.csv (36 rows)


Unnamed: 0_level_0,Unnamed: 1_level_0,mae,r2,rmse
probe,model,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
HELDOUT+MISS_0,LSTM_baseline,0.1025,0.9635,0.1292
HELDOUT+MISS_0,ResPSANN_conv_spine,0.0948,0.9688,0.1195
HELDOUT+MISS_0,TCN_baseline,0.0856,0.9748,0.1075
HELDOUT+MISS_10,LSTM_baseline,0.1096,0.9545,0.1374
HELDOUT+MISS_10,ResPSANN_conv_spine,0.0973,0.9645,0.1218
HELDOUT+MISS_10,TCN_baseline,0.0887,0.9703,0.1114
HELDOUT+MISS_30,LSTM_baseline,0.1117,0.9528,0.1405
HELDOUT+MISS_30,ResPSANN_conv_spine,0.0985,0.9634,0.1237
HELDOUT+MISS_30,TCN_baseline,0.0899,0.9692,0.1134


In [10]:
# @title Permute grouped features on the AIR proxy (history vs meteorology vs calendar)
# FIX: make sure F is the PyTorch functional module, not an int
import torch.nn.functional as F  # restore F after any accidental shadowing

def grouped_permute(X, groups: Dict[str, List[int]], which: str, rs: np.random.RandomState):
    Xp = X.copy()
    idxs = groups[which]
    # permute across samples independently for each feature channel
    for j in idxs:
        rs.shuffle(Xp[:, :, j])
    return Xp

# Rebuild a dataset with explicit groups: [0]=history, [1..m]=met; append calendar(sin/cos hour)
def air_with_calendar(seed=2024):
    data = gen_cross_station_air(stations=10, n=6000, ctx=24, horizon=3, missing=0.0, seed=seed)
    def add_calendar(X):
        N, T, Fd = X.shape
        hours = np.arange(T, dtype=np.float32)[None, :, None].repeat(N, axis=0)
        sin = np.sin(2*np.pi*hours/24.0).astype(np.float32)
        cos = np.cos(2*np.pi*hours/24.0).astype(np.float32)
        return np.concatenate([X, sin, cos], axis=2)
    return [(s, add_calendar(X), y) for (s, X, y) in data]

set_seed(101)
data = air_with_calendar(seed=101)
# Train on all but station 1; test on station 1
Xtr = np.concatenate([x for (s, x, y) in data if s != 1], axis=0)
ytr = np.concatenate([y for (s, x, y) in data if s != 1], axis=0)
Xte = [x for (s, x, y) in data if s == 1][0]
yte = [y for (s, x, y) in data if s == 1][0]

# Scale on training only
sc = SimpleScaler(); sc.fit(Xtr.reshape(-1, Xtr.shape[-1]))
def scale_seq(X):
    shp = X.shape
    return sc.transform(X.reshape(-1, shp[-1])).reshape(shp)

Xtr_s, Xte_s = scale_seq(Xtr), scale_seq(Xte)

# Group indices:
feat_dim = Xtr_s.shape[-1]   # FIX: don't shadow F
# history at index 0; meteorology next 3; calendar at the end (2 dims)
groups = {"history": [0], "meteorology": [1, 2, 3], "calendar": [feat_dim - 2, feat_dim - 1]}

# pick a parity-matched winner config (PSANN+Conv) and compare
psann_m, _ = build_psann_conv(feat_dim, target_params=180_000, out_f=1)
psann_m = fit_regressor(psann_m, Xtr_s, ytr[:, None], Xtr_s[-5000:], ytr[-5000:, None],
                        epochs=40, lr=3e-3, bs=256)
base = eval_regression(yte, predict_regressor(psann_m, Xte_s))["r2"]

abl_records = []
rs = np.random.RandomState(7)
for gname in ["history", "meteorology", "calendar"]:
    Xp = grouped_permute(Xte_s, groups, gname, rs)
    r2 = eval_regression(yte, predict_regressor(psann_m, Xp))["r2"]
    abl_records.append(dict(dataset="AIR_BEIJING_PROXY", probe="ABLATE_GROUPS",
                            model="ResPSANN_conv_spine",
                            group=gname, base_r2=base, ablated_r2=r2, delta=r2 - base))

abl_df = pd.DataFrame(abl_records)
save_csv(abl_df, "synthetic_ablation_results.csv")
abl_df


Saved -> /content/psann_synth_results/synthetic_ablation_results.csv (3 rows)


Unnamed: 0,dataset,probe,model,group,base_r2,ablated_r2,delta
0,AIR_BEIJING_PROXY,ABLATE_GROUPS,ResPSANN_conv_spine,history,0.984804,-0.823856,-1.80866
1,AIR_BEIJING_PROXY,ABLATE_GROUPS,ResPSANN_conv_spine,meteorology,0.984804,0.917602,-0.067202
2,AIR_BEIJING_PROXY,ABLATE_GROUPS,ResPSANN_conv_spine,calendar,0.984804,0.984804,0.0


In [12]:
# @title Jacobian spectrum & participation ratio (PSANN vs MLP) on a small batch — FIXED
def jacobian_matrix(model: nn.Module, Xb: np.ndarray):
    """
    Compute d y / d x for a batch: for each sample i (out_i is scalar y[i,0]),
    we compute grad(out_i, Xb)[i, :, :] and vectorize into one row.
    Returns J of shape (B, T*F).
    """
    model.eval()
    xb = torch.tensor(Xb, dtype=torch.float32, device=DEVICE, requires_grad=True)
    y = model(xb)  # (B, 1)
    assert y.ndim == 2 and y.shape[1] == 1, f"Expected (B,1) output, got {tuple(y.shape)}"

    B, T, Fdim = xb.shape
    rows = []
    for i in range(B):
        # grad_outputs must be same shape as y: (B,1)
        go = torch.zeros_like(y)
        go[i, 0] = 1.0
        g = torch.autograd.grad(outputs=y, inputs=xb,
                                grad_outputs=go,
                                retain_graph=True, create_graph=False,
                                allow_unused=False)[0]  # (B,T,F)
        gi = g[i].reshape(-1).detach().cpu().numpy()       # (T*F,)
        rows.append(gi)
    J = np.stack(rows, axis=0)  # (B, T*F)
    return J

def participation_ratio(M: np.ndarray):
    # effective dimensionality of rows of M
    C = (M - M.mean(0, keepdims=True))
    C = C.T @ C / max(C.shape[0]-1, 1)
    evals = np.maximum(np.linalg.eigvalsh(C), 1e-12)
    s1 = evals.sum(); s2 = (evals**2).sum()
    return float((s1**2) / s2)

# build small dataset
y, exo = gen_seasonal_series(n=6000)
X, Y = window_xy(y, exo, ctx=72, horizon=6)
sc = SimpleScaler(); sc.fit(X.reshape(-1, X.shape[-1]))
Xs = sc.transform(X.reshape(-1, X.shape[-1])).reshape(X.shape)

# models: PSANN+Conv (parity-targeted) vs MLP baseline on pooled features
psann_m, _ = build_psann_conv(Xs.shape[-1], target_params=160_000, out_f=1)
psann_m = fit_regressor(psann_m, Xs[:3000], Y[:3000,None], Xs[3000:4000], Y[3000:4000,None], epochs=30)

# Simple MLP baseline on last-step features only
class TinyMLP(nn.Module):
    def __init__(self, in_f, hidden=64):
        super().__init__()
        self.net = nn.Sequential(nn.Linear(in_f, hidden), nn.ReLU(), nn.Linear(hidden, 1))
    def forward(self, x):  # x: (B,T,F)
        x_last = x[:, -1, :]
        return self.net(x_last)

mlp = TinyMLP(Xs.shape[-1])
mlp = fit_regressor(mlp, Xs[:3000], Y[:3000,None], Xs[3000:4000], Y[3000:4000,None], epochs=30)

# Jacobian on a tiny batch for each
batch = Xs[4000:4024]
J_ps = jacobian_matrix(psann_m, batch)
J_ml = jacobian_matrix(mlp, batch)

# SVD & metrics
_, s_ps, _ = np.linalg.svd(J_ps, full_matrices=False)
_, s_ml, _ = np.linalg.svd(J_ml, full_matrices=False)

spec_records = [{
    "dataset":"SEASONAL_JENA_PROXY","probe":"SPECTRAL","model":"ResPSANN_conv_spine",
    "top_sv": float(s_ps[0]), "sum_sv": float(s_ps.sum()), "pr": participation_ratio(J_ps)
},{
    "dataset":"SEASONAL_JENA_PROXY","probe":"SPECTRAL","model":"MLP_laststep",
    "top_sv": float(s_ml[0]), "sum_sv": float(s_ml.sum()), "pr": participation_ratio(J_ml)
}]
with open(os.path.join(RESULTS_DIR,"synthetic_spectral_results.json"),"w") as f:
    json.dump(spec_records, f, indent=2)
spec_records

[{'dataset': 'SEASONAL_JENA_PROXY',
  'probe': 'SPECTRAL',
  'model': 'ResPSANN_conv_spine',
  'top_sv': 4.814483165740967,
  'sum_sv': 6.343472957611084,
  'pr': 1.770914912223816},
 {'dataset': 'SEASONAL_JENA_PROXY',
  'probe': 'SPECTRAL',
  'model': 'MLP_laststep',
  'top_sv': 4.128472328186035,
  'sum_sv': 5.6605544090271,
  'pr': 2.7963361740112305}]

In [13]:
# @title Fairness check: single-head attention spine vs conv spine under matched params
set_seed(2025)
data = gen_cross_station_air(stations=8, n=5000, ctx=24, horizon=3, missing=0.0, seed=77)
X = np.concatenate([x for (s,x,y) in data], axis=0)
y = np.concatenate([y for (s,x,y) in data], axis=0)
sc = SimpleScaler(); sc.fit(X.reshape(-1, X.shape[-1]))
Xs = sc.transform(X.reshape(-1, X.shape[-1])).reshape(X.shape)

N = len(y); ntr=int(0.7*N); nv=int(0.15*N)
Xtr, ytr = Xs[:ntr], y[:ntr]; Xv, yv = Xs[ntr:ntr+nv], y[ntr:ntr+nv]; Xte, yte = Xs[ntr+nv:], y[ntr+nv:]

target_params = 180_000
# match conv
conv_m, p_conv = build_psann_conv(Xs.shape[-1], target_params, out_f=1)
# for attention, sweep d & mlp until within tol
best_att = None; bestp = 1e18
for d in [48, 64, 80]:
    for mh in [48, 64, 80]:
        att = AttentionSpine1D(in_f=Xs.shape[-1], d=d, mlp_hidden=mh, mlp_layers=2, out_f=1)
        p = param_count(att)
        if abs(p - target_params) < abs(bestp - target_params):
            bestp = p; best_att = att
att_m = best_att

conv_m = fit_regressor(conv_m, Xtr, ytr[:,None], Xv, yv[:,None], epochs=40, lr=3e-3)
att_m  = fit_regressor(att_m,  Xtr, ytr[:,None], Xv, yv[:,None], epochs=40, lr=3e-3)

yhat_c = predict_regressor(conv_m, Xte)
yhat_a = predict_regressor(att_m,  Xte)
rec = pd.DataFrame([
    dict(dataset="AIR_BEIJING_PROXY", probe="SPINE_FAIRNESS", split="test", model="ResPSANN_conv_spine",
         params=param_count(conv_m), **eval_regression(yte, yhat_c)),
    dict(dataset="AIR_BEIJING_PROXY", probe="SPINE_FAIRNESS", split="test", model="ResPSANN_attention_spine",
         params=param_count(att_m), **eval_regression(yte, yhat_a)),
])
save_csv(rec, "synthetic_experiment_metrics.csv")
rec[["model","r2","mae","rmse","params"]].round(5)


Saved -> /content/psann_synth_results/synthetic_experiment_metrics.csv (38 rows)


Unnamed: 0,model,r2,mae,rmse,params
0,ResPSANN_conv_spine,0.97023,0.0943,0.11857,36868
1,ResPSANN_attention_spine,0.67226,0.30052,0.39342,14723


In [14]:
# @title EAF synthetic: ΔTEMP regression with per-heat resets; PSANN-tabular vs MLP baseline
set_seed(606)
X, y = gen_eaf_heats(heats=140, min_len=80, max_len=180, seed=606)  # X: (N, ctx, F_tabular_per_step)
# Collapse per-step features to summary stats over last k steps (tabular proxy)
def collapse_tail(Xseq, k=6):
    # concat mean and last values for each channel to form a fixed-length tabular vector
    tail = Xseq[:, -k:, :]  # (N,k,F)
    mean = tail.mean(axis=1)
    last = tail[:, -1, :]
    return np.concatenate([mean, last], axis=1)

X_tab = collapse_tail(X, k=8)
# scale
mu = X_tab.mean(0, keepdims=True); sd = X_tab.std(0, keepdims=True) + 1e-8
Xn = (X_tab - mu)/sd

N = len(y); ntr=int(0.7*N); nv=int(0.15*N)
Xtr, ytr = Xn[:ntr], y[:ntr]; Xv,yv = Xn[ntr:ntr+nv], y[ntr:ntr+nv]; Xte,yte = Xn[ntr+nv:], y[ntr+nv:]

# Baseline MLP
class MLP_Reg(nn.Module):
    def __init__(self, in_f, hidden=64, layers=2):
        super().__init__()
        net=[nn.Linear(in_f, hidden), nn.ReLU()]
        for _ in range(layers-1):
            net += [nn.Linear(hidden, hidden), nn.ReLU()]
        self.net = nn.Sequential(*net)
        self.head = nn.Linear(hidden, 1)
    def forward(self, x): return self.head(self.net(x))

def fit_reg_tab(model, Xtr, ytr, Xv, yv, epochs=80, lr=2e-3, bs=128):
    model = model.to(DEVICE); opt = torch.optim.AdamW(model.parameters(), lr=lr)
    Xtr = torch.tensor(Xtr, dtype=torch.float32); ytr = torch.tensor(ytr, dtype=torch.float32)[:,None]
    Xv = torch.tensor(Xv, dtype=torch.float32); yv = torch.tensor(yv, dtype=torch.float32)[:,None]
    dl = DataLoader(torch.utils.data.TensorDataset(Xtr, ytr), batch_size=bs, shuffle=True)
    best=1e9; best_state=None
    for ep in range(epochs):
        model.train()
        for xb,yb in dl:
            xb,yb = xb.to(DEVICE), yb.to(DEVICE)
            pred = model(xb)
            loss = F.mse_loss(pred, yb)
            opt.zero_grad(); loss.backward(); opt.step()
        model.eval()
        with torch.no_grad():
            vpred = model(Xv.to(DEVICE)).cpu().numpy().ravel()
        v = np.mean((vpred - yv.numpy().ravel())**2)
        if v < best: best=v; best_state={k:v.detach().cpu().clone() for k,v in model.state_dict().items()}
    if best_state: model.load_state_dict(best_state)
    return model

in_f = Xn.shape[1]
target_params = 160_000
psann_tab, p_ps = build_psann_tabular(in_f, target_params, out_f=1)
mlp = MLP_Reg(in_f, hidden=128, layers=3)
wave_tab, p_wr = build_wave_resnet_tabular(in_f, target_params, out_f=1)

t0 = time.time(); psann_tab = fit_reg_tab(psann_tab, Xtr, ytr, Xv, yv, epochs=80, lr=2e-3); t1 = time.time()
tw0 = time.time(); wave_tab = fit_reg_tab(wave_tab, Xtr, ytr, Xv, yv, epochs=80, lr=2e-3); tw1 = time.time()
tp0 = time.time(); mlp       = fit_reg_tab(mlp,       Xtr, ytr, Xv, yv, epochs=80, lr=2e-3); tp1 = time.time()
yhat_ps = predict_regressor(psann_tab, Xte)
yhat_wr = predict_regressor(wave_tab, Xte)
yhat_ml = predict_regressor(mlp, Xte)

rec = pd.DataFrame([
    dict(dataset="EAF_PROXY", probe="DELTA_TEMP", split="test", model="ResPSANN_tabular",
         params=param_count(psann_tab), train_wall_seconds=t1-t0, **eval_regression(yte, yhat_ps)),
    dict(dataset="EAF_PROXY", probe="DELTA_TEMP", split="test", model="WaveResNet_tabular",
         params=param_count(wave_tab), train_wall_seconds=tw1-tw0, **eval_regression(yte, yhat_wr)),
    dict(dataset="EAF_PROXY", probe="DELTA_TEMP", split="test", model="MLP_baseline",
         params=param_count(mlp), train_wall_seconds=tp1-tp0, **eval_regression(yte, yhat_ml)),
])
save_csv(rec, "synthetic_experiment_metrics.csv")
rec[["model","r2","mae","rmse","params","train_wall_seconds"]].round(5)


Saved -> /content/psann_synth_results/synthetic_experiment_metrics.csv (40 rows)


Unnamed: 0,model,r2,mae,rmse,params,train_wall_seconds
0,ResPSANN_tabular,-0.00184,0.90183,1.12802,35716,29.38711
1,MLP_baseline,-0.00844,0.90144,1.13173,34561,15.71341


In [15]:
# @title Load and preview all CSV outputs
em_path = os.path.join(RESULTS_DIR, "synthetic_experiment_metrics.csv")
ab_path = os.path.join(RESULTS_DIR, "synthetic_ablation_results.csv")

em = pd.read_csv(em_path) if os.path.exists(em_path) else pd.DataFrame()
ab = pd.read_csv(ab_path) if os.path.exists(ab_path) else pd.DataFrame()

print("Experiment metrics rows:", len(em))
display(em.sort_values(["dataset","probe","model"]).head(20))
print("\nAblation results rows:", len(ab))
display(ab)
print("\nSpectral results:", os.path.join(RESULTS_DIR,"synthetic_spectral_results.json"))
if os.path.exists(os.path.join(RESULTS_DIR,"synthetic_spectral_results.json")):
    print(open(os.path.join(RESULTS_DIR,"synthetic_spectral_results.json")).read())


Experiment metrics rows: 40


Unnamed: 0,dataset,probe,split,seed,model,params,train_wall_seconds,r2,mae,rmse,smape,mase
11,AIR_BEIJING_PROXY,HELDOUT+MISS_0,test,7.0,LSTM_baseline,106401,41.524874,0.966884,0.098049,0.123167,14.319148,0.531465
14,AIR_BEIJING_PROXY,HELDOUT+MISS_0,test,8.0,LSTM_baseline,106401,41.716242,0.963445,0.102644,0.129404,14.929571,0.556373
17,AIR_BEIJING_PROXY,HELDOUT+MISS_0,test,9.0,LSTM_baseline,106401,41.80374,0.960139,0.106705,0.135129,16.25956,0.578387
9,AIR_BEIJING_PROXY,HELDOUT+MISS_0,test,7.0,ResPSANN_conv_spine,36868,58.247009,0.968087,0.095834,0.120908,14.547373,0.519461
12,AIR_BEIJING_PROXY,HELDOUT+MISS_0,test,8.0,ResPSANN_conv_spine,36868,57.968974,0.970053,0.092577,0.117125,14.569326,0.501808
15,AIR_BEIJING_PROXY,HELDOUT+MISS_0,test,9.0,ResPSANN_conv_spine,36868,57.654462,0.96829,0.096014,0.120524,14.819717,0.520436
10,AIR_BEIJING_PROXY,HELDOUT+MISS_0,test,7.0,TCN_baseline,63361,45.60039,0.976168,0.08335,0.104485,13.160618,0.451792
13,AIR_BEIJING_PROXY,HELDOUT+MISS_0,test,8.0,TCN_baseline,63361,45.590491,0.974812,0.085392,0.107418,13.489844,0.46286
16,AIR_BEIJING_PROXY,HELDOUT+MISS_0,test,9.0,TCN_baseline,63361,45.708496,0.973368,0.087999,0.110452,14.183676,0.476995
20,AIR_BEIJING_PROXY,HELDOUT+MISS_10,test,7.0,LSTM_baseline,108961,42.478026,0.957508,0.106708,0.133331,15.894076,0.605897



Ablation results rows: 3


Unnamed: 0,dataset,probe,model,group,base_r2,ablated_r2,delta
0,AIR_BEIJING_PROXY,ABLATE_GROUPS,ResPSANN_conv_spine,history,0.984804,-0.823856,-1.80866
1,AIR_BEIJING_PROXY,ABLATE_GROUPS,ResPSANN_conv_spine,meteorology,0.984804,0.917602,-0.067202
2,AIR_BEIJING_PROXY,ABLATE_GROUPS,ResPSANN_conv_spine,calendar,0.984804,0.984804,0.0



Spectral results: /content/psann_synth_results/synthetic_spectral_results.json
[
  {
    "dataset": "SEASONAL_JENA_PROXY",
    "probe": "SPECTRAL",
    "model": "ResPSANN_conv_spine",
    "top_sv": 4.814483165740967,
    "sum_sv": 6.343472957611084,
    "pr": 1.770914912223816
  },
  {
    "dataset": "SEASONAL_JENA_PROXY",
    "probe": "SPECTRAL",
    "model": "MLP_laststep",
    "top_sv": 4.128472328186035,
    "sum_sv": 5.6605544090271,
    "pr": 2.7963361740112305
  }
]


In [16]:
import os
import zipfile
from pathlib import Path

def zip_folder(folder_path: str | Path, output_path: str | Path | None = None, *, include_hidden: bool = True) -> Path:
    """
    Compresses an entire folder (recursively) into a .zip archive.

    Parameters
    ----------
    folder_path : str or Path
        Path to the folder to zip.
    output_path : str or Path or None, optional
        Output .zip file path. Defaults to "<folder_name>.zip" in the same directory.
    include_hidden : bool, optional
        Whether to include hidden files (those starting with '.').

    Returns
    -------
    Path
        Path to the created .zip file.
    """
    folder_path = Path(folder_path).resolve()
    if not folder_path.is_dir():
        raise ValueError(f"{folder_path} is not a valid directory")

    if output_path is None:
        output_path = folder_path.with_suffix(".zip")
    else:
        output_path = Path(output_path).resolve()

    with zipfile.ZipFile(output_path, "w", compression=zipfile.ZIP_DEFLATED) as zf:
        for root, dirs, files in os.walk(folder_path):
            # skip hidden dirs/files if requested
            if not include_hidden:
                dirs[:] = [d for d in dirs if not d.startswith(".")]
                files = [f for f in files if not f.startswith(".")]

            for file in files:
                abs_path = Path(root) / file
                # relative path inside the zip
                rel_path = abs_path.relative_to(folder_path)
                zf.write(abs_path, arcname=rel_path)

    print(f"Zipped {folder_path} → {output_path}")
    return output_path



In [17]:
zip_folder(folder_path = '/content/psann_synth_results', output_path = '/content/psann_synth_results.zip')

Zipped /content/psann_synth_results → /content/psann_synth_results.zip


PosixPath('/content/psann_synth_results.zip')