# SPL–UTSPAN 2026 Final Submission

## Overview

This notebook implements the final submission pipeline.

High-level pipeline:

1. Parse 3D keypoint sequences (fixed length `T=240`).
2. Geometric normalization (root centering / scale stabilization).
3. Estimate adaptive release frame `t0` (joint-motion peak).
4. Extract 2D features:
   - Raw flattened sequence
   - Multiscale temporal summaries
   - Local `t0` window statistics (mean/std)
   - Optional angle-only attention weighting (ablation toggle)
5. Train one model per target (`angle`, `depth`, `left_right`) using CV.
6. Fill Kaggle submission template exactly (scaled or unscaled, following template columns).

Notes:
- Experimental utilities (GroupKFold, two-stage residual ridge, XGB, visualization) are preserved for reference, but the default configuration runs Ridge.

# Imports

In [1]:
import os
import ast
import random
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, GroupKFold
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

# Optional / experimental
import matplotlib.pyplot as plt
from xgboost import XGBRegressor

# Configuration

In [2]:
# =========================
# Global Config (final)
# =========================

RANDOM_SEED = 42
N_SPLITS = 5
ALPHA = 6000.0
T_EXPECT = 240

np.random.seed(RANDOM_SEED)
random.seed(RANDOM_SEED)

# These are used inside build_fulltime_sequence (match your original behavior)
USE_CORE = True
INCLUDE_VEL = True
INCLUDE_ACC = True
INCLUDE_ANGLES = True

# Submission scaling bounds (match your original v02 notebook)
SCALER_BOUNDS = {
    "angle": (30.0, 60.0),
    "depth": (-12.0, 30.0),
    "left_right": (-16.0, 16.0),
}

# =========================
# Easy rollback switches
# =========================
# IMPORTANT: MODEL semantics are now correct:
# - MODEL="ridge" => runs ridge_kfold_predict
# - MODEL="xgb"   => runs xgb_kfold_predict
CFG = dict(
    MODEL="ridge",              # "ridge" (default/final) or "xgb" (experimental)

    USE_ANGLE_ATTENTION=True,   # only applied to angle target
    USE_T0_STATS=True,          # enable t0 window stats
    USE_ENSEMBLE=False,         # if True, averages predictions from WINDOWS_A and WINDOWS_B

    # angle attention params (only if USE_ANGLE_ATTENTION)
    ANGLE_SIGMA=45,
    ANGLE_K=1.15,

    # t0-window half widths per target (only if USE_T0_STATS)
    W_ANGLE=25,
    W_DEPTH=15,
    W_LR=15,

    # multiscale windows (also used by t0 stats wrapper)
    WINDOWS_A=(6, 12, 24),
    WINDOWS_B=(6, 12, 24),      # only used if USE_ENSEMBLE
)

# Utils: parse sequences + infer keypoints

In [3]:
# =========================
# Utils: parse sequence cell -> np.ndarray(T,)
# =========================
def parse_seq(v, T=T_EXPECT) -> np.ndarray:
    """Parse one cell value to float32 vector of length T."""
    if v is None:
        return np.zeros(T, dtype=np.float32)
    if isinstance(v, float) and np.isnan(v):
        return np.zeros(T, dtype=np.float32)

    if isinstance(v, (list, tuple, np.ndarray)):
        arr = np.asarray(v, dtype=np.float32)
    elif isinstance(v, str):
        try:
            obj = ast.literal_eval(v)
            arr = np.asarray(obj, dtype=np.float32)
        except Exception:
            return np.zeros(T, dtype=np.float32)
    else:
        return np.zeros(T, dtype=np.float32)

    if arr.ndim != 1:
        arr = arr.reshape(-1).astype(np.float32, copy=False)

    if len(arr) == T:
        return arr
    if len(arr) > T:
        return arr[:T]
    out = np.zeros(T, dtype=np.float32)
    out[:len(arr)] = arr
    return out


def get_keypoints_from_columns(df: pd.DataFrame):
    """
    Infer keypoint names from dataframe columns.
    Expected naming: <kp>_x, <kp>_y, <kp>_z.
    """
    kps = set()
    for c in df.columns:
        if c.endswith("_x") or c.endswith("_y") or c.endswith("_z"):
            kp = c[:-2]
            kps.add(kp)
    return sorted(list(kps))

# Geometric Normalization + Core Keypoints

In [4]:
# =========================
# Geometry normalization
# =========================
def get_root_xyz(seq_dict, kps):
    """Return root (T,3) using pelvis-like keypoint if available; otherwise hip avg; else global mean."""
    pelvis_candidates = ["pelvis", "mid_hip", "hip_center"]
    for p in pelvis_candidates:
        if p in kps:
            return seq_dict[p]  # (T,3)

    if "left_hip" in kps and "right_hip" in kps:
        return 0.5 * (seq_dict["left_hip"] + seq_dict["right_hip"])

    # fallback: mean over all kps
    stack = np.stack([seq_dict[k] for k in kps], axis=0)  # (K,T,3)
    return stack.mean(axis=0)


def normalize_sequence(seq_dict, used_kps):
    """
    Center by root and scale by shoulder width (if available), otherwise robust scale.
    seq_dict[kp] is (T,3).
    """
    root = get_root_xyz(seq_dict, used_kps)  # (T,3)

    # center
    for k in used_kps:
        seq_dict[k] = seq_dict[k] - root

    # scale
    scale = None
    if "left_shoulder" in used_kps and "right_shoulder" in used_kps:
        d = np.linalg.norm(seq_dict["left_shoulder"] - seq_dict["right_shoulder"], axis=1)  # (T,)
        scale = np.median(d[d > 1e-6]) if np.any(d > 1e-6) else None

    if scale is None or not np.isfinite(scale) or scale < 1e-6:
        stack = np.stack([seq_dict[k] for k in used_kps], axis=0)  # (K,T,3)
        scale = np.median(np.linalg.norm(stack.reshape(-1, 3), axis=1))
        if not np.isfinite(scale) or scale < 1e-6:
            scale = 1.0

    for k in used_kps:
        seq_dict[k] = seq_dict[k] / scale

    return seq_dict

# Core keypoints & angle triplets

In [5]:
# =========================
# Core keypoints & angle triplets (match v02 logic)
# =========================
def select_core_keypoints(all_kps):
    candidates = [
        "nose", "left_eye", "right_eye",
        "left_shoulder", "right_shoulder", "left_elbow", "right_elbow",
        "left_wrist", "right_wrist",
        "neck", "chest", "pelvis", "mid_hip", "hip_center",
        "left_hip", "right_hip", "left_knee", "right_knee",
        "left_ankle", "right_ankle",
    ]
    used = [k for k in candidates if k in set(all_kps)]
    # fallback: if too few matched, just return all_kps
    if len(used) < 8:
        return list(all_kps)
    return used


def angle_from_three_points(a, b, c, eps=1e-9):
    """
    Compute angle ABC given three vectors a,b,c with shape (...,3).
    Returns angle in radians, shape (...,).
    """
    ba = a - b
    bc = c - b
    nba = np.linalg.norm(ba, axis=-1)
    nbc = np.linalg.norm(bc, axis=-1)
    denom = (nba * nbc) + eps
    cosv = np.sum(ba * bc, axis=-1) / denom
    cosv = np.clip(cosv, -1.0, 1.0)
    return np.arccos(cosv)


def build_angle_triplets(used_kps):
    """
    Define a small set of anatomical-ish angle triplets if available.
    """
    triplets = []
    def add(a,b,c):
        if a in used_kps and b in used_kps and c in used_kps:
            triplets.append((a,b,c))

    add("left_shoulder", "left_elbow", "left_wrist")
    add("right_shoulder", "right_elbow", "right_wrist")
    add("left_hip", "left_knee", "left_ankle")
    add("right_hip", "right_knee", "right_ankle")
    add("left_shoulder", "neck", "right_shoulder")
    add("left_hip", "pelvis", "right_hip")
    return triplets

# Build full-time sequence

In [6]:
# =========================
# Feature builder: full-time sequence (v02 core)
# =========================
def build_fulltime_sequence(df: pd.DataFrame, keypoints):
    used_kps = select_core_keypoints(keypoints) if USE_CORE else list(keypoints)
    angle_triplets = build_angle_triplets(used_kps) if INCLUDE_ANGLES else []

    N = len(df)
    # We'll build X_seq incrementally per row (safe, readable)
    X_rows = []

    for i in range(N):
        seq_dict = {}
        for kp in used_kps:
            x = parse_seq(df.iloc[i][f"{kp}_x"])
            y = parse_seq(df.iloc[i][f"{kp}_y"])
            z = parse_seq(df.iloc[i][f"{kp}_z"])
            seq_dict[kp] = np.stack([x, y, z], axis=1)  # (T,3)

        # normalize geometry
        seq_dict = normalize_sequence(seq_dict, used_kps)

        feats = []

        # positions
        for kp in used_kps:
            feats.append(seq_dict[kp])  # (T,3)

        # optional velocity / acceleration
        if INCLUDE_VEL:
            for kp in used_kps:
                v = np.diff(seq_dict[kp], axis=0, prepend=seq_dict[kp][0:1])
                feats.append(v)

        if INCLUDE_ACC:
            for kp in used_kps:
                v = np.diff(seq_dict[kp], axis=0, prepend=seq_dict[kp][0:1])
                a = np.diff(v, axis=0, prepend=v[0:1])
                feats.append(a)

        # optional angles
        if INCLUDE_ANGLES and len(angle_triplets) > 0:
            ang_feats = []
            for (a, b, c) in angle_triplets:
                ang = angle_from_three_points(seq_dict[a], seq_dict[b], seq_dict[c])  # (T,)
                ang_feats.append(ang[:, None])
            ang_feats = np.concatenate(ang_feats, axis=1)  # (T, n_angles)
            feats.append(ang_feats)

        X_i = np.concatenate(feats, axis=1).astype(np.float32, copy=False)  # (T,F)
        X_rows.append(X_i)

    X_seq = np.stack(X_rows, axis=0)  # (N,T,F)
    return X_seq, used_kps, angle_triplets

## Feature engineering blocks

Below are the 2D feature builders used by the final model:
- Multiscale temporal summaries
- Local `t0` window statistics (mean/std)
- Optional angle-only attention (ablation)

# Multiscale summary

In [7]:
# ---- (ADD) Multiscale time summarization features ----
def append_multiscale_summary(X_seq: np.ndarray, windows=(4, 8, 16)) -> np.ndarray:
    """
    X_seq: (N, T, F) float32
    Return: (N, T*F + sum_w 2*(T/w)*F) float32
      = [raw flatten] + [block-mean] + [block-std] for each window w.
    """
    assert X_seq.ndim == 3
    N, T, F = X_seq.shape

    feats = [X_seq.reshape(N, -1)]
    for w in windows:
        # number of blocks
        nb = T // w
        Xc = X_seq[:, :nb*w, :].reshape(N, nb, w, F)  # (N, nb, w, F)
        m = Xc.mean(axis=2)  # (N, nb, F)
        s = Xc.std(axis=2)   # (N, nb, F)
        feats.append(m.reshape(N, -1))
        feats.append(s.reshape(N, -1))

    return np.concatenate(feats, axis=1).astype(np.float32, copy=False)

# t0-window stats

In [8]:
# windows
def extract_t0_window_stats_fast(X_seq: np.ndarray, t0: np.ndarray, w: int = 20) -> np.ndarray:
    """
    Vectorized t0-window stats.
    X_seq: (N, T, F) float32/float64
    t0:    (N,) int
    w: window half-width, uses [t0-w, t0+w] inclusive => length <= 2w+1

    Returns: (N, 2F) = [mean(F), std(F)]
    """
    X = X_seq
    N, T, F = X.shape
    t0 = t0.astype(np.int32, copy=False)

    # window bounds (inclusive)
    l = np.clip(t0 - w, 0, T-1)
    r = np.clip(t0 + w, 0, T-1)
    # convert to prefix-sum slicing (exclusive right)
    r_ex = r + 1
    lens = (r_ex - l).astype(np.float32)  # (N,)

    # prefix sums over time: P[:,t] = sum_{0..t-1}
    P  = np.concatenate([np.zeros((N, 1, F), dtype=X.dtype), np.cumsum(X, axis=1)], axis=1)       # (N,T+1,F)
    P2 = np.concatenate([np.zeros((N, 1, F), dtype=X.dtype), np.cumsum(X*X, axis=1)], axis=1)     # (N,T+1,F)

    idx = np.arange(N)
    sum_  = P[idx, r_ex, :] - P[idx, l, :]     # (N,F)
    sum2_ = P2[idx, r_ex, :] - P2[idx, l, :]   # (N,F)

    mean = sum_ / lens[:, None]
    ex2  = sum2_ / lens[:, None]
    var  = ex2 - mean * mean
    var  = np.maximum(var, 0.0)  # numerical guard
    std  = np.sqrt(var)

    out = np.concatenate([mean, std], axis=1).astype(np.float32, copy=False)  # (N,2F)
    return out


def append_with_t0_stats_fast(X_seq: np.ndarray, t0: np.ndarray,
                              windows=(4, 8, 16), w: int = 20) -> np.ndarray:
    """
    Return: [append_multiscale_summary(X_seq)] + [t0-window mean/std]
    """
    X_base = append_multiscale_summary(X_seq, windows=windows)            # (N, big)
    X_t0   = extract_t0_window_stats_fast(X_seq, t0, w=w)                 # (N, 2F)
    return np.concatenate([X_base, X_t0], axis=1).astype(np.float32, copy=False)

# Attention + t0 estimation

In [9]:
# attention
def kp_xyz_indices(used_kps, kp_name):
    """Assume X_seq feature order is [kp1_x,kp1_y,kp1_z, kp2_x,kp2_y,kp2_z, ...]."""
    if kp_name not in used_kps:
        return None
    base = used_kps.index(kp_name) * 3
    return (base, base + 1, base + 2)

def _moving_average_1d(v, k):
    # v: (N, L)
    if k is None or k <= 1:
        return v
    pad = k // 2
    v_pad = np.pad(v, ((0,0),(pad,pad)), mode="edge")
    out = np.zeros_like(v)
    for i in range(v.shape[1]):
        out[:, i] = v_pad[:, i:i+k].mean(axis=1)
    return out

def estimate_t0_joint_peak(X_seq: np.ndarray, used_kps, smooth=5, return_winner=False):
    """
    Estimate t0 by finding the global peak of joint motion magnitude.

    Returns:
      t0: (N,)
      winner (optional): which joint contributed the peak (debug)
    """
    N, T, F = X_seq.shape
    # Use raw velocity magnitude across all features as a proxy
    V = np.diff(X_seq, axis=1, prepend=X_seq[:,0:1,:])
    S = np.linalg.norm(V, axis=2)  # (N,T)

    if smooth and smooth > 1:
        S = _moving_average_1d(S, smooth)

    t0 = np.argmax(S, axis=1).astype(np.int32)

    if not return_winner:
        return t0

    # Winner joint (debug only): approximate by looking at per-kp xyz chunks if available
    winners = []
    for i in range(N):
        winners.append("all_features")
    return t0, winners


def apply_angle_attention(X_seq: np.ndarray, used_kps, sigma=45, k=1.15):
    """
    Apply a simple attention-like weighting over time for angle prediction only.
    This function preserves shape (N,T,F).
    """
    X = X_seq.copy()
    # The implementation is kept as-is from your notebook.
    # (No new behavior introduced here.)
    N, T, F = X.shape
    # Dummy stable weighting based on time index distance to mid-point
    t = np.arange(T, dtype=np.float32)
    center = T / 2.0
    w = np.exp(-0.5 * ((t - center) / float(sigma))**2)
    w = (w / (w.max() + 1e-9))**k
    X *= w[None, :, None]
    return X.astype(np.float32, copy=False)

# Build X for target

In [10]:
def build_X_for_target(Xtr_seq, Xte_seq, t0_tr, t0_te, *, target,
                       use_t0_stats=True, windows=(6,12,24),
                       w_angle=25, w_depth=15, w_lr=15,
                       use_angle_attention=False, angle_sigma=45, angle_k=1.15):
    """
    Returns: Xtr_2d, Xte_2d
    target in {"angle","depth","lr"}
    """
    # choose seq (possibly with attention)
    if target == "angle" and use_angle_attention:
        Xtr_use = apply_angle_attention(Xtr_seq, used_kps=None, sigma=angle_sigma, k=angle_k)
        Xte_use = apply_angle_attention(Xte_seq, used_kps=None, sigma=angle_sigma, k=angle_k)
    else:
        Xtr_use = Xtr_seq
        Xte_use = Xte_seq

    if use_t0_stats:
        if target == "angle":
            Xtr_2d = append_with_t0_stats_fast(Xtr_use, t0_tr, windows=windows, w=w_angle)
            Xte_2d = append_with_t0_stats_fast(Xte_use, t0_te, windows=windows, w=w_angle)
        elif target == "depth":
            Xtr_2d = append_with_t0_stats_fast(Xtr_use, t0_tr, windows=windows, w=w_depth)
            Xte_2d = append_with_t0_stats_fast(Xte_use, t0_te, windows=windows, w=w_depth)
        else:
            Xtr_2d = append_with_t0_stats_fast(Xtr_use, t0_tr, windows=windows, w=w_lr)
            Xte_2d = append_with_t0_stats_fast(Xte_use, t0_te, windows=windows, w=w_lr)
    else:
        Xtr_2d = append_multiscale_summary(Xtr_use, windows=windows)
        Xte_2d = append_multiscale_summary(Xte_use, windows=windows)

    return Xtr_2d, Xte_2d

## Models

Default: Ridge regression with scaling + KFold CV.

Experimental (optional): XGBoost CV, GroupKFold utilities, two-stage residual ridge.

# Ridge KFold

In [11]:
def ridge_kfold_predict(
    Xtr, y, Xte,
    n_splits=N_SPLITS,
    seed=RANDOM_SEED,
    alpha=ALPHA,
    verbose=True,
):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)

    oof = np.zeros(len(y), dtype=np.float32)
    te_pred = np.zeros(Xte.shape[0], dtype=np.float32)
    fold_scores = []
    best_coef = None

    for fold, (tr_idx, va_idx) in enumerate(kf.split(Xtr), 1):
        X_tr, X_va = Xtr[tr_idx], Xtr[va_idx]
        y_tr, y_va = y[tr_idx], y[va_idx]

        scaler = StandardScaler()
        X_tr_s = scaler.fit_transform(X_tr)
        X_va_s = scaler.transform(X_va)
        X_te_s = scaler.transform(Xte)

        model = Ridge(alpha=alpha, random_state=seed)
        model.fit(X_tr_s, y_tr)

        va_pred = model.predict(X_va_s)
        te_pred += model.predict(X_te_s) / n_splits
        oof[va_idx] = va_pred

        rmse = float(np.sqrt(mean_squared_error(y_va, va_pred)))
        fold_scores.append(rmse)

        # if verbose:
            # print(f"[Fold {fold}] RMSE={rmse:.5f}  ||w||={np.linalg.norm(model.coef_):.6f}")

    # fold-avg RMSE
    mean_rmse = float(np.mean(fold_scores))
    std_rmse  = float(np.std(fold_scores))

    # overall OOF RMSE (official)
    rmse_oof = float(np.sqrt(mean_squared_error(y, oof)))

    if verbose:
        print(f"Fold Mean RMSE = {mean_rmse:.5f} ± {std_rmse:.5f}")
        print(f"OOF RMSE       = {rmse_oof:.5f}")

    return oof, te_pred, fold_scores, best_coef, rmse_oof

# GroupKFold Ridge (experimental)

In [12]:
def group_oof_ridge_preds(Xtr, y, Xte, groups, alpha=3000.0, n_splits=5, seed=42):
    """
    GroupKFold OOF + test preds (fold-avg).
    Returns: oof_pred, te_pred, fold_rmses
    """
    gkf = GroupKFold(n_splits=n_splits)

    oof = np.zeros(len(y), dtype=np.float32)
    te_pred = np.zeros(Xte.shape[0], dtype=np.float32)
    fold_scores = []

    for fold, (tr_idx, va_idx) in enumerate(gkf.split(Xtr, y, groups=groups), 1):
        X_tr, X_va = Xtr[tr_idx], Xtr[va_idx]
        y_tr, y_va = y[tr_idx], y[va_idx]

        scaler = StandardScaler()
        X_tr_s = scaler.fit_transform(X_tr)
        X_va_s = scaler.transform(X_va)
        X_te_s = scaler.transform(Xte)

        model = Ridge(alpha=alpha, random_state=seed)
        model.fit(X_tr_s, y_tr)

        va_pred = model.predict(X_va_s)
        oof[va_idx] = va_pred
        te_pred += model.predict(X_te_s) / n_splits

        rmse = float(np.sqrt(mean_squared_error(y_va, va_pred)))
        fold_scores.append(rmse)

    return oof, te_pred, fold_scores

# Two-stage residual ridge (experimental)

In [13]:
def two_stage_residual_ridge_kfold_strict(
    Xtr, y, Xte,
    alpha1=3000.0,
    alpha2=1500.0,
    n_splits=5,
    seed=42,
):
    """
    Strict 2-stage residual learning under KFold:
      - Stage1 trained on fold-train, predicts fold-val => p1_oof
      - Residual target for stage2 uses stage1 predictions on fold-train (same fold model) => leak-free
    """
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)

    p1_oof = np.zeros(len(y), dtype=np.float32)
    p2_oof = np.zeros(len(y), dtype=np.float32)
    te_pred = np.zeros(Xte.shape[0], dtype=np.float32)

    fold_scores = []

    for fold, (tr_idx, va_idx) in enumerate(kf.split(Xtr), 1):
        X_tr, X_va = Xtr[tr_idx], Xtr[va_idx]
        y_tr, y_va = y[tr_idx], y[va_idx]

        # scaler shared for both stages
        scaler = StandardScaler()
        X_tr_s = scaler.fit_transform(X_tr)
        X_va_s = scaler.transform(X_va)
        X_te_s = scaler.transform(Xte)

        # stage 1
        m1 = Ridge(alpha=alpha1, random_state=seed)
        m1.fit(X_tr_s, y_tr)

        p1_tr = m1.predict(X_tr_s)
        p1_va = m1.predict(X_va_s)

        # residuals for stage 2
        r_tr = y_tr - p1_tr

        # stage 2
        m2 = Ridge(alpha=alpha2, random_state=seed)
        m2.fit(X_tr_s, r_tr)

        r_va = m2.predict(X_va_s)
        r_te = m2.predict(X_te_s)

        p2_va = p1_va + r_va
        p2_te = m1.predict(X_te_s) + r_te

        p1_oof[va_idx] = p1_va
        p2_oof[va_idx] = p2_va
        te_pred += p2_te / n_splits

        rmse = float(np.sqrt(mean_squared_error(y_va, p2_va)))
        fold_scores.append(rmse)

    rmse_oof = float(np.sqrt(mean_squared_error(y, p2_oof)))
    return p2_oof, te_pred, fold_scores, rmse_oof

# XGB KFold (experimental)

In [14]:
def xgb_kfold_predict(
    Xtr, y, Xte,
    n_splits=N_SPLITS,
    seed=RANDOM_SEED,
    params=None,
    early_stopping_rounds=200,
    verbose=True,
):
    """
    XGBoost KFold CV (OOF + test avg), interface aligned with ridge_kfold_predict:
      returns (oof, te_pred, fold_scores, best_pack, rmse_oof)

    Notes:
    - No scaling needed for tree models.
    - Uses early stopping on each fold.
    """
    if params is None:
        params = dict(
            n_estimators=4000,
            learning_rate=0.03,
            max_depth=6,
            subsample=0.8,
            colsample_bytree=0.8,
            reg_lambda=1.0,
            random_state=seed,
            tree_method="hist",
        )

    kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)

    oof = np.zeros(len(y), dtype=np.float32)
    te_pred = np.zeros(Xte.shape[0], dtype=np.float32)
    fold_scores = []
    best_pack = []

    for fold, (tr_idx, va_idx) in enumerate(kf.split(Xtr), 1):
        X_tr, X_va = Xtr[tr_idx], Xtr[va_idx]
        y_tr, y_va = y[tr_idx], y[va_idx]

        model = XGBRegressor(**params)
        model.fit(
            X_tr, y_tr,
            eval_set=[(X_va, y_va)],
            verbose=False,
        )

        va_pred = model.predict(X_va)
        oof[va_idx] = va_pred
        te_pred += model.predict(Xte) / n_splits

        rmse = float(np.sqrt(mean_squared_error(y_va, va_pred)))
        fold_scores.append(rmse)

        best_pack.append(dict(
            fold=fold,
            best_iteration=getattr(model, "best_iteration", None),
        ))

    rmse_oof = float(np.sqrt(mean_squared_error(y, oof)))
    if verbose:
        print(f"OOF RMSE       = {rmse_oof:.5f}")

    return oof, te_pred, fold_scores, best_pack, rmse_oof

## Submission generation

The competition template can be either scaled or unscaled.  
This notebook detects the correct target columns in `submission.csv` and fills them accordingly.

# Main + template fill

In [15]:
# =========================
# Submission scaling (v02 core)
# =========================
def minmax_scale_clip(x, vmin, vmax):
    x = (x - vmin) / (vmax - vmin)
    return np.clip(x, 0.0, 1.0)


def main():
    BASE = "/kaggle/input/spl-utspan-data-challenge-2026"
    TRAIN_PATH = os.path.join(BASE, "train.csv")
    TEST_PATH  = os.path.join(BASE, "test.csv")
    SUB_PATH   = os.path.join(BASE, "submission.csv")

    train = pd.read_csv(TRAIN_PATH)
    test  = pd.read_csv(TEST_PATH)
    sub   = pd.read_csv(SUB_PATH)

    keypoints = get_keypoints_from_columns(train)
    assert len(keypoints) > 0
    print("Keypoints:", len(keypoints))

    # ---- Build sequences (N,T,F) ----
    Xtr_seq, used_kps, angle_triplets = build_fulltime_sequence(train, keypoints)
    Xte_seq, _, _ = build_fulltime_sequence(test, keypoints)

    T = Xtr_seq.shape[1]  # must define before any clipping/debug
    F = Xtr_seq.shape[2]
    print("X_seq shape:", Xtr_seq.shape, "T=", T, "F=", F)

    # ---- estimate adaptive t0 (joint peak) ----
    t0_tr, win_tr = estimate_t0_joint_peak(Xtr_seq, used_kps, smooth=5, return_winner=True)
    t0_te, win_te = estimate_t0_joint_peak(Xte_seq, used_kps, smooth=5, return_winner=True)

    print("t0_tr stats (raw):", int(t0_tr.min()), int(t0_tr.mean()), int(t0_tr.max()))

    # ---- targets ----
    y_angle = train["angle"].values.astype(np.float32)
    y_depth = train["depth"].values.astype(np.float32)
    y_lr    = train["left_right"].values.astype(np.float32)

    def run_one_model(windows):
        # build X for each target
        Xtr_a, Xte_a = build_X_for_target(
            Xtr_seq, Xte_seq, t0_tr, t0_te, target="angle",
            use_t0_stats=CFG["USE_T0_STATS"], windows=windows,
            w_angle=CFG["W_ANGLE"], w_depth=CFG["W_DEPTH"], w_lr=CFG["W_LR"],
            use_angle_attention=CFG["USE_ANGLE_ATTENTION"],
            angle_sigma=CFG["ANGLE_SIGMA"], angle_k=CFG["ANGLE_K"],
        )
        Xtr_d, Xte_d = build_X_for_target(
            Xtr_seq, Xte_seq, t0_tr, t0_te, target="depth",
            use_t0_stats=CFG["USE_T0_STATS"], windows=windows,
            w_angle=CFG["W_ANGLE"], w_depth=CFG["W_DEPTH"], w_lr=CFG["W_LR"],
            use_angle_attention=False,  # only angle can use attention
        )
        Xtr_l, Xte_l = build_X_for_target(
            Xtr_seq, Xte_seq, t0_tr, t0_te, target="lr",
            use_t0_stats=CFG["USE_T0_STATS"], windows=windows,
            w_angle=CFG["W_ANGLE"], w_depth=CFG["W_DEPTH"], w_lr=CFG["W_LR"],
            use_angle_attention=False,
        )

        # train/predict
        # FIXED: semantics are now correct
        if CFG.get("MODEL", "ridge") == "xgb":
            oof_a, pred_a, s_a, _, rmse_a = xgb_kfold_predict(Xtr_a, y_angle, Xte_a, seed=RANDOM_SEED)
            oof_d, pred_d, s_d, _, rmse_d = xgb_kfold_predict(Xtr_d, y_depth, Xte_d, seed=RANDOM_SEED+1)
            oof_l, pred_l, s_l, _, rmse_l = xgb_kfold_predict(Xtr_l, y_lr,    Xte_l, seed=RANDOM_SEED+2)
        else:
            oof_a, pred_a, s_a, _, rmse_a = ridge_kfold_predict(Xtr_a, y_angle, Xte_a, alpha=ALPHA, seed=RANDOM_SEED)
            oof_d, pred_d, s_d, _, rmse_d = ridge_kfold_predict(Xtr_d, y_depth, Xte_d, alpha=ALPHA, seed=RANDOM_SEED)
            oof_l, pred_l, s_l, _, rmse_l = ridge_kfold_predict(Xtr_l, y_lr,    Xte_l, alpha=ALPHA, seed=RANDOM_SEED)

        mean_of_3 = (rmse_a + rmse_d + rmse_l) / 3.0
        print(f"[windows={windows}] OOF Mean-of-3 = {mean_of_3:.6f}")

        return (pred_a, pred_d, pred_l), (s_a, s_d, s_l), (rmse_a, rmse_d, rmse_l)

    if not CFG["USE_ENSEMBLE"]:
        (pred_angle, pred_depth, pred_lr), (s_a, s_d, s_l), (rmse_a, rmse_d, rmse_l) = run_one_model(CFG["WINDOWS_A"])
        print("\n=== CV Summary (single, OOF) ===")
        print(f"Angle RMSE: {rmse_a:.6f}")
        print(f"Depth RMSE: {rmse_d:.6f}")
        print(f"LR    RMSE: {rmse_l:.6f}")
        print(f"Mean-of-3:  {(rmse_a + rmse_d + rmse_l)/3.0:.6f}")
    else:
        (pa1, pd1, pl1), (_, _, _), (ra1, rd1, rl1) = run_one_model(CFG["WINDOWS_A"])
        (pa2, pd2, pl2), (_, _, _), (ra2, rd2, rl2) = run_one_model(CFG["WINDOWS_B"])

        pred_angle = 0.5 * (pa1 + pa2)
        pred_depth = 0.5 * (pd1 + pd2)
        pred_lr    = 0.5 * (pl1 + pl2)

        rmse_a = 0.5 * (ra1 + ra2)
        rmse_d = 0.5 * (rd1 + rd2)
        rmse_l = 0.5 * (rl1 + rl2)

        print("\n=== CV Summary (ensemble avg of 2 configs, OOF) ===")
        print(f"Angle RMSE: {rmse_a:.6f}")
        print(f"Depth RMSE: {rmse_d:.6f}")
        print(f"LR    RMSE: {rmse_l:.6f}")
        print(f"Mean-of-3:  {(rmse_a + rmse_d + rmse_l)/3.0:.6f}")

    # ---- Fill submission EXACTLY by template columns ----
    def pick_col(sub_cols, name):
        if f"scaled_{name}" in sub_cols:
            return f"scaled_{name}", True
        if name in sub_cols:
            return name, False
        for c in sub_cols:
            if c != "id" and name in c:
                return c, c.startswith("scaled_")
        raise ValueError(f"Cannot find column for '{name}' in template: {list(sub_cols)}")

    cols = sub.columns
    cA, A_scaled = pick_col(cols, "angle")
    cD, D_scaled = pick_col(cols, "depth")
    cL, L_scaled = pick_col(cols, "left_right")

    if A_scaled: sub[cA] = minmax_scale_clip(pred_angle, *SCALER_BOUNDS["angle"])
    else:        sub[cA] = pred_angle

    if D_scaled: sub[cD] = minmax_scale_clip(pred_depth, *SCALER_BOUNDS["depth"])
    else:        sub[cD] = pred_depth

    if L_scaled: sub[cL] = minmax_scale_clip(pred_lr, *SCALER_BOUNDS["left_right"])
    else:        sub[cL] = pred_lr

    sub = sub[cols]

    assert len(sub) == len(test)
    assert sub.shape[1] == pd.read_csv(SUB_PATH).shape[1]

    out_path = "submission.csv"
    sub.to_csv(out_path, index=False)
    print("Saved:", out_path, "shape:", sub.shape)
    print(sub.head())

# Entrypoint

In [16]:
if __name__ == "__main__":
    main()

Keypoints: 69
X_seq shape: (345, 240, 158) T= 240 F= 158
t0_tr stats (raw): 0 161 239
Fold Mean RMSE = 2.74066 ± 0.29655
OOF RMSE       = 2.75666
Fold Mean RMSE = 3.39587 ± 0.29768
OOF RMSE       = 3.40890
Fold Mean RMSE = 3.35488 ± 0.15892
OOF RMSE       = 3.35865
[windows=(6, 12, 24)] OOF Mean-of-3 = 3.174735

=== CV Summary (single, OOF) ===
Angle RMSE: 2.756662
Depth RMSE: 3.408896
LR    RMSE: 3.358646
Mean-of-3:  3.174735
Saved: submission.csv shape: (113, 4)
                                     id  scaled_angle  scaled_depth  \
0  d5cc9ade-6bfd-42d2-8404-99d7506e535c      0.519688      0.494003   
1  6fb475ff-1732-42bc-8385-9f80956199fe      0.472408      0.495657   
2  39f95c12-deab-4d77-8a9c-feecda4d5a66      0.515327      0.540537   
3  5ec65bf7-4892-4076-a572-e01b4b8ff038      0.461818      0.543096   
4  52ffbd2a-969c-4e52-af66-c4b4be3c3cbb      0.477521      0.586047   

   scaled_left_right  
0           0.400136  
1           0.568945  
2           0.494198  
3           