In [1]:
# v02_minimal.py
import os
import ast
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, GroupKFold
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error

In [2]:
# =========================
# Config (match v02)
# =========================
RANDOM_SEED = 42
N_SPLITS = 5
ALPHA = 6000.0
T_EXPECT = 240

USE_CORE = True
INCLUDE_VEL = True
INCLUDE_ACC = True
INCLUDE_ANGLES = True

# Submission scaling bounds (match your v02 notebook)
SCALER_BOUNDS = {
    "angle": (30.0, 60.0),
    "depth": (-12.0, 30.0),
    "left_right": (-16.0, 16.0),
}

In [3]:
# =========================
# Utils: parse sequence cell -> np.ndarray(T,)
# =========================
def parse_seq(v, T=T_EXPECT) -> np.ndarray:
    """Parse one cell value to float32 vector of length T."""
    if v is None:
        return np.zeros(T, dtype=np.float32)
    if isinstance(v, float) and np.isnan(v):
        return np.zeros(T, dtype=np.float32)

    if isinstance(v, (list, tuple, np.ndarray)):
        arr = np.asarray(v, dtype=np.float32).reshape(-1)
    elif isinstance(v, str):
        s = v.strip()
        if s == "":
            return np.zeros(T, dtype=np.float32)
        try:
            arr = np.asarray(ast.literal_eval(s), dtype=np.float32).reshape(-1)
        except Exception:
            return np.zeros(T, dtype=np.float32)
    else:
        # unknown type -> safest fallback
        return np.zeros(T, dtype=np.float32)

    if arr.size >= T:
        return arr[:T].astype(np.float32, copy=False)
    out = np.zeros(T, dtype=np.float32)
    out[: arr.size] = arr
    return out


def get_keypoints_from_columns(df: pd.DataFrame):
    """Infer keypoints by detecting *_x, *_y, *_z triplets."""
    cols = set(df.columns)
    kps = []
    for c in df.columns:
        if c.endswith("_x"):
            base = c[:-2]
            if (base + "_y") in cols and (base + "_z") in cols:
                kps.append(base)
    return sorted(kps)

In [4]:
# =========================
# Geometry normalization
# =========================
def get_root_xyz(seq_dict, kps):
    """Return root (T,3) using pelvis-like keypoint if available; otherwise hip avg; else global mean."""
    pelvis_candidates = ["pelvis", "mid_hip", "hip_center"]
    for p in pelvis_candidates:
        if p in kps:
            return seq_dict[p]  # (T,3)

    if "left_hip" in kps and "right_hip" in kps:
        return 0.5 * (seq_dict["left_hip"] + seq_dict["right_hip"])

    # fallback: mean over all keypoints
    stack = np.stack([seq_dict[k] for k in kps], axis=0)  # (K,T,3)
    return stack.mean(axis=0)  # (T,3)


def get_scale(seq_dict, kps, root_xyz):
    """Return scale (T,1) using shoulder/hip width if possible; else median distance to root."""
    eps = 1e-6

    def safe_norm(x):
        return np.sqrt((x * x).sum(axis=-1, keepdims=True))  # (T,1)

    if "left_shoulder" in kps and "right_shoulder" in kps:
        w = safe_norm(seq_dict["left_shoulder"] - seq_dict["right_shoulder"])
        return np.clip(w, eps, None)

    if "left_hip" in kps and "right_hip" in kps:
        w = safe_norm(seq_dict["left_hip"] - seq_dict["right_hip"])
        return np.clip(w, eps, None)

    # fallback: median distance to root across keypoints
    dists = []
    for k in kps:
        d = safe_norm(seq_dict[k] - root_xyz)  # (T,1)
        dists.append(d)
    D = np.concatenate(dists, axis=1)  # (T,K)
    med = np.median(D, axis=1, keepdims=True)  # (T,1)
    return np.clip(med, eps, None)


def diff1(x):
    """First difference along time, keeping same length."""
    # x: (T,F)
    out = np.empty_like(x)
    out[0] = x[0]
    out[1:] = x[1:] - x[:-1]
    return out


def diff2(x):
    return diff1(diff1(x))


def angle_3pts(a, b, c):
    """
    Angle ABC per frame in radians.
    a,b,c: (T,3)
    returns: (T,1)
    """
    ba = a - b
    bc = c - b
    ba_n = np.linalg.norm(ba, axis=1, keepdims=True) + 1e-6
    bc_n = np.linalg.norm(bc, axis=1, keepdims=True) + 1e-6
    cos = (ba * bc).sum(axis=1, keepdims=True) / (ba_n * bc_n)
    cos = np.clip(cos, -1.0, 1.0)
    return np.arccos(cos).astype(np.float32)

In [5]:
# =========================
# Core keypoints & angle triplets (match v02 logic)
# =========================
def select_core_keypoints(all_kps):
    candidates = [
        "nose", "left_eye", "right_eye",
        "left_shoulder", "right_shoulder", "left_elbow", "right_elbow",
        "left_wrist", "right_wrist",
        "neck", "chest", "pelvis", "mid_hip", "hip_center",
        "left_hip", "right_hip", "left_knee", "right_knee",
        "left_ankle", "right_ankle",
    ]
    core = [k for k in candidates if k in all_kps]
    if len(core) < 6:
        return list(all_kps)
    return core


def build_angle_triplets(used_kps):
    # Try common triplets; only keep if all three exist
    triplets = [
        ("left_shoulder", "left_elbow", "left_wrist"),
        ("right_shoulder", "right_elbow", "right_wrist"),
        ("left_hip", "left_knee", "left_ankle"),
        ("right_hip", "right_knee", "right_ankle"),
        ("left_hip", "pelvis", "right_hip"),
        ("left_shoulder", "chest", "right_shoulder"),
    ]
    out = []
    s = set(used_kps)
    for a, b, c in triplets:
        if a in s and b in s and c in s:
            out.append((a, b, c))
    return out

In [6]:
# =========================
# Feature builder: full-time sequence (v02 core)
# =========================
def build_fulltime_sequence(df: pd.DataFrame, keypoints):
    used_kps = select_core_keypoints(keypoints) if USE_CORE else list(keypoints)
    angle_triplets = build_angle_triplets(used_kps) if INCLUDE_ANGLES else []

    N = len(df)
    # We'll build X_seq incrementally per row (safe, readable)
    X_list = []

    for i in range(N):
        row = df.iloc[i]

        # gather raw sequences
        seq_dict = {}
        for kp in used_kps:
            x = parse_seq(row[f"{kp}_x"])
            y = parse_seq(row[f"{kp}_y"])
            z = parse_seq(row[f"{kp}_z"])
            seq_dict[kp] = np.stack([x, y, z], axis=1)  # (T,3)

        root = get_root_xyz(seq_dict, used_kps)         # (T,3)
        scale = get_scale(seq_dict, used_kps, root)     # (T,1)

        # normalize per frame
        for kp in used_kps:
            seq_dict[kp] = (seq_dict[kp] - root) / scale

        # pos: (T, 3K)
        pos = np.concatenate([seq_dict[kp] for kp in used_kps], axis=1).astype(np.float32)

        feats = [pos]
        if INCLUDE_VEL:
            feats.append(diff1(pos))
        if INCLUDE_ACC:
            feats.append(diff2(pos))
        if INCLUDE_ANGLES and len(angle_triplets) > 0:
            angs = []
            for a, b, c in angle_triplets:
                angs.append(angle_3pts(seq_dict[a], seq_dict[b], seq_dict[c]))  # (T,1)
            feats.append(np.concatenate(angs, axis=1))  # (T,n_ang)

        Xi = np.concatenate(feats, axis=1).astype(np.float32)  # (T,F)
        X_list.append(Xi)

    X_seq = np.stack(X_list, axis=0)  # (N,T,F)
    return X_seq, used_kps, angle_triplets

In [7]:
from sklearn.model_selection import GroupKFold
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
import numpy as np

def group_oof_ridge_preds(Xtr, y, Xte, groups, alpha=3000.0, n_splits=5, seed=42):
    """
    GroupKFold OOF + test preds (fold-avg).
    Returns: oof_pred, te_pred, fold_rmses
    """
    gkf = GroupKFold(n_splits=n_splits)
    oof = np.zeros(len(y), dtype=np.float32)
    te_sum = np.zeros(Xte.shape[0], dtype=np.float64)
    fold_rmses = []

    # GroupKFold 不用 seed/shuffle；如果你想 shuffle，需要自己打乱 group 顺序（可先不搞）
    for fold, (tr_idx, va_idx) in enumerate(gkf.split(Xtr, y, groups=groups), 1):
        scaler = StandardScaler()
        Xtr_s = scaler.fit_transform(Xtr[tr_idx])
        Xva_s = scaler.transform(Xtr[va_idx])
        Xte_s = scaler.transform(Xte)

        model = Ridge(alpha=alpha)
        model.fit(Xtr_s, y[tr_idx])

        oof[va_idx] = model.predict(Xva_s).astype(np.float32)
        te_sum += model.predict(Xte_s).astype(np.float64)

        rmse = float(np.sqrt(mean_squared_error(y[va_idx], oof[va_idx])))
        fold_rmses.append(rmse)
        print(f"[GroupRidge fold {fold}] RMSE={rmse:.5f}")

    te_pred = (te_sum / n_splits).astype(np.float32)
    return oof, te_pred, fold_rmses

In [8]:
from sklearn.model_selection import KFold

def two_stage_residual_ridge_kfold_strict(
    Xtr, y, Xte,
    alpha1=3000.0,
    alpha2=1500.0,
    n_splits=5,
    seed=42,
):
    """
    Strict 2-stage residual learning under KFold:
      - Stage1 trained on fold-train, predicts fold-val => p1_oof
      - Residual target for stage2 uses stage1 predictions on fold-train (same fold model) => leak-free
      - Stage2 trained on fold-train residuals, predicts fold-val residual => r_oof
      - Final OOF = p1_oof + r_oof
      - Test pred = fold-avg of (stage1+stage2) fold models
    """
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)

    p1_oof = np.zeros(len(y), dtype=np.float32)
    r_oof  = np.zeros(len(y), dtype=np.float32)
    te_sum = np.zeros(Xte.shape[0], dtype=np.float64)
    fold_rmses = []

    for fold, (tr_idx, va_idx) in enumerate(kf.split(Xtr, y), 1):
        # ---- Stage 1 ----
        sc1 = StandardScaler()
        Xtr1 = sc1.fit_transform(Xtr[tr_idx])
        Xva1 = sc1.transform(Xtr[va_idx])
        Xte1 = sc1.transform(Xte)

        m1 = Ridge(alpha=alpha1)
        m1.fit(Xtr1, y[tr_idx])

        p1_va = m1.predict(Xva1).astype(np.float32)
        p1_oof[va_idx] = p1_va

        # ---- Stage 2 (residual) ----
        # leak-free residuals on training fold:
        p1_tr = m1.predict(Xtr1).astype(np.float32)
        r_tr = (y[tr_idx] - p1_tr).astype(np.float32)

        sc2 = StandardScaler()
        Xtr2 = sc2.fit_transform(Xtr[tr_idx])
        Xva2 = sc2.transform(Xtr[va_idx])
        Xte2 = sc2.transform(Xte)

        m2 = Ridge(alpha=alpha2)
        m2.fit(Xtr2, r_tr)

        r_va = m2.predict(Xva2).astype(np.float32)
        r_oof[va_idx] = r_va

        te_fold = m1.predict(Xte1).astype(np.float64) + m2.predict(Xte2).astype(np.float64)
        te_sum += te_fold

        rmse = float(np.sqrt(mean_squared_error(y[va_idx], p1_va + r_va)))
        fold_rmses.append(rmse)
        print(f"[2-stage KFold fold {fold}] RMSE={rmse:.5f}")

    te_pred = (te_sum / n_splits).astype(np.float32)
    oof_final = (p1_oof + r_oof).astype(np.float32)
    rmse_final = float(np.sqrt(mean_squared_error(y, oof_final)))
    print(f"[2-stage KFold OOF] RMSE={rmse_final:.5f}")
    return oof_final, te_pred, fold_rmses, rmse_final

In [9]:
def ridge_kfold_predict(
    Xtr, y, Xte,
    n_splits=N_SPLITS,
    seed=RANDOM_SEED,
    alpha=ALPHA,
    verbose=True,
):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)

    oof = np.zeros(len(y), dtype=np.float32)
    te_pred = np.zeros(Xte.shape[0], dtype=np.float32)
    fold_scores = []
    best_coef = None

    for fold, (tr_idx, va_idx) in enumerate(kf.split(Xtr), 1):
        X_tr, X_va = Xtr[tr_idx], Xtr[va_idx]
        y_tr, y_va = y[tr_idx], y[va_idx]

        scaler = StandardScaler()
        X_tr_s = scaler.fit_transform(X_tr)
        X_va_s = scaler.transform(X_va)
        X_te_s = scaler.transform(Xte)

        model = Ridge(alpha=alpha)
        model.fit(X_tr_s, y_tr)

        if fold == 1:
            best_coef = model.coef_.copy()

        va_pred = model.predict(X_va_s)
        oof[va_idx] = va_pred.astype(np.float32)

        te_pred += model.predict(X_te_s).astype(np.float32) / n_splits

        rmse = float(np.sqrt(mean_squared_error(y_va, va_pred)))
        fold_scores.append(rmse)

        # if verbose:
            # print(f"[Fold {fold}] RMSE={rmse:.5f}  ||w||={np.linalg.norm(model.coef_):.6f}")

    # 折均 RMSE（调试用）
    mean_rmse = float(np.mean(fold_scores))
    std_rmse  = float(np.std(fold_scores))

    # 整体 OOF RMSE（正式口径）
    rmse_oof = float(np.sqrt(mean_squared_error(y, oof)))

    if verbose:
        print(f"Fold Mean RMSE = {mean_rmse:.5f} ± {std_rmse:.5f}")
        print(f"OOF RMSE       = {rmse_oof:.5f}")

    return oof, te_pred, fold_scores, best_coef, rmse_oof

In [10]:
# enable
def train_predict_three_targets(Xtr_seq, Xte_seq, t0_tr, t0_te, y_angle, y_depth, y_lr,
                                windows=(4,8,16), w=20):
    # 直接构造 2D 特征（不做 attention）
    Xtr_a = append_with_t0_stats_fast(Xtr_seq, t0_tr, windows=windows, w=w)
    Xte_a = append_with_t0_stats_fast(Xte_seq, t0_te, windows=windows, w=w)

    Xtr_d = append_with_t0_stats_fast(Xtr_seq, t0_tr, windows=windows, w=w)
    Xte_d = append_with_t0_stats_fast(Xte_seq, t0_te, windows=windows, w=w)

    Xtr_l = append_with_t0_stats_fast(Xtr_seq, t0_tr, windows=windows, w=w)
    Xte_l = append_with_t0_stats_fast(Xte_seq, t0_te, windows=windows, w=w)

    _, pred_a, scores_a, _ = cv_ridge_predict(Xtr_a, y_angle, Xte_a)
    _, pred_d, scores_d, _ = cv_ridge_predict(Xtr_d, y_depth, Xte_d)
    _, pred_l, scores_l, _ = cv_ridge_predict(Xtr_l, y_lr,    Xte_l)

    mean_of_3 = (float(np.mean(scores_a)) + float(np.mean(scores_d)) + float(np.mean(scores_l))) / 3.0
    print(f"[windows={windows}, w={w}] Mean-of-3 CV = {mean_of_3:.6f}")

    return pred_a, pred_d, pred_l


In [11]:
# windows
def extract_t0_window_stats_fast(X_seq: np.ndarray, t0: np.ndarray, w: int = 20) -> np.ndarray:
    """
    Vectorized t0-window stats.
    X_seq: (N, T, F) float32/float64
    t0:    (N,) int
    w: window half-width, uses [t0-w, t0+w] inclusive => length <= 2w+1

    Returns: (N, 2F) = [mean(F), std(F)]
    """
    X = X_seq
    N, T, F = X.shape
    t0 = t0.astype(np.int32, copy=False)

    # window bounds (inclusive)
    l = np.clip(t0 - w, 0, T-1)
    r = np.clip(t0 + w, 0, T-1)
    # convert to prefix-sum slicing (exclusive right)
    r_ex = r + 1
    lens = (r_ex - l).astype(np.float32)  # (N,)

    # prefix sums over time: P[:,t] = sum_{0..t-1}
    P  = np.concatenate([np.zeros((N, 1, F), dtype=X.dtype), np.cumsum(X, axis=1)], axis=1)       # (N,T+1,F)
    P2 = np.concatenate([np.zeros((N, 1, F), dtype=X.dtype), np.cumsum(X*X, axis=1)], axis=1)     # (N,T+1,F)

    idx = np.arange(N)
    sum_  = P[idx, r_ex, :] - P[idx, l, :]     # (N,F)
    sum2_ = P2[idx, r_ex, :] - P2[idx, l, :]   # (N,F)

    mean = sum_ / lens[:, None]
    ex2  = sum2_ / lens[:, None]
    var  = ex2 - mean * mean
    var  = np.maximum(var, 0.0)  # numerical guard
    std  = np.sqrt(var)

    out = np.concatenate([mean, std], axis=1).astype(np.float32, copy=False)  # (N,2F)
    return out


def append_with_t0_stats_fast(X_seq: np.ndarray, t0: np.ndarray,
                              windows=(4, 8, 16), w: int = 20) -> np.ndarray:
    """
    Return: [append_multiscale_summary(X_seq)] + [t0-window mean/std]
    """
    X_base = append_multiscale_summary(X_seq, windows=windows)            # (N, big)
    X_t0   = extract_t0_window_stats_fast(X_seq, t0, w=w)                 # (N, 2F)
    return np.concatenate([X_base, X_t0], axis=1).astype(np.float32, copy=False)


In [12]:
# attention
def kp_xyz_indices(used_kps, kp_name):
    """Assume X_seq feature order is [kp1_x,kp1_y,kp1_z, kp2_x,kp2_y,kp2_z, ...]."""
    if kp_name not in used_kps:
        return None
    base = used_kps.index(kp_name) * 3
    return (base, base + 1, base + 2)

def _moving_average_1d(v, k):
    # v: (N, L)
    if k is None or k <= 1:
        return v
    pad = k // 2
    v_pad = np.pad(v, ((0,0),(pad,pad)), mode="edge")
    c = np.cumsum(v_pad, axis=1)
    return (c[:, k:] - c[:, :-k]) / k

def estimate_t0_joint_peak(
    X_seq,
    used_kps,
    candidates=None,
    smooth=5,
    return_winner=True,
):
    """
    X_seq: (N,T,F)
    used_kps: list of keypoint base names in the same order used in X_seq
    candidates: list of kp names to consider. If None, use a strong default set.
    smooth: moving-average window on speed magnitude (helps reduce jitter)
    return_winner: also return which keypoint (string) triggered the peak for each sample

    Returns:
      t0: (N,) int  peak frame
      winner_kp: (N,) object (optional)  name of kp that achieved max-speed at t0
    """
    N, T, F = X_seq.shape

    if candidates is None:
        # High-potential release proxies: wrists + elbows + (optionally) fingertips.
        # These names must match entries in used_kps; missing ones are auto-skipped.
        candidates = [
            "right_wrist", "left_wrist",
            "right_elbow", "left_elbow",
            # fingertip/distal joints (only used if present in used_kps)
            "right_index_distal", "left_index_distal",
            "right_middle_distal", "left_middle_distal",
            "right_ring_distal", "left_ring_distal",
            "right_pinky_distal", "left_pinky_distal",
            "right_thumb_distal", "left_thumb_distal",
        ]

    # Keep only candidates actually present
    present = []
    idxs = []
    for kp in candidates:
        idx = kp_xyz_indices(used_kps, kp)
        if idx is not None:
            present.append(kp)
            idxs.append(idx)

    if len(present) == 0:
        raise ValueError("None of the candidate keypoints exist in used_kps. "
                         "Print used_kps[:50] and pick valid names.")

    # Compute per-kp speed magnitude: v_kp in R^{N x (T-1)}
    v_list = []
    for (ix, iy, iz) in idxs:
        x = X_seq[:, :, ix]
        y = X_seq[:, :, iy]
        z = X_seq[:, :, iz]
        vx = np.diff(x, axis=1)
        vy = np.diff(y, axis=1)
        vz = np.diff(z, axis=1)
        v = np.sqrt(vx*vx + vy*vy + vz*vz)           # (N, T-1)
        v = _moving_average_1d(v, smooth)            # (N, T-1)
        v_list.append(v)

    V = np.stack(v_list, axis=0)                     # (K, N, T-1)

    # At each (n,t), pick kp with maximum speed; then pick time of maximum.
    kp_arg = np.argmax(V, axis=0)                    # (N, T-1) kp index at each time
    v_max  = np.max(V, axis=0)                       # (N, T-1) max speed at each time
    t_peak = np.argmax(v_max, axis=1)                # (N,) index in [0..T-2]
    t0 = (t_peak + 1).astype(np.int32)               # map to frame index [1..T-1]

    if not return_winner:
        return t0

    winner_idx = kp_arg[np.arange(N), t_peak]        # (N,) kp index that won at peak time
    winner_kp = np.array([present[i] for i in winner_idx], dtype=object)
    return t0, winner_kp

def apply_adaptive_gaussian_attention(X_seq, t0, sigma=18, k=1.35):
    """
    X_seq: (N,T,F), t0: (N,)
    Multiplies each sample by a Gaussian bump centered at its own t0.
    """
    X = X_seq.copy()
    N, T, F = X.shape
    tt = np.arange(T)[None, :]
    t0 = t0[:, None]
    g = 1.0 + (k - 1.0) * np.exp(-0.5 * ((tt - t0) / sigma) ** 2)  # (N,T)
    X *= g[:, :, None].astype(X.dtype)
    return X

In [13]:
# Visualization
import numpy as np
import matplotlib.pyplot as plt

# -------------------------
# 1) coef -> (T,F) for raw flatten part
# -------------------------
def ridge_unpack_raw_coef(coef_1d: np.ndarray, T: int, F: int) -> np.ndarray:
    """
    coef_1d: Ridge coef on X columns.
    If you appended multiscale summary, coef_1d may be longer than T*F;
    we only visualize the raw flatten part (first T*F).
    """
    coef_1d = np.asarray(coef_1d).reshape(-1)
    raw_len = T * F
    if coef_1d.shape[0] < raw_len:
        raise ValueError(f"coef dim={coef_1d.shape[0]} < T*F={raw_len}")
    return coef_1d[:raw_len].reshape(T, F)

# -------------------------
# 2) aggregate over feature dims -> time importance
# -------------------------
def time_importance(coef_tf: np.ndarray, agg="l2") -> np.ndarray:
    """
    coef_tf: (T, F)
    agg: 'l1' | 'l2' | 'max'
    return: (T,)
    """
    if agg == "l1":
        return np.sum(np.abs(coef_tf), axis=1)
    if agg == "l2":
        return np.sqrt(np.sum(coef_tf**2, axis=1))
    if agg == "max":
        return np.max(np.abs(coef_tf), axis=1)
    raise ValueError("agg must be one of: l1, l2, max")

# -------------------------
# 3) plot time importance + mark peaks
# -------------------------
def plot_time_importance_with_peaks(coef_tf: np.ndarray, agg="l2", topk=8,
                                    title="Time importance (Ridge)"):
    imp = time_importance(coef_tf, agg=agg)
    peaks = np.argsort(imp)[::-1][:topk]

    plt.figure(figsize=(12, 3))
    plt.plot(imp)
    for p in peaks:
        plt.axvline(int(p), linestyle="--", linewidth=1)
        plt.text(int(p), float(imp[p]), f"{int(p)}", rotation=90,
                 va="bottom", ha="center")
    plt.xlabel("time index (frame)")
    plt.ylabel(f"importance ({agg})")
    plt.title(title)
    plt.tight_layout()
    plt.show()

    print("Top peak frames:", [(int(p), float(imp[p])) for p in peaks])

# -------------------------
# 4) heatmap view (optional but useful)
# -------------------------
def plot_coef_heatmap(coef_tf: np.ndarray, title="Coef heatmap (feature vs time)"):
    plt.figure(figsize=(12, 5))
    plt.imshow(coef_tf.T, aspect="auto")  # y=feature dim, x=time
    plt.colorbar()
    plt.xlabel("time index (frame)")
    plt.ylabel("feature dim (flattened)")
    plt.title(title)
    plt.tight_layout()
    plt.show()

# -------------------------
# 5) list top-k (t,f) coefficients
# -------------------------
def topk_coef_entries(coef_tf: np.ndarray, k=30):
    T, F = coef_tf.shape
    flat = coef_tf.reshape(-1)
    idx = np.argsort(np.abs(flat))[::-1][:k]
    out = []
    for j in idx:
        t = j // F
        f = j % F
        out.append((int(t), int(f), float(flat[j])))
    return out

def print_topk(topk_list, feature_names=None):
    for rank, (t, f, c) in enumerate(topk_list, 1):
        nm = feature_names[f] if feature_names is not None else f"f={f}"
        print(f"{rank:02d}. t={t:03d}  {nm:<20}  coef={c:+.6e}")


In [14]:
# ---- (ADD) Multiscale time summarization features ----
def append_multiscale_summary(X_seq: np.ndarray, windows=(4, 8, 16)) -> np.ndarray:
    """
    X_seq: (N, T, F) float32
    Return: (N, T*F + sum_w 2*(T/w)*F) float32
      = [raw flatten] + [block-mean] + [block-std] for each window w.
    """
    assert X_seq.ndim == 3
    N, T, F = X_seq.shape

    feats = [X_seq.reshape(N, -1)]
    for w in windows:
        if T % w != 0:
            continue  # only keep exact partitions
        Xw = X_seq.reshape(N, T // w, w, F)
        m = Xw.mean(axis=2)           # (N, T/w, F)
        s = Xw.std(axis=2)            # (N, T/w, F)
        feats.append(m.reshape(N, -1))
        feats.append(s.reshape(N, -1))

    return np.concatenate(feats, axis=1).astype(np.float32, copy=False)



In [15]:
def xgb_kfold_predict(
    Xtr, y, Xte,
    n_splits=N_SPLITS,
    seed=RANDOM_SEED,
    params=None,
    early_stopping_rounds=200,
    verbose=True,
):
    """
    XGBoost KFold CV (OOF + test avg), interface aligned with ridge_kfold_predict:
      returns (oof, te_pred, fold_scores, best_pack, rmse_oof)

    Notes:
    - No scaling needed for tree models.
    - Uses early stopping on each fold.
    """
    import numpy as np
    from sklearn.model_selection import KFold
    from sklearn.metrics import mean_squared_error

    # xgboost import
    import xgboost as xgb

    kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)

    oof = np.zeros(len(y), dtype=np.float32)
    te_sum = np.zeros(Xte.shape[0], dtype=np.float64)
    fold_scores = []

    # default params (safe/strong baseline for tabular regression)
    if params is None:
        params = dict(
            n_estimators=1000,
            learning_rate=0.05,
            max_depth=8,
            min_child_weight=10,
            subsample=0.8,
            colsample_bytree=0.8,
            reg_lambda=5.0,
            reg_alpha=0.0,
            gamma=0.0,
            objective="reg:squarederror",
            eval_metric="rmse",
            tree_method="hist",   # fast on CPU; if GPU available you can switch to "gpu_hist"
            random_state=seed,
            n_jobs=-1,
        )

    best_pack = None  # store first fold booster + best_iteration for debugging

    for fold, (tr_idx, va_idx) in enumerate(kf.split(Xtr), 1):
        X_tr, X_va = Xtr[tr_idx], Xtr[va_idx]
        y_tr, y_va = y[tr_idx], y[va_idx]

        model = xgb.XGBRegressor(**params)

        # early stopping via callbacks (compatible with newer xgboost)
        callbacks = []
        if early_stopping_rounds is not None and early_stopping_rounds > 0:
            callbacks.append(xgb.callback.EarlyStopping(rounds=early_stopping_rounds, save_best=True))

        model.fit(
            X_tr, y_tr,
            eval_set=[(X_va, y_va)],
            verbose=False)

        # predict val with best iteration
        va_pred = model.predict(X_va).astype(np.float32)
        oof[va_idx] = va_pred

        # predict test (use same best iteration)
        te_pred_fold = model.predict(Xte).astype(np.float64)
        te_sum += te_pred_fold

        rmse = float(np.sqrt(mean_squared_error(y_va, va_pred)))
        fold_scores.append(rmse)

        if fold == 1:
            # keep for inspection if needed
            best_pack = {
                "best_iteration": getattr(model, "best_iteration", None),
                "best_ntree_limit": getattr(model, "best_ntree_limit", None),
                "model": model,
            }

        if verbose:
            bi = getattr(model, "best_iteration", None)
            if bi is None:
                print(f"[XGB Fold {fold}] RMSE={rmse:.5f}")
            else:
                print(f"[XGB Fold {fold}] RMSE={rmse:.5f}  best_iter={bi}")

    te_pred = (te_sum / n_splits).astype(np.float32)

    rmse_oof = float(np.sqrt(mean_squared_error(y, oof)))
    mean_rmse = float(np.mean(fold_scores))
    std_rmse  = float(np.std(fold_scores))

    if verbose:
        print(f"Fold Mean RMSE = {mean_rmse:.5f} ± {std_rmse:.5f}")
        print(f"OOF RMSE       = {rmse_oof:.5f}")

    return oof, te_pred, fold_scores, best_pack, rmse_oof

Config

In [16]:
    # =========================
    # CONFIG (easy rollback)
    # =========================
    CFG = dict(
        # toggles
        MODEL = "xgb",  # "ridge" or "xgb"
        USE_ANGLE_ATTENTION=True,   # set False to rollback attention
        USE_T0_STATS=True,          # set False to rollback window stats
        USE_ENSEMBLE=False,          # set False to rollback to single model

        # angle attention params (only if USE_ANGLE_ATTENTION)
        ANGLE_SIGMA=45,
        ANGLE_K=1.15,

        # t0-window half widths per target (only if USE_T0_STATS)
        W_ANGLE=25,
        W_DEPTH=15,
        W_LR=15,

        # multiscale windows
        WINDOWS_A=(6, 12, 24),
        WINDOWS_B=(6, 12, 24), # (4, 8, 16), (6, 12, 24) # only used if USE_ENSEMBLE
    )


In [17]:
def build_X_for_target(Xtr_seq, Xte_seq, t0_tr, t0_te, *, target,
                       use_t0_stats=True, windows=(6,12,24),
                       w_angle=25, w_depth=15, w_lr=15,
                       use_angle_attention=False, angle_sigma=45, angle_k=1.15):
    """
    Returns: Xtr_2d, Xte_2d
    target in {"angle","depth","lr"}
    """
    # choose seq (possibly with attention)
    if target == "angle" and use_angle_attention:
        Xtr_s = apply_adaptive_gaussian_attention(Xtr_seq, t0_tr, sigma=angle_sigma, k=angle_k)
        Xte_s = apply_adaptive_gaussian_attention(Xte_seq, t0_te, sigma=angle_sigma, k=angle_k)
    else:
        Xtr_s, Xte_s = Xtr_seq, Xte_seq

    # choose window half width
    if target == "angle":
        w = w_angle
    elif target == "depth":
        w = w_depth
    else:
        w = w_lr

    # build final 2D features
    if use_t0_stats:
        Xtr_2d = append_with_t0_stats_fast(Xtr_s, t0_tr, windows=windows, w=w)
        Xte_2d = append_with_t0_stats_fast(Xte_s, t0_te, windows=windows, w=w)
    else:
        # rollback: multiscale only (still 2D)
        Xtr_2d = append_multiscale_summary(Xtr_s, windows=windows)
        Xte_2d = append_multiscale_summary(Xte_s, windows=windows)

    return Xtr_2d, Xte_2d


In [18]:
# =========================
# Submission scaling (v02 core)
# =========================
def minmax_scale_clip(x, vmin, vmax):
    x = (x - vmin) / (vmax - vmin)
    return np.clip(x, 0.0, 1.0)

def main():
    import os
    import numpy as np
    import pandas as pd

    BASE = "/kaggle/input/spl-utspan-data-challenge-2026"
    TRAIN_PATH = os.path.join(BASE, "train.csv")
    TEST_PATH  = os.path.join(BASE, "test.csv")
    SUB_PATH   = os.path.join(BASE, "submission.csv")

    train = pd.read_csv(TRAIN_PATH)
    test  = pd.read_csv(TEST_PATH)
    sub   = pd.read_csv(SUB_PATH)

    keypoints = get_keypoints_from_columns(train)
    assert len(keypoints) > 0
    print("Keypoints:", len(keypoints))
    
    # ---- Build features ----
    Xtr_seq, used_kps, angle_triplets = build_fulltime_sequence(train, keypoints)
    Xte_seq, _, _ = build_fulltime_sequence(test, keypoints)

    T = Xtr_seq.shape[1]  # <<< 必须先定义
    F = Xtr_seq.shape[2]

    # ---- estimate adaptive t0 (joint peak) ----
    t0_tr, win_tr = estimate_t0_joint_peak(Xtr_seq, used_kps, smooth=5, return_winner=True)
    t0_te, win_te = estimate_t0_joint_peak(Xte_seq, used_kps, smooth=5, return_winner=True)

    print("t0_tr stats (raw):", int(t0_tr.min()), int(t0_tr.mean()), int(t0_tr.max()))
    import collections
    print(collections.Counter(win_tr).most_common(10))

    # ---- IMPORTANT: clip t0 to avoid boundary noise (e.g. min=1) ----
    # t0_tr = np.clip(t0_tr, 40, T-20).astype(np.int32)
    # t0_te = np.clip(t0_te, 40, T-20).astype(np.int32)
    # print("t0_tr stats (clipped):", int(t0_tr.min()), int(t0_tr.mean()), int(t0_tr.max()))

    # ---- targets ----
    y_angle = train["angle"].values.astype(np.float32)
    y_depth = train["depth"].values.astype(np.float32)
    y_lr    = train["left_right"].values.astype(np.float32)

    def run_one_model(windows):
        # build X for each target
        Xtr_a, Xte_a = build_X_for_target(
            Xtr_seq, Xte_seq, t0_tr, t0_te, target="angle",
            use_t0_stats=CFG["USE_T0_STATS"], windows=windows,
            w_angle=CFG["W_ANGLE"], w_depth=CFG["W_DEPTH"], w_lr=CFG["W_LR"],
            use_angle_attention=CFG["USE_ANGLE_ATTENTION"],
            angle_sigma=CFG["ANGLE_SIGMA"], angle_k=CFG["ANGLE_K"],
        )
        Xtr_d, Xte_d = build_X_for_target(
            Xtr_seq, Xte_seq, t0_tr, t0_te, target="depth",
            use_t0_stats=CFG["USE_T0_STATS"], windows=windows,
            w_angle=CFG["W_ANGLE"], w_depth=CFG["W_DEPTH"], w_lr=CFG["W_LR"],
            use_angle_attention=False,  # only angle can use attention
        )
        Xtr_l, Xte_l = build_X_for_target(
            Xtr_seq, Xte_seq, t0_tr, t0_te, target="lr",
            use_t0_stats=CFG["USE_T0_STATS"], windows=windows,
            w_angle=CFG["W_ANGLE"], w_depth=CFG["W_DEPTH"], w_lr=CFG["W_LR"],
            use_angle_attention=False,
        )

        # train/predict
        if CFG.get("MODEL", "ridge") == "ridge":
            oof_a, pred_a, s_a, _, rmse_a = xgb_kfold_predict(Xtr_a, y_angle, Xte_a, seed=RANDOM_SEED)
            oof_d, pred_d, s_d, _, rmse_d = xgb_kfold_predict(Xtr_d, y_depth, Xte_d, seed=RANDOM_SEED+1)
            oof_l, pred_l, s_l, _, rmse_l = xgb_kfold_predict(Xtr_l, y_lr,    Xte_l, seed=RANDOM_SEED+2)
        else:
            oof_a, pred_a, s_a, _, rmse_a = ridge_kfold_predict(Xtr_a, y_angle, Xte_a, alpha=ALPHA, seed=RANDOM_SEED)
            oof_d, pred_d, s_d, _, rmse_d = ridge_kfold_predict(Xtr_d, y_depth, Xte_d, alpha=ALPHA, seed=RANDOM_SEED)
            oof_l, pred_l, s_l, _, rmse_l = ridge_kfold_predict(Xtr_l, y_lr,    Xte_l, alpha=ALPHA, seed=RANDOM_SEED)

            # oof_a, pred_a, s_a, rmse_a = two_stage_residual_ridge_kfold_strict(
                # Xtr_a, y_angle, Xte_a, alpha1=3000, alpha2=800, n_splits=5, seed=RANDOM_SEED
            # )
            # oof_d, pred_d, s_d, rmse_d = two_stage_residual_ridge_kfold_strict(
                # Xtr_d, y_depth, Xte_d, alpha1=3000, alpha2=800,  n_splits=5, seed=RANDOM_SEED
            # )
            # oof_l, pred_l, s_l, rmse_l = two_stage_residual_ridge_kfold_strict(
                # Xtr_l, y_lr,    Xte_l, alpha1=3000, alpha2=800, n_splits=5, seed=RANDOM_SEED
            # )

        mean_of_3 = (rmse_a + rmse_d + rmse_l) / 3.0
        print(f"[windows={windows}] OOF Mean-of-3 = {mean_of_3:.6f}")

        # Return fold rmses too (for debugging/compat)
        return (pred_a, pred_d, pred_l), (s_a, s_d, s_l), (rmse_a, rmse_d, rmse_l)

    if not CFG["USE_ENSEMBLE"]:
        (pred_angle, pred_depth, pred_lr), (s_a, s_d, s_l), (rmse_a, rmse_d, rmse_l) = run_one_model(CFG["WINDOWS_A"])
        print("\n=== CV Summary (single, OOF) ===")
        print(f"Angle RMSE: {rmse_a:.6f}")
        print(f"Depth RMSE: {rmse_d:.6f}")
        print(f"LR    RMSE: {rmse_l:.6f}")
        print(f"Mean-of-3:  {(rmse_a + rmse_d + rmse_l)/3.0:.6f}")
    else:
        (pa1, pd1, pl1), (sa1, sd1, sl1), (ra1, rd1, rl1) = run_one_model(CFG["WINDOWS_A"])
        (pa2, pd2, pl2), (sa2, sd2, sl2), (ra2, rd2, rl2) = run_one_model(CFG["WINDOWS_B"])

        pred_angle = 0.5 * (pa1 + pa2)
        pred_depth = 0.5 * (pd1 + pd2)
        pred_lr    = 0.5 * (pl1 + pl2)

        # ensemble 的 OOF 指标（按两个配置平均）
        rmse_a = 0.5 * (ra1 + ra2)
        rmse_d = 0.5 * (rd1 + rd2)
        rmse_l = 0.5 * (rl1 + rl2)

        print("\n=== CV Summary (ensemble avg of 2 configs, OOF) ===")
        print(f"Angle RMSE: {rmse_a:.6f}")
        print(f"Depth RMSE: {rmse_d:.6f}")
        print(f"LR    RMSE: {rmse_l:.6f}")
        print(f"Mean-of-3:  {(rmse_a + rmse_d + rmse_l)/3.0:.6f}")

    
    # ---- Fill submission EXACTLY by template columns ----
    def pick_col(sub_cols, name):
        if f"scaled_{name}" in sub_cols:
            return f"scaled_{name}", True
        if name in sub_cols:
            return name, False
        for c in sub_cols:
            if c != "id" and name in c:
                return c, c.startswith("scaled_")
        raise ValueError(f"Cannot find column for '{name}' in template: {list(sub_cols)}")

    cols = sub.columns
    cA, A_scaled = pick_col(cols, "angle")
    cD, D_scaled = pick_col(cols, "depth")
    cL, L_scaled = pick_col(cols, "left_right")

    if A_scaled: sub[cA] = minmax_scale_clip(pred_angle, *SCALER_BOUNDS["angle"])
    else:        sub[cA] = pred_angle

    if D_scaled: sub[cD] = minmax_scale_clip(pred_depth, *SCALER_BOUNDS["depth"])
    else:        sub[cD] = pred_depth

    if L_scaled: sub[cL] = minmax_scale_clip(pred_lr, *SCALER_BOUNDS["left_right"])
    else:        sub[cL] = pred_lr

    # keep only template columns, in template order
    sub = sub[cols]

    # sanity checks
    assert len(sub) == len(test)
    assert sub.shape[1] == pd.read_csv(SUB_PATH).shape[1]

    out_path = "submission.csv"
    sub.to_csv(out_path, index=False)
    print("Saved:", out_path, "shape:", sub.shape)
    print(sub.head())

In [19]:
main()

Keypoints: 69
t0_tr stats (raw): 1 163 234
[('right_wrist', 259), ('left_wrist', 86)]
Fold Mean RMSE = 2.80053 ± 0.24661
OOF RMSE       = 2.81137
Fold Mean RMSE = 3.49837 ± 0.29615
OOF RMSE       = 3.51088
Fold Mean RMSE = 3.40399 ± 0.18637
OOF RMSE       = 3.40909
[windows=(6, 12, 24)] OOF Mean-of-3 = 3.243781

=== CV Summary (single, OOF) ===
Angle RMSE: 2.811368
Depth RMSE: 3.510881
LR    RMSE: 3.409092
Mean-of-3:  3.243781
Saved: submission.csv shape: (113, 4)
                                     id  scaled_angle  scaled_depth  \
0  d5cc9ade-6bfd-42d2-8404-99d7506e535c      0.520770      0.487105   
1  6fb475ff-1732-42bc-8385-9f80956199fe      0.477969      0.481610   
2  39f95c12-deab-4d77-8a9c-feecda4d5a66      0.515179      0.540309   
3  5ec65bf7-4892-4076-a572-e01b4b8ff038      0.470096      0.542501   
4  52ffbd2a-969c-4e52-af66-c4b4be3c3cbb      0.482992      0.591587   

   scaled_left_right  
0           0.398150  
1           0.547909  
2           0.494540  
3           