Variants:
  1) Full PADRe (Ours)
  2) w/o Gate (Fixed Deg 1/2/3)
  3) Gate w/o STE (Soft weighted sum)
  4) w/o Hadamard
  5) w/o FFN
  6) w/o Residual

Metrics:
  - Clean Val Acc / Macro-F1
  - Noisy Val Acc / Macro-F1 (SNR=10 dB)
  - Robustness Drop (Clean F1 - Noisy F1)
  - Normalized Compute
  - Degree Entropy (gate diversity)

# UCI-HAR

In [1]:
import os, copy, random, time
import numpy as np
import pandas as pd
from pathlib import Path

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from sklearn.metrics import f1_score

# ──────────────────────────────────────────────────────────────────────────────
# Seed / Device
# ──────────────────────────────────────────────────────────────────────────────
def set_seed(seed=42):
    os.environ["PYTHONHASHSEED"] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

DEVICE  = torch.device("cuda" if torch.cuda.is_available() else "cpu")
USE_GPU = DEVICE.type == "cuda"
print(f"Device: {DEVICE} | pin_memory: {USE_GPU}")


# ──────────────────────────────────────────────────────────────────────────────
# Dataset
# ──────────────────────────────────────────────────────────────────────────────
CLASS_NAMES = ["WALK", "UP", "DOWN", "SIT", "STAND", "LAY"]

class UCIHARDataset(Dataset):
    def __init__(self, data_dir, split="train", normalize=None):
        self.data_dir = Path(data_dir)
        self.split    = split
        self.X, self.y = self._load_data()
        self.X = torch.FloatTensor(self.X)
        self.y = torch.LongTensor(self.y) - 1

        self.normalize = normalize

    def _load_data(self):
        split_dir    = self.data_dir / self.split
        signal_types = [
            "body_acc_x","body_acc_y","body_acc_z",
            "body_gyro_x","body_gyro_y","body_gyro_z",
            "total_acc_x","total_acc_y","total_acc_z",
        ]
        signals = []
        for st in signal_types:
            fname = split_dir / "Inertial Signals" / f"{st}_{self.split}.txt"
            signals.append(np.loadtxt(fname))
        X = np.stack(signals, axis=1)
        y = np.loadtxt(split_dir / f"y_{self.split}.txt", dtype=int)
        return X, y

    def __len__(self):  return len(self.X)

    def __getitem__(self, idx):
        X = self.X[idx]
        y = self.y[idx]
        if self.normalize is not None:
            mean, std = self.normalize
            X = (X - mean.squeeze(0)) / std.squeeze(0)
        return X, y


# ──────────────────────────────────────────────────────────────────────────────
# Utils
# ──────────────────────────────────────────────────────────────────────────────
def cosine_temperature(ep, total, tmax=5.0, tmin=0.5):
    r = ep / max(total - 1, 1)
    return tmin + (tmax - tmin) * 0.5 * (1.0 + np.cos(np.pi * r))

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


# ──────────────────────────────────────────────────────────────────────────────
# Corruptions (SNR=10 uses this)
# ──────────────────────────────────────────────────────────────────────────────
def add_gaussian_noise(X, snr_db):
    """
    X: (B,C,T)
    snr_db: float
    """
    signal_power = (X ** 2).mean(dim=(1, 2), keepdim=True)
    snr = 10 ** (snr_db / 10.0)
    noise_power = signal_power / snr
    noise = torch.randn_like(X) * torch.sqrt(noise_power)
    return X + noise


# ──────────────────────────────────────────────────────────────────────────────
# Compute-Aware Degree Gate  (★ Variant behavior matches "예전버전")
#   - use_ste=True  : train=STE(hard fwd, soft bwd), eval=hard onehot
#   - use_ste=False : always soft_probs (train/eval 동일)
# ──────────────────────────────────────────────────────────────────────────────
class ComputeAwareDegreeGate(nn.Module):
    def __init__(self,
                 channels,
                 max_degree=3,
                 gate_hidden_dim=16,
                 temperature_initial=5.0,
                 temperature_min=0.5
        ):
        super().__init__()
        self.max_degree = max_degree

        self.gate = nn.Sequential(
            nn.AdaptiveAvgPool1d(1),
            nn.Flatten(1),
            nn.Linear(channels, gate_hidden_dim),
            nn.GELU(),
            nn.Linear(gate_hidden_dim, max_degree),
        )

        nn.init.zeros_(self.gate[-1].bias)
        if max_degree >= 3:
            self.gate[-1].bias.data[1] = 0.4

        self.register_buffer("temperature", torch.tensor(float(temperature_initial)))
        self.temperature_min = float(temperature_min)

    def set_temperature(self, t):
        self.temperature.fill_(max(float(t), self.temperature_min))

    def forward(self, x, use_ste=True):
        logits = self.gate(x)  # (B,K)
        soft_probs = F.softmax(logits / self.temperature, dim=-1)

        if use_ste:
            if self.training:
                hard_idx = logits.argmax(dim=-1)
                hard_oh = F.one_hot(hard_idx, num_classes=self.max_degree).float()
                # STE: forward=hard, backward=soft
                degree_w = hard_oh - soft_probs.detach() + soft_probs
            else:
                degree_w = F.one_hot(
                    logits.argmax(dim=-1), num_classes=self.max_degree
                ).float()
        else:
            # Gate w/o STE: always soft (train/eval 동일)
            degree_w = soft_probs

        return degree_w, logits, soft_probs


# ──────────────────────────────────────────────────────────────────────────────
# PADRe Block (Ablation switches ONLY; ★ Variant behavior matches "예전버전")
#   - w/o Gate: fixed_degree (1..K) → build up to d, output Z[d-1]
#   - Gate w/o STE: soft weighted sum of ALL degrees (train/eval 동일)
#   - w/o Hadamard: prefix sum (Z[i]=Z[i-1]+Y[i])  (예전버전)
# ──────────────────────────────────────────────────────────────────────────────
class PADReBlockAblation(nn.Module):
    def __init__(self,
                 channels,
                 seq_len,
                 max_degree=3,
                 token_kernel=11,
                 gate_hidden_dim=16,
                 temperature_initial=5.0,
                 temperature_min=0.5,
                 # ablations
                 use_gate=True,
                 fixed_degree=None,      # 1..K if w/o Gate (fixed)
                 use_ste=True,           # Gate w/o STE (soft routing)
                 use_hadamard=True,      # w/o Hadamard (prefix-sum)
        ):
        super().__init__()
        self.max_degree = max_degree

        self.use_gate = bool(use_gate)
        self.fixed_degree = fixed_degree  # None or int in [1..K]
        self.use_ste = bool(use_ste)
        self.use_hadamard = bool(use_hadamard)

        self.degree_gate = ComputeAwareDegreeGate(
            channels,
            max_degree=max_degree,
            gate_hidden_dim=gate_hidden_dim,
            temperature_initial=temperature_initial,
            temperature_min=temperature_min
        )

        self.channel_mixing = nn.ModuleList([
            nn.Conv1d(channels, channels, kernel_size=1) for _ in range(max_degree)
        ])

        self.token_mixing = nn.ModuleList([
            nn.Conv1d(channels, channels, kernel_size=token_kernel,
                      padding=token_kernel // 2, groups=channels)
            for _ in range(max_degree)
        ])

        self.pre_hadamard_channel = nn.ModuleList([
            nn.Conv1d(channels, channels, kernel_size=1) for _ in range(max_degree-1)
        ])

        self.pre_hadamard_token = nn.ModuleList([
            nn.Conv1d(channels, channels, kernel_size=token_kernel,
                      padding=token_kernel // 2, groups=channels)
            for _ in range(max_degree-1)
        ])

        self.norm = nn.LayerNorm(channels)

    def set_temperature(self, t):
        self.degree_gate.set_temperature(t)

    def _build_Y(self, x, max_deg):
        return [self.token_mixing[i](self.channel_mixing[i](x)) for i in range(max_deg)]

    def _build_Z(self, x, max_deg):
        """
        - use_hadamard=True : 예전버전(원본) Hadamard chain
            Z0=Y0, Zi = pre(Z_{i-1}) * Yi
        - use_hadamard=False: 예전버전 w/o Hadamard (prefix sum)
            Z0=Y0, Zi = Z_{i-1} + Yi
        """
        Y = self._build_Y(x, max_deg)

        if self.use_hadamard:
            Z = [Y[0]]
            for i in range(1, max_deg):
                Z_ = self.pre_hadamard_token[i-1](self.pre_hadamard_channel[i-1](Z[-1]))
                Z.append(Z_ * Y[i])
            return Z
        else:
            Z = [Y[0]]
            for i in range(1, max_deg):
                Z.append(Z[-1] + Y[i])
            return Z

    def _hard_select(self, Z_list, sel):
        B = Z_list[0].shape[0]
        Z_stack = torch.stack(Z_list, dim=0)  # (K,B,C,T)
        return Z_stack[sel, torch.arange(B, device=Z_stack.device)]

    def _soft_weighted_output(self, x, soft_probs):
        """
        예전버전 Gate w/o STE:
          - always compute ALL K degrees
          - weighted sum with soft_probs
          - hadamard / no_hadamard build rule은 동일하게 적용
        """
        B = x.size(0)
        Z = self._build_Z(x, max_deg=self.max_degree)      # list length K, each (B,C,T)
        Z_stack = torch.stack(Z, dim=1)                    # (B,K,C,T)
        w = soft_probs.view(B, self.max_degree, 1, 1)      # (B,K,1,1)
        out = (Z_stack * w).sum(dim=1)                     # (B,C,T)
        return out

    def forward(self, x, return_gate_info=False):
        B = x.shape[0]

        # ---- Case A) w/o Gate (fixed degree 1..K) ----
        if (not self.use_gate) or (self.fixed_degree is not None):
            d = int(self.fixed_degree) if self.fixed_degree is not None else self.max_degree
            d = max(1, min(d, self.max_degree))

            # build only up to d, output "degree d" path (예전버전)
            Z = self._build_Z(x, max_deg=d)
            out = Z[-1]

            # stats payload (예전버전 스타일)
            sel = torch.full((B,), d - 1, device=x.device, dtype=torch.long)
            K = self.max_degree
            sp = F.one_hot(sel, num_classes=K).float()
            dw = sp
            logits = sp

            out = self.norm(out.permute(0, 2, 1)).permute(0, 2, 1)
            if return_gate_info:
                return out, {
                    "degree_selection": dw,
                    "soft_probs": sp,
                    "logits": logits,
                    "compute_cost": float(d),
                }
            return out

        # ---- Case B) Gate ON ----
        degree_w, logits, soft_probs = self.degree_gate(x, use_ste=self.use_ste)

        if (not self.use_ste):
            # 예전버전: Gate w/o STE는 train/eval 관계없이 ALWAYS soft weighted sum
            out = self._soft_weighted_output(x, degree_w)  # degree_w == soft_probs
            # (stats only) 대표 degree
            selected = soft_probs.argmax(dim=-1)
            # compute_cost (예전코드에선 argmax 기반이었지만, 여기서는 gi에만 들어가므로 그대로 둠)
            compute_cost = (selected + 1).float().mean().item()
        else:
            # 원본: hard select (STE는 train에서 degree_w에 반영됨)
            selected = degree_w.argmax(dim=-1)
            max_deg = max(1, min(int(selected.max().item()) + 1, self.max_degree))
            Z = self._build_Z(x, max_deg=max_deg)
            out = self._hard_select(Z, selected)
            compute_cost = (selected + 1).float().mean().item()

        out = self.norm(out.permute(0, 2, 1)).permute(0, 2, 1)

        if return_gate_info:
            return out, {
                "degree_selection": degree_w,
                "soft_probs": soft_probs,
                "logits": logits,
                "compute_cost": compute_cost,
            }
        return out


# ──────────────────────────────────────────────────────────────────────────────
# Adaptive PADRe Model (Ablation switches ONLY; otherwise matches your logic)
# ──────────────────────────────────────────────────────────────────────────────
class PADReHAR_Ablation(nn.Module):
    def __init__(self,
                 in_channels=9,
                 seq_len=128,
                 num_classes=6,
                 hidden_dim=48,
                 num_layers=3,
                 max_degree=3,
                 gate_hidden_dim=16,
                 dropout=0.2,
                 temperature_initial=5.0,
                 temperature_min=0.5,
                 # ablations
                 use_gate=True,
                 fixed_degree=None,     # for w/o Gate (1/2/3)
                 use_ste=True,          # Gate w/o STE
                 use_hadamard=True,     # w/o Hadamard
                 use_ffn=True,          # w/o FFN
                 use_residual=True,     # w/o Residual
        ):
        super().__init__()
        self.num_layers = num_layers
        self.max_degree = max_degree

        self.use_ffn = bool(use_ffn)
        self.use_residual = bool(use_residual)

        self.input_proj = nn.Conv1d(in_channels, hidden_dim, kernel_size=1)

        self.padre_blocks = nn.ModuleList([
            PADReBlockAblation(
                hidden_dim, seq_len,
                max_degree=max_degree,
                token_kernel=11,
                gate_hidden_dim=gate_hidden_dim,
                temperature_initial=temperature_initial,
                temperature_min=temperature_min,
                use_gate=use_gate,
                fixed_degree=fixed_degree,
                use_ste=use_ste,
                use_hadamard=use_hadamard,
            )
            for _ in range(num_layers)
        ])

        self.ffn = nn.ModuleList([
            nn.Sequential(
                nn.Conv1d(hidden_dim, hidden_dim * 2, kernel_size=1),
                nn.GELU(),
                nn.Dropout(dropout),
                nn.Conv1d(hidden_dim * 2, hidden_dim, kernel_size=1),
                nn.Dropout(dropout),
            )
            for _ in range(num_layers)
        ])

        self.norms1 = nn.ModuleList([nn.LayerNorm(hidden_dim) for _ in range(num_layers)])
        self.norms2 = nn.ModuleList([nn.LayerNorm(hidden_dim) for _ in range(num_layers)])

        self.global_pool = nn.AdaptiveAvgPool1d(1)
        self.classifier = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim), nn.LayerNorm(hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, num_classes),
        )

    def set_temperature(self, t):
        for b in self.padre_blocks:
            b.set_temperature(t)

    def _ln(self, norm, x):
        return norm(x.permute(0, 2, 1)).permute(0, 2, 1)

    def forward(self, x, return_gate_info=False):
        x = self.input_proj(x)
        gate_info_list = [] if return_gate_info else None
        total_compute  = 0.0

        for i, block in enumerate(self.padre_blocks):
            res = x

            if return_gate_info:
                x, gi = block(x, return_gate_info=True)
                gate_info_list.append(gi)
                total_compute += gi["compute_cost"]
            else:
                x = block(x)

            if self.use_residual:
                x = self._ln(self.norms1[i], x + res)
            else:
                x = self._ln(self.norms1[i], x)

            res2 = x
            if self.use_ffn:
                x = self.ffn[i](x)

            if self.use_residual:
                x = self._ln(self.norms2[i], x + res2)
            else:
                x = self._ln(self.norms2[i], x)

        logits = self.classifier(self.global_pool(x).squeeze(-1))
        return (logits, gate_info_list, total_compute) if return_gate_info else logits


# ──────────────────────────────────────────────────────────────────────────────
# Train & Eval (unchanged)
# ──────────────────────────────────────────────────────────────────────────────
def train_model(model,
                train_loader,
                test_loader,
                device,
                lr=1e-3,
                weight_decay=1e-4,
                epochs=30,
                seed=42
    ):
    set_seed(seed)
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs, eta_min=1e-5)
    criterion = nn.CrossEntropyLoss()

    best_f1 = -1.0
    best_state = None

    for ep in range(epochs):
        temp = cosine_temperature(ep, epochs, tmax=5.0, tmin=0.5)
        model.set_temperature(temp)

        model.train()
        train_loss_sum = 0.0
        train_n = 0
        for X, y in train_loader:
            X, y = X.to(device), y.to(device)
            optimizer.zero_grad()
            loss = criterion(model(X), y)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()

            bs = y.size(0)
            train_loss_sum += loss.item() * bs
            train_n += bs

        scheduler.step()
        train_loss = train_loss_sum / max(train_n, 1)

        model.eval()
        preds_all, labels_all = [], []
        with torch.no_grad():
            for X, y in test_loader:
                X, y = X.to(device), y.to(device)
                preds_all.extend(model(X).argmax(1).cpu().numpy())
                labels_all.extend(y.cpu().numpy())
        test_f1 = f1_score(labels_all, preds_all, average="macro")

        if test_f1 > best_f1:
            best_f1 = test_f1
            best_state = copy.deepcopy(model.state_dict())

        if (ep + 1) % 5 == 0:
            cur_lr = optimizer.param_groups[0]["lr"]
            print(f"Epoch {ep+1:02d}/{epochs} | LR={cur_lr:.4f} | Train Loss={train_loss:.4f} | TestF1={test_f1:.4f} | BestF1={best_f1:.4f} | Temp={temp:.3f}")

    model.load_state_dict(best_state)
    print(f"\nBest Test Macro-F1: {best_f1:.4f}")
    return model

@torch.no_grad()
def compute_train_stats(train_loader, device="cpu", eps=1e-6):
    """
    Returns:
      mean: (C,1) tensor
      std : (C,1) tensor
    Note:
      X shape from loader: (B,C,T)
      We compute stats over (B,T) for each channel.
    """
    sum_x = None
    sum_x2 = None
    n = 0

    for X, _ in train_loader:
        X = X.to(device)  # (B,C,T)
        B, C, T = X.shape
        if sum_x is None:
            sum_x = torch.zeros(C, device=device)
            sum_x2 = torch.zeros(C, device=device)

        # sum over batch and time
        sum_x  += X.sum(dim=(0, 2))                 # (C,)
        sum_x2 += (X * X).sum(dim=(0, 2))           # (C,)
        n += B * T

    mean = (sum_x / n)                              # (C,)
    var  = (sum_x2 / n) - mean * mean               # (C,)
    std  = torch.sqrt(torch.clamp(var, min=eps))    # (C,)

    # reshape for broadcasting: (1,C,1) or (C,1)
    mean = mean.view(1, -1, 1)
    std  = std.view(1, -1, 1)
    return mean.detach().cpu(), std.detach().cpu()
# ──────────────────────────────────────────────────────────────────────────────
# Table Metrics (keep as-is in your "now code")
# ──────────────────────────────────────────────────────────────────────────────
@torch.no_grad()
def eval_f1_and_gate_stats(model, loader, device, snr_db=None, max_degree=3):
    """
    Returns:
      macro_f1, degree_entropy, norm_comp
    Definitions:
      - degree_entropy: mean over (layers, samples) of normalized entropy of soft_probs
                        H(p)/log(K)  where K=max_degree
      - norm_comp: mean expected degree / max_degree, averaged over layers
    """
    model.eval()

    all_preds, all_labels = [], []
    ent_sum = 0.0
    ent_count = 0
    comp_sum = 0.0
    comp_count = 0

    eps = 1e-12
    logK = float(np.log(max_degree))

    deg_vals = torch.arange(1, max_degree + 1, device=device).float()

    for X, y in loader:
        X = X.to(device)
        y = y.to(device)

        if snr_db is not None:
            X = add_gaussian_noise(X, float(snr_db))

        logits, gate_info_list, _ = model(X, return_gate_info=True)
        preds = logits.argmax(dim=1)

        all_preds.append(preds.detach().cpu().numpy())
        all_labels.append(y.detach().cpu().numpy())

        # gate stats
        for gi in gate_info_list:
            sp = gi["soft_probs"]  # (B,K)
            ent = -(sp * (sp + eps).log()).sum(dim=-1) / logK
            ent_sum += ent.mean().item()
            ent_count += 1

            exp_deg = (sp * deg_vals).sum(dim=-1).mean().item()
            comp_sum += (exp_deg / max_degree)
            comp_count += 1

    all_preds = np.concatenate(all_preds, axis=0)
    all_labels = np.concatenate(all_labels, axis=0)
    macro_f1 = float(f1_score(all_labels, all_preds, average="macro"))

    degree_entropy = float(ent_sum / max(ent_count, 1))
    norm_comp = float(comp_sum / max(comp_count, 1))
    return macro_f1, degree_entropy, norm_comp


def format_row(name, clean_f1, snr10_f1, deg_ent, norm_comp):
    drop_pct = 100.0 * (clean_f1 - snr10_f1) / max(clean_f1, 1e-12)
    return {
        "Variant": name,
        "CleanF1": clean_f1,
        "SNR10F1": snr10_f1,
        "drop(%)": drop_pct,
        "DegreeEntropy": deg_ent,
        "NormComp": norm_comp,
    }


def print_table(rows):
    header = ["Variant", "Clean F1", "(SNR=10) F1", "drop(%)", "Degree Entropy", "NormComp"]
    print("\n" + "="*110)
    print("UCI-HAR Ablation Table")
    print("="*110)
    print(f"{header[0]:<22s} | {header[1]:>8s} | {header[2]:>11s} | {header[3]:>7s} | {header[4]:>14s} | {header[5]:>8s}")
    print("-"*110)
    for r in rows:
        print(
            f"{r['Variant']:<22s} | "
            f"{r['CleanF1']:>8.4f} | "
            f"{r['SNR10F1']:>11.4f} | "
            f"{r['drop(%)']:>7.2f} | "
            f"{r['DegreeEntropy']:>14.4f} | "
            f"{r['NormComp']:>8.4f}"
        )
    print("-"*110)

    print("\n[LaTeX rows]")
    for r in rows:
        print(
            f"{r['Variant']} & {r['CleanF1']:.4f} & {r['SNR10F1']:.4f} & "
            f"{r['drop(%)']:.2f} & {r['DegreeEntropy']:.4f} & {r['NormComp']:.4f} \\\\"
        )


# ──────────────────────────────────────────────────────────────────────────────
# Experiment Runner (unchanged)
# ──────────────────────────────────────────────────────────────────────────────
def build_variant(name, cfg):
    model = PADReHAR_Ablation(
        in_channels=cfg["in_channels"],
        seq_len=cfg["seq_len"],
        num_classes=cfg["num_classes"],
        hidden_dim=cfg["hidden_dim"],
        num_layers=cfg["num_layers"],
        max_degree=cfg["max_degree"],
        gate_hidden_dim=cfg["gate_hidden_dim"],
        dropout=cfg["dropout"],
        temperature_initial=cfg["temperature_initial"],
        temperature_min=cfg["temperature_min"],
        use_gate=cfg.get("use_gate", True),
        fixed_degree=cfg.get("fixed_degree", None),
        use_ste=cfg.get("use_ste", True),
        use_hadamard=cfg.get("use_hadamard", True),
        use_ffn=cfg.get("use_ffn", True),
        use_residual=cfg.get("use_residual", True),
    ).to(cfg["device"])
    return model


def run_ablation_suite(train_loader, test_loader, base_cfg, train_cfg):
    variants = []
    variants.append(("Full (Ours)", dict()))
    variants.append(("w/o Gate-1", dict(use_gate=False, fixed_degree=1)))
    variants.append(("w/o Gate-2", dict(use_gate=False, fixed_degree=2)))
    variants.append(("w/o Gate-3", dict(use_gate=False, fixed_degree=3)))
    variants.append(("Gate w/o STE", dict(use_gate=True, use_ste=False)))
    variants.append(("w/o Hadamard", dict(use_hadamard=False)))
    variants.append(("w/o FFN", dict(use_ffn=False)))
    variants.append(("w/o Residual", dict(use_residual=False)))

    rows = []

    for name, delta in variants:
        print("\n" + "="*80)
        print(f"[Running Variant] {name}")
        print("="*80)

        cfg = copy.deepcopy(base_cfg)
        cfg.update(delta)

        set_seed(train_cfg["seed"])
        model = build_variant(name, cfg)
        print(f"Model params: {count_parameters(model):,}")

        model = train_model(
            model,
            train_loader,
            test_loader,
            cfg["device"],
            lr=train_cfg["lr"],
            weight_decay=train_cfg["wd"],
            epochs=train_cfg["epochs"],
            seed=train_cfg["seed"],
        )

        clean_f1, deg_ent, norm_comp = eval_f1_and_gate_stats(
            model, test_loader, cfg["device"], snr_db=None, max_degree=cfg["max_degree"]
        )
        snr10_f1, _, _ = eval_f1_and_gate_stats(
            model, test_loader, cfg["device"], snr_db=10.0, max_degree=cfg["max_degree"]
        )

        rows.append(format_row(name, clean_f1, snr10_f1, deg_ent, norm_comp))

    return rows


# ──────────────────────────────────────────────────────────────────────────────
# Main
# ──────────────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
    DATA_PATH = "/content/drive/MyDrive/Colab Notebooks/HAR/har_orig_datasets/UCI_HAR"
    SEED = 42
    NUM_WORKERS = 2 if USE_GPU else 0
    PIN_MEMORY = USE_GPU

    BATCH_SIZE = 64
    EPOCHS = 100

    NUM_CLASSES = 6
    HIDDEN_DIM = 48
    NUM_LAYERS = 3
    MAX_DEGREE = 3
    GATE_HIDDEN_DIM = 16
    DROPOUT = 0.25
    LR = 1e-3
    WD = 1e-2

    train_dataset_raw = UCIHARDataset(DATA_PATH, split="train", normalize=None)

    stats_loader = DataLoader(
        train_dataset_raw,
        batch_size=BATCH_SIZE,
        shuffle=False,
        num_workers=NUM_WORKERS,
        pin_memory=PIN_MEMORY
    )

    mean, std = compute_train_stats(stats_loader, device="cpu")
    train_dataset = UCIHARDataset(DATA_PATH, split="train", normalize=(mean, std))
    test_dataset  = UCIHARDataset(DATA_PATH, split="test",  normalize=(mean, std))

    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True,
                              num_workers=NUM_WORKERS, pin_memory=PIN_MEMORY)
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False,
                             num_workers=NUM_WORKERS, pin_memory=PIN_MEMORY)

    base_cfg = dict(
        device=DEVICE,
        in_channels=9,
        seq_len=128,
        num_classes=NUM_CLASSES,
        hidden_dim=HIDDEN_DIM,
        num_layers=NUM_LAYERS,
        max_degree=MAX_DEGREE,
        gate_hidden_dim=GATE_HIDDEN_DIM,
        dropout=DROPOUT,
        temperature_initial=5.0,
        temperature_min=0.5,
        use_gate=True,
        fixed_degree=None,
        use_ste=True,
        use_hadamard=True,
        use_ffn=True,
        use_residual=True,
    )

    train_cfg = dict(
        seed=SEED,
        epochs=EPOCHS,
        lr=LR,
        wd=WD,
    )

    rows = run_ablation_suite(train_loader, test_loader, base_cfg, train_cfg)
    print_table(rows)

Device: cuda | pin_memory: True

[Running Variant] Full (Ours)
Model params: 78,591
Epoch 05/100 | LR=0.0010 | Train Loss=0.1073 | TestF1=0.9187 | BestF1=0.9325 | Temp=4.982
Epoch 10/100 | LR=0.0010 | Train Loss=0.0853 | TestF1=0.9088 | BestF1=0.9361 | Temp=4.909
Epoch 15/100 | LR=0.0009 | Train Loss=0.0701 | TestF1=0.9241 | BestF1=0.9361 | Temp=4.782
Epoch 20/100 | LR=0.0009 | Train Loss=0.0278 | TestF1=0.9338 | BestF1=0.9382 | Temp=4.603
Epoch 25/100 | LR=0.0009 | Train Loss=0.0141 | TestF1=0.9471 | BestF1=0.9482 | Temp=4.378
Epoch 30/100 | LR=0.0008 | Train Loss=0.0060 | TestF1=0.9500 | BestF1=0.9547 | Temp=4.113
Epoch 35/100 | LR=0.0007 | Train Loss=0.0023 | TestF1=0.9436 | BestF1=0.9547 | Temp=3.813
Epoch 40/100 | LR=0.0007 | Train Loss=0.0016 | TestF1=0.9469 | BestF1=0.9547 | Temp=3.486
Epoch 45/100 | LR=0.0006 | Train Loss=0.0027 | TestF1=0.9524 | BestF1=0.9547 | Temp=3.141
Epoch 50/100 | LR=0.0005 | Train Loss=0.0002 | TestF1=0.9486 | BestF1=0.9547 | Temp=2.786
Epoch 55/100 | L

#PAMAP2

In [3]:
import os, copy, random, time, re, glob
import numpy as np
import pandas as pd
from pathlib import Path

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split

from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler

# ──────────────────────────────────────────────────────────────────────────────
# Seed / Device
# ──────────────────────────────────────────────────────────────────────────────
def set_seed(seed=42):
    os.environ["PYTHONHASHSEED"] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

DEVICE  = torch.device("cuda" if torch.cuda.is_available() else "cpu")
USE_GPU = DEVICE.type == "cuda"
print(f"Device: {DEVICE} | pin_memory: {USE_GPU}")


# ──────────────────────────────────────────────────────────────────────────────
# Dataset
# ──────────────────────────────────────────────────────────────────────────────
def create_pamap2_windows(df: pd.DataFrame, window_size: int, step_size: int):
    feature_cols = [
        # hand
        "handAcc16_1","handAcc16_2","handAcc16_3",
        "handAcc6_1","handAcc6_2","handAcc6_3",
        "handGyro1","handGyro2","handGyro3",
        # chest
        "chestAcc16_1","chestAcc16_2","chestAcc16_3",
        "chestAcc6_1","chestAcc6_2","chestAcc6_3",
        "chestGyro1","chestGyro2","chestGyro3",
        # ankle
        "ankleAcc16_1","ankleAcc16_2","ankleAcc16_3",
        "ankleAcc6_1","ankleAcc6_2","ankleAcc6_3",
        "ankleGyro1","ankleGyro2","ankleGyro3",
    ]  # C = 27

    ORDERED_IDS = [1, 2, 3, 4, 5, 6, 7, 12, 13, 16, 17, 24]
    old2new = {
        1: 0,   # Lying
        2: 1,   # Sitting
        3: 2,   # Standing
        4: 3,   # Walking
        5: 4,   # Running
        6: 5,   # Cycling
        7: 6,   # Nordic walking
        12: 7,  # Ascending stairs
        13: 8,  # Descending stairs
        16: 9,  # Vacuum cleaning
        17: 10, # Ironing
        24: 11, # Rope jumping
    }
    label_names = [
        "Lying", "Sitting", "Standing", "Walking",
        "Running", "Cycling", "Nordic walking",
        "Ascending stairs", "Descending stairs",
        "Vacuum cleaning", "Ironing", "Rope jumping",
    ]

    X_list, y_list, subj_list = [], [], []

    for subj_id, g in df.groupby("subject_id"):
        if "timestamp" in g.columns:
            g = g.sort_values("timestamp")
        else:
            g = g.sort_index()

        data_arr  = g[feature_cols].to_numpy(dtype=np.float32)
        label_arr = g["activityID"].to_numpy(dtype=np.int64)
        L = data_arr.shape[0]

        start = 0
        while start + window_size <= L:
            end = start + window_size
            last_label_orig = int(label_arr[end - 1])

            if last_label_orig == 0:
                start += step_size
                continue
            if last_label_orig not in old2new:
                start += step_size
                continue

            window_ct = data_arr[start:end].T
            X_list.append(window_ct)
            y_list.append(old2new[last_label_orig])
            subj_list.append(int(subj_id))
            start += step_size

    if len(X_list) == 0:
        raise RuntimeError("No windows created. Check window_size/step_size and label filtering.")

    X = np.stack(X_list, axis=0).astype(np.float32)
    y = np.asarray(y_list, dtype=np.int64)
    subj_ids = np.asarray(subj_list, dtype=np.int64)
    return X, y, subj_ids, label_names


class PAMAP2Dataset(Dataset):
    def __init__(self, data_dir, window_size, step_size):
        super().__init__()

        csv_files = glob.glob(os.path.join(data_dir, "*.csv"))
        if len(csv_files) == 0:
            raise RuntimeError(f"No CSV files found under {data_dir}")

        dfs = []
        for fpath in sorted(csv_files):
            df_i = pd.read_csv(fpath)

            if "subject_id" not in df_i.columns:
                m = re.findall(r"\d+", os.path.basename(fpath))
                subj_guess = int(m[0]) if len(m) > 0 else 0
                df_i["subject_id"] = subj_guess

            dfs.append(df_i)

        df = pd.concat(dfs, ignore_index=True)

        df = df.dropna(subset=["activityID"])
        df["activityID"] = df["activityID"].astype(np.int64)
        df["subject_id"] = df["subject_id"].astype(np.int64)
        if "timestamp" in df.columns:
            df["timestamp"] = pd.to_numeric(df["timestamp"], errors="coerce")

        feature_cols = [
            # hand
            "handAcc16_1","handAcc16_2","handAcc16_3",
            "handAcc6_1","handAcc6_2","handAcc6_3",
            "handGyro1","handGyro2","handGyro3",
            # chest
            "chestAcc16_1","chestAcc16_2","chestAcc16_3",
            "chestAcc6_1","chestAcc6_2","chestAcc6_3",
            "chestGyro1","chestGyro2","chestGyro3",
            # ankle
            "ankleAcc16_1","ankleAcc16_2","ankleAcc16_3",
            "ankleAcc6_1","ankleAcc6_2","ankleAcc6_3",
            "ankleGyro1","ankleGyro2","ankleGyro3",
        ]

        def _fill_subject_group(g):
            if "timestamp" in g.columns:
                g = g.sort_values("timestamp")
            else:
                g = g.sort_index()
            g[feature_cols] = (
                g[feature_cols]
                .interpolate(method="linear", limit_direction="both", axis=0)
                .ffill()
                .bfill()
            )
            return g

        df = df.groupby("subject_id", group_keys=False).apply(_fill_subject_group)
        df[feature_cols] = df[feature_cols].fillna(0.0)

        X, y, subj_ids, label_names = create_pamap2_windows(df, window_size, step_size)

        self.X = X.astype(np.float32)
        self.y = y
        self.subject_ids = subj_ids
        self.label_names = label_names
        self.scaler = None

    def fit_scaler(self, indices):
        Xtr = self.X[indices]
        N, C, T = Xtr.shape

        X2 = np.transpose(Xtr, (0, 2, 1)).reshape(-1, C)

        scaler = StandardScaler()
        scaler.fit(X2)
        self.scaler = scaler
        return scaler

    def apply_scaler(self, scaler=None):
        if scaler is None:
            scaler = self.scaler
        assert scaler is not None, "Scaler is not fitted. Call fit_scaler() first."

        X = self.X
        N, C, T = X.shape
        X2 = np.transpose(X, (0, 2, 1)).reshape(-1, C)
        X2 = scaler.transform(X2)
        X_scaled = X2.reshape(N, T, C).transpose(0, 2, 1)

        self.X = X_scaled.astype(np.float32)

        print("Loaded PAMAP2 dataset")
        print(f"X shape : {self.X.shape}  (N, C, T)")
        print(f"y shape : {self.y.shape}  (N,)")
        print(f"Classes : {len(self.label_names)}")

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return (
            torch.from_numpy(self.X[idx]).float(),
            torch.tensor(self.y[idx], dtype=torch.long),
            int(self.subject_ids[idx]),
        )


# ──────────────────────────────────────────────────────────────────────────────
# Utils
# ──────────────────────────────────────────────────────────────────────────────
def cosine_temperature(ep, total, tmax=5.0, tmin=0.5):
    r = ep / max(total - 1, 1)
    return tmin + (tmax - tmin) * 0.5 * (1.0 + np.cos(np.pi * r))

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


# ──────────────────────────────────────────────────────────────────────────────
# Corruptions (SNR=10 uses this)
# ──────────────────────────────────────────────────────────────────────────────
def add_gaussian_noise(X, snr_db):
    """
    X: (B,C,T)
    snr_db: float
    """
    signal_power = (X ** 2).mean(dim=(1, 2), keepdim=True)
    snr = 10 ** (snr_db / 10.0)
    noise_power = signal_power / snr
    noise = torch.randn_like(X) * torch.sqrt(noise_power)
    return X + noise


# ──────────────────────────────────────────────────────────────────────────────
# Compute-Aware Degree Gate  (★ Variant behavior matches "예전버전")
#   - use_ste=True  : train=STE(hard fwd, soft bwd), eval=hard onehot
#   - use_ste=False : always soft_probs (train/eval 동일)
# ──────────────────────────────────────────────────────────────────────────────
class ComputeAwareDegreeGate(nn.Module):
    def __init__(self,
                 channels,
                 max_degree=3,
                 gate_hidden_dim=16,
                 temperature_initial=5.0,
                 temperature_min=0.5
        ):
        super().__init__()
        self.max_degree = max_degree

        self.gate = nn.Sequential(
            nn.AdaptiveAvgPool1d(1),
            nn.Flatten(1),
            nn.Linear(channels, gate_hidden_dim),
            nn.GELU(),
            nn.Linear(gate_hidden_dim, max_degree),
        )

        nn.init.zeros_(self.gate[-1].bias)
        if max_degree >= 3:
            self.gate[-1].bias.data[1] = 0.4

        self.register_buffer("temperature", torch.tensor(float(temperature_initial)))
        self.temperature_min = float(temperature_min)

    def set_temperature(self, t):
        self.temperature.fill_(max(float(t), self.temperature_min))

    def forward(self, x, use_ste=True):
        logits = self.gate(x)  # (B,K)
        soft_probs = F.softmax(logits / self.temperature, dim=-1)

        if use_ste:
            if self.training:
                hard_idx = logits.argmax(dim=-1)
                hard_oh = F.one_hot(hard_idx, num_classes=self.max_degree).float()
                # STE: forward=hard, backward=soft
                degree_w = hard_oh - soft_probs.detach() + soft_probs
            else:
                degree_w = F.one_hot(
                    logits.argmax(dim=-1), num_classes=self.max_degree
                ).float()
        else:
            # Gate w/o STE: always soft (train/eval 동일)
            degree_w = soft_probs

        return degree_w, logits, soft_probs


# ──────────────────────────────────────────────────────────────────────────────
# PADRe Block (Ablation switches ONLY; ★ Variant behavior matches "예전버전")
#   - w/o Gate: fixed_degree (1..K) → build up to d, output Z[d-1]
#   - Gate w/o STE: soft weighted sum of ALL degrees (train/eval 동일)
#   - w/o Hadamard: prefix sum (Z[i]=Z[i-1]+Y[i])  (예전버전)
# ──────────────────────────────────────────────────────────────────────────────
class PADReBlockAblation(nn.Module):
    def __init__(self,
                 channels,
                 seq_len,
                 max_degree=3,
                 token_kernel=11,
                 gate_hidden_dim=16,
                 temperature_initial=5.0,
                 temperature_min=0.5,
                 # ablations
                 use_gate=True,
                 fixed_degree=None,      # 1..K if w/o Gate (fixed)
                 use_ste=True,           # Gate w/o STE (soft routing)
                 use_hadamard=True,      # w/o Hadamard (prefix-sum)
        ):
        super().__init__()
        self.max_degree = max_degree

        self.use_gate = bool(use_gate)
        self.fixed_degree = fixed_degree  # None or int in [1..K]
        self.use_ste = bool(use_ste)
        self.use_hadamard = bool(use_hadamard)

        self.degree_gate = ComputeAwareDegreeGate(
            channels,
            max_degree=max_degree,
            gate_hidden_dim=gate_hidden_dim,
            temperature_initial=temperature_initial,
            temperature_min=temperature_min
        )

        self.channel_mixing = nn.ModuleList([
            nn.Conv1d(channels, channels, kernel_size=1) for _ in range(max_degree)
        ])

        self.token_mixing = nn.ModuleList([
            nn.Conv1d(channels, channels, kernel_size=token_kernel,
                      padding=token_kernel // 2, groups=channels)
            for _ in range(max_degree)
        ])

        self.pre_hadamard_channel = nn.ModuleList([
            nn.Conv1d(channels, channels, kernel_size=1) for _ in range(max_degree-1)
        ])

        self.pre_hadamard_token = nn.ModuleList([
            nn.Conv1d(channels, channels, kernel_size=token_kernel,
                      padding=token_kernel // 2, groups=channels)
            for _ in range(max_degree-1)
        ])

        self.norm = nn.LayerNorm(channels)

    def set_temperature(self, t):
        self.degree_gate.set_temperature(t)

    def _build_Y(self, x, max_deg):
        return [self.token_mixing[i](self.channel_mixing[i](x)) for i in range(max_deg)]

    def _build_Z(self, x, max_deg):
        """
        - use_hadamard=True : 예전버전(원본) Hadamard chain
            Z0=Y0, Zi = pre(Z_{i-1}) * Yi
        - use_hadamard=False: 예전버전 w/o Hadamard (prefix sum)
            Z0=Y0, Zi = Z_{i-1} + Yi
        """
        Y = self._build_Y(x, max_deg)

        if self.use_hadamard:
            Z = [Y[0]]
            for i in range(1, max_deg):
                Z_ = self.pre_hadamard_token[i-1](self.pre_hadamard_channel[i-1](Z[-1]))
                Z.append(Z_ * Y[i])
            return Z
        else:
            Z = [Y[0]]
            for i in range(1, max_deg):
                Z.append(Z[-1] + Y[i])
            return Z

    def _hard_select(self, Z_list, sel):
        B = Z_list[0].shape[0]
        Z_stack = torch.stack(Z_list, dim=0)  # (K,B,C,T)
        return Z_stack[sel, torch.arange(B, device=Z_stack.device)]

    def _soft_weighted_output(self, x, soft_probs):
        """
        예전버전 Gate w/o STE:
          - always compute ALL K degrees
          - weighted sum with soft_probs
          - hadamard / no_hadamard build rule은 동일하게 적용
        """
        B = x.size(0)
        Z = self._build_Z(x, max_deg=self.max_degree)      # list length K, each (B,C,T)
        Z_stack = torch.stack(Z, dim=1)                    # (B,K,C,T)
        w = soft_probs.view(B, self.max_degree, 1, 1)      # (B,K,1,1)
        out = (Z_stack * w).sum(dim=1)                     # (B,C,T)
        return out

    def forward(self, x, return_gate_info=False):
        B = x.shape[0]

        # ---- Case A) w/o Gate (fixed degree 1..K) ----
        if (not self.use_gate) or (self.fixed_degree is not None):
            d = int(self.fixed_degree) if self.fixed_degree is not None else self.max_degree
            d = max(1, min(d, self.max_degree))

            # build only up to d, output "degree d" path (예전버전)
            Z = self._build_Z(x, max_deg=d)
            out = Z[-1]

            # stats payload (예전버전 스타일)
            sel = torch.full((B,), d - 1, device=x.device, dtype=torch.long)
            K = self.max_degree
            sp = F.one_hot(sel, num_classes=K).float()
            dw = sp
            logits = sp

            out = self.norm(out.permute(0, 2, 1)).permute(0, 2, 1)
            if return_gate_info:
                return out, {
                    "degree_selection": dw,
                    "soft_probs": sp,
                    "logits": logits,
                    "compute_cost": float(d),
                }
            return out

        # ---- Case B) Gate ON ----
        degree_w, logits, soft_probs = self.degree_gate(x, use_ste=self.use_ste)

        if (not self.use_ste):
            # 예전버전: Gate w/o STE는 train/eval 관계없이 ALWAYS soft weighted sum
            out = self._soft_weighted_output(x, degree_w)  # degree_w == soft_probs
            # (stats only) 대표 degree
            selected = soft_probs.argmax(dim=-1)
            # compute_cost (예전코드에선 argmax 기반이었지만, 여기서는 gi에만 들어가므로 그대로 둠)
            compute_cost = (selected + 1).float().mean().item()
        else:
            # 원본: hard select (STE는 train에서 degree_w에 반영됨)
            selected = degree_w.argmax(dim=-1)
            max_deg = max(1, min(int(selected.max().item()) + 1, self.max_degree))
            Z = self._build_Z(x, max_deg=max_deg)
            out = self._hard_select(Z, selected)
            compute_cost = (selected + 1).float().mean().item()

        out = self.norm(out.permute(0, 2, 1)).permute(0, 2, 1)

        if return_gate_info:
            return out, {
                "degree_selection": degree_w,
                "soft_probs": soft_probs,
                "logits": logits,
                "compute_cost": compute_cost,
            }
        return out


# ──────────────────────────────────────────────────────────────────────────────
# Adaptive PADRe Model (Ablation switches ONLY; otherwise matches your logic)
# ──────────────────────────────────────────────────────────────────────────────
class PADReHAR_Ablation(nn.Module):
    def __init__(self,
                 in_channels=9,
                 seq_len=128,
                 num_classes=6,
                 hidden_dim=48,
                 num_layers=3,
                 max_degree=3,
                 gate_hidden_dim=16,
                 dropout=0.2,
                 temperature_initial=5.0,
                 temperature_min=0.5,
                 # ablations
                 use_gate=True,
                 fixed_degree=None,     # for w/o Gate (1/2/3)
                 use_ste=True,          # Gate w/o STE
                 use_hadamard=True,     # w/o Hadamard
                 use_ffn=True,          # w/o FFN
                 use_residual=True,     # w/o Residual
        ):
        super().__init__()
        self.num_layers = num_layers
        self.max_degree = max_degree

        self.use_ffn = bool(use_ffn)
        self.use_residual = bool(use_residual)

        self.input_proj = nn.Conv1d(in_channels, hidden_dim, kernel_size=1)

        self.padre_blocks = nn.ModuleList([
            PADReBlockAblation(
                hidden_dim, seq_len,
                max_degree=max_degree,
                token_kernel=11,
                gate_hidden_dim=gate_hidden_dim,
                temperature_initial=temperature_initial,
                temperature_min=temperature_min,
                use_gate=use_gate,
                fixed_degree=fixed_degree,
                use_ste=use_ste,
                use_hadamard=use_hadamard,
            )
            for _ in range(num_layers)
        ])

        self.ffn = nn.ModuleList([
            nn.Sequential(
                nn.Conv1d(hidden_dim, hidden_dim * 2, kernel_size=1),
                nn.GELU(),
                nn.Dropout(dropout),
                nn.Conv1d(hidden_dim * 2, hidden_dim, kernel_size=1),
                nn.Dropout(dropout),
            )
            for _ in range(num_layers)
        ])

        self.norms1 = nn.ModuleList([nn.LayerNorm(hidden_dim) for _ in range(num_layers)])
        self.norms2 = nn.ModuleList([nn.LayerNorm(hidden_dim) for _ in range(num_layers)])

        self.global_pool = nn.AdaptiveAvgPool1d(1)
        self.classifier = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim), nn.LayerNorm(hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, num_classes),
        )

    def set_temperature(self, t):
        for b in self.padre_blocks:
            b.set_temperature(t)

    def _ln(self, norm, x):
        return norm(x.permute(0, 2, 1)).permute(0, 2, 1)

    def forward(self, x, return_gate_info=False):
        x = self.input_proj(x)
        gate_info_list = [] if return_gate_info else None
        total_compute  = 0.0

        for i, block in enumerate(self.padre_blocks):
            res = x

            if return_gate_info:
                x, gi = block(x, return_gate_info=True)
                gate_info_list.append(gi)
                total_compute += gi["compute_cost"]
            else:
                x = block(x)

            if self.use_residual:
                x = self._ln(self.norms1[i], x + res)
            else:
                x = self._ln(self.norms1[i], x)

            res2 = x
            if self.use_ffn:
                x = self.ffn[i](x)

            if self.use_residual:
                x = self._ln(self.norms2[i], x + res2)
            else:
                x = self._ln(self.norms2[i], x)

        logits = self.classifier(self.global_pool(x).squeeze(-1))
        return (logits, gate_info_list, total_compute) if return_gate_info else logits


# ──────────────────────────────────────────────────────────────────────────────
# Train & Eval (unchanged)
# ──────────────────────────────────────────────────────────────────────────────
def train_model(model,
                train_loader,
                test_loader,
                device,
                lr=1e-3,
                weight_decay=1e-4,
                epochs=30,
                seed=42
    ):
    set_seed(seed)
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs, eta_min=1e-5)
    criterion = nn.CrossEntropyLoss()

    best_f1 = -1.0
    best_state = None

    for ep in range(epochs):
        temp = cosine_temperature(ep, epochs, tmax=5.0, tmin=0.5)
        model.set_temperature(temp)

        model.train()
        train_loss_sum = 0.0
        train_n = 0
        for X, y, _ in train_loader:
            X, y = X.to(device), y.to(device)
            optimizer.zero_grad()
            loss = criterion(model(X), y)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()

            bs = y.size(0)
            train_loss_sum += loss.item() * bs
            train_n += bs

        scheduler.step()
        train_loss = train_loss_sum / max(train_n, 1)

        model.eval()
        preds_all, labels_all = [], []
        with torch.no_grad():
            for X, y, _ in test_loader:
                X, y = X.to(device), y.to(device)
                preds_all.extend(model(X).argmax(1).cpu().numpy())
                labels_all.extend(y.cpu().numpy())
        test_f1 = f1_score(labels_all, preds_all, average="macro")

        if test_f1 > best_f1:
            best_f1 = test_f1
            best_state = copy.deepcopy(model.state_dict())

        if (ep + 1) % 5 == 0:
            cur_lr = optimizer.param_groups[0]["lr"]
            print(f"Epoch {ep+1:02d}/{epochs} | LR={cur_lr:.4f} | Train Loss={train_loss:.4f} | TestF1={test_f1:.4f} | BestF1={best_f1:.4f} | Temp={temp:.3f}")

    model.load_state_dict(best_state)
    print(f"\nBest Test Macro-F1: {best_f1:.4f}")
    return model


# ──────────────────────────────────────────────────────────────────────────────
# Table Metrics (keep as-is in your "now code")
# ──────────────────────────────────────────────────────────────────────────────
@torch.no_grad()
def eval_f1_and_gate_stats(model, loader, device, snr_db=None, max_degree=3):
    """
    Returns:
      macro_f1, degree_entropy, norm_comp
    Definitions:
      - degree_entropy: mean over (layers, samples) of normalized entropy of soft_probs
                        H(p)/log(K)  where K=max_degree
      - norm_comp: mean expected degree / max_degree, averaged over layers
    """
    model.eval()

    all_preds, all_labels = [], []
    ent_sum = 0.0
    ent_count = 0
    comp_sum = 0.0
    comp_count = 0

    eps = 1e-12
    logK = float(np.log(max_degree))

    deg_vals = torch.arange(1, max_degree + 1, device=device).float()

    for X, y, _ in loader:
        X = X.to(device)
        y = y.to(device)

        if snr_db is not None:
            X = add_gaussian_noise(X, float(snr_db))

        logits, gate_info_list, _ = model(X, return_gate_info=True)
        preds = logits.argmax(dim=1)

        all_preds.append(preds.detach().cpu().numpy())
        all_labels.append(y.detach().cpu().numpy())

        # gate stats
        for gi in gate_info_list:
            sp = gi["soft_probs"]  # (B,K)
            ent = -(sp * (sp + eps).log()).sum(dim=-1) / logK
            ent_sum += ent.mean().item()
            ent_count += 1

            exp_deg = (sp * deg_vals).sum(dim=-1).mean().item()
            comp_sum += (exp_deg / max_degree)
            comp_count += 1

    all_preds = np.concatenate(all_preds, axis=0)
    all_labels = np.concatenate(all_labels, axis=0)
    macro_f1 = float(f1_score(all_labels, all_preds, average="macro"))

    degree_entropy = float(ent_sum / max(ent_count, 1))
    norm_comp = float(comp_sum / max(comp_count, 1))
    return macro_f1, degree_entropy, norm_comp


def format_row(name, clean_f1, snr10_f1, deg_ent, norm_comp):
    drop_pct = 100.0 * (clean_f1 - snr10_f1) / max(clean_f1, 1e-12)
    return {
        "Variant": name,
        "CleanF1": clean_f1,
        "SNR10F1": snr10_f1,
        "drop(%)": drop_pct,
        "DegreeEntropy": deg_ent,
        "NormComp": norm_comp,
    }


def print_table(rows):
    header = ["Variant", "Clean F1", "(SNR=10) F1", "drop(%)", "Degree Entropy", "NormComp"]
    print("\n" + "="*110)
    print("UCI-HAR Ablation Table")
    print("="*110)
    print(f"{header[0]:<22s} | {header[1]:>8s} | {header[2]:>11s} | {header[3]:>7s} | {header[4]:>14s} | {header[5]:>8s}")
    print("-"*110)
    for r in rows:
        print(
            f"{r['Variant']:<22s} | "
            f"{r['CleanF1']:>8.4f} | "
            f"{r['SNR10F1']:>11.4f} | "
            f"{r['drop(%)']:>7.2f} | "
            f"{r['DegreeEntropy']:>14.4f} | "
            f"{r['NormComp']:>8.4f}"
        )
    print("-"*110)

    print("\n[LaTeX rows]")
    for r in rows:
        print(
            f"{r['Variant']} & {r['CleanF1']:.4f} & {r['SNR10F1']:.4f} & "
            f"{r['drop(%)']:.2f} & {r['DegreeEntropy']:.4f} & {r['NormComp']:.4f} \\\\"
        )


# ──────────────────────────────────────────────────────────────────────────────
# Experiment Runner (unchanged)
# ──────────────────────────────────────────────────────────────────────────────
def build_variant(name, cfg):
    model = PADReHAR_Ablation(
        in_channels=cfg["in_channels"],
        seq_len=cfg["seq_len"],
        num_classes=cfg["num_classes"],
        hidden_dim=cfg["hidden_dim"],
        num_layers=cfg["num_layers"],
        max_degree=cfg["max_degree"],
        gate_hidden_dim=cfg["gate_hidden_dim"],
        dropout=cfg["dropout"],
        temperature_initial=cfg["temperature_initial"],
        temperature_min=cfg["temperature_min"],
        use_gate=cfg.get("use_gate", True),
        fixed_degree=cfg.get("fixed_degree", None),
        use_ste=cfg.get("use_ste", True),
        use_hadamard=cfg.get("use_hadamard", True),
        use_ffn=cfg.get("use_ffn", True),
        use_residual=cfg.get("use_residual", True),
    ).to(cfg["device"])
    return model


def run_ablation_suite(train_loader, test_loader, base_cfg, train_cfg):
    variants = []
    variants.append(("Full (Ours)", dict()))
    variants.append(("w/o Gate-1", dict(use_gate=False, fixed_degree=1)))
    variants.append(("w/o Gate-2", dict(use_gate=False, fixed_degree=2)))
    variants.append(("w/o Gate-3", dict(use_gate=False, fixed_degree=3)))
    variants.append(("Gate w/o STE", dict(use_gate=True, use_ste=False)))
    variants.append(("w/o Hadamard", dict(use_hadamard=False)))
    variants.append(("w/o FFN", dict(use_ffn=False)))
    variants.append(("w/o Residual", dict(use_residual=False)))

    rows = []

    for name, delta in variants:
        print("\n" + "="*80)
        print(f"[Running Variant] {name}")
        print("="*80)

        cfg = copy.deepcopy(base_cfg)
        cfg.update(delta)

        set_seed(train_cfg["seed"])
        model = build_variant(name, cfg)
        print(f"Model params: {count_parameters(model):,}")

        model = train_model(
            model,
            train_loader,
            test_loader,
            cfg["device"],
            lr=train_cfg["lr"],
            weight_decay=train_cfg["wd"],
            epochs=train_cfg["epochs"],
            seed=train_cfg["seed"],
        )

        clean_f1, deg_ent, norm_comp = eval_f1_and_gate_stats(
            model, test_loader, cfg["device"], snr_db=None, max_degree=cfg["max_degree"]
        )
        snr10_f1, _, _ = eval_f1_and_gate_stats(
            model, test_loader, cfg["device"], snr_db=10.0, max_degree=cfg["max_degree"]
        )

        rows.append(format_row(name, clean_f1, snr10_f1, deg_ent, norm_comp))

    return rows


# ──────────────────────────────────────────────────────────────────────────────
# Main
# ──────────────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
    DATA_PATH =  "/content/drive/MyDrive/Colab Notebooks/HAR/har_orig_datasets"
    SEED = 42
    NUM_WORKERS = 2 if USE_GPU else 0
    PIN_MEMORY = USE_GPU

    BATCH_SIZE = 64
    EPOCHS = 50

    NUM_CLASSES = 12
    HIDDEN_DIM = 48
    NUM_LAYERS = 3
    MAX_DEGREE = 3
    GATE_HIDDEN_DIM = 16
    DROPOUT = 0.25
    LR = 1e-3
    WD = 1e-2

    WINDOW_SIZE = 100
    STEP_SIZE = 50

    set_seed(SEED)

    full_dataset = PAMAP2Dataset(
        data_dir=DATA_PATH,
        window_size=WINDOW_SIZE,
        step_size=STEP_SIZE
    )

    n_total = len(full_dataset)
    n_test = int(0.2 * n_total)
    n_train = n_total - n_test

    g = torch.Generator().manual_seed(SEED)
    train_dataset, test_dataset = random_split(full_dataset, [n_train, n_test], generator=g)

    train_idx = np.array(train_dataset.indices, dtype=np.int64)
    scaler = full_dataset.fit_scaler(train_idx)
    full_dataset.apply_scaler(scaler)

    train_loader = DataLoader(
        train_dataset, batch_size=BATCH_SIZE, shuffle=True,
        num_workers=NUM_WORKERS, pin_memory=PIN_MEMORY
    )
    test_loader = DataLoader(
        test_dataset, batch_size=BATCH_SIZE, shuffle=False,
        num_workers=NUM_WORKERS, pin_memory=PIN_MEMORY
    )

    base_cfg = dict(
        device=DEVICE,
        in_channels=27,
        seq_len=100,
        num_classes=NUM_CLASSES,
        hidden_dim=HIDDEN_DIM,
        num_layers=NUM_LAYERS,
        max_degree=MAX_DEGREE,
        gate_hidden_dim=GATE_HIDDEN_DIM,
        dropout=DROPOUT,
        temperature_initial=5.0,
        temperature_min=0.5,
        use_gate=True,
        fixed_degree=None,
        use_ste=True,
        use_hadamard=True,
        use_ffn=True,
        use_residual=True,
    )

    train_cfg = dict(
        seed=SEED,
        epochs=EPOCHS,
        lr=LR,
        wd=WD,
    )

    rows = run_ablation_suite(train_loader, test_loader, base_cfg, train_cfg)
    print_table(rows)

Device: cuda | pin_memory: True


KeyboardInterrupt: 

In [1]:
import os, copy, random, time, re, glob
import numpy as np
import pandas as pd
from pathlib import Path

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split

from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler

# ──────────────────────────────────────────────────────────────────────────────
# Seed / Device
# ──────────────────────────────────────────────────────────────────────────────
def set_seed(seed=42):
    os.environ["PYTHONHASHSEED"] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

DEVICE  = torch.device("cuda" if torch.cuda.is_available() else "cpu")
USE_GPU = DEVICE.type == "cuda"
print(f"Device: {DEVICE} | pin_memory: {USE_GPU}")


# ──────────────────────────────────────────────────────────────────────────────
# Dataset
# ──────────────────────────────────────────────────────────────────────────────
def _load_single_mhealth_log(path: str, feature_cols: list[str]):
    df = pd.read_csv(
        path,
        sep="\t",
        header=None,
        names=feature_cols + ["label"],
    )
    return df

def load_mhealth_dataframe(data_dir: str):
    feature_cols = [
        "acc_chest_x", "acc_chest_y", "acc_chest_z",      # 0,1,2
        "acc_ankle_x", "acc_ankle_y", "acc_ankle_z",      # 5,6,7
        "gyro_ankle_x", "gyro_ankle_y", "gyro_ankle_z",   # 8,9,10
        "acc_arm_x", "acc_arm_y", "acc_arm_z",            # 14,15,16
        "gyro_arm_x", "gyro_arm_y", "gyro_arm_z",         # 17,18,19
    ]  # total 15 channels

    log_files = glob.glob(os.path.join(data_dir, "mHealth_subject*.log"))
    if not log_files:
        raise FileNotFoundError(f"No mHealth_subject*.log files found in {data_dir}")
    print(f"Found {len(log_files)} log files in {data_dir}")

    dfs = []
    for fp in sorted(log_files):
        dfs.append(_load_single_mhealth_log(fp, feature_cols))

    full_df = pd.concat(dfs, ignore_index=True)

    full_df = full_df[full_df["label"] != 0].copy()

    full_df.loc[:, "label"] = full_df["label"] - 1

    return full_df, feature_cols


def create_mhealth_windows(
    df: pd.DataFrame,
    feature_cols: list[str],
    window_size: int,
    step_size: int,
):
    data_arr = df[feature_cols].to_numpy(dtype=np.float32)
    labels_arr = df["label"].to_numpy(dtype=np.int64)
    L = data_arr.shape[0]

    X_list, y_list = [], []
    start = 0
    while start + window_size <= L:
        end = start + window_size
        window_x = data_arr[start:end]
        window_label = labels_arr[end - 1]
        X_list.append(window_x.T)
        y_list.append(int(window_label))
        start += step_size

    if not X_list:
        raise RuntimeError("No windows created. Check window_size / step_size / dataset length.")

    X_np = np.stack(X_list, axis=0).astype(np.float32)
    y_np = np.array(y_list, dtype=np.int64)
    return X_np, y_np


class MHEALTHDataset(Dataset):
    def __init__(self, data_dir: str, window_size: int = 128, step_size: int = 64):
        super().__init__()

        full_df, feature_cols = load_mhealth_dataframe(data_dir)
        X, y = create_mhealth_windows(full_df, feature_cols, window_size, step_size)

        self.X = X
        self.y = y
        self.subjects = np.zeros(len(self.y), dtype=int)

        self.label_names = [
            "Standing still", "Sitting and relaxing", "Lying down",
            "Walking", "Climbing stairs", "Waist bends forward",
            "Frontal elevation of arms", "Knees bending", "Cycling",
            "Jogging", "Running", "Jump front & back",
        ]

        print("Loaded MHEALTH dataset")
        print(f"X shape : {self.X.shape}  (N, C, T)")
        print(f"y shape : {self.y.shape}  (N,)")
        print(f"Classes : {len(self.label_names)}")

    def fit_scaler(self, indices):
        Xtr = self.X[indices]
        N, C, T = Xtr.shape
        X2 = Xtr.transpose(0, 2, 1).reshape(-1, C)

        scaler = StandardScaler()
        scaler.fit(X2)
        self.scaler = scaler
        return scaler

    def apply_scaler(self, scaler=None):
        if scaler is None:
            scaler = self.scaler
        assert scaler is not None, "Scaler is not fitted. Call fit_scaler() first."

        X = self.X
        N, C, T = X.shape
        X2 = X.transpose(0, 2, 1).reshape(-1, C)
        X2 = scaler.transform(X2)
        self.X = X2.reshape(N, T, C).transpose(0, 2, 1).astype(np.float32)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx: int):
        return (
            torch.from_numpy(self.X[idx]).float(),
            torch.tensor(self.y[idx]).long(),
            int(self.subjects[idx]),
        )


# ──────────────────────────────────────────────────────────────────────────────
# Utils
# ──────────────────────────────────────────────────────────────────────────────
def cosine_temperature(ep, total, tmax=5.0, tmin=0.5):
    r = ep / max(total - 1, 1)
    return tmin + (tmax - tmin) * 0.5 * (1.0 + np.cos(np.pi * r))

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


# ──────────────────────────────────────────────────────────────────────────────
# Corruptions (SNR=10 uses this)
# ──────────────────────────────────────────────────────────────────────────────
def add_gaussian_noise(X, snr_db):
    """
    X: (B,C,T)
    snr_db: float
    """
    signal_power = (X ** 2).mean(dim=(1, 2), keepdim=True)
    snr = 10 ** (snr_db / 10.0)
    noise_power = signal_power / snr
    noise = torch.randn_like(X) * torch.sqrt(noise_power)
    return X + noise


# ──────────────────────────────────────────────────────────────────────────────
# Compute-Aware Degree Gate  (★ Variant behavior matches "예전버전")
#   - use_ste=True  : train=STE(hard fwd, soft bwd), eval=hard onehot
#   - use_ste=False : always soft_probs (train/eval 동일)
# ──────────────────────────────────────────────────────────────────────────────
class ComputeAwareDegreeGate(nn.Module):
    def __init__(self,
                 channels,
                 max_degree=3,
                 gate_hidden_dim=16,
                 temperature_initial=5.0,
                 temperature_min=0.5
        ):
        super().__init__()
        self.max_degree = max_degree

        self.gate = nn.Sequential(
            nn.AdaptiveAvgPool1d(1),
            nn.Flatten(1),
            nn.Linear(channels, gate_hidden_dim),
            nn.GELU(),
            nn.Linear(gate_hidden_dim, max_degree),
        )

        nn.init.zeros_(self.gate[-1].bias)
        if max_degree >= 3:
            self.gate[-1].bias.data[1] = 0.4

        self.register_buffer("temperature", torch.tensor(float(temperature_initial)))
        self.temperature_min = float(temperature_min)

    def set_temperature(self, t):
        self.temperature.fill_(max(float(t), self.temperature_min))

    def forward(self, x, use_ste=True):
        logits = self.gate(x)  # (B,K)
        soft_probs = F.softmax(logits / self.temperature, dim=-1)

        if use_ste:
            if self.training:
                hard_idx = logits.argmax(dim=-1)
                hard_oh = F.one_hot(hard_idx, num_classes=self.max_degree).float()
                # STE: forward=hard, backward=soft
                degree_w = hard_oh - soft_probs.detach() + soft_probs
            else:
                degree_w = F.one_hot(
                    logits.argmax(dim=-1), num_classes=self.max_degree
                ).float()
        else:
            # Gate w/o STE: always soft (train/eval 동일)
            degree_w = soft_probs

        return degree_w, logits, soft_probs


# ──────────────────────────────────────────────────────────────────────────────
# PADRe Block (Ablation switches ONLY; ★ Variant behavior matches "예전버전")
#   - w/o Gate: fixed_degree (1..K) → build up to d, output Z[d-1]
#   - Gate w/o STE: soft weighted sum of ALL degrees (train/eval 동일)
#   - w/o Hadamard: prefix sum (Z[i]=Z[i-1]+Y[i])  (예전버전)
# ──────────────────────────────────────────────────────────────────────────────
class PADReBlockAblation(nn.Module):
    def __init__(self,
                 channels,
                 seq_len,
                 max_degree=3,
                 token_kernel=11,
                 gate_hidden_dim=16,
                 temperature_initial=5.0,
                 temperature_min=0.5,
                 # ablations
                 use_gate=True,
                 fixed_degree=None,      # 1..K if w/o Gate (fixed)
                 use_ste=True,           # Gate w/o STE (soft routing)
                 use_hadamard=True,      # w/o Hadamard (prefix-sum)
        ):
        super().__init__()
        self.max_degree = max_degree

        self.use_gate = bool(use_gate)
        self.fixed_degree = fixed_degree  # None or int in [1..K]
        self.use_ste = bool(use_ste)
        self.use_hadamard = bool(use_hadamard)

        self.degree_gate = ComputeAwareDegreeGate(
            channels,
            max_degree=max_degree,
            gate_hidden_dim=gate_hidden_dim,
            temperature_initial=temperature_initial,
            temperature_min=temperature_min
        )

        self.channel_mixing = nn.ModuleList([
            nn.Conv1d(channels, channels, kernel_size=1) for _ in range(max_degree)
        ])

        self.token_mixing = nn.ModuleList([
            nn.Conv1d(channels, channels, kernel_size=token_kernel,
                      padding=token_kernel // 2, groups=channels)
            for _ in range(max_degree)
        ])

        self.pre_hadamard_channel = nn.ModuleList([
            nn.Conv1d(channels, channels, kernel_size=1) for _ in range(max_degree-1)
        ])

        self.pre_hadamard_token = nn.ModuleList([
            nn.Conv1d(channels, channels, kernel_size=token_kernel,
                      padding=token_kernel // 2, groups=channels)
            for _ in range(max_degree-1)
        ])

        self.norm = nn.LayerNorm(channels)

    def set_temperature(self, t):
        self.degree_gate.set_temperature(t)

    def _build_Y(self, x, max_deg):
        return [self.token_mixing[i](self.channel_mixing[i](x)) for i in range(max_deg)]

    def _build_Z(self, x, max_deg):
        """
        - use_hadamard=True : 예전버전(원본) Hadamard chain
            Z0=Y0, Zi = pre(Z_{i-1}) * Yi
        - use_hadamard=False: 예전버전 w/o Hadamard (prefix sum)
            Z0=Y0, Zi = Z_{i-1} + Yi
        """
        Y = self._build_Y(x, max_deg)

        if self.use_hadamard:
            Z = [Y[0]]
            for i in range(1, max_deg):
                Z_ = self.pre_hadamard_token[i-1](self.pre_hadamard_channel[i-1](Z[-1]))
                Z.append(Z_ * Y[i])
            return Z
        else:
            Z = [Y[0]]
            for i in range(1, max_deg):
                Z.append(Z[-1] + Y[i])
            return Z

    def _hard_select(self, Z_list, sel):
        B = Z_list[0].shape[0]
        Z_stack = torch.stack(Z_list, dim=0)  # (K,B,C,T)
        return Z_stack[sel, torch.arange(B, device=Z_stack.device)]

    def _soft_weighted_output(self, x, soft_probs):
        """
        예전버전 Gate w/o STE:
          - always compute ALL K degrees
          - weighted sum with soft_probs
          - hadamard / no_hadamard build rule은 동일하게 적용
        """
        B = x.size(0)
        Z = self._build_Z(x, max_deg=self.max_degree)      # list length K, each (B,C,T)
        Z_stack = torch.stack(Z, dim=1)                    # (B,K,C,T)
        w = soft_probs.view(B, self.max_degree, 1, 1)      # (B,K,1,1)
        out = (Z_stack * w).sum(dim=1)                     # (B,C,T)
        return out

    def forward(self, x, return_gate_info=False):
        B = x.shape[0]

        # ---- Case A) w/o Gate (fixed degree 1..K) ----
        if (not self.use_gate) or (self.fixed_degree is not None):
            d = int(self.fixed_degree) if self.fixed_degree is not None else self.max_degree
            d = max(1, min(d, self.max_degree))

            # build only up to d, output "degree d" path (예전버전)
            Z = self._build_Z(x, max_deg=d)
            out = Z[-1]

            # stats payload (예전버전 스타일)
            sel = torch.full((B,), d - 1, device=x.device, dtype=torch.long)
            K = self.max_degree
            sp = F.one_hot(sel, num_classes=K).float()
            dw = sp
            logits = sp

            out = self.norm(out.permute(0, 2, 1)).permute(0, 2, 1)
            if return_gate_info:
                return out, {
                    "degree_selection": dw,
                    "soft_probs": sp,
                    "logits": logits,
                    "compute_cost": float(d),
                }
            return out

        # ---- Case B) Gate ON ----
        degree_w, logits, soft_probs = self.degree_gate(x, use_ste=self.use_ste)

        if (not self.use_ste):
            # 예전버전: Gate w/o STE는 train/eval 관계없이 ALWAYS soft weighted sum
            out = self._soft_weighted_output(x, degree_w)  # degree_w == soft_probs
            # (stats only) 대표 degree
            selected = soft_probs.argmax(dim=-1)
            # compute_cost (예전코드에선 argmax 기반이었지만, 여기서는 gi에만 들어가므로 그대로 둠)
            compute_cost = (selected + 1).float().mean().item()
        else:
            # 원본: hard select (STE는 train에서 degree_w에 반영됨)
            selected = degree_w.argmax(dim=-1)
            max_deg = max(1, min(int(selected.max().item()) + 1, self.max_degree))
            Z = self._build_Z(x, max_deg=max_deg)
            out = self._hard_select(Z, selected)
            compute_cost = (selected + 1).float().mean().item()

        out = self.norm(out.permute(0, 2, 1)).permute(0, 2, 1)

        if return_gate_info:
            return out, {
                "degree_selection": degree_w,
                "soft_probs": soft_probs,
                "logits": logits,
                "compute_cost": compute_cost,
            }
        return out


# ──────────────────────────────────────────────────────────────────────────────
# Adaptive PADRe Model (Ablation switches ONLY; otherwise matches your logic)
# ──────────────────────────────────────────────────────────────────────────────
class PADReHAR_Ablation(nn.Module):
    def __init__(self,
                 in_channels=9,
                 seq_len=128,
                 num_classes=6,
                 hidden_dim=48,
                 num_layers=3,
                 max_degree=3,
                 gate_hidden_dim=16,
                 dropout=0.2,
                 temperature_initial=5.0,
                 temperature_min=0.5,
                 # ablations
                 use_gate=True,
                 fixed_degree=None,     # for w/o Gate (1/2/3)
                 use_ste=True,          # Gate w/o STE
                 use_hadamard=True,     # w/o Hadamard
                 use_ffn=True,          # w/o FFN
                 use_residual=True,     # w/o Residual
        ):
        super().__init__()
        self.num_layers = num_layers
        self.max_degree = max_degree

        self.use_ffn = bool(use_ffn)
        self.use_residual = bool(use_residual)

        self.input_proj = nn.Conv1d(in_channels, hidden_dim, kernel_size=1)

        self.padre_blocks = nn.ModuleList([
            PADReBlockAblation(
                hidden_dim, seq_len,
                max_degree=max_degree,
                token_kernel=11,
                gate_hidden_dim=gate_hidden_dim,
                temperature_initial=temperature_initial,
                temperature_min=temperature_min,
                use_gate=use_gate,
                fixed_degree=fixed_degree,
                use_ste=use_ste,
                use_hadamard=use_hadamard,
            )
            for _ in range(num_layers)
        ])

        self.ffn = nn.ModuleList([
            nn.Sequential(
                nn.Conv1d(hidden_dim, hidden_dim * 2, kernel_size=1),
                nn.GELU(),
                nn.Dropout(dropout),
                nn.Conv1d(hidden_dim * 2, hidden_dim, kernel_size=1),
                nn.Dropout(dropout),
            )
            for _ in range(num_layers)
        ])

        self.norms1 = nn.ModuleList([nn.LayerNorm(hidden_dim) for _ in range(num_layers)])
        self.norms2 = nn.ModuleList([nn.LayerNorm(hidden_dim) for _ in range(num_layers)])

        self.global_pool = nn.AdaptiveAvgPool1d(1)
        self.classifier = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim), nn.LayerNorm(hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, num_classes),
        )

    def set_temperature(self, t):
        for b in self.padre_blocks:
            b.set_temperature(t)

    def _ln(self, norm, x):
        return norm(x.permute(0, 2, 1)).permute(0, 2, 1)

    def forward(self, x, return_gate_info=False):
        x = self.input_proj(x)
        gate_info_list = [] if return_gate_info else None
        total_compute  = 0.0

        for i, block in enumerate(self.padre_blocks):
            res = x

            if return_gate_info:
                x, gi = block(x, return_gate_info=True)
                gate_info_list.append(gi)
                total_compute += gi["compute_cost"]
            else:
                x = block(x)

            if self.use_residual:
                x = self._ln(self.norms1[i], x + res)
            else:
                x = self._ln(self.norms1[i], x)

            res2 = x
            if self.use_ffn:
                x = self.ffn[i](x)

            if self.use_residual:
                x = self._ln(self.norms2[i], x + res2)
            else:
                x = self._ln(self.norms2[i], x)

        logits = self.classifier(self.global_pool(x).squeeze(-1))
        return (logits, gate_info_list, total_compute) if return_gate_info else logits


# ──────────────────────────────────────────────────────────────────────────────
# Train & Eval (unchanged)
# ──────────────────────────────────────────────────────────────────────────────
def train_model(model,
                train_loader,
                test_loader,
                device,
                lr=1e-3,
                weight_decay=1e-4,
                epochs=30,
                seed=42
    ):
    set_seed(seed)
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs, eta_min=1e-5)
    criterion = nn.CrossEntropyLoss()

    best_f1 = -1.0
    best_state = None

    for ep in range(epochs):
        temp = cosine_temperature(ep, epochs, tmax=5.0, tmin=0.5)
        model.set_temperature(temp)

        model.train()
        train_loss_sum = 0.0
        train_n = 0
        for X, y, _ in train_loader:
            X, y = X.to(device), y.to(device)
            optimizer.zero_grad()
            loss = criterion(model(X), y)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()

            bs = y.size(0)
            train_loss_sum += loss.item() * bs
            train_n += bs

        scheduler.step()
        train_loss = train_loss_sum / max(train_n, 1)

        model.eval()
        preds_all, labels_all = [], []
        with torch.no_grad():
            for X, y, _ in test_loader:
                X, y = X.to(device), y.to(device)
                preds_all.extend(model(X).argmax(1).cpu().numpy())
                labels_all.extend(y.cpu().numpy())
        test_f1 = f1_score(labels_all, preds_all, average="macro")

        if test_f1 > best_f1:
            best_f1 = test_f1
            best_state = copy.deepcopy(model.state_dict())

        if (ep + 1) % 5 == 0:
            cur_lr = optimizer.param_groups[0]["lr"]
            print(f"Epoch {ep+1:02d}/{epochs} | LR={cur_lr:.4f} | Train Loss={train_loss:.4f} | TestF1={test_f1:.4f} | BestF1={best_f1:.4f} | Temp={temp:.3f}")

    model.load_state_dict(best_state)
    print(f"\nBest Test Macro-F1: {best_f1:.4f}")
    return model


# ──────────────────────────────────────────────────────────────────────────────
# Table Metrics (keep as-is in your "now code")
# ──────────────────────────────────────────────────────────────────────────────
@torch.no_grad()
def eval_f1_and_gate_stats(model, loader, device, snr_db=None, max_degree=3):
    """
    Returns:
      macro_f1, degree_entropy, norm_comp
    Definitions:
      - degree_entropy: mean over (layers, samples) of normalized entropy of soft_probs
                        H(p)/log(K)  where K=max_degree
      - norm_comp: mean expected degree / max_degree, averaged over layers
    """
    model.eval()

    all_preds, all_labels = [], []
    ent_sum = 0.0
    ent_count = 0
    comp_sum = 0.0
    comp_count = 0

    eps = 1e-12
    logK = float(np.log(max_degree))

    deg_vals = torch.arange(1, max_degree + 1, device=device).float()

    for X, y, _ in loader:
        X = X.to(device)
        y = y.to(device)

        if snr_db is not None:
            X = add_gaussian_noise(X, float(snr_db))

        logits, gate_info_list, _ = model(X, return_gate_info=True)
        preds = logits.argmax(dim=1)

        all_preds.append(preds.detach().cpu().numpy())
        all_labels.append(y.detach().cpu().numpy())

        # gate stats
        for gi in gate_info_list:
            sp = gi["soft_probs"]  # (B,K)
            ent = -(sp * (sp + eps).log()).sum(dim=-1) / logK
            ent_sum += ent.mean().item()
            ent_count += 1

            exp_deg = (sp * deg_vals).sum(dim=-1).mean().item()
            comp_sum += (exp_deg / max_degree)
            comp_count += 1

    all_preds = np.concatenate(all_preds, axis=0)
    all_labels = np.concatenate(all_labels, axis=0)
    macro_f1 = float(f1_score(all_labels, all_preds, average="macro"))

    degree_entropy = float(ent_sum / max(ent_count, 1))
    norm_comp = float(comp_sum / max(comp_count, 1))
    return macro_f1, degree_entropy, norm_comp


def format_row(name, clean_f1, snr10_f1, deg_ent, norm_comp):
    drop_pct = 100.0 * (clean_f1 - snr10_f1) / max(clean_f1, 1e-12)
    return {
        "Variant": name,
        "CleanF1": clean_f1,
        "SNR10F1": snr10_f1,
        "drop(%)": drop_pct,
        "DegreeEntropy": deg_ent,
        "NormComp": norm_comp,
    }


def print_table(rows):
    header = ["Variant", "Clean F1", "(SNR=10) F1", "drop(%)", "Degree Entropy", "NormComp"]
    print("\n" + "="*110)
    print("UCI-HAR Ablation Table")
    print("="*110)
    print(f"{header[0]:<22s} | {header[1]:>8s} | {header[2]:>11s} | {header[3]:>7s} | {header[4]:>14s} | {header[5]:>8s}")
    print("-"*110)
    for r in rows:
        print(
            f"{r['Variant']:<22s} | "
            f"{r['CleanF1']:>8.4f} | "
            f"{r['SNR10F1']:>11.4f} | "
            f"{r['drop(%)']:>7.2f} | "
            f"{r['DegreeEntropy']:>14.4f} | "
            f"{r['NormComp']:>8.4f}"
        )
    print("-"*110)

    print("\n[LaTeX rows]")
    for r in rows:
        print(
            f"{r['Variant']} & {r['CleanF1']:.4f} & {r['SNR10F1']:.4f} & "
            f"{r['drop(%)']:.2f} & {r['DegreeEntropy']:.4f} & {r['NormComp']:.4f} \\\\"
        )


# ──────────────────────────────────────────────────────────────────────────────
# Experiment Runner (unchanged)
# ──────────────────────────────────────────────────────────────────────────────
def build_variant(name, cfg):
    model = PADReHAR_Ablation(
        in_channels=cfg["in_channels"],
        seq_len=cfg["seq_len"],
        num_classes=cfg["num_classes"],
        hidden_dim=cfg["hidden_dim"],
        num_layers=cfg["num_layers"],
        max_degree=cfg["max_degree"],
        gate_hidden_dim=cfg["gate_hidden_dim"],
        dropout=cfg["dropout"],
        temperature_initial=cfg["temperature_initial"],
        temperature_min=cfg["temperature_min"],
        use_gate=cfg.get("use_gate", True),
        fixed_degree=cfg.get("fixed_degree", None),
        use_ste=cfg.get("use_ste", True),
        use_hadamard=cfg.get("use_hadamard", True),
        use_ffn=cfg.get("use_ffn", True),
        use_residual=cfg.get("use_residual", True),
    ).to(cfg["device"])
    return model


def run_ablation_suite(train_loader, test_loader, base_cfg, train_cfg):
    variants = []
    variants.append(("Full (Ours)", dict()))
    variants.append(("w/o Gate-1", dict(use_gate=False, fixed_degree=1)))
    variants.append(("w/o Gate-2", dict(use_gate=False, fixed_degree=2)))
    variants.append(("w/o Gate-3", dict(use_gate=False, fixed_degree=3)))
    variants.append(("Gate w/o STE", dict(use_gate=True, use_ste=False)))
    variants.append(("w/o Hadamard", dict(use_hadamard=False)))
    variants.append(("w/o FFN", dict(use_ffn=False)))
    variants.append(("w/o Residual", dict(use_residual=False)))

    rows = []

    for name, delta in variants:
        print("\n" + "="*80)
        print(f"[Running Variant] {name}")
        print("="*80)

        cfg = copy.deepcopy(base_cfg)
        cfg.update(delta)

        set_seed(train_cfg["seed"])
        model = build_variant(name, cfg)
        print(f"Model params: {count_parameters(model):,}")

        model = train_model(
            model,
            train_loader,
            test_loader,
            cfg["device"],
            lr=train_cfg["lr"],
            weight_decay=train_cfg["wd"],
            epochs=train_cfg["epochs"],
            seed=train_cfg["seed"],
        )

        clean_f1, deg_ent, norm_comp = eval_f1_and_gate_stats(
            model, test_loader, cfg["device"], snr_db=None, max_degree=cfg["max_degree"]
        )
        snr10_f1, _, _ = eval_f1_and_gate_stats(
            model, test_loader, cfg["device"], snr_db=10.0, max_degree=cfg["max_degree"]
        )

        rows.append(format_row(name, clean_f1, snr10_f1, deg_ent, norm_comp))

    return rows


# ──────────────────────────────────────────────────────────────────────────────
# Main
# ──────────────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
    DATA_PATH =  "/content/drive/MyDrive/Colab Notebooks/HAR/har_orig_datasets/MHEALTHDATASET"
    SEED = 42
    NUM_WORKERS = 2 if USE_GPU else 0
    PIN_MEMORY = USE_GPU

    BATCH_SIZE = 64
    EPOCHS = 25

    NUM_CLASSES = 12
    HIDDEN_DIM = 48
    NUM_LAYERS = 3
    MAX_DEGREE = 3
    GATE_HIDDEN_DIM = 16
    DROPOUT = 0.25
    LR = 1e-3
    WD = 1e-2

    WINDOW_SIZE = 100
    STEP_SIZE = 50

    set_seed(SEED)

    full_dataset = MHEALTHDataset(DATA_PATH, window_size=WINDOW_SIZE, step_size=STEP_SIZE)

    n_total = len(full_dataset)
    n_test = int(0.2 * n_total)
    n_train = n_total - n_test

    g = torch.Generator().manual_seed(SEED)
    train_dataset, test_dataset = random_split(full_dataset, [n_train, n_test], generator=g)

    train_idx = np.array(train_dataset.indices, dtype=np.int64)
    scaler = full_dataset.fit_scaler(train_idx)
    full_dataset.apply_scaler(scaler)

    train_loader = DataLoader(
        train_dataset, batch_size=BATCH_SIZE, shuffle=True,
        num_workers=NUM_WORKERS, pin_memory=PIN_MEMORY
    )
    test_loader = DataLoader(
        test_dataset, batch_size=BATCH_SIZE, shuffle=False,
        num_workers=NUM_WORKERS, pin_memory=PIN_MEMORY
    )

    base_cfg = dict(
        device=DEVICE,
        in_channels=15,
        seq_len=100,
        num_classes=NUM_CLASSES,
        hidden_dim=HIDDEN_DIM,
        num_layers=NUM_LAYERS,
        max_degree=MAX_DEGREE,
        gate_hidden_dim=GATE_HIDDEN_DIM,
        dropout=DROPOUT,
        temperature_initial=5.0,
        temperature_min=0.5,
        use_gate=True,
        fixed_degree=None,
        use_ste=True,
        use_hadamard=True,
        use_ffn=True,
        use_residual=True,
    )

    train_cfg = dict(
        seed=SEED,
        epochs=EPOCHS,
        lr=LR,
        wd=WD,
    )

    rows = run_ablation_suite(train_loader, test_loader, base_cfg, train_cfg)
    print_table(rows)

Device: cuda | pin_memory: True
Found 10 log files in /content/drive/MyDrive/Colab Notebooks/HAR/har_orig_datasets/MHEALTHDATASET
Loaded MHEALTH dataset
X shape : (6862, 15, 100)  (N, C, T)
y shape : (6862,)  (N,)
Classes : 12

[Running Variant] Full (Ours)
Model params: 79,173
Epoch 05/25 | LR=0.0009 | Train Loss=0.0915 | TestF1=0.9812 | BestF1=0.9812 | Temp=4.699
Epoch 10/25 | LR=0.0007 | Train Loss=0.0425 | TestF1=0.9853 | BestF1=0.9853 | Temp=3.611
Epoch 15/25 | LR=0.0004 | Train Loss=0.0259 | TestF1=0.9853 | BestF1=0.9853 | Temp=2.168
Epoch 20/25 | LR=0.0001 | Train Loss=0.0153 | TestF1=0.9839 | BestF1=0.9853 | Temp=0.965
Epoch 25/25 | LR=0.0000 | Train Loss=0.0115 | TestF1=0.9839 | BestF1=0.9853 | Temp=0.500

Best Test Macro-F1: 0.9853

[Running Variant] w/o Gate-1
Model params: 79,173
Epoch 05/25 | LR=0.0009 | Train Loss=0.1706 | TestF1=0.9747 | BestF1=0.9747 | Temp=4.699
Epoch 10/25 | LR=0.0007 | Train Loss=0.0833 | TestF1=0.9762 | BestF1=0.9811 | Temp=3.611
Epoch 15/25 | LR=0.

# WISDM

In [2]:
import os, copy, random, time, re, glob
import numpy as np
import pandas as pd
from pathlib import Path

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split

from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler

# ──────────────────────────────────────────────────────────────────────────────
# Seed / Device
# ──────────────────────────────────────────────────────────────────────────────
def set_seed(seed=42):
    os.environ["PYTHONHASHSEED"] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

DEVICE  = torch.device("cuda" if torch.cuda.is_available() else "cpu")
USE_GPU = DEVICE.type == "cuda"
print(f"Device: {DEVICE} | pin_memory: {USE_GPU}")


# ──────────────────────────────────────────────────────────────────────────────
# Dataset
# ──────────────────────────────────────────────────────────────────────────────
def _load_single_mhealth_log(path: str, feature_cols: list[str]):
    df = pd.read_csv(
        path,
        sep="\t",
        header=None,
        names=feature_cols + ["label"],
    )
    return df

def load_mhealth_dataframe(data_dir: str):
    feature_cols = [
        "acc_chest_x", "acc_chest_y", "acc_chest_z",      # 0,1,2
        "acc_ankle_x", "acc_ankle_y", "acc_ankle_z",      # 5,6,7
        "gyro_ankle_x", "gyro_ankle_y", "gyro_ankle_z",   # 8,9,10
        "acc_arm_x", "acc_arm_y", "acc_arm_z",            # 14,15,16
        "gyro_arm_x", "gyro_arm_y", "gyro_arm_z",         # 17,18,19
    ]  # total 15 channels

    log_files = glob.glob(os.path.join(data_dir, "mHealth_subject*.log"))
    if not log_files:
        raise FileNotFoundError(f"No mHealth_subject*.log files found in {data_dir}")
    print(f"Found {len(log_files)} log files in {data_dir}")

    dfs = []
    for fp in sorted(log_files):
        dfs.append(_load_single_mhealth_log(fp, feature_cols))

    full_df = pd.concat(dfs, ignore_index=True)

    full_df = full_df[full_df["label"] != 0].copy()

    full_df.loc[:, "label"] = full_df["label"] - 1

    return full_df, feature_cols


def create_mhealth_windows(
    df: pd.DataFrame,
    feature_cols: list[str],
    window_size: int,
    step_size: int,
):
    data_arr = df[feature_cols].to_numpy(dtype=np.float32)
    labels_arr = df["label"].to_numpy(dtype=np.int64)
    L = data_arr.shape[0]

    X_list, y_list = [], []
    start = 0
    while start + window_size <= L:
        end = start + window_size
        window_x = data_arr[start:end]
        window_label = labels_arr[end - 1]
        X_list.append(window_x.T)
        y_list.append(int(window_label))
        start += step_size

    if not X_list:
        raise RuntimeError("No windows created. Check window_size / step_size / dataset length.")

    X_np = np.stack(X_list, axis=0).astype(np.float32)
    y_np = np.array(y_list, dtype=np.int64)
    return X_np, y_np


class MHEALTHDataset(Dataset):
    def __init__(self, data_dir: str, window_size: int = 128, step_size: int = 64):
        super().__init__()

        full_df, feature_cols = load_mhealth_dataframe(data_dir)
        X, y = create_mhealth_windows(full_df, feature_cols, window_size, step_size)

        self.X = X
        self.y = y
        self.subjects = np.zeros(len(self.y), dtype=int)

        self.label_names = [
            "Standing still", "Sitting and relaxing", "Lying down",
            "Walking", "Climbing stairs", "Waist bends forward",
            "Frontal elevation of arms", "Knees bending", "Cycling",
            "Jogging", "Running", "Jump front & back",
        ]

        print("Loaded MHEALTH dataset")
        print(f"X shape : {self.X.shape}  (N, C, T)")
        print(f"y shape : {self.y.shape}  (N,)")
        print(f"Classes : {len(self.label_names)}")

    def fit_scaler(self, indices):
        Xtr = self.X[indices]
        N, C, T = Xtr.shape
        X2 = Xtr.transpose(0, 2, 1).reshape(-1, C)

        scaler = StandardScaler()
        scaler.fit(X2)
        self.scaler = scaler
        return scaler

    def apply_scaler(self, scaler=None):
        if scaler is None:
            scaler = self.scaler
        assert scaler is not None, "Scaler is not fitted. Call fit_scaler() first."

        X = self.X
        N, C, T = X.shape
        X2 = X.transpose(0, 2, 1).reshape(-1, C)
        X2 = scaler.transform(X2)
        self.X = X2.reshape(N, T, C).transpose(0, 2, 1).astype(np.float32)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx: int):
        return (
            torch.from_numpy(self.X[idx]).float(),
            torch.tensor(self.y[idx]).long(),
            int(self.subjects[idx]),
        )


# ──────────────────────────────────────────────────────────────────────────────
# Utils
# ──────────────────────────────────────────────────────────────────────────────
def cosine_temperature(ep, total, tmax=5.0, tmin=0.5):
    r = ep / max(total - 1, 1)
    return tmin + (tmax - tmin) * 0.5 * (1.0 + np.cos(np.pi * r))

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


# ──────────────────────────────────────────────────────────────────────────────
# Corruptions (SNR=10 uses this)
# ──────────────────────────────────────────────────────────────────────────────
def add_gaussian_noise(X, snr_db):
    """
    X: (B,C,T)
    snr_db: float
    """
    signal_power = (X ** 2).mean(dim=(1, 2), keepdim=True)
    snr = 10 ** (snr_db / 10.0)
    noise_power = signal_power / snr
    noise = torch.randn_like(X) * torch.sqrt(noise_power)
    return X + noise


# ──────────────────────────────────────────────────────────────────────────────
# Compute-Aware Degree Gate  (★ Variant behavior matches "예전버전")
#   - use_ste=True  : train=STE(hard fwd, soft bwd), eval=hard onehot
#   - use_ste=False : always soft_probs (train/eval 동일)
# ──────────────────────────────────────────────────────────────────────────────
class ComputeAwareDegreeGate(nn.Module):
    def __init__(self,
                 channels,
                 max_degree=3,
                 gate_hidden_dim=16,
                 temperature_initial=5.0,
                 temperature_min=0.5
        ):
        super().__init__()
        self.max_degree = max_degree

        self.gate = nn.Sequential(
            nn.AdaptiveAvgPool1d(1),
            nn.Flatten(1),
            nn.Linear(channels, gate_hidden_dim),
            nn.GELU(),
            nn.Linear(gate_hidden_dim, max_degree),
        )

        nn.init.zeros_(self.gate[-1].bias)
        if max_degree >= 3:
            self.gate[-1].bias.data[1] = 0.4

        self.register_buffer("temperature", torch.tensor(float(temperature_initial)))
        self.temperature_min = float(temperature_min)

    def set_temperature(self, t):
        self.temperature.fill_(max(float(t), self.temperature_min))

    def forward(self, x, use_ste=True):
        logits = self.gate(x)  # (B,K)
        soft_probs = F.softmax(logits / self.temperature, dim=-1)

        if use_ste:
            if self.training:
                hard_idx = logits.argmax(dim=-1)
                hard_oh = F.one_hot(hard_idx, num_classes=self.max_degree).float()
                # STE: forward=hard, backward=soft
                degree_w = hard_oh - soft_probs.detach() + soft_probs
            else:
                degree_w = F.one_hot(
                    logits.argmax(dim=-1), num_classes=self.max_degree
                ).float()
        else:
            # Gate w/o STE: always soft (train/eval 동일)
            degree_w = soft_probs

        return degree_w, logits, soft_probs


# ──────────────────────────────────────────────────────────────────────────────
# PADRe Block (Ablation switches ONLY; ★ Variant behavior matches "예전버전")
#   - w/o Gate: fixed_degree (1..K) → build up to d, output Z[d-1]
#   - Gate w/o STE: soft weighted sum of ALL degrees (train/eval 동일)
#   - w/o Hadamard: prefix sum (Z[i]=Z[i-1]+Y[i])  (예전버전)
# ──────────────────────────────────────────────────────────────────────────────
class PADReBlockAblation(nn.Module):
    def __init__(self,
                 channels,
                 seq_len,
                 max_degree=3,
                 token_kernel=11,
                 gate_hidden_dim=16,
                 temperature_initial=5.0,
                 temperature_min=0.5,
                 # ablations
                 use_gate=True,
                 fixed_degree=None,      # 1..K if w/o Gate (fixed)
                 use_ste=True,           # Gate w/o STE (soft routing)
                 use_hadamard=True,      # w/o Hadamard (prefix-sum)
        ):
        super().__init__()
        self.max_degree = max_degree

        self.use_gate = bool(use_gate)
        self.fixed_degree = fixed_degree  # None or int in [1..K]
        self.use_ste = bool(use_ste)
        self.use_hadamard = bool(use_hadamard)

        self.degree_gate = ComputeAwareDegreeGate(
            channels,
            max_degree=max_degree,
            gate_hidden_dim=gate_hidden_dim,
            temperature_initial=temperature_initial,
            temperature_min=temperature_min
        )

        self.channel_mixing = nn.ModuleList([
            nn.Conv1d(channels, channels, kernel_size=1) for _ in range(max_degree)
        ])

        self.token_mixing = nn.ModuleList([
            nn.Conv1d(channels, channels, kernel_size=token_kernel,
                      padding=token_kernel // 2, groups=channels)
            for _ in range(max_degree)
        ])

        self.pre_hadamard_channel = nn.ModuleList([
            nn.Conv1d(channels, channels, kernel_size=1) for _ in range(max_degree-1)
        ])

        self.pre_hadamard_token = nn.ModuleList([
            nn.Conv1d(channels, channels, kernel_size=token_kernel,
                      padding=token_kernel // 2, groups=channels)
            for _ in range(max_degree-1)
        ])

        self.norm = nn.LayerNorm(channels)

    def set_temperature(self, t):
        self.degree_gate.set_temperature(t)

    def _build_Y(self, x, max_deg):
        return [self.token_mixing[i](self.channel_mixing[i](x)) for i in range(max_deg)]

    def _build_Z(self, x, max_deg):
        """
        - use_hadamard=True : 예전버전(원본) Hadamard chain
            Z0=Y0, Zi = pre(Z_{i-1}) * Yi
        - use_hadamard=False: 예전버전 w/o Hadamard (prefix sum)
            Z0=Y0, Zi = Z_{i-1} + Yi
        """
        Y = self._build_Y(x, max_deg)

        if self.use_hadamard:
            Z = [Y[0]]
            for i in range(1, max_deg):
                Z_ = self.pre_hadamard_token[i-1](self.pre_hadamard_channel[i-1](Z[-1]))
                Z.append(Z_ * Y[i])
            return Z
        else:
            Z = [Y[0]]
            for i in range(1, max_deg):
                Z.append(Z[-1] + Y[i])
            return Z

    def _hard_select(self, Z_list, sel):
        B = Z_list[0].shape[0]
        Z_stack = torch.stack(Z_list, dim=0)  # (K,B,C,T)
        return Z_stack[sel, torch.arange(B, device=Z_stack.device)]

    def _soft_weighted_output(self, x, soft_probs):
        """
        예전버전 Gate w/o STE:
          - always compute ALL K degrees
          - weighted sum with soft_probs
          - hadamard / no_hadamard build rule은 동일하게 적용
        """
        B = x.size(0)
        Z = self._build_Z(x, max_deg=self.max_degree)      # list length K, each (B,C,T)
        Z_stack = torch.stack(Z, dim=1)                    # (B,K,C,T)
        w = soft_probs.view(B, self.max_degree, 1, 1)      # (B,K,1,1)
        out = (Z_stack * w).sum(dim=1)                     # (B,C,T)
        return out

    def forward(self, x, return_gate_info=False):
        B = x.shape[0]

        # ---- Case A) w/o Gate (fixed degree 1..K) ----
        if (not self.use_gate) or (self.fixed_degree is not None):
            d = int(self.fixed_degree) if self.fixed_degree is not None else self.max_degree
            d = max(1, min(d, self.max_degree))

            # build only up to d, output "degree d" path (예전버전)
            Z = self._build_Z(x, max_deg=d)
            out = Z[-1]

            # stats payload (예전버전 스타일)
            sel = torch.full((B,), d - 1, device=x.device, dtype=torch.long)
            K = self.max_degree
            sp = F.one_hot(sel, num_classes=K).float()
            dw = sp
            logits = sp

            out = self.norm(out.permute(0, 2, 1)).permute(0, 2, 1)
            if return_gate_info:
                return out, {
                    "degree_selection": dw,
                    "soft_probs": sp,
                    "logits": logits,
                    "compute_cost": float(d),
                }
            return out

        # ---- Case B) Gate ON ----
        degree_w, logits, soft_probs = self.degree_gate(x, use_ste=self.use_ste)

        if (not self.use_ste):
            # 예전버전: Gate w/o STE는 train/eval 관계없이 ALWAYS soft weighted sum
            out = self._soft_weighted_output(x, degree_w)  # degree_w == soft_probs
            # (stats only) 대표 degree
            selected = soft_probs.argmax(dim=-1)
            # compute_cost (예전코드에선 argmax 기반이었지만, 여기서는 gi에만 들어가므로 그대로 둠)
            compute_cost = (selected + 1).float().mean().item()
        else:
            # 원본: hard select (STE는 train에서 degree_w에 반영됨)
            selected = degree_w.argmax(dim=-1)
            max_deg = max(1, min(int(selected.max().item()) + 1, self.max_degree))
            Z = self._build_Z(x, max_deg=max_deg)
            out = self._hard_select(Z, selected)
            compute_cost = (selected + 1).float().mean().item()

        out = self.norm(out.permute(0, 2, 1)).permute(0, 2, 1)

        if return_gate_info:
            return out, {
                "degree_selection": degree_w,
                "soft_probs": soft_probs,
                "logits": logits,
                "compute_cost": compute_cost,
            }
        return out


# ──────────────────────────────────────────────────────────────────────────────
# Adaptive PADRe Model (Ablation switches ONLY; otherwise matches your logic)
# ──────────────────────────────────────────────────────────────────────────────
class PADReHAR_Ablation(nn.Module):
    def __init__(self,
                 in_channels=9,
                 seq_len=128,
                 num_classes=6,
                 hidden_dim=48,
                 num_layers=3,
                 max_degree=3,
                 gate_hidden_dim=16,
                 dropout=0.2,
                 temperature_initial=5.0,
                 temperature_min=0.5,
                 # ablations
                 use_gate=True,
                 fixed_degree=None,     # for w/o Gate (1/2/3)
                 use_ste=True,          # Gate w/o STE
                 use_hadamard=True,     # w/o Hadamard
                 use_ffn=True,          # w/o FFN
                 use_residual=True,     # w/o Residual
        ):
        super().__init__()
        self.num_layers = num_layers
        self.max_degree = max_degree

        self.use_ffn = bool(use_ffn)
        self.use_residual = bool(use_residual)

        self.input_proj = nn.Conv1d(in_channels, hidden_dim, kernel_size=1)

        self.padre_blocks = nn.ModuleList([
            PADReBlockAblation(
                hidden_dim, seq_len,
                max_degree=max_degree,
                token_kernel=11,
                gate_hidden_dim=gate_hidden_dim,
                temperature_initial=temperature_initial,
                temperature_min=temperature_min,
                use_gate=use_gate,
                fixed_degree=fixed_degree,
                use_ste=use_ste,
                use_hadamard=use_hadamard,
            )
            for _ in range(num_layers)
        ])

        self.ffn = nn.ModuleList([
            nn.Sequential(
                nn.Conv1d(hidden_dim, hidden_dim * 2, kernel_size=1),
                nn.GELU(),
                nn.Dropout(dropout),
                nn.Conv1d(hidden_dim * 2, hidden_dim, kernel_size=1),
                nn.Dropout(dropout),
            )
            for _ in range(num_layers)
        ])

        self.norms1 = nn.ModuleList([nn.LayerNorm(hidden_dim) for _ in range(num_layers)])
        self.norms2 = nn.ModuleList([nn.LayerNorm(hidden_dim) for _ in range(num_layers)])

        self.global_pool = nn.AdaptiveAvgPool1d(1)
        self.classifier = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim), nn.LayerNorm(hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, num_classes),
        )

    def set_temperature(self, t):
        for b in self.padre_blocks:
            b.set_temperature(t)

    def _ln(self, norm, x):
        return norm(x.permute(0, 2, 1)).permute(0, 2, 1)

    def forward(self, x, return_gate_info=False):
        x = self.input_proj(x)
        gate_info_list = [] if return_gate_info else None
        total_compute  = 0.0

        for i, block in enumerate(self.padre_blocks):
            res = x

            if return_gate_info:
                x, gi = block(x, return_gate_info=True)
                gate_info_list.append(gi)
                total_compute += gi["compute_cost"]
            else:
                x = block(x)

            if self.use_residual:
                x = self._ln(self.norms1[i], x + res)
            else:
                x = self._ln(self.norms1[i], x)

            res2 = x
            if self.use_ffn:
                x = self.ffn[i](x)

            if self.use_residual:
                x = self._ln(self.norms2[i], x + res2)
            else:
                x = self._ln(self.norms2[i], x)

        logits = self.classifier(self.global_pool(x).squeeze(-1))
        return (logits, gate_info_list, total_compute) if return_gate_info else logits


# ──────────────────────────────────────────────────────────────────────────────
# Train & Eval (unchanged)
# ──────────────────────────────────────────────────────────────────────────────
def train_model(model,
                train_loader,
                test_loader,
                device,
                lr=1e-3,
                weight_decay=1e-4,
                epochs=30,
                seed=42
    ):
    set_seed(seed)
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs, eta_min=1e-5)
    criterion = nn.CrossEntropyLoss()

    best_f1 = -1.0
    best_state = None

    for ep in range(epochs):
        temp = cosine_temperature(ep, epochs, tmax=5.0, tmin=0.5)
        model.set_temperature(temp)

        model.train()
        train_loss_sum = 0.0
        train_n = 0
        for X, y, _ in train_loader:
            X, y = X.to(device), y.to(device)
            optimizer.zero_grad()
            loss = criterion(model(X), y)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()

            bs = y.size(0)
            train_loss_sum += loss.item() * bs
            train_n += bs

        scheduler.step()
        train_loss = train_loss_sum / max(train_n, 1)

        model.eval()
        preds_all, labels_all = [], []
        with torch.no_grad():
            for X, y, _ in test_loader:
                X, y = X.to(device), y.to(device)
                preds_all.extend(model(X).argmax(1).cpu().numpy())
                labels_all.extend(y.cpu().numpy())
        test_f1 = f1_score(labels_all, preds_all, average="macro")

        if test_f1 > best_f1:
            best_f1 = test_f1
            best_state = copy.deepcopy(model.state_dict())

        if (ep + 1) % 5 == 0:
            cur_lr = optimizer.param_groups[0]["lr"]
            print(f"Epoch {ep+1:02d}/{epochs} | LR={cur_lr:.4f} | Train Loss={train_loss:.4f} | TestF1={test_f1:.4f} | BestF1={best_f1:.4f} | Temp={temp:.3f}")

    model.load_state_dict(best_state)
    print(f"\nBest Test Macro-F1: {best_f1:.4f}")
    return model


# ──────────────────────────────────────────────────────────────────────────────
# Table Metrics (keep as-is in your "now code")
# ──────────────────────────────────────────────────────────────────────────────
@torch.no_grad()
def eval_f1_and_gate_stats(model, loader, device, snr_db=None, max_degree=3):
    """
    Returns:
      macro_f1, degree_entropy, norm_comp
    Definitions:
      - degree_entropy: mean over (layers, samples) of normalized entropy of soft_probs
                        H(p)/log(K)  where K=max_degree
      - norm_comp: mean expected degree / max_degree, averaged over layers
    """
    model.eval()

    all_preds, all_labels = [], []
    ent_sum = 0.0
    ent_count = 0
    comp_sum = 0.0
    comp_count = 0

    eps = 1e-12
    logK = float(np.log(max_degree))

    deg_vals = torch.arange(1, max_degree + 1, device=device).float()

    for X, y, _ in loader:
        X = X.to(device)
        y = y.to(device)

        if snr_db is not None:
            X = add_gaussian_noise(X, float(snr_db))

        logits, gate_info_list, _ = model(X, return_gate_info=True)
        preds = logits.argmax(dim=1)

        all_preds.append(preds.detach().cpu().numpy())
        all_labels.append(y.detach().cpu().numpy())

        # gate stats
        for gi in gate_info_list:
            sp = gi["soft_probs"]  # (B,K)
            ent = -(sp * (sp + eps).log()).sum(dim=-1) / logK
            ent_sum += ent.mean().item()
            ent_count += 1

            exp_deg = (sp * deg_vals).sum(dim=-1).mean().item()
            comp_sum += (exp_deg / max_degree)
            comp_count += 1

    all_preds = np.concatenate(all_preds, axis=0)
    all_labels = np.concatenate(all_labels, axis=0)
    macro_f1 = float(f1_score(all_labels, all_preds, average="macro"))

    degree_entropy = float(ent_sum / max(ent_count, 1))
    norm_comp = float(comp_sum / max(comp_count, 1))
    return macro_f1, degree_entropy, norm_comp


def format_row(name, clean_f1, snr10_f1, deg_ent, norm_comp):
    drop_pct = 100.0 * (clean_f1 - snr10_f1) / max(clean_f1, 1e-12)
    return {
        "Variant": name,
        "CleanF1": clean_f1,
        "SNR10F1": snr10_f1,
        "drop(%)": drop_pct,
        "DegreeEntropy": deg_ent,
        "NormComp": norm_comp,
    }


def print_table(rows):
    header = ["Variant", "Clean F1", "(SNR=10) F1", "drop(%)", "Degree Entropy", "NormComp"]
    print("\n" + "="*110)
    print("UCI-HAR Ablation Table")
    print("="*110)
    print(f"{header[0]:<22s} | {header[1]:>8s} | {header[2]:>11s} | {header[3]:>7s} | {header[4]:>14s} | {header[5]:>8s}")
    print("-"*110)
    for r in rows:
        print(
            f"{r['Variant']:<22s} | "
            f"{r['CleanF1']:>8.4f} | "
            f"{r['SNR10F1']:>11.4f} | "
            f"{r['drop(%)']:>7.2f} | "
            f"{r['DegreeEntropy']:>14.4f} | "
            f"{r['NormComp']:>8.4f}"
        )
    print("-"*110)

    print("\n[LaTeX rows]")
    for r in rows:
        print(
            f"{r['Variant']} & {r['CleanF1']:.4f} & {r['SNR10F1']:.4f} & "
            f"{r['drop(%)']:.2f} & {r['DegreeEntropy']:.4f} & {r['NormComp']:.4f} \\\\"
        )


# ──────────────────────────────────────────────────────────────────────────────
# Experiment Runner (unchanged)
# ──────────────────────────────────────────────────────────────────────────────
def build_variant(name, cfg):
    model = PADReHAR_Ablation(
        in_channels=cfg["in_channels"],
        seq_len=cfg["seq_len"],
        num_classes=cfg["num_classes"],
        hidden_dim=cfg["hidden_dim"],
        num_layers=cfg["num_layers"],
        max_degree=cfg["max_degree"],
        gate_hidden_dim=cfg["gate_hidden_dim"],
        dropout=cfg["dropout"],
        temperature_initial=cfg["temperature_initial"],
        temperature_min=cfg["temperature_min"],
        use_gate=cfg.get("use_gate", True),
        fixed_degree=cfg.get("fixed_degree", None),
        use_ste=cfg.get("use_ste", True),
        use_hadamard=cfg.get("use_hadamard", True),
        use_ffn=cfg.get("use_ffn", True),
        use_residual=cfg.get("use_residual", True),
    ).to(cfg["device"])
    return model


def run_ablation_suite(train_loader, test_loader, base_cfg, train_cfg):
    variants = []
    variants.append(("Full (Ours)", dict()))
    variants.append(("w/o Gate-1", dict(use_gate=False, fixed_degree=1)))
    variants.append(("w/o Gate-2", dict(use_gate=False, fixed_degree=2)))
    variants.append(("w/o Gate-3", dict(use_gate=False, fixed_degree=3)))
    variants.append(("Gate w/o STE", dict(use_gate=True, use_ste=False)))
    variants.append(("w/o Hadamard", dict(use_hadamard=False)))
    variants.append(("w/o FFN", dict(use_ffn=False)))
    variants.append(("w/o Residual", dict(use_residual=False)))

    rows = []

    for name, delta in variants:
        print("\n" + "="*80)
        print(f"[Running Variant] {name}")
        print("="*80)

        cfg = copy.deepcopy(base_cfg)
        cfg.update(delta)

        set_seed(train_cfg["seed"])
        model = build_variant(name, cfg)
        print(f"Model params: {count_parameters(model):,}")

        model = train_model(
            model,
            train_loader,
            test_loader,
            cfg["device"],
            lr=train_cfg["lr"],
            weight_decay=train_cfg["wd"],
            epochs=train_cfg["epochs"],
            seed=train_cfg["seed"],
        )

        clean_f1, deg_ent, norm_comp = eval_f1_and_gate_stats(
            model, test_loader, cfg["device"], snr_db=None, max_degree=cfg["max_degree"]
        )
        snr10_f1, _, _ = eval_f1_and_gate_stats(
            model, test_loader, cfg["device"], snr_db=10.0, max_degree=cfg["max_degree"]
        )

        rows.append(format_row(name, clean_f1, snr10_f1, deg_ent, norm_comp))

    return rows


# ──────────────────────────────────────────────────────────────────────────────
# Main
# ──────────────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
    DATA_PATH =  "/content/drive/MyDrive/Colab Notebooks/HAR/har_orig_datasets/MHEALTHDATASET"
    SEED = 42
    NUM_WORKERS = 2 if USE_GPU else 0
    PIN_MEMORY = USE_GPU

    BATCH_SIZE = 64
    EPOCHS = 5

    NUM_CLASSES = 12
    HIDDEN_DIM = 48
    NUM_LAYERS = 3
    MAX_DEGREE = 3
    GATE_HIDDEN_DIM = 16
    DROPOUT = 0.25
    LR = 1e-3
    WD = 1e-2

    WINDOW_SIZE = 100
    STEP_SIZE = 50

    set_seed(SEED)

    full_dataset = MHEALTHDataset(DATA_PATH, window_size=WINDOW_SIZE, step_size=STEP_SIZE)

    n_total = len(full_dataset)
    n_test = int(0.2 * n_total)
    n_train = n_total - n_test

    g = torch.Generator().manual_seed(SEED)
    train_dataset, test_dataset = random_split(full_dataset, [n_train, n_test], generator=g)

    train_idx = np.array(train_dataset.indices, dtype=np.int64)
    scaler = full_dataset.fit_scaler(train_idx)
    full_dataset.apply_scaler(scaler)

    train_loader = DataLoader(
        train_dataset, batch_size=BATCH_SIZE, shuffle=True,
        num_workers=NUM_WORKERS, pin_memory=PIN_MEMORY
    )
    test_loader = DataLoader(
        test_dataset, batch_size=BATCH_SIZE, shuffle=False,
        num_workers=NUM_WORKERS, pin_memory=PIN_MEMORY
    )

    base_cfg = dict(
        device=DEVICE,
        in_channels=15,
        seq_len=100,
        num_classes=NUM_CLASSES,
        hidden_dim=HIDDEN_DIM,
        num_layers=NUM_LAYERS,
        max_degree=MAX_DEGREE,
        gate_hidden_dim=GATE_HIDDEN_DIM,
        dropout=DROPOUT,
        temperature_initial=5.0,
        temperature_min=0.5,
        use_gate=True,
        fixed_degree=None,
        use_ste=True,
        use_hadamard=True,
        use_ffn=True,
        use_residual=True,
    )

    train_cfg = dict(
        seed=SEED,
        epochs=EPOCHS,
        lr=LR,
        wd=WD,
    )

    rows = run_ablation_suite(train_loader, test_loader, base_cfg, train_cfg)
    print_table(rows)

Device: cuda | pin_memory: True
Found 10 log files in /content/drive/MyDrive/Colab Notebooks/HAR/har_orig_datasets/MHEALTHDATASET
Loaded MHEALTH dataset
X shape : (6862, 15, 100)  (N, C, T)
y shape : (6862,)  (N,)
Classes : 12

[Running Variant] Full (Ours)
Model params: 79,173
Epoch 05/5 | LR=0.0000 | Train Loss=0.1030 | TestF1=0.9762 | BestF1=0.9783 | Temp=0.500

Best Test Macro-F1: 0.9783

[Running Variant] w/o Gate-1
Model params: 79,173
Epoch 05/5 | LR=0.0000 | Train Loss=0.2083 | TestF1=0.9627 | BestF1=0.9627 | Temp=0.500

Best Test Macro-F1: 0.9627

[Running Variant] w/o Gate-2
Model params: 79,173
Epoch 05/5 | LR=0.0000 | Train Loss=0.1000 | TestF1=0.9797 | BestF1=0.9797 | Temp=0.500

Best Test Macro-F1: 0.9797

[Running Variant] w/o Gate-3
Model params: 79,173
Epoch 05/5 | LR=0.0000 | Train Loss=0.0921 | TestF1=0.9805 | BestF1=0.9805 | Temp=0.500

Best Test Macro-F1: 0.9805

[Running Variant] Gate w/o STE
Model params: 79,173
Epoch 05/5 | LR=0.0000 | Train Loss=0.1310 | TestF1

# WISDM

In [1]:
import os, copy, random, time, re, glob
import numpy as np
import pandas as pd
from pathlib import Path

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split

from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler

# ──────────────────────────────────────────────────────────────────────────────
# Seed / Device
# ──────────────────────────────────────────────────────────────────────────────
def set_seed(seed=42):
    os.environ["PYTHONHASHSEED"] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

DEVICE  = torch.device("cuda" if torch.cuda.is_available() else "cpu")
USE_GPU = DEVICE.type == "cuda"
print(f"Device: {DEVICE} | pin_memory: {USE_GPU}")


# ──────────────────────────────────────────────────────────────────────────────
# Dataset
# ──────────────────────────────────────────────────────────────────────────────
class WISDMDataset(Dataset):
    def __init__(self, file_path: str, window_size: int = 80, step_size: int = 40):
        super().__init__()
        self.file_path = file_path
        self.window_size = window_size
        self.step_size = step_size

        if not os.path.isfile(file_path):
            raise FileNotFoundError(f"WISDM txt file not found: {file_path}")

        df = self._load_file(file_path)
        self.X, self.y, self.subjects = self._create_windows(df)
        self.unique_subjects = sorted(np.unique(self.subjects))

        self.n_classes = int(len(np.unique(self.y)))

        print("=" * 80)
        print("Loaded WISDM dataset (single txt)")
        print(f"  X shape       : {self.X.shape}  (N, C, T)")
        print(f"  y shape       : {self.y.shape}  (N,)")
        print(f"  subjects shape: {self.subjects.shape} (N,)")
        print(f"  num classes   : {self.n_classes}")
        print(f"  unique subjects: {self.unique_subjects[:10]} ... (total {len(self.unique_subjects)})")
        print("=" * 80)

    def _load_file(self, file_path: str) -> pd.DataFrame:
        WISDM_LABEL_MAP = {
            "walking": 0,
            "jogging": 1,
            "sitting": 2,
            "standing": 3,
            "upstairs": 4,
            "downstairs": 5,
        }

        with open(file_path, "r") as f:
            lines = f.readlines()

        rows = []
        for line in lines:
            line = line.strip()
            if not line:
                continue
            line = line.replace(";", "")
            parts = line.split(",")

            if len(parts) != 6:
                continue

            subj, act, ts, x, y, z = parts
            if x.strip() == "" or y.strip() == "" or z.strip() == "":
                continue

            act_norm = act.strip().lower()
            if act_norm not in WISDM_LABEL_MAP:
                continue

            rows.append([subj, act_norm, ts, x, y, z])

        if not rows:
            raise ValueError(f"No valid rows parsed from file: {file_path}")

        df = pd.DataFrame(rows, columns=["subject", "activity", "timestamp", "x", "y", "z"])
        df = df.replace(["", "NaN", "nan"], np.nan).dropna(subset=["subject", "x", "y", "z"])

        df["subject"] = pd.to_numeric(df["subject"], errors="coerce")
        df["x"] = pd.to_numeric(df["x"], errors="coerce")
        df["y"] = pd.to_numeric(df["y"], errors="coerce")
        df["z"] = pd.to_numeric(df["z"], errors="coerce")
        df = df.dropna(subset=["subject", "x", "y", "z"])
        if df.empty:
            raise ValueError("After cleaning, WISDM DataFrame is empty. Check file format.")

        df["subject"] = df["subject"].astype(int)
        df["activity_id"] = df["activity"].map(WISDM_LABEL_MAP).astype(int)

        return df

    def _create_windows(self, df: pd.DataFrame):
        X_list, y_list, s_list = [], [], []

        for subj_id in sorted(df["subject"].unique()):
            df_sub = df[df["subject"] == subj_id]
            data = df_sub[["x", "y", "z"]].to_numpy(dtype=np.float32)
            labels = df_sub["activity_id"].to_numpy(dtype=np.int64)
            L = len(df_sub)

            start = 0
            while start + self.window_size <= L:
                end = start + self.window_size
                window_x = data[start:end]
                window_y = labels[end - 1]

                X_list.append(window_x.T)
                y_list.append(window_y)
                s_list.append(subj_id)

                start += self.step_size

        if len(X_list) == 0:
            raise ValueError("[WISDMDataset] No windows created. Try smaller window_size or check data.")

        X = np.stack(X_list, axis=0).astype(np.float32)
        y = np.array(y_list, dtype=np.int64)
        s = np.array(s_list, dtype=np.int64)
        return X, y, s

    def fit_scaler(self, indices):
        Xtr = self.X[indices]  # (N,C,T)
        N, C, T = Xtr.shape
        X2 = np.transpose(Xtr, (0, 2, 1)).reshape(-1, C)  # (N*T, C)
        scaler = StandardScaler()
        scaler.fit(X2)
        self.scaler = scaler
        return scaler

    def apply_scaler(self, scaler=None):
        if scaler is None:
            scaler = getattr(self, "scaler", None)
        assert scaler is not None, "Scaler is not fitted. Call fit_scaler() first."

        X = self.X
        N, C, T = X.shape
        X2 = np.transpose(X, (0, 2, 1)).reshape(-1, C)
        X2 = scaler.transform(X2)
        self.X = X2.reshape(N, T, C).transpose(0, 2, 1).astype(np.float32)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx: int):
        return (
            torch.FloatTensor(self.X[idx]),
            torch.LongTensor([self.y[idx]])[0],
            int(self.subjects[idx]),
        )


# ──────────────────────────────────────────────────────────────────────────────
# Utils
# ──────────────────────────────────────────────────────────────────────────────
def cosine_temperature(ep, total, tmax=5.0, tmin=0.5):
    r = ep / max(total - 1, 1)
    return tmin + (tmax - tmin) * 0.5 * (1.0 + np.cos(np.pi * r))

def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)


# ──────────────────────────────────────────────────────────────────────────────
# Corruptions (SNR=10 uses this)
# ──────────────────────────────────────────────────────────────────────────────
def add_gaussian_noise(X, snr_db):
    """
    X: (B,C,T)
    snr_db: float
    """
    signal_power = (X ** 2).mean(dim=(1, 2), keepdim=True)
    snr = 10 ** (snr_db / 10.0)
    noise_power = signal_power / snr
    noise = torch.randn_like(X) * torch.sqrt(noise_power)
    return X + noise


# ──────────────────────────────────────────────────────────────────────────────
# Compute-Aware Degree Gate  (★ Variant behavior matches "예전버전")
#   - use_ste=True  : train=STE(hard fwd, soft bwd), eval=hard onehot
#   - use_ste=False : always soft_probs (train/eval 동일)
# ──────────────────────────────────────────────────────────────────────────────
class ComputeAwareDegreeGate(nn.Module):
    def __init__(self,
                 channels,
                 max_degree=3,
                 gate_hidden_dim=16,
                 temperature_initial=5.0,
                 temperature_min=0.5
        ):
        super().__init__()
        self.max_degree = max_degree

        self.gate = nn.Sequential(
            nn.AdaptiveAvgPool1d(1),
            nn.Flatten(1),
            nn.Linear(channels, gate_hidden_dim),
            nn.GELU(),
            nn.Linear(gate_hidden_dim, max_degree),
        )

        nn.init.zeros_(self.gate[-1].bias)
        if max_degree >= 3:
            self.gate[-1].bias.data[1] = 0.4

        self.register_buffer("temperature", torch.tensor(float(temperature_initial)))
        self.temperature_min = float(temperature_min)

    def set_temperature(self, t):
        self.temperature.fill_(max(float(t), self.temperature_min))

    def forward(self, x, use_ste=True):
        logits = self.gate(x)  # (B,K)
        soft_probs = F.softmax(logits / self.temperature, dim=-1)

        if use_ste:
            if self.training:
                hard_idx = logits.argmax(dim=-1)
                hard_oh = F.one_hot(hard_idx, num_classes=self.max_degree).float()
                # STE: forward=hard, backward=soft
                degree_w = hard_oh - soft_probs.detach() + soft_probs
            else:
                degree_w = F.one_hot(
                    logits.argmax(dim=-1), num_classes=self.max_degree
                ).float()
        else:
            # Gate w/o STE: always soft (train/eval 동일)
            degree_w = soft_probs

        return degree_w, logits, soft_probs


# ──────────────────────────────────────────────────────────────────────────────
# PADRe Block (Ablation switches ONLY; ★ Variant behavior matches "예전버전")
#   - w/o Gate: fixed_degree (1..K) → build up to d, output Z[d-1]
#   - Gate w/o STE: soft weighted sum of ALL degrees (train/eval 동일)
#   - w/o Hadamard: prefix sum (Z[i]=Z[i-1]+Y[i])  (예전버전)
# ──────────────────────────────────────────────────────────────────────────────
class PADReBlockAblation(nn.Module):
    def __init__(self,
                 channels,
                 seq_len,
                 max_degree=3,
                 token_kernel=11,
                 gate_hidden_dim=16,
                 temperature_initial=5.0,
                 temperature_min=0.5,
                 # ablations
                 use_gate=True,
                 fixed_degree=None,      # 1..K if w/o Gate (fixed)
                 use_ste=True,           # Gate w/o STE (soft routing)
                 use_hadamard=True,      # w/o Hadamard (prefix-sum)
        ):
        super().__init__()
        self.max_degree = max_degree

        self.use_gate = bool(use_gate)
        self.fixed_degree = fixed_degree  # None or int in [1..K]
        self.use_ste = bool(use_ste)
        self.use_hadamard = bool(use_hadamard)

        self.degree_gate = ComputeAwareDegreeGate(
            channels,
            max_degree=max_degree,
            gate_hidden_dim=gate_hidden_dim,
            temperature_initial=temperature_initial,
            temperature_min=temperature_min
        )

        self.channel_mixing = nn.ModuleList([
            nn.Conv1d(channels, channels, kernel_size=1) for _ in range(max_degree)
        ])

        self.token_mixing = nn.ModuleList([
            nn.Conv1d(channels, channels, kernel_size=token_kernel,
                      padding=token_kernel // 2, groups=channels)
            for _ in range(max_degree)
        ])

        self.pre_hadamard_channel = nn.ModuleList([
            nn.Conv1d(channels, channels, kernel_size=1) for _ in range(max_degree-1)
        ])

        self.pre_hadamard_token = nn.ModuleList([
            nn.Conv1d(channels, channels, kernel_size=token_kernel,
                      padding=token_kernel // 2, groups=channels)
            for _ in range(max_degree-1)
        ])

        self.norm = nn.LayerNorm(channels)

    def set_temperature(self, t):
        self.degree_gate.set_temperature(t)

    def _build_Y(self, x, max_deg):
        return [self.token_mixing[i](self.channel_mixing[i](x)) for i in range(max_deg)]

    def _build_Z(self, x, max_deg):
        """
        - use_hadamard=True : 예전버전(원본) Hadamard chain
            Z0=Y0, Zi = pre(Z_{i-1}) * Yi
        - use_hadamard=False: 예전버전 w/o Hadamard (prefix sum)
            Z0=Y0, Zi = Z_{i-1} + Yi
        """
        Y = self._build_Y(x, max_deg)

        if self.use_hadamard:
            Z = [Y[0]]
            for i in range(1, max_deg):
                Z_ = self.pre_hadamard_token[i-1](self.pre_hadamard_channel[i-1](Z[-1]))
                Z.append(Z_ * Y[i])
            return Z
        else:
            Z = [Y[0]]
            for i in range(1, max_deg):
                Z.append(Z[-1] + Y[i])
            return Z

    def _hard_select(self, Z_list, sel):
        B = Z_list[0].shape[0]
        Z_stack = torch.stack(Z_list, dim=0)  # (K,B,C,T)
        return Z_stack[sel, torch.arange(B, device=Z_stack.device)]

    def _soft_weighted_output(self, x, soft_probs):
        """
        예전버전 Gate w/o STE:
          - always compute ALL K degrees
          - weighted sum with soft_probs
          - hadamard / no_hadamard build rule은 동일하게 적용
        """
        B = x.size(0)
        Z = self._build_Z(x, max_deg=self.max_degree)      # list length K, each (B,C,T)
        Z_stack = torch.stack(Z, dim=1)                    # (B,K,C,T)
        w = soft_probs.view(B, self.max_degree, 1, 1)      # (B,K,1,1)
        out = (Z_stack * w).sum(dim=1)                     # (B,C,T)
        return out

    def forward(self, x, return_gate_info=False):
        B = x.shape[0]

        # ---- Case A) w/o Gate (fixed degree 1..K) ----
        if (not self.use_gate) or (self.fixed_degree is not None):
            d = int(self.fixed_degree) if self.fixed_degree is not None else self.max_degree
            d = max(1, min(d, self.max_degree))

            # build only up to d, output "degree d" path (예전버전)
            Z = self._build_Z(x, max_deg=d)
            out = Z[-1]

            # stats payload (예전버전 스타일)
            sel = torch.full((B,), d - 1, device=x.device, dtype=torch.long)
            K = self.max_degree
            sp = F.one_hot(sel, num_classes=K).float()
            dw = sp
            logits = sp

            out = self.norm(out.permute(0, 2, 1)).permute(0, 2, 1)
            if return_gate_info:
                return out, {
                    "degree_selection": dw,
                    "soft_probs": sp,
                    "logits": logits,
                    "compute_cost": float(d),
                }
            return out

        # ---- Case B) Gate ON ----
        degree_w, logits, soft_probs = self.degree_gate(x, use_ste=self.use_ste)

        if (not self.use_ste):
            # 예전버전: Gate w/o STE는 train/eval 관계없이 ALWAYS soft weighted sum
            out = self._soft_weighted_output(x, degree_w)  # degree_w == soft_probs
            # (stats only) 대표 degree
            selected = soft_probs.argmax(dim=-1)
            # compute_cost (예전코드에선 argmax 기반이었지만, 여기서는 gi에만 들어가므로 그대로 둠)
            compute_cost = (selected + 1).float().mean().item()
        else:
            # 원본: hard select (STE는 train에서 degree_w에 반영됨)
            selected = degree_w.argmax(dim=-1)
            max_deg = max(1, min(int(selected.max().item()) + 1, self.max_degree))
            Z = self._build_Z(x, max_deg=max_deg)
            out = self._hard_select(Z, selected)
            compute_cost = (selected + 1).float().mean().item()

        out = self.norm(out.permute(0, 2, 1)).permute(0, 2, 1)

        if return_gate_info:
            return out, {
                "degree_selection": degree_w,
                "soft_probs": soft_probs,
                "logits": logits,
                "compute_cost": compute_cost,
            }
        return out


# ──────────────────────────────────────────────────────────────────────────────
# Adaptive PADRe Model (Ablation switches ONLY; otherwise matches your logic)
# ──────────────────────────────────────────────────────────────────────────────
class PADReHAR_Ablation(nn.Module):
    def __init__(self,
                 in_channels=9,
                 seq_len=128,
                 num_classes=6,
                 hidden_dim=48,
                 num_layers=3,
                 max_degree=3,
                 gate_hidden_dim=16,
                 dropout=0.2,
                 temperature_initial=5.0,
                 temperature_min=0.5,
                 # ablations
                 use_gate=True,
                 fixed_degree=None,     # for w/o Gate (1/2/3)
                 use_ste=True,          # Gate w/o STE
                 use_hadamard=True,     # w/o Hadamard
                 use_ffn=True,          # w/o FFN
                 use_residual=True,     # w/o Residual
        ):
        super().__init__()
        self.num_layers = num_layers
        self.max_degree = max_degree

        self.use_ffn = bool(use_ffn)
        self.use_residual = bool(use_residual)

        self.input_proj = nn.Conv1d(in_channels, hidden_dim, kernel_size=1)

        self.padre_blocks = nn.ModuleList([
            PADReBlockAblation(
                hidden_dim, seq_len,
                max_degree=max_degree,
                token_kernel=11,
                gate_hidden_dim=gate_hidden_dim,
                temperature_initial=temperature_initial,
                temperature_min=temperature_min,
                use_gate=use_gate,
                fixed_degree=fixed_degree,
                use_ste=use_ste,
                use_hadamard=use_hadamard,
            )
            for _ in range(num_layers)
        ])

        self.ffn = nn.ModuleList([
            nn.Sequential(
                nn.Conv1d(hidden_dim, hidden_dim * 2, kernel_size=1),
                nn.GELU(),
                nn.Dropout(dropout),
                nn.Conv1d(hidden_dim * 2, hidden_dim, kernel_size=1),
                nn.Dropout(dropout),
            )
            for _ in range(num_layers)
        ])

        self.norms1 = nn.ModuleList([nn.LayerNorm(hidden_dim) for _ in range(num_layers)])
        self.norms2 = nn.ModuleList([nn.LayerNorm(hidden_dim) for _ in range(num_layers)])

        self.global_pool = nn.AdaptiveAvgPool1d(1)
        self.classifier = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim), nn.LayerNorm(hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, num_classes),
        )

    def set_temperature(self, t):
        for b in self.padre_blocks:
            b.set_temperature(t)

    def _ln(self, norm, x):
        return norm(x.permute(0, 2, 1)).permute(0, 2, 1)

    def forward(self, x, return_gate_info=False):
        x = self.input_proj(x)
        gate_info_list = [] if return_gate_info else None
        total_compute  = 0.0

        for i, block in enumerate(self.padre_blocks):
            res = x

            if return_gate_info:
                x, gi = block(x, return_gate_info=True)
                gate_info_list.append(gi)
                total_compute += gi["compute_cost"]
            else:
                x = block(x)

            if self.use_residual:
                x = self._ln(self.norms1[i], x + res)
            else:
                x = self._ln(self.norms1[i], x)

            res2 = x
            if self.use_ffn:
                x = self.ffn[i](x)

            if self.use_residual:
                x = self._ln(self.norms2[i], x + res2)
            else:
                x = self._ln(self.norms2[i], x)

        logits = self.classifier(self.global_pool(x).squeeze(-1))
        return (logits, gate_info_list, total_compute) if return_gate_info else logits


# ──────────────────────────────────────────────────────────────────────────────
# Train & Eval (unchanged)
# ──────────────────────────────────────────────────────────────────────────────
def train_model(model,
                train_loader,
                test_loader,
                device,
                lr=1e-3,
                weight_decay=1e-4,
                epochs=30,
                seed=42
    ):
    set_seed(seed)
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs, eta_min=1e-5)
    criterion = nn.CrossEntropyLoss()

    best_f1 = -1.0
    best_state = None

    for ep in range(epochs):
        temp = cosine_temperature(ep, epochs, tmax=5.0, tmin=0.5)
        model.set_temperature(temp)

        model.train()
        train_loss_sum = 0.0
        train_n = 0
        for X, y, _ in train_loader:
            X, y = X.to(device), y.to(device)
            optimizer.zero_grad()
            loss = criterion(model(X), y)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()

            bs = y.size(0)
            train_loss_sum += loss.item() * bs
            train_n += bs

        scheduler.step()
        train_loss = train_loss_sum / max(train_n, 1)

        model.eval()
        preds_all, labels_all = [], []
        with torch.no_grad():
            for X, y, _ in test_loader:
                X, y = X.to(device), y.to(device)
                preds_all.extend(model(X).argmax(1).cpu().numpy())
                labels_all.extend(y.cpu().numpy())
        test_f1 = f1_score(labels_all, preds_all, average="macro")

        if test_f1 > best_f1:
            best_f1 = test_f1
            best_state = copy.deepcopy(model.state_dict())

        if (ep + 1) % 5 == 0:
            cur_lr = optimizer.param_groups[0]["lr"]
            print(f"Epoch {ep+1:02d}/{epochs} | LR={cur_lr:.4f} | Train Loss={train_loss:.4f} | TestF1={test_f1:.4f} | BestF1={best_f1:.4f} | Temp={temp:.3f}")

    model.load_state_dict(best_state)
    print(f"\nBest Test Macro-F1: {best_f1:.4f}")
    return model


# ──────────────────────────────────────────────────────────────────────────────
# Table Metrics (keep as-is in your "now code")
# ──────────────────────────────────────────────────────────────────────────────
@torch.no_grad()
def eval_f1_and_gate_stats(model, loader, device, snr_db=None, max_degree=3):
    """
    Returns:
      macro_f1, degree_entropy, norm_comp
    Definitions:
      - degree_entropy: mean over (layers, samples) of normalized entropy of soft_probs
                        H(p)/log(K)  where K=max_degree
      - norm_comp: mean expected degree / max_degree, averaged over layers
    """
    model.eval()

    all_preds, all_labels = [], []
    ent_sum = 0.0
    ent_count = 0
    comp_sum = 0.0
    comp_count = 0

    eps = 1e-12
    logK = float(np.log(max_degree))

    deg_vals = torch.arange(1, max_degree + 1, device=device).float()

    for X, y, _ in loader:
        X = X.to(device)
        y = y.to(device)

        if snr_db is not None:
            X = add_gaussian_noise(X, float(snr_db))

        logits, gate_info_list, _ = model(X, return_gate_info=True)
        preds = logits.argmax(dim=1)

        all_preds.append(preds.detach().cpu().numpy())
        all_labels.append(y.detach().cpu().numpy())

        # gate stats
        for gi in gate_info_list:
            sp = gi["soft_probs"]  # (B,K)
            ent = -(sp * (sp + eps).log()).sum(dim=-1) / logK
            ent_sum += ent.mean().item()
            ent_count += 1

            exp_deg = (sp * deg_vals).sum(dim=-1).mean().item()
            comp_sum += (exp_deg / max_degree)
            comp_count += 1

    all_preds = np.concatenate(all_preds, axis=0)
    all_labels = np.concatenate(all_labels, axis=0)
    macro_f1 = float(f1_score(all_labels, all_preds, average="macro"))

    degree_entropy = float(ent_sum / max(ent_count, 1))
    norm_comp = float(comp_sum / max(comp_count, 1))
    return macro_f1, degree_entropy, norm_comp


def format_row(name, clean_f1, snr10_f1, deg_ent, norm_comp):
    drop_pct = 100.0 * (clean_f1 - snr10_f1) / max(clean_f1, 1e-12)
    return {
        "Variant": name,
        "CleanF1": clean_f1,
        "SNR10F1": snr10_f1,
        "drop(%)": drop_pct,
        "DegreeEntropy": deg_ent,
        "NormComp": norm_comp,
    }


def print_table(rows):
    header = ["Variant", "Clean F1", "(SNR=10) F1", "drop(%)", "Degree Entropy", "NormComp"]
    print("\n" + "="*110)
    print("UCI-HAR Ablation Table")
    print("="*110)
    print(f"{header[0]:<22s} | {header[1]:>8s} | {header[2]:>11s} | {header[3]:>7s} | {header[4]:>14s} | {header[5]:>8s}")
    print("-"*110)
    for r in rows:
        print(
            f"{r['Variant']:<22s} | "
            f"{r['CleanF1']:>8.4f} | "
            f"{r['SNR10F1']:>11.4f} | "
            f"{r['drop(%)']:>7.2f} | "
            f"{r['DegreeEntropy']:>14.4f} | "
            f"{r['NormComp']:>8.4f}"
        )
    print("-"*110)

    print("\n[LaTeX rows]")
    for r in rows:
        print(
            f"{r['Variant']} & {r['CleanF1']:.4f} & {r['SNR10F1']:.4f} & "
            f"{r['drop(%)']:.2f} & {r['DegreeEntropy']:.4f} & {r['NormComp']:.4f} \\\\"
        )


# ──────────────────────────────────────────────────────────────────────────────
# Experiment Runner (unchanged)
# ──────────────────────────────────────────────────────────────────────────────
def build_variant(name, cfg):
    model = PADReHAR_Ablation(
        in_channels=cfg["in_channels"],
        seq_len=cfg["seq_len"],
        num_classes=cfg["num_classes"],
        hidden_dim=cfg["hidden_dim"],
        num_layers=cfg["num_layers"],
        max_degree=cfg["max_degree"],
        gate_hidden_dim=cfg["gate_hidden_dim"],
        dropout=cfg["dropout"],
        temperature_initial=cfg["temperature_initial"],
        temperature_min=cfg["temperature_min"],
        use_gate=cfg.get("use_gate", True),
        fixed_degree=cfg.get("fixed_degree", None),
        use_ste=cfg.get("use_ste", True),
        use_hadamard=cfg.get("use_hadamard", True),
        use_ffn=cfg.get("use_ffn", True),
        use_residual=cfg.get("use_residual", True),
    ).to(cfg["device"])
    return model


def run_ablation_suite(train_loader, test_loader, base_cfg, train_cfg):
    variants = []
    variants.append(("Full (Ours)", dict()))
    variants.append(("w/o Gate-1", dict(use_gate=False, fixed_degree=1)))
    variants.append(("w/o Gate-2", dict(use_gate=False, fixed_degree=2)))
    variants.append(("w/o Gate-3", dict(use_gate=False, fixed_degree=3)))
    variants.append(("Gate w/o STE", dict(use_gate=True, use_ste=False)))
    variants.append(("w/o Hadamard", dict(use_hadamard=False)))
    variants.append(("w/o FFN", dict(use_ffn=False)))
    variants.append(("w/o Residual", dict(use_residual=False)))

    rows = []

    for name, delta in variants:
        print("\n" + "="*80)
        print(f"[Running Variant] {name}")
        print("="*80)

        cfg = copy.deepcopy(base_cfg)
        cfg.update(delta)

        set_seed(train_cfg["seed"])
        model = build_variant(name, cfg)
        print(f"Model params: {count_parameters(model):,}")

        model = train_model(
            model,
            train_loader,
            test_loader,
            cfg["device"],
            lr=train_cfg["lr"],
            weight_decay=train_cfg["wd"],
            epochs=train_cfg["epochs"],
            seed=train_cfg["seed"],
        )

        clean_f1, deg_ent, norm_comp = eval_f1_and_gate_stats(
            model, test_loader, cfg["device"], snr_db=None, max_degree=cfg["max_degree"]
        )
        snr10_f1, _, _ = eval_f1_and_gate_stats(
            model, test_loader, cfg["device"], snr_db=10.0, max_degree=cfg["max_degree"]
        )

        rows.append(format_row(name, clean_f1, snr10_f1, deg_ent, norm_comp))

    return rows


# ──────────────────────────────────────────────────────────────────────────────
# Main
# ──────────────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
    DATA_PATH =  "/content/drive/MyDrive/Colab Notebooks/HAR/har_orig_datasets/WISDM_ar_v1.1_raw.txt"
    SEED = 42
    NUM_WORKERS = 2 if USE_GPU else 0
    PIN_MEMORY = USE_GPU

    BATCH_SIZE = 64
    EPOCHS = 20

    NUM_CLASSES = 6
    HIDDEN_DIM = 48
    NUM_LAYERS = 3
    MAX_DEGREE = 3
    GATE_HIDDEN_DIM = 16
    DROPOUT = 0.25
    LR = 1e-3
    WD = 1e-2

    WINDOW_SIZE = 80
    STEP_SIZE = 40

    set_seed(SEED)

    full_dataset = WISDMDataset(DATA_PATH, window_size=WINDOW_SIZE, step_size=STEP_SIZE)

    n_total = len(full_dataset)
    n_test = int(0.2 * n_total)
    n_train = n_total - n_test

    g = torch.Generator().manual_seed(SEED)
    train_dataset, test_dataset = random_split(full_dataset, [n_train, n_test], generator=g)

    train_idx = np.array(train_dataset.indices, dtype=np.int64)
    scaler = full_dataset.fit_scaler(train_idx)
    full_dataset.apply_scaler(scaler)

    train_loader = DataLoader(
        train_dataset, batch_size=BATCH_SIZE, shuffle=True,
        num_workers=NUM_WORKERS, pin_memory=PIN_MEMORY
    )
    test_loader = DataLoader(
        test_dataset, batch_size=BATCH_SIZE, shuffle=False,
        num_workers=NUM_WORKERS, pin_memory=PIN_MEMORY
    )

    base_cfg = dict(
        device=DEVICE,
        in_channels=3,
        seq_len=80,
        num_classes=NUM_CLASSES,
        hidden_dim=HIDDEN_DIM,
        num_layers=NUM_LAYERS,
        max_degree=MAX_DEGREE,
        gate_hidden_dim=GATE_HIDDEN_DIM,
        dropout=DROPOUT,
        temperature_initial=5.0,
        temperature_min=0.5,
        use_gate=True,
        fixed_degree=None,
        use_ste=True,
        use_hadamard=True,
        use_ffn=True,
        use_residual=True,
    )

    train_cfg = dict(
        seed=SEED,
        epochs=EPOCHS,
        lr=LR,
        wd=WD,
    )

    rows = run_ablation_suite(train_loader, test_loader, base_cfg, train_cfg)
    print_table(rows)

Device: cuda | pin_memory: True
Loaded WISDM dataset (single txt)
  X shape       : (27108, 3, 80)  (N, C, T)
  y shape       : (27108,)  (N,)
  subjects shape: (27108,) (N,)
  num classes   : 6
  unique subjects: [np.int64(1), np.int64(2), np.int64(3), np.int64(4), np.int64(5), np.int64(6), np.int64(7), np.int64(8), np.int64(9), np.int64(10)] ... (total 36)

[Running Variant] Full (Ours)
Model params: 78,303
Epoch 05/20 | LR=0.0009 | Train Loss=0.0658 | TestF1=0.9611 | BestF1=0.9641 | Temp=4.526
Epoch 10/20 | LR=0.0005 | Train Loss=0.0277 | TestF1=0.9729 | BestF1=0.9729 | Temp=2.936
Epoch 15/20 | LR=0.0002 | Train Loss=0.0077 | TestF1=0.9771 | BestF1=0.9771 | Temp=1.226
Epoch 20/20 | LR=0.0000 | Train Loss=0.0044 | TestF1=0.9776 | BestF1=0.9793 | Temp=0.500

Best Test Macro-F1: 0.9793

[Running Variant] w/o Gate-1
Model params: 78,303
Epoch 05/20 | LR=0.0009 | Train Loss=0.1321 | TestF1=0.9340 | BestF1=0.9367 | Temp=4.526
Epoch 10/20 | LR=0.0005 | Train Loss=0.0825 | TestF1=0.9632 | B