In [1]:
import os,re
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from pathlib import Path
import json
from datetime import datetime, timedelta
from scipy import signal
from scipy.interpolate import interp1d
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import LeaveOneGroupOut
from scipy.signal import butter, sosfiltfilt
import joblib
import warnings
import random 
import torch
warnings.filterwarnings('ignore')


In [2]:
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

# Deterministic CuDNN kernels ― reproducibility > speed
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.use_deterministic_algorithms(True)

In [3]:
def load_wesad_subject(pkl_path):
    pkl_path = Path(pkl_path)
    if not pkl_path.exists():
        raise FileNotFoundError(f"File not found: {pkl_path}")
    with pkl_path.open("rb") as f:
        # encoding='latin1' is the safe choice for WESAD pickles
        return pickle.load(f, encoding="latin1")

In [4]:

# ── 1) Project root ─────────────────────────────────────────────────────────────
project_root = Path(r"C:\Users\Joseph\generative-health-models").resolve()
print(f"🏠 Project Root: {project_root}")

# ── 2) Ensure expected directories exist ───────────────────────────────────────
required_dirs = [
    project_root / "data" / "raw" / "wesad",
    project_root / "data" / "processed",
    project_root / "data" / "synthetic",
    project_root / "models" / "tc_multigan",
    project_root / "results" / "figures",
]

print("\n🏗️ Verifying project structure:")
for d in required_dirs:
    if d.exists():
        print(f"  ✅ {d.relative_to(project_root)}")
    else:
        d.mkdir(parents=True, exist_ok=True)
        print(f"  📁 Created: {d}")

# ── 3) Discover WESAD subject .pkl files (sorted, filtered) ────────────────────
wesad_path = project_root / "data" / "raw" / "wesad"
subject_files = sorted(
    p for p in wesad_path.rglob("*.pkl") if re.match(r"S\d+\.pkl$", p.name)
)

print(f"\n📁 Found {len(subject_files)} WESAD subject files:")
for i, p in enumerate(subject_files[:5], 1):
    print(f"  {i}. {p.name}")
if len(subject_files) > 5:
    print(f"     ... and {len(subject_files) - 5} more files")

if not subject_files:
    print("❌ No WESAD files found! Please check data download.")
else:
    print(f"✅ Ready to process {len(subject_files)} subjects")

# ── 3b) Subject sanity check (expected S2–S17 excluding S12) ───────────────────
expected = [f"S{i}" for i in range(2, 18) if i != 12]   # 15 subjects
ids = [re.match(r"(S\d+)", p.name).group(1) for p in subject_files]

missing    = sorted(set(expected) - set(ids))
unexpected = sorted(set(ids) - set(expected))
dups       = sorted([k for k, c in Counter(ids).items() if c > 1])

if missing or unexpected or dups:
    print("⚠️ Subject sanity check failed:")
    if missing:    print("  Missing:", missing)
    if unexpected: print("  Unexpected:", unexpected)
    if dups:       print("  Duplicates:", dups)
    # Optional hard stop:
    # raise ValueError("Subject list mismatch")
else:
    print("✅ Subject sanity check passed (15 subjects: S2–S11, S13–S17).")

# ── 4) Quick sanity‑load one subject using your loader ─────────────────────────
# (Helps catch encoding/path issues early.)
if subject_files:
    try:
        sample_path = subject_files[0]
        data = load_wesad_subject(sample_path)  # <-- your function
        print(f"\n🔍 Loaded {sample_path.name}")

        # Brief structure peek (optional but helpful)
        top_keys = list(data.keys())
        print("   Top-level keys:", top_keys)
        if "signal" in data and isinstance(data["signal"], dict):
            sig_keys = list(data["signal"].keys())
            print("   Signal keys:", sig_keys)
            if "chest" in data["signal"]:
                chest_keys = list(data["signal"]["chest"].keys())
                print("   Chest channels:", chest_keys)
    except Exception as e:
        print(f"⚠️ Failed to load {sample_path.name}: {e}")

# ── 5) (Optional) Write a small manifest for traceability ──────────────────────
manifest = [
    {"subject": re.search(r"S\d+", p.name).group(0), "path": str(p)}
    for p in subject_files
]
manifest_path = project_root / "data" / "processed" / "wesad_manifest.json"
manifest_path.write_text(json.dumps(manifest, indent=2))
print(f"\n📝 Manifest written to: {manifest_path}")

🏠 Project Root: C:\Users\Joseph\generative-health-models

🏗️ Verifying project structure:
  ✅ data\raw\wesad
  ✅ data\processed
  ✅ data\synthetic
  ✅ models\tc_multigan
  ✅ results\figures

📁 Found 15 WESAD subject files:
  1. S10.pkl
  2. S11.pkl
  3. S13.pkl
  4. S14.pkl
  5. S15.pkl
     ... and 10 more files
✅ Ready to process 15 subjects
✅ Subject sanity check passed (15 subjects: S2–S11, S13–S17).

🔍 Loaded S10.pkl
   Top-level keys: ['signal', 'label', 'subject']
   Signal keys: ['chest', 'wrist']
   Chest channels: ['ACC', 'ECG', 'EMG', 'EDA', 'Temp', 'Resp']

📝 Manifest written to: C:\Users\Joseph\generative-health-models\data\processed\wesad_manifest.json


In [5]:
# Ensure deterministic numeric order (S2, S3, …, S17)
def subject_sort_key(p):
    return int(re.search(r"S(\d+)", Path(p).stem).group(1))
subject_files = sorted(subject_files, key=subject_sort_key)

subjects = [Path(p).stem for p in subject_files]  # ['S2','S3',...]
groups   = subjects                                # group == subject

logo = LeaveOneGroupOut()
folds = []

# Separate seed for splits; fixed val size per fold (set to 0 to disable val)
SPLIT_SEED   = 42
VAL_SUBJECTS = 2
rng = np.random.default_rng(SPLIT_SEED)

for train_idx, test_idx in logo.split(subject_files, groups=groups):
    train_files = [subject_files[i] for i in train_idx]
    test_files  = [subject_files[i] for i in test_idx]

    # Pick validation subjects from the training pool (deterministic)
    val_k = min(VAL_SUBJECTS, len(train_files))
    if val_k > 0:
        val_idx = rng.choice(len(train_files), size=val_k, replace=False)
        val_idx_set = set(int(i) for i in val_idx)
        val_files = [train_files[i] for i in sorted(val_idx_set)]
        train_files_final = [f for i, f in enumerate(train_files) if i not in val_idx_set]
    else:
        val_files = []
        train_files_final = train_files

    folds.append({
        "train": [Path(f).stem for f in train_files_final],
        "val":   [Path(f).stem for f in val_files],
        "test":  [Path(f).stem for f in test_files],   # exactly one subject
    })

# Assert full test coverage (each subject appears once as test)
test_counts = Counter(s for f in folds for s in f["test"])
assert len(test_counts) == len(subjects) and all(c == 1 for c in test_counts.values()), \
    "Each subject must appear exactly once in the test set across folds."

# Persist folds
splits_path = project_root / "data" / "processed" / "wesad_loso_splits.json"
splits_path.write_text(json.dumps(folds, indent=2))
print(f"📝 LOSO splits saved to {splits_path}")
print(f"↳ Total folds: {len(folds)} (should be {len(subject_files)})")
print(f"↳ Val subjects per fold: {VAL_SUBJECTS}")
print("↳ Example fold 1:", folds[0])

📝 LOSO splits saved to C:\Users\Joseph\generative-health-models\data\processed\wesad_loso_splits.json
↳ Total folds: 15 (should be 15)
↳ Val subjects per fold: 2
↳ Example fold 1: {'train': ['S2', 'S4', 'S5', 'S6', 'S7', 'S8', 'S9', 'S11', 'S13', 'S15', 'S16', 'S17'], 'val': ['S3', 'S14'], 'test': ['S10']}


In [6]:
def random_mask_interpolate(m2, mask_ratio=0.05, *, rng=None, protect_endpoints=True):
    """
    TRAINING-ONLY augmentation: randomly remove time points and linearly re-fill.
    Do NOT use on validation/test.
    """
    x = np.asarray(m2, dtype=np.float32)
    if x.ndim == 1:
        x = x[:, None]
    T = x.shape[0]
    if mask_ratio <= 0 or T < 3:
        return x

    rng = np.random.default_rng() if rng is None else rng
    t = np.arange(T)
    pool = np.arange(1, T-1) if (protect_endpoints and T > 2) else t
    k = min(int(round(T*mask_ratio)), pool.size)
    if k == 0:
        return x

    drop_idx = np.sort(rng.choice(pool, size=k, replace=False))
    keep_idx = np.setdiff1d(t, drop_idx)
    lin = interp1d(keep_idx, x[keep_idx], axis=0, kind="linear",
                   bounds_error=False, fill_value="extrapolate")
    return lin(t).astype(np.float32)

In [7]:
def extract_chest_data_correct(subject_data, 
                               channels=("ECG","EDA","RESP"), 
                               strict=True, 
                               verbose=True, 
                               as_float32=True):
    """
    Extract selected chest channels from WESAD and stack to (N, C).
    channels: tuple of channel names in desired order.
              Valid names include: 'ECG','EDA','EMG','RESP','TEMP','ACC' (ACC is Nx3; not stacked here)
    strict:   raise if lengths mismatch; if False, truncate to the shortest length.
    """
    # 1) Access chest dict and labels
    chest = subject_data["signal"]["chest"]
    labels = np.asarray(subject_data["label"]).reshape(-1)

    # 2) Normalize key capitalization found in WESAD pickles
    #    (Resp→RESP, Temp→TEMP) while not overwriting if already present
    rename = {"Resp": "RESP", "Temp": "TEMP"}
    for old, new in rename.items():
        if old in chest and new not in chest:
            chest[new] = chest.pop(old)

    # 3) Gather requested channels in a fixed order
    arrays = []
    present = set(chest.keys())
    for ch in channels:
        if ch not in chest:
            # try case-insensitive match
            ci = {k.upper(): k for k in chest.keys()}
            if ch.upper() in ci:
                key = ci[ch.upper()]
            else:
                raise KeyError(f"Channel '{ch}' not found. Available: {sorted(present)}")
        else:
            key = ch

        x = np.asarray(chest[key])
        # Expect (N,1) or (N,), enforce (N,1)
        if x.ndim == 1:
            x = x[:, None]
        elif x.ndim == 2 and x.shape[1] != 1:
            raise ValueError(f"Channel '{ch}' expected shape (N,1) or (N,), got {x.shape}")
        elif x.ndim > 2:
            raise ValueError(f"Channel '{ch}' has invalid ndim={x.ndim}")

        arrays.append(x)

    # 4) Length checks (signals vs labels)
    lens = [a.shape[0] for a in arrays] + [labels.shape[0]]
    if len(set(lens)) != 1:
        if strict:
            raise ValueError(f"Length mismatch across channels/labels: {lens}")
        # else truncate to shortest
        n = min(lens)
        arrays = [a[:n] for a in arrays]
        labels = labels[:n]
    else:
        n = lens[0]

    # 5) Stack and cast
    X = np.concatenate(arrays, axis=1)
    if as_float32:
        X = X.astype(np.float32, copy=False)

    if verbose:
        ch_list = ", ".join(channels)
        print(f"📊 Extracted channels ({ch_list}) → X: {X.shape}, labels: {labels.shape}")

    return X, labels, list(channels)

In [8]:
def test_data_extraction(channels=("ECG","EDA","RESP"), fs=700):
    """Test data extraction on the first subject with strict sanity checks."""
    if not subject_files:
        print("❌ No subject files available for testing")
        return False

    p = subject_files[0]
    sid = Path(p).stem
    print(f"🧪 Testing data extraction on: {sid} ({Path(p).name})")

    try:
        # Use the standardized loader
        data = load_wesad_subject(p)

        # Extract in a fixed channel order; your function returns (X, y, channels)
        X, y, ch = extract_chest_data_correct(
            data,
            channels=channels,
            strict=True,
            verbose=True,
            as_float32=True
        )

        # ----- Hard checks (fail fast on violations) ---------------------------
        assert X.ndim == 2, f"Expected 2D array (N,C), got ndim={X.ndim}"
        assert X.shape[1] == len(ch), f"C mismatch: X has {X.shape[1]} cols, channels={len(ch)}"
        assert X.dtype == np.float32, f"Expected float32, got {X.dtype}"
        assert len(X) == len(y), f"Length mismatch: X={len(X)} vs labels={len(y)}"
        y = np.asarray(y).astype(np.int64, copy=False)

        n_nans = int(np.isnan(X).sum())
        assert n_nans == 0, f"Found {n_nans} NaNs in X"

        # Label summary (helps confirm codes present)
        uniq, cnt = np.unique(y, return_counts=True)
        label_summary = {int(k): int(v) for k, v in zip(uniq, cnt)}
        print(f"   Label codes & counts: {label_summary}")

        minutes = X.shape[0] / fs / 60.0
        print(f"   Duration: {minutes:.1f} minutes at fs={fs} Hz")
        print(f"   Channels order: {ch}")

        print("✅ Data extraction test successful!")
        return True

    except Exception as e:
        print(f"❌ Error during test: {e}")
        return False


extraction_success = test_data_extraction()


🧪 Testing data extraction on: S2 (S2.pkl)
📊 Extracted channels (ECG, EDA, RESP) → X: (4255300, 3), labels: (4255300,)
   Label codes & counts: {0: 2142701, 1: 800800, 2: 430500, 3: 253400, 4: 537599, 6: 45500, 7: 44800}
   Duration: 101.3 minutes at fs=700 Hz
   Channels order: ['ECG', 'EDA', 'RESP']
✅ Data extraction test successful!


In [None]:

def butter_lowpass_zerophase(x, fs, cutoff_hz, order=4):
    nyq = fs * 0.5
    sos = butter(order, cutoff_hz/nyq, btype="low", output="sos")
    return sosfiltfilt(sos, x.astype(np.float64)).astype(np.float32)

def butter_highpass_zerophase(x, fs, cutoff_hz, order=2):
    nyq = fs * 0.5
    sos = butter(order, cutoff_hz/nyq, btype="high", output="sos")
    return sosfiltfilt(sos, x.astype(np.float64)).astype(np.float32)

def butter_bandpass_zerophase(x, fs, low_hz, high_hz, order=4):
    nyq = fs * 0.5
    sos = butter(order, [low_hz/nyq, high_hz/nyq], btype="band", output="sos")
    return sosfiltfilt(sos, x.astype(np.float64)).astype(np.float32)

def make_label_mask(labels, fs, valid_labels, transition_pad_s=5.0, min_valid_run_s=30.0):
    y = np.asarray(labels)
    n = y.size
    keep = np.isin(y, np.array(list(valid_labels)))
    # remove ±pad around transitions
    pad = int(round(transition_pad_s*fs))
    if pad > 0 and n > 1:
        chg = np.flatnonzero(np.diff(y) != 0)
        if chg.size:
            edge = np.zeros(n, dtype=bool)
            for i in chg:
                lo = max(0, i - pad + 1); hi = min(n, i + pad + 1)
                edge[lo:hi] = True
            keep &= ~edge
    # drop short runs
    min_len = int(round(min_valid_run_s*fs))
    if min_len > 1:
        i = 0
        while i < n:
            if keep[i]:
                j = i+1
                while j < n and keep[j]:
                    j += 1
                if (j - i) < min_len:
                    keep[i:j] = False
                i = j
            else:
                i += 1
    return keep

def block_mode_downsample(labels, in_len, factor=None, n_out=None):
    """Mode of label in each block during downsampling."""
    y = np.asarray(labels)
    if factor is None and n_out is None:
        raise ValueError("Provide factor or n_out")
    if factor is None:
        factor = int(np.floor(in_len / n_out))
    n_blocks = int(np.floor(in_len / factor))
    out = np.empty(n_blocks, dtype=y.dtype)
    for i in range(n_blocks):
        block = y[i*factor:(i+1)*factor]
        vals, counts = np.unique(block, return_counts=True)
        out[i] = vals[np.argmax(counts)]
    return out

# --- label config (4-class; change to {1,2,3} if you want 3-class) ---
LABEL_MAP = {1:"baseline", 2:"stress", 3:"amusement", 4:"meditation"}  # keep
VALID     = sorted(LABEL_MAP.keys())  # [1,2,3,4]
COND      = {lab:i for i, lab in enumerate(VALID)}  # {1:0,2:1,3:2,4:3}


def _impute_linear_series(col):
    # col: 1D array
    col = col.astype(np.float32, copy=False)
    n = col.shape[0]
    mask = np.isfinite(col)
    if mask.all():
        return col
    if not mask.any():
        return np.zeros_like(col)             # last-resort fallback
    if mask.sum() == 1:
        col[~mask] = col[mask][0]             # fill with single value
        return col
    idx = np.arange(n, dtype=np.float32)
    col[~mask] = np.interp(idx[~mask], idx[mask], col[mask])
    return col

def preprocess_single_subject(subject_file, target_rate=4, original_rate=700, channels=("ECG","EDA","RESP"),
                              transition_pad_s=5.0, min_valid_run_s=30.0, verbose=True,
                              eda_hp_cutoff=0.03, eda_hp_order=2, eda_robust=False):
    sid = Path(subject_file).stem
    if verbose:
        print(f"\n📂 Processing: {sid}")

    try:
        # 1) Load + extract
        data = load_wesad_subject(subject_file)
        X, y, ch = extract_chest_data_correct(data, channels=channels, strict=True, verbose=False)
        n = len(X)
        if verbose:
            print(f"  Original: X {X.shape}, y {y.shape} (~{n/original_rate/60:.1f} min)")

        # 2) Channel-specific filtering at original fs
        Xf = np.empty_like(X, dtype=np.float32)
        for j, name in enumerate(ch):
            if name.upper() == "ECG":
                Xf[:, j] = butter_bandpass_zerophase(X[:, j], original_rate, low_hz=0.5, high_hz=40.0)
            elif name.upper() == "EDA":
                Xf[:, j] = butter_lowpass_zerophase(X[:, j], original_rate, cutoff_hz=5.0)
            elif name.upper() in ("RESP","RESPIRATION"):
                Xf[:, j] = butter_bandpass_zerophase(X[:, j], original_rate, low_hz=0.1, high_hz=0.35)
            else:
                Xf[:, j] = X[:, j]

        # 3) Label-based mask
        keep = make_label_mask(y, fs=original_rate, valid_labels=VALID,
                               transition_pad_s=transition_pad_s, min_valid_run_s=min_valid_run_s)
        Xk = Xf[keep]; yk = y[keep]
        if verbose:
            uniq, cnt = np.unique(yk, return_counts=True)
            print(f"  After mask (orig fs): {Xk.shape}, label dist: {dict(zip(uniq.tolist(), cnt.tolist()))}")

        # 4) Multi-rate
        TARGET_LOW, TARGET_ECG = target_rate, 175
        fac_low = original_rate // TARGET_LOW     # 175
        fac_ecg = original_rate // TARGET_ECG     # 4

        n_keep = (len(yk) // np.lcm(fac_low, fac_ecg)) * np.lcm(fac_low, fac_ecg)
        Xk, yk = Xk[:n_keep], yk[:n_keep]

        idx_ecg   = ch.index("ECG")
        idx_other = [j for j in range(len(ch)) if j != idx_ecg]

        # EDA + RESP @ 4 Hz
        Xd_low = signal.decimate(Xk[:, idx_other], fac_low, ftype="fir", axis=0, zero_phase=True)

        # Enforce [EDA, RESP] order
        cols_low = [ch[j] for j in idx_other]
        assert "EDA" in cols_low and "RESP" in cols_low, f"Low-rate cols missing: {cols_low}"
        perm = [cols_low.index("EDA"), cols_low.index("RESP")]
        Xd_low = Xd_low[:, perm]

        # EDA high-pass @ 4 Hz
        if verbose:
            from scipy.signal import welch
            nps = min(len(Xd_low), 1024)
            f0, P0 = welch(Xd_low[:,0], fs=TARGET_LOW, nperseg=nps)
            print(f"EDA BEFORE HP peak ~ {f0[P0.argmax()]:.4f} Hz")

        if eda_hp_cutoff and eda_hp_cutoff > 0:
            Xd_low[:,0] = butter_highpass_zerophase(Xd_low[:,0], fs=TARGET_LOW,
                                                    cutoff_hz=eda_hp_cutoff, order=eda_hp_order)

        if verbose:
            f1, P1 = welch(Xd_low[:,0], fs=TARGET_LOW, nperseg=nps)
            print(f"EDA AFTER  HP peak ~ {f1[P1.argmax()]:.4f} Hz")

        # Optional robust per-subject scale
        if eda_robust:
            def robust_z_1d(x):
                med = np.nanmedian(x)
                iqr = np.nanpercentile(x,75) - np.nanpercentile(x,25)
                scale = max(iqr/1.349, 1e-6)
                return (x - med) / scale
            Xd_low[:,0] = robust_z_1d(Xd_low[:,0])

        # ECG @ 175 Hz
        Xd_ecg = signal.decimate(Xk[:, idx_ecg], fac_ecg, ftype="fir", axis=0, zero_phase=True)[:, None]

        # Labels
        yd_low = block_mode_downsample(yk, in_len=n_keep, factor=fac_low)
        yd_ecg = block_mode_downsample(yk, in_len=n_keep, factor=fac_ecg)

        # Impute NaNs (defensive)
        for arr in (Xd_low, Xd_ecg):
            bad = ~np.isfinite(arr)
            if bad.any():
                for c in range(arr.shape[1]):
                    if bad[:, c].any():
                        arr[:, c] = _impute_linear_series(arr[:, c])

        # Sanity
        assert len(Xd_low) == len(yd_low)
        assert len(Xd_ecg) == len(yd_ecg)
        assert abs(len(Xd_low)/TARGET_LOW - len(Xd_ecg)/TARGET_ECG) < 1e-6

        # One-hot conditioning
        y_cond_low = np.vectorize(COND.get)(yd_low)
        K = len(COND)
        m1_low = np.zeros((len(y_cond_low), K), dtype=np.float32)
        m1_low[np.arange(len(y_cond_low)), y_cond_low] = 1.0

        if verbose:
            print(f"  ➜ X_low {Xd_low.shape} @ {TARGET_LOW} Hz | X_ecg {Xd_ecg.shape} @ {TARGET_ECG} Hz | m1 {m1_low.shape}")

        return {
            "subject_id": sid,
            "channels_low": ["EDA","RESP"],
            "channels_ecg": ["ECG"],
            "fs_low": TARGET_LOW,
            "fs_ecg": TARGET_ECG,
            "m2_low":   Xd_low.astype(np.float32, copy=False),
            "m2_ecg":   Xd_ecg.astype(np.float32, copy=False),
            "labels_low": yd_low.astype(np.int64, copy=False),
            "labels_ecg": yd_ecg.astype(np.int64, copy=False),
            "m1_low":   m1_low,
            "duration_minutes": len(Xd_low) / TARGET_LOW / 60.0,
        }
    except Exception as e:
        print(f"  ❌ Error processing {sid}: {e}")
        return None

In [10]:
# Pick the first discovered file
p = subject_files[0]            # Path object (from your earlier cell)

res = preprocess_single_subject(
    subject_file=p,
    target_rate=4,              # must divide 700 exactly with current decimate()
    original_rate=700,
    channels=("ECG","EDA","RESP"),
    transition_pad_s=5.0,
    min_valid_run_s=30.0,
    verbose=True
)

# Inspect the result
if res is not None:
    print("\n— Summary —")
    print("Subject:", res["subject_id"])
    print("fs_low :", res["fs_low"], "Hz   | fs_ecg :", res["fs_ecg"], "Hz")
    print("m2_low :", res["m2_low"].shape)   # (N_low, 2)
    print("m2_ecg :", res["m2_ecg"].shape)   # (N_ecg, 1)
    print("m1_low :", res["m1_low"].shape)   # (N_low, K)

    # Label distribution after mask + downsample
    uniq, cnt = np.unique(res["labels_low"], return_counts=True)
    print("Label counts (low-rate):", dict(zip(uniq.tolist(), cnt.tolist())))


📂 Processing: S2
  Original: X (4255300, 3), y (4255300,) (~101.3 min)
  After mask (orig fs): (1987299, 3), label dist: {1: 793800, 2: 423500, 3: 246400, 4: 523599}
EDA BEFORE HP peak ~ 0.0039 Hz
EDA AFTER  HP peak ~ 0.0430 Hz
  ➜ X_low (11352, 2) @ 4 Hz | X_ecg (496650, 1) @ 175 Hz | m1 (11352, 4)

— Summary —
Subject: S2
fs_low : 4 Hz   | fs_ecg : 175 Hz
m2_low : (11352, 2)
m2_ecg : (496650, 1)
m1_low : (11352, 4)
Label counts (low-rate): {1: 4536, 2: 2420, 3: 1408, 4: 2988}


In [11]:
def process_multiple_subjects(subject_files, max_subjects=None, *,
                              target_rate=4, channels=("ECG","EDA","RESP"),
                              transition_pad_s=5.0, min_valid_run_s=30.0, verbose=True,
                              eda_hp_cutoff=0.03, eda_hp_order=2, eda_robust=False):
    """
    Run preprocess_single_subject() on several files and stack the two
    sampling-rate streams separately.

    Returns
    -------
    dict with keys
        X_low        (N_low, 2)   – EDA & RESP @ 4 Hz
        m1_low       (N_low, K)   – one-hot labels @ 4 Hz
        y_low        (N_low,)     – raw label codes  @ 4 Hz
        X_ecg        (N_ecg, 1)   – ECG waveform     @ 175 Hz
        y_ecg        (N_ecg,)     – raw label codes  @ 175 Hz
        fs_low, fs_ecg, channels_low, channels_ecg, segments, summary, …
    """
    iterable = subject_files if max_subjects is None else subject_files[:max_subjects]
    print(f"\n🔄 Processing {len(iterable)} subjects …")
    print("=" * 60)

    # accumulators
    all_low, all_m1, all_y_low = [], [], []
    all_ecg, all_y_ecg         = [], []
    segments = []           # per-subject index bookkeeping
    tot_min  = 0.0

    low_fs = ecg_fs = None
    for subj_idx, p in enumerate(iterable, 1):
        res = preprocess_single_subject(
        subject_file=p, target_rate=target_rate, original_rate=700, channels=channels,
        transition_pad_s=transition_pad_s, min_valid_run_s=min_valid_run_s, verbose=verbose,
        eda_hp_cutoff=eda_hp_cutoff, eda_hp_order=eda_hp_order, eda_robust=eda_robust
    )
        if res is None:
            print(f"⚠️ Skipping {Path(p).name}")
            continue

        # -------- pull tensors ----------
        X_low, m1_low, y_low = res["m2_low"], res["m1_low"], res["labels_low"]
        X_ecg, y_ecg         = res["m2_ecg"], res["labels_ecg"]
        sid                  = res["subject_id"]

        # -------- sanity on fs ----------
        if low_fs is None:
            low_fs, ecg_fs = res["fs_low"], res["fs_ecg"]
            channels_low   = res["channels_low"]
            channels_ecg   = res["channels_ecg"]
        else:
            assert res["fs_low"] == low_fs and res["fs_ecg"] == ecg_fs, "fs mismatch"
            assert res["channels_low"] == channels_low and res["channels_ecg"] == channels_ecg, "channel order mismatch"

        # -------- append ----------
        low_start  = 0 if not all_low else segments[-1]["low_end"]
        ecg_start  = 0 if not all_ecg else segments[-1]["ecg_end"]

        all_low.append(X_low);   all_m1.append(m1_low); all_y_low.append(y_low)
        all_ecg.append(X_ecg);   all_y_ecg.append(y_ecg)

        segments.append({
            "subject_id": sid,
            "low_start": low_start,
            "low_end":   low_start + len(X_low),
            "ecg_start": ecg_start,
            "ecg_end":   ecg_start + len(X_ecg),
            "duration_minutes": res["duration_minutes"]
        })
        tot_min += res["duration_minutes"]

    if not all_low:
        print("❌ No subjects processed successfully.")
        return None

    # -------- stack tensors ----------
    X_low_comb   = np.vstack(all_low).astype(np.float32, copy=False)
    m1_low_comb  = np.vstack(all_m1).astype(np.float32, copy=False)
    y_low_comb   = np.concatenate(all_y_low).astype(np.int64, copy=False)

    X_ecg_comb   = np.concatenate(all_ecg).astype(np.float32, copy=False)  # 1-D concat ok
    y_ecg_comb   = np.concatenate(all_y_ecg).astype(np.int64, copy=False)

    # -------- summary ----------
    print(f"\n📊 Combined Dataset:")
    print(f"  • Low-rate  : {X_low_comb.shape}  (EDA+RESP @ {low_fs} Hz)")
    print(f"  • ECG stream: {X_ecg_comb.shape}  (ECG @ {ecg_fs} Hz)")
    print(f"  • Subjects  : {len(segments)}   Total: {tot_min:.1f} min")

    # label counts (low-rate stream drives conditioning)
    uniq, cnt = np.unique(y_low_comb, return_counts=True)
    print(f"  • Label counts (low-rate): {dict(zip(uniq.tolist(), cnt.tolist()))}")

    return {
        # low-rate branch -------------------------------------------------------
        "X_low":  X_low_comb,
        "m1_low": m1_low_comb,
        "y_low":  y_low_comb,
        "fs_low": low_fs,
        "channels_low": channels_low,     # ['EDA','RESP']

        # ECG branch -----------------------------------------------------------
        "X_ecg":  X_ecg_comb,
        "y_ecg":  y_ecg_comb,
        "fs_ecg": ecg_fs,
        "channels_ecg": channels_ecg,     # ['ECG']

        # bookkeeping ----------------------------------------------------------
        "segments": segments,
        "summary": {
            "subjects": len(segments),
            "total_minutes": tot_min,
            "label_counts": {int(k): int(v) for k, v in zip(uniq, cnt)},
        },
        "feature_names": {
            "m1":  [LABEL_MAP[k] for k in sorted(LABEL_MAP)],   # one-hot col names
            "low": channels_low,
            "ecg": channels_ecg,
        }
    }

In [12]:
ds = process_multiple_subjects(
    subject_files,
    max_subjects=2,                 # small smoke test
    target_rate=4,
    channels=("ECG","EDA","RESP"),  # ← Fixed: updated to new channel config
    transition_pad_s=5.0,
    min_valid_run_s=30.0,
    verbose=True
)

# Basic invariants
if ds is not None:
    print("\n— Combined check —")
    print("X_low :", ds["X_low"].shape,  "| m1_low :", ds["m1_low"].shape,
          "| y_low :", ds["y_low"].shape)
    print("X_ecg :", ds["X_ecg"].shape)
    print("fs_low :", ds["fs_low"], "Hz | fs_ecg :", ds["fs_ecg"], "Hz")
    print("channels_low :", ds["channels_low"], "| channels_ecg :", ds["channels_ecg"])
    print("Segments:", len(ds["segments"]), "→", ds["segments"][:2])


🔄 Processing 2 subjects …

📂 Processing: S2
  Original: X (4255300, 3), y (4255300,) (~101.3 min)
  After mask (orig fs): (1987299, 3), label dist: {1: 793800, 2: 423500, 3: 246400, 4: 523599}
EDA BEFORE HP peak ~ 0.0039 Hz
EDA AFTER  HP peak ~ 0.0430 Hz
  ➜ X_low (11352, 2) @ 4 Hz | X_ecg (496650, 1) @ 175 Hz | m1 (11352, 4)

📂 Processing: S3
  Original: X (4545100, 3), y (4545100,) (~108.2 min)
  After mask (orig fs): (2019501, 3), label dist: {1: 791000, 2: 441000, 3: 255500, 4: 532001}
EDA BEFORE HP peak ~ 0.0039 Hz
EDA AFTER  HP peak ~ 0.0664 Hz
  ➜ X_low (11540, 2) @ 4 Hz | X_ecg (504875, 1) @ 175 Hz | m1 (11540, 4)

📊 Combined Dataset:
  • Low-rate  : (22892, 2)  (EDA+RESP @ 4 Hz)
  • ECG stream: (1001525, 1)  (ECG @ 175 Hz)
  • Subjects  : 2   Total: 95.4 min
  • Label counts (low-rate): {1: 9056, 2: 4940, 3: 2868, 4: 6028}

— Combined check —
X_low : (22892, 2) | m1_low : (22892, 4) | y_low : (22892,)
X_ecg : (1001525, 1)
fs_low : 4 Hz | fs_ecg : 175 Hz
channels_low : ['EDA',

In [13]:
def _constant_label_windows(X, y_raw, y_cond, m1, fs, T, step, require_single_label=True):
    """Slice windows within a subject segment; enforce single label if requested."""
    Xw, ycw, m1w = [], [], []
    n = len(y_raw)
    if require_single_label:
        i = 0
        while i < n:
            j = i + 1
            while j < n and y_raw[j] == y_raw[i]:
                j += 1
            run_len = j - i
            if run_len >= T:
                for t0 in range(i, j - T + 1, step):
                    mid = t0 + T//2
                    Xw.append(X[t0:t0+T])
                    ycw.append(int(y_cond[mid]))   # 0..K-1
                    m1w.append(m1[mid])            # one-hot for that label
            i = j
    else:
        for t0 in range(0, n - T + 1, step):
            mid = t0 + T//2
            Xw.append(X[t0:t0+T])
            ycw.append(int(y_cond[mid]))
            m1w.append(m1[mid])
    if not Xw:
        return None, None, None
    return (np.stack(Xw, 0).astype(np.float32),
            np.asarray(ycw, dtype=np.int64),
            np.stack(m1w, 0).astype(np.float32))

def _fit_norm_stats(Xw):
    """Per-channel mean/std across windows and time."""
    mean = Xw.mean(axis=(0,1), dtype=np.float64)
    std  = Xw.std(axis=(0,1), dtype=np.float64)
    std[std < 1e-8] = 1.0
    return mean.astype(np.float32), std.astype(np.float32)

def _apply_norm(Xw, mean, std):
    return ((Xw - mean) / std).astype(np.float32)

def create_training_sequences_from_combined(
    ds,
    window_s=60,                # 60-s windows
    step_s=30,                  # 50 % overlap
    train_subject_ids=None,
    test_subject_ids=None,
    require_single_label=True
):
    """
    Build train/test windows for the **two-stream** dataset produced by
    process_multiple_subjects (X_low @ 4 Hz, X_ecg @ 175 Hz).

    Returns
    -------
    dict with keys
      train : {"X_low","X_ecg","m1_seq","cond", "mean_low","std_low",
               "mean_ecg","std_ecg"}
      test  : {"X_low","X_ecg","m1_seq","cond"}
      config: meta information
    """
    # ------------------------------------------------------------------ constants
    fs_low = ds["fs_low"]           # 4
    fs_ecg = ds["fs_ecg"]           # 175
    T_low  = int(window_s * fs_low)     # 240
    T_ecg  = int(window_s * fs_ecg)     # 10 500
    step_low = int(step_s * fs_low)
    step_ecg = int(step_s * fs_ecg)

    # ------------------------------------------------------------------ split by id
    all_ids = [seg["subject_id"] for seg in ds["segments"]]
    uniq_ids = list(dict.fromkeys(all_ids))          # stable, unique
    if train_subject_ids is None or test_subject_ids is None:
        n_train = max(1, int(round(0.8 * len(uniq_ids))))
        train_subject_ids, test_subject_ids = uniq_ids[:n_train], uniq_ids[n_train:]

    # ------------------------------------------------------------------ helper
    def slice_per_subject(ids):
        Xl, Xe, m1s, ycs = [], [], [], []
        for seg in ds["segments"]:
            if seg["subject_id"] not in ids:
                continue

            # -------- low-rate slice (EDA+RESP) -----------------------
            X_low  = ds["X_low"][ seg["low_start"] : seg["low_end"] ]
            y_low  = ds["y_low"][ seg["low_start"] : seg["low_end"] ]
            m1_low = ds["m1_low"][ seg["low_start"] : seg["low_end"] ]

            out_low = _constant_label_windows_slice(
                X_low, y_low, y_low, m1_low, T_low, step_low
            ) if require_single_label else _windows_from_ds(...)

            # -------- ECG slice --------------------------------------
            X_ecg = ds["X_ecg"][ seg["ecg_start"] : seg["ecg_end"] ]
            y_ecg = ds["y_ecg"][ seg["ecg_start"] : seg["ecg_end"] ]

            # m1 not used for ECG branch → create dummy array of correct length
            dummy_m1 = np.zeros((len(y_ecg), ds["m1_low"].shape[1]), dtype=np.float32)

            out_ecg = _constant_label_windows_slice(
                X_ecg, y_ecg, y_ecg, dummy_m1,    # dummy_m1 avoids length mismatch
                T_ecg, step_ecg
            )

            if out_low[0] is None or out_ecg[0] is None:
                continue

            # both have same #windows by construction
            assert out_low[0].shape[0] == out_ecg[0].shape[0]

            Xl.append(out_low[0]);   Xe.append(out_ecg[0])
            ycs.append(out_low[1]);  m1s.append(out_low[2])

        if not Xl:
            return None
        return (np.concatenate(Xl, 0),
                np.concatenate(Xe, 0),
                np.concatenate(m1s, 0),
                np.concatenate(ycs, 0))

    # ------------------------------------------------------------------ collect
    tr = slice_per_subject(train_subject_ids)
    te = slice_per_subject(test_subject_ids)
    if tr is None or te is None:
        raise RuntimeError("No windows created – try smaller window_s or step_s.")

    Xl_tr, Xe_tr, m1_tr, yc_tr = tr
    Xl_te, Xe_te, m1_te, yc_te = te

    # ------------------------------------------------------------------ normalise
    mean_low, std_low = Xl_tr.mean((0,1)), Xl_tr.std((0,1)); std_low[std_low==0] = 1
    mean_ecg, std_ecg = Xe_tr.mean((0,1)), Xe_tr.std((0,1)); std_ecg[std_ecg==0] = 1

    Xl_tr = ((Xl_tr - mean_low)/std_low).astype(np.float32)
    Xl_te = ((Xl_te - mean_low)/std_low).astype(np.float32)
    Xe_tr = ((Xe_tr - mean_ecg)/std_ecg).astype(np.float32)
    Xe_te = ((Xe_te - mean_ecg)/std_ecg).astype(np.float32)

    # one-hot needs repeating across time if the model expects per-step cond
    m1_tr_seq = np.repeat(m1_tr[:, None, :], T_low, axis=1)
    m1_te_seq = np.repeat(m1_te[:, None, :], T_low, axis=1)

    # ------------------------------------------------------------------ return
    return {
        "train": {
            "X_low": Xl_tr, "X_ecg": Xe_tr,
            "m1_seq": m1_tr_seq, "cond": yc_tr,
            "mean_low": mean_low.astype(np.float32),
            "std_low":  std_low.astype(np.float32),
            "mean_ecg": mean_ecg.astype(np.float32),
            "std_ecg":  std_ecg.astype(np.float32)
        },
        "test": {
            "X_low": Xl_te, "X_ecg": Xe_te,
            "m1_seq": m1_te_seq, "cond": yc_te
        },
        "config": {
            "fs_low": fs_low, "fs_ecg": fs_ecg,
            "T_low": T_low, "T_ecg": T_ecg,
            "step_low": step_low, "step_ecg": step_ecg,
            "channels_low": ds["channels_low"],
            "channels_ecg": ds["channels_ecg"],
            "K": m1_tr.shape[-1],
            "train_subject_ids": train_subject_ids,
            "test_subject_ids":  test_subject_ids
        }
    }

In [14]:
def _constant_label_windows_slice(X, y_raw, y_cond, m1, T, step):
    """Windows fully inside constant-label runs for a single subject slice."""
    Xw, ycw, m1w = [], [], []
    n = len(y_raw)
    i = 0
    while i < n:
        j = i + 1
        while j < n and y_raw[j] == y_raw[i]:
            j += 1
        run_len = j - i
        if run_len >= T:
            for t0 in range(i, j - T + 1, step):
                mid = t0 + T//2
                Xw.append(X[t0:t0+T])
                ycw.append(int(y_cond[mid]))   # 0..K-1
                m1w.append(m1[mid])            # one-hot for that label
        i = j
    if not Xw:
        return None, None, None
    return (np.stack(Xw, 0).astype(np.float32),
            np.asarray(ycw, dtype=np.int64),
            np.stack(m1w, 0).astype(np.float32))

def _windows_from_ds(ds, T, step):
    """Collect windows per subject from a combined ds dict."""
    Xw_list, ycw_list, m1w_list = [], [], []
    for seg in ds["segments"]:
        s0, s1 = seg["start"], seg["end"]
        out = _constant_label_windows_slice(
            ds["X"][s0:s1], ds["y_raw"][s0:s1],
            ds["y_cond"][s0:s1], ds["m1"][s0:s1],
            T, step
        )
        if out[0] is not None:
            Xw_list.append(out[0]); ycw_list.append(out[1]); m1w_list.append(out[2])
    if not Xw_list:
        return None, None, None
    return (np.concatenate(Xw_list, 0),
            np.concatenate(ycw_list, 0),
            np.concatenate(m1w_list, 0))

def _fit_norm_stats(Xw):
    mean = Xw.mean(axis=(0,1), dtype=np.float64)
    std  = Xw.std(axis=(0,1), dtype=np.float64)
    std[std < 1e-8] = 1.0
    return mean.astype(np.float32), std.astype(np.float32)

def _apply_norm(Xw, mean, std):
    return ((Xw - mean) / std).astype(np.float32)


def _impute_linear_windows(Xw, ycw, m1w, split_name, return_stats=False):
    """
    Replace non‑finite values in each window/channel by linear interpolation along time.
    • 0 valid points in a channel → drop the window
    • 1 valid point            → fill that channel with the single value
    • ≥2 valid points          → linear interpolation (with constant edge fill)
    """
    import numpy as np
    Xw = Xw.copy().astype(np.float32, copy=False)
    Xw[~np.isfinite(Xw)] = np.nan

    if Xw is None:
        if return_stats: 
            return None, None, None, {"N_in": 0, "dropped": 0, "split": split_name}
        return None, None, None

    N, T, C = Xw.shape
    t = np.arange(T, dtype=np.float32)
    keep = np.ones(N, dtype=bool)
    dropped = 0

    for i in range(N):
        Xi = Xw[i]
        ok_window = True
        for c in range(C):
            col = Xi[:, c]
            good = np.isfinite(col)
            n_good = int(good.sum())
            if n_good == 0:
                ok_window = False
                break
            elif n_good == T:
                continue
            elif n_good == 1:
                Xi[:, c] = float(col[good][0])
            else:
                Xi[~good, c] = np.interp(t[~good], t[good], col[good])
        if not ok_window:
            keep[i] = False
            dropped += 1

    stats = {"N_in": N, "dropped": dropped, "split": split_name}
    if dropped:
        print(f"⚠️ {split_name}: dropped {dropped} / {N} windows (0 valid points in a channel).")
    if return_stats:
        return Xw[keep], ycw[keep], m1w[keep], stats
    return Xw[keep], ycw[keep], m1w[keep]

def create_train_test_sequences(proc_train, proc_test,
                                sequence_length=240, step_overlap=0.5,
                                impute=True):
    """Build single-label windows for train & test; z-score on train only (signals)."""
    T = int(sequence_length)
    step = max(1, int(round(T * (1 - step_overlap))))

    Xw_tr, ycw_tr, m1w_tr = _windows_from_ds(proc_train, T, step)
    Xw_te, ycw_te, m1w_te = _windows_from_ds(proc_test,  T, step)

    if Xw_tr is None or Xw_te is None:
        raise RuntimeError("No windows created; reduce sequence_length or step_overlap.")

    impute_stats = {"train": {"N_in": len(Xw_tr), "dropped": 0, "split": "TRAIN"},
                    "test":  {"N_in": len(Xw_te), "dropped": 0, "split": "TEST"}}

    if impute:
        print(f"Before impute: train={Xw_tr.shape[0]} test={Xw_te.shape[0]}")
        Xw_tr, ycw_tr, m1w_tr, s_tr = _impute_linear_windows(Xw_tr, ycw_tr, m1w_tr, "TRAIN", return_stats=True)
        Xw_te, ycw_te, m1w_te, s_te = _impute_linear_windows(Xw_te, ycw_te, m1w_te, "TEST",  return_stats=True)
        impute_stats = {"train": s_tr, "test": s_te}
        print(f"After  impute: train={Xw_tr.shape[0]} test={Xw_te.shape[0]}")

    if Xw_tr.size == 0 or Xw_te.size == 0:
        raise RuntimeError("All windows dropped. Try shorter windows or enable imputation.")

    mean, std = _fit_norm_stats(Xw_tr)
    Xw_tr_n = _apply_norm(Xw_tr, mean, std)
    Xw_te_n = _apply_norm(Xw_te, mean, std)

    m1_tr_seq = np.repeat(m1w_tr[:, None, :], T, axis=1)
    m1_te_seq = np.repeat(m1w_te[:, None, :], T, axis=1)

    return {
        "train": {"X": Xw_tr_n, "m1_seq": m1_tr_seq, "cond": ycw_tr, "mean": mean, "std": std},
        "test":  {"X": Xw_te_n, "m1_seq": m1_te_seq, "cond": ycw_te},
        "config": {
            "T": T, "step": step, "fs": proc_train["fs_out"],
            "channels": proc_train["channels"], "K": m1_tr_seq.shape[-1],
            "train_subjects": [s["subject_id"] for s in proc_train["segments"]],
            "test_subjects":  [s["subject_id"] for s in proc_test["segments"]],
        },
        "impute_stats": impute_stats
    }

In [15]:
train_ids = [ds["segments"][0]["subject_id"]]   # e.g., 'S2'
test_ids  = [ds["segments"][1]["subject_id"]]   # e.g., 'S3'

seqs = create_training_sequences_from_combined(
    ds,
    window_s=30, step_s=15,
    train_subject_ids=train_ids,
    test_subject_ids=test_ids,
    require_single_label=True
)

print("Train low-rate :", seqs["train"]["X_low"].shape,
      "| Train ECG :",   seqs["train"]["X_ecg"].shape)

Train low-rate : (182, 120, 2) | Train ECG : (182, 5250, 1)


In [16]:
def visualize_preprocessing_results(seqs, save_name=None, show=False):
    """Visualize sample window, conditioning, correlations, and class balance for current schema."""

    fs  = seqs["config"]["fs"]
    T   = seqs["config"]["T"]
    chs = seqs["config"]["channels"]
    K   = seqs["config"]["K"]

    X_tr   = seqs["train"]["X"]         # [N,T,C], normalized
    median_idx = np.argsort(np.abs(X_tr).mean(axis=(1,2)))[len(X_tr)//2]
    M1_tr  = seqs["train"]["m1_seq"]    # [N,T,K] one‑hot per time step
    y_tr   = seqs["train"]["cond"]      # [N], 0..K-1

    # pick a sample window
    idx = 0
    x = X_tr[median_idx]        # [T,C]
    m = M1_tr[median_idx]       # [T,K]
    t = np.arange(T) / fs

    # optional class names
    try:
        class_names = [LABEL_MAP[k] for k in sorted(LABEL_MAP.keys())][:K]
    except Exception:
        class_names = [f"class_{i}" for i in range(K)]

    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    fig.suptitle("WESAD – Preprocessed Windows Overview", fontsize=16, fontweight="bold")

    # (1) Signals (normalized)
    for c in range(x.shape[1]):
        axes[0,0].plot(t, x[:, c])
    axes[0,0].set_title("Signals (median window)")
    axes[0,0].set_xlabel("Time (s)"); axes[0,0].set_ylabel("z‑score")
    axes[0,0].legend(chs)
    axes[0,0].grid(True, alpha=0.3)

    # (2) Conditioning one‑hot (sample window)
    im = axes[0,1].imshow(m.T, aspect="auto", interpolation="nearest")
    axes[0,1].set_title("Conditioning one‑hot (sample window)")
    axes[0,1].set_ylabel("Class")
    axes[0,1].set_xlabel("Time step")
    axes[0,1].set_yticks(range(K))
    axes[0,1].set_yticklabels(class_names)
    plt.colorbar(im, ax=axes[0,1])

    # (3) Correlation across channels (flatten several windows)
    n_corr = min(100, X_tr.shape[0])
    flat = X_tr[:n_corr].reshape(-1, X_tr.shape[-1])  # [(n_corr*T), C]
    corr = np.corrcoef(flat.T)                        # [C,C]
    im2 = axes[1,0].imshow(corr, vmin=-1, vmax=1)
    axes[1,0].set_title(f"Channel correlation (first {n_corr} windows)")
    axes[1,0].set_xticks(range(len(chs))); axes[1,0].set_xticklabels(chs)
    axes[1,0].set_yticks(range(len(chs))); axes[1,0].set_yticklabels(chs)
    plt.colorbar(im2, ax=axes[1,0])

    # (4) Class balance (train split)
    cnt = Counter(y_tr.tolist())
    xs = np.arange(K)
    axes[1,1].bar(xs, [cnt.get(i,0) for i in xs])
    axes[1,1].set_title("Train windows per class")
    axes[1,1].set_xticks(xs); axes[1,1].set_xticklabels(class_names, rotation=0)
    axes[1,1].set_ylabel("# windows")
    axes[1,1].grid(True, axis="y", alpha=0.3)

    plt.tight_layout()

    # Save figure
    figures_dir = Path(project_root) / "results" / "figures"
    figures_dir.mkdir(parents=True, exist_ok=True)
    out_path = figures_dir / save_name
    plt.savefig(out_path, dpi=300, bbox_inches="tight")
    print(f"✅ Visualization saved to: {out_path}")
    
    if show:
        plt.show()
    else:
        plt.close(fig)



    # Data quality summary
    print("\n📊 Data Quality Summary (train split):")
    print(f"  • X shape: {X_tr.shape} (N,T,C) at fs={fs} Hz, channels={chs}")
    print(f"  • m1_seq shape: {M1_tr.shape} (N,T,K), K={K}, classes={class_names}")
    print(f"  • No NaNs in X: {not np.isnan(X_tr).any()}")
    print(f"  • No NaNs in m1_seq: {not np.isnan(M1_tr).any()}")
    mu = X_tr.mean(axis=(0,1)); sd = X_tr.std(axis=(0,1))
    print(f"  • Per‑channel mean ≈ 0: {mu}")
    print(f"  • Per‑channel std  ≈ 1: {sd}")

In [17]:
def preprocessing_summary(seqs, processed_dir):
    """
    Print a summary for the *two-stream* dataset produced by
    create_training_sequences_from_combined + save_processed_data_two_stream.
    """
    from collections import Counter
    from pathlib import Path

    cfg = seqs["config"]

    # ── meta info ────────────────────────────────────────────────────────────
    ch_low  = cfg["channels_low"]          # ['EDA','RESP']
    ch_ecg  = cfg["channels_ecg"]          # ['ECG']
    fs_low  = cfg["fs_low"]                # 4
    fs_ecg  = cfg["fs_ecg"]                # 175
    T_low   = cfg["T_low"]                 # 120  (30 s)
    T_ecg   = cfg["T_ecg"]                 # 5 250
    K       = cfg["K"]

    # ── basic counts ────────────────────────────────────────────────────────
    n_tr = seqs["train"]["X_low"].shape[0]
    n_te = seqs["test"]["X_low"].shape[0]

    mu_low,  sd_low  = seqs["train"]["mean_low"],  seqs["train"]["std_low"]
    mu_ecg,  sd_ecg  = seqs["train"]["mean_ecg"],  seqs["train"]["std_ecg"]

    try:
        label_names = [LABEL_MAP[k] for k in sorted(LABEL_MAP)][:K]
    except Exception:
        label_names = [f"class_{i}" for i in range(K)]

    # ── print summary ───────────────────────────────────────────────────────
    print("\n" + "="*60)
    print("🎉 Two-Stream WESAD Preprocessing Completed!")
    print("="*60)

    print("📊 Final Dataset Statistics:")
    print(f"   • Train windows: {n_tr:,}   Test windows: {n_te:,}")
    print(f"   • Window length: low-rate {T_low} steps  (30 s @ {fs_low} Hz)")
    print(f"                    ECG      {T_ecg} steps  (30 s @ {fs_ecg} Hz)")
    print(f"   • Channels (low): {ch_low}   (ECG): {ch_ecg}")
    print(f"   • Classes (K): {K} → {label_names}")

    # per-class counts
    tr_cnt = Counter(seqs["train"]["cond"].tolist())
    te_cnt = Counter(seqs["test"]["cond"].tolist())
    print(f"   • Train per-class: {[tr_cnt.get(i,0) for i in range(K)]}")
    print(f"   • Test  per-class: {[te_cnt.get(i,0) for i in range(K)]}")

    print("\n🧭 Normalisation (train only):")
    print(f"   • low-rate μ: {mu_low}   σ: {sd_low}")
    print(f"   • ECG      μ: {mu_ecg}   σ: {sd_ecg}")

    # ── file overview ───────────────────────────────────────────────────────
    processed_dir = Path(processed_dir)
    files = [
        "train_X_low.npy", "train_X_ecg.npy", "train_m1_seq.npy", "train_cond.npy",
        "test_X_low.npy",  "test_X_ecg.npy",  "test_m1_seq.npy",  "test_cond.npy",
        "norm_low.npz", "norm_ecg.npz", "dataset_config.json"
    ]
    print("\n📁 Output Files:")
    print(f"   • Directory: {processed_dir}")
    for f in files:
        p = processed_dir / f
        status = "✅" if p.exists() else "❌"
        size   = f" ({p.stat().st_size/1_048_576:.1f} MB)" if p.exists() else ""
        print(f"   • {status} {f}{size}")

    print("\n🎯 Ready for next steps:")
    print("   1) Train your dual-branch GAN on (X_low, X_ecg, m1_seq).")
    print("   2) Generate synthetic windows per class; keep them normalised with the train stats.")
    print("   3) Evaluate KS/W1/JS (EDA, RESP, ECG) + HRV, then classifier F1 ≥ 95 %.")

In [18]:
def save_processed_data_two_stream(seqs, subdir):
    out_dir = Path(project_root) / "data" / "processed" / subdir
    out_dir.mkdir(parents=True, exist_ok=True)
    np.save(out_dir / "train_X_low.npy",  seqs["train"]["X_low"])
    np.save(out_dir / "train_X_ecg.npy",  seqs["train"]["X_ecg"])
    np.save(out_dir / "train_m1_seq.npy", seqs["train"]["m1_seq"])
    np.save(out_dir / "train_cond.npy",   seqs["train"]["cond"])
    np.save(out_dir / "test_X_low.npy",   seqs["test"]["X_low"])
    np.save(out_dir / "test_X_ecg.npy",   seqs["test"]["X_ecg"])
    np.save(out_dir / "test_m1_seq.npy",  seqs["test"]["m1_seq"])
    np.save(out_dir / "test_cond.npy",    seqs["test"]["cond"])
    np.savez(out_dir / "norm_low.npz",  mean=seqs["train"]["mean_low"],  std=seqs["train"]["std_low"])
    np.savez(out_dir / "norm_ecg.npz",  mean=seqs["train"]["mean_ecg"],  std=seqs["train"]["std_ecg"])
    (out_dir / "dataset_config.json").write_text(json.dumps(seqs["config"], indent=2))
    return out_dir

In [25]:
def sweep_eda_hp(train_paths, test_paths, train_ids, test_ids,
                 cutoffs=(0.03, 0.04, 0.05), orders=(2,4), robust=False):
    rows = []
    for co in cutoffs:
        for od in orders:
            ds_fold = process_multiple_subjects(
                train_paths + test_paths,
                target_rate=4, channels=("ECG","EDA","RESP"),
                transition_pad_s=5.0, min_valid_run_s=30.0, verbose=False,
                eda_hp_cutoff=co, eda_hp_order=od, eda_robust=robust
            )
            seqs = create_training_sequences_from_combined(
                ds_fold, window_s=30, step_s=15,
                train_subject_ids=train_ids, test_subject_ids=test_ids,
                require_single_label=True
            )

            r = seqs["train"]["X_low"][:,:,0].ravel()  # EDA train (z to train stats)
            t = seqs["test"]["X_low"][:,:,0].ravel()   # EDA test
            n = min(len(r), len(t))
            rng = np.random.default_rng(0)
            r = r[rng.choice(len(r), n, replace=False)]
            t = t[rng.choice(len(t), n, replace=False)]
            ks = float(stats.ks_2samp(r, t).statistic)
            w1 = float(stats.wasserstein_distance(r, t))
            mu_te = float(seqs["test"]["X_low"][:,:,0].mean())
            sd_te = float(seqs["test"]["X_low"][:,:,0].std())

            rows.append(dict(cutoff=co, order=od, robust=robust,
                             KS=ks, W1=w1, test_mu=mu_te, test_sd=sd_te,
                             N_tr=int(seqs["train"]["X_low"].shape[0]),
                             N_te=int(seqs["test"]["X_low"].shape[0])))
    df = pd.DataFrame(rows).sort_values(["KS","W1","cutoff","order"])
    print(df.to_string(index=False, float_format=lambda x: f"{x:.3f}"))
    return df

In [26]:
df_hp = sweep_eda_hp(
    train_paths, test_paths, train_ids, test_ids,
    cutoffs=(0.03, 0.04, 0.05), orders=(2, 4), robust=False
)
# optional: save it
(project_root / "results" / "eda_hp_sweep.csv").parent.mkdir(parents=True, exist_ok=True)
df_hp.to_csv(project_root / "results" / "eda_hp_sweep.csv", index=False)


🔄 Processing 15 subjects …

📊 Combined Dataset:
  • Low-rate  : (176820, 2)  (EDA+RESP @ 4 Hz)
  • ECG stream: (7735875, 1)  (ECG @ 175 Hz)
  • Subjects  : 15   Total: 736.8 min
  • Label counts (low-rate): {1: 69844, 2: 39264, 3: 21700, 4: 46012}

🔄 Processing 15 subjects …

📊 Combined Dataset:
  • Low-rate  : (176820, 2)  (EDA+RESP @ 4 Hz)
  • ECG stream: (7735875, 1)  (ECG @ 175 Hz)
  • Subjects  : 15   Total: 736.8 min
  • Label counts (low-rate): {1: 69844, 2: 39264, 3: 21700, 4: 46012}

🔄 Processing 15 subjects …

📊 Combined Dataset:
  • Low-rate  : (176820, 2)  (EDA+RESP @ 4 Hz)
  • ECG stream: (7735875, 1)  (ECG @ 175 Hz)
  • Subjects  : 15   Total: 736.8 min
  • Label counts (low-rate): {1: 69844, 2: 39264, 3: 21700, 4: 46012}

🔄 Processing 15 subjects …

📊 Combined Dataset:
  • Low-rate  : (176820, 2)  (EDA+RESP @ 4 Hz)
  • ECG stream: (7735875, 1)  (ECG @ 175 Hz)
  • Subjects  : 15   Total: 736.8 min
  • Label counts (low-rate): {1: 69844, 2: 39264, 3: 21700, 4: 46012}

🔄 P

In [19]:
assert subject_files, "No subject files found."
print("🚀 Starting two-stream preprocessing pipeline (LOSO)…")

# 0)  ── load the 15 LOSO folds you generated earlier ──────────────────────────
splits_path = project_root / "data" / "processed" / "wesad_loso_splits.json"
folds = json.loads(splits_path.read_text())

# >>> choose the fold you want to run
FOLD = 0                            # 0 … 14
test_ids  = folds[FOLD]["test"]     # e.g. ['S10']
train_ids = folds[FOLD]["train"] + folds[FOLD].get("val", [])

# map subject ID → .pkl path
id2path = {Path(p).stem: p for p in subject_files}
train_paths = [id2path[sid] for sid in train_ids]
test_paths  = [id2path[sid] for sid in test_ids]

print(f"Fold {FOLD} — Test: {test_ids} | Train(+val): {len(train_paths)} subjects")

# 1) ── combine *all* subjects of this fold into one two-stream dataset ────────
ds_fold = process_multiple_subjects(
    train_paths + test_paths,       # combine; slicing happens later
    target_rate=4,
    channels=("ECG", "EDA", "RESP"),
    transition_pad_s=5.0,
    min_valid_run_s=30.0,
    verbose=True
)

# 2) ── slice train / test windows (30-s, 50 % overlap, single-label) ─────────
seqs = create_training_sequences_from_combined(
    ds_fold,
    window_s=30,
    step_s=15,
    train_subject_ids=train_ids,
    test_subject_ids=test_ids,
    require_single_label=True
)

print("Train low-rate :", seqs["train"]["X_low"].shape,
      "| Train ECG :",   seqs["train"]["X_ecg"].shape)
print("Test  low-rate :",  seqs["test"]["X_low"].shape,
      "| Test  ECG :",    seqs["test"]["X_ecg"].shape)

# 3) ── save + (optional) visualise + summary ─────────────────────────────────
out_dir = save_processed_data_two_stream(
    seqs, subdir=f"tc_multigan_fold_{test_ids[0]}"
)

# ⬇ if your old visualiser expects single-stream keys, skip or adapt it
# visualize_preprocessing_results_two_stream is a tiny wrapper you can add
# visualize_preprocessing_results_two_stream(seqs, save_name=f"overview_{test_ids[0]}.png")

preprocessing_summary(seqs, out_dir)
print("✅ Fold finished →", out_dir)

🚀 Starting two-stream preprocessing pipeline (LOSO)…
Fold 0 — Test: ['S10'] | Train(+val): 14 subjects

🔄 Processing 15 subjects …

📂 Processing: S2
  Original: X (4255300, 3), y (4255300,) (~101.3 min)
  After mask (orig fs): (1987299, 3), label dist: {1: 793800, 2: 423500, 3: 246400, 4: 523599}
EDA BEFORE HP peak ~ 0.0039 Hz
EDA AFTER  HP peak ~ 0.0430 Hz
  ➜ X_low (11352, 2) @ 4 Hz | X_ecg (496650, 1) @ 175 Hz | m1 (11352, 4)

📂 Processing: S4
  Original: X (4496100, 3), y (4496100,) (~107.0 min)
  After mask (orig fs): (2044001, 3), label dist: {1: 803601, 2: 437500, 3: 253400, 4: 549500}
EDA BEFORE HP peak ~ 0.0039 Hz
EDA AFTER  HP peak ~ 0.0352 Hz
  ➜ X_low (11680, 2) @ 4 Hz | X_ecg (511000, 1) @ 175 Hz | m1 (11680, 4)

📂 Processing: S5
  Original: X (4380600, 3), y (4380600,) (~104.3 min)
  After mask (orig fs): (2072700, 3), label dist: {1: 831600, 2: 444500, 3: 254800, 4: 541800}
EDA BEFORE HP peak ~ 0.0039 Hz
EDA AFTER  HP peak ~ 0.0430 Hz
  ➜ X_low (11844, 2) @ 4 Hz | X_ecg 

In [27]:
# pick best row
best = df_hp.sort_values(["KS","W1","cutoff","order"]).iloc[0]
BEST_CUTOFF = float(best["cutoff"])
BEST_ORDER  = int(best["order"])
print("Chosen EDA HP:", BEST_CUTOFF, "Hz, order", BEST_ORDER)

# rebuild with chosen params
ds_fold = process_multiple_subjects(
    train_paths + test_paths,
    target_rate=4, channels=("ECG","EDA","RESP"),
    transition_pad_s=5.0, min_valid_run_s=30.0, verbose=True,
    eda_hp_cutoff=BEST_CUTOFF, eda_hp_order=BEST_ORDER, eda_robust=False
)
seqs = create_training_sequences_from_combined(
    ds_fold, window_s=30, step_s=15,
    train_subject_ids=train_ids, test_subject_ids=test_ids,
    require_single_label=True
)

tag = f"hp{int(BEST_CUTOFF*1000):03d}_o{BEST_ORDER}"
out_dir = save_processed_data_two_stream(
    seqs, subdir=f"tc_multigan_fold_{test_ids[0]}_{tag}"
)
print("Saved to:", out_dir)

Chosen EDA HP: 0.05 Hz, order 2

🔄 Processing 15 subjects …

📂 Processing: S2
  Original: X (4255300, 3), y (4255300,) (~101.3 min)
  After mask (orig fs): (1987299, 3), label dist: {1: 793800, 2: 423500, 3: 246400, 4: 523599}
EDA BEFORE HP peak ~ 0.0039 Hz
EDA AFTER  HP peak ~ 0.0742 Hz
  ➜ X_low (11352, 2) @ 4 Hz | X_ecg (496650, 1) @ 175 Hz | m1 (11352, 4)

📂 Processing: S4
  Original: X (4496100, 3), y (4496100,) (~107.0 min)
  After mask (orig fs): (2044001, 3), label dist: {1: 803601, 2: 437500, 3: 253400, 4: 549500}
EDA BEFORE HP peak ~ 0.0039 Hz
EDA AFTER  HP peak ~ 0.0742 Hz
  ➜ X_low (11680, 2) @ 4 Hz | X_ecg (511000, 1) @ 175 Hz | m1 (11680, 4)

📂 Processing: S5
  Original: X (4380600, 3), y (4380600,) (~104.3 min)
  After mask (orig fs): (2072700, 3), label dist: {1: 831600, 2: 444500, 3: 254800, 4: 541800}
EDA BEFORE HP peak ~ 0.0039 Hz
EDA AFTER  HP peak ~ 0.0625 Hz
  ➜ X_low (11844, 2) @ 4 Hz | X_ecg (518175, 1) @ 175 Hz | m1 (11844, 4)

📂 Processing: S6
  Original: X (4

In [20]:
# Low stream is [EDA, RESP] in this code
assert ds_fold["channels_low"] == ["EDA","RESP"]
assert not np.isnan(ds_fold["X_low"]).any(), "NaNs in X_low"
assert not np.isnan(ds_fold["X_ecg"]).any(), "NaNs in X_ecg"

span_low = len(ds_fold["X_low"]) / ds_fold["fs_low"]
span_ecg = len(ds_fold["X_ecg"]) / ds_fold["fs_ecg"]
print(span_low, span_ecg)
assert abs(span_low - span_ecg) < 1e-6

for seg in ds_fold["segments"]:
    a,b = seg["low_start"], seg["low_end"]
    c,d = seg["ecg_start"], seg["ecg_end"]
    # Same subjects should have same duration
    assert abs((b-a)/ds_fold["fs_low"] - (d-c)/ds_fold["fs_ecg"]) < 1e-6

print("Train windows:", seqs["train"]["X_low"].shape[0], 
      "Test windows:", seqs["test"]["X_low"].shape[0])


mu = seqs["train"]["X_low"].mean((0,1)); sd = seqs["train"]["X_low"].std((0,1))
print("Train mean≈0:", mu, "Train std≈1:", sd)


mu_te = seqs["test"]["X_low"].mean((0,1))
sd_te = seqs["test"]["X_low"].std((0,1))
print("Test mean (not forced to 0):", mu_te, "Test std:", sd_te)

import numpy as np, collections
for split in ["train","test"]:
    cnt = collections.Counter(seqs[split]["cond"].tolist())
    print(split, dict(cnt))

from scipy.signal import welch
x_eda = ds_fold["X_low"][:4096,0]  # first EDA samples
f, P = welch(x_eda, fs=ds_fold["fs_low"], nperseg=256)
print("EDA PSD peak ~", f[P.argmax()], "Hz")  # EDA dynamics are very low freq

# Compare train vs test REAL distributions (EDA, RESP) after train z-scoring
real_train = seqs["train"]["X_low"].reshape(-1, 2)
real_test  = seqs["test"]["X_low"].reshape(-1, 2)
from scipy import stats
import numpy as np

for i, name in enumerate(["EDA","RESP"]):
    r = real_train[:,i]; t = real_test[:,i]
    r = r[np.isfinite(r)]; t = t[np.isfinite(t)]
    n = min(len(r), len(t))
    rng = np.random.default_rng(0)
    r = r[rng.choice(len(r), n, replace=False)]
    t = t[rng.choice(len(t), n, replace=False)]
    ks = stats.ks_2samp(r, t).statistic
    w1 = stats.wasserstein_distance(r, t)
    print(name, "KS(train,test)=", round(ks,3), "W1=", round(w1,3))




44205.0 44205.0
Train windows: 2645 Test windows: 194
Train mean≈0: [-1.1201316e-08  5.7689181e-10] Train std≈1: [0.9999996 0.9999992]
Test mean (not forced to 0): [-0.01319962  0.00013314] Test std: [0.14074133 0.9113119 ]
train {1: 1069, 2: 588, 4: 669, 3: 319}
test {1: 77, 3: 23, 4: 48, 2: 46}
EDA PSD peak ~ 0.0 Hz
EDA KS(train,test)= 0.205 W1= 0.16
RESP KS(train,test)= 0.036 W1= 0.07


In [21]:
mu_tr = seqs["train"]["X_ecg"].mean((0,1))    # shape (1,)
sd_tr = seqs["train"]["X_ecg"].std((0,1))
mu_te = seqs["test"]["X_ecg"].mean((0,1))
sd_te = seqs["test"]["X_ecg"].std((0,1))
print("ECG train mean≈0,std≈1:", mu_tr, sd_tr)
print("ECG test mean/std:",      mu_te, sd_te)

from scipy import stats
r = seqs["train"]["X_ecg"].reshape(-1)  # already z-scored to train stats
t = seqs["test"]["X_ecg"].reshape(-1)
# balance sample sizes
import numpy as np
n = min(len(r), len(t))
rng = np.random.default_rng(0)
r = r[rng.choice(len(r), n, replace=False)]
t = t[rng.choice(len(t), n, replace=False)]
print("ECG KS(train,test) =", stats.ks_2samp(r, t).statistic)
print("ECG W1 =", stats.wasserstein_distance(r, t))

from scipy.signal import find_peaks
ecg = seqs["test"]["X_ecg"][0,:,0]  # first test window
# simple heuristic: peaks > 1.0 z, spaced ≥ 0.25 s (175*0.25 ≈ 44 samples)
peaks, _ = find_peaks(ecg, height=1.0, distance=44)
rr = np.diff(peaks) / 175.0  # seconds
if len(rr) > 0:
    bpm = 60.0 / rr.mean()
    print(f"R-peaks: {len(peaks)} | mean RR={rr.mean():.3f}s | bpm≈{bpm:.1f}")
else:
    print("No clear R-peaks with this simple detector — try lower height (e.g., 0.7) or better preprocessing.")


    




ECG train mean≈0,std≈1: [-5.614325e-09] [0.9999998]
ECG test mean/std: [8.217065e-06] [0.53762823]
ECG KS(train,test) = 0.2373637702503682
ECG W1 = 0.24367693083576528
R-peaks: 43 | mean RR=0.700s | bpm≈85.7


In [22]:
# Map subject id -> .pkl path once
id2path = {Path(p).stem: p for p in subject_files}

# Load LOSO splits you generated earlier
splits_path = project_root / "data" / "processed" / "wesad_loso_splits.json"
folds = json.loads(splits_path.read_text())

def summarize_fold(seqs, fold_name, class_names=("baseline","stress","amusement","meditation")):
    K = int(seqs["config"]["K"])
    tr_y = seqs["train"]["cond"]; te_y = seqs["test"]["cond"]
    tr_counts = np.bincount(tr_y, minlength=K)
    te_counts = np.bincount(te_y, minlength=K)
    X_tr = seqs["train"]["X"]
    mu = X_tr.mean(axis=(0,1)); sd = X_tr.std(axis=(0,1))
    
    # Updated for ECG, EDA, RESP channels
    stats = {
        "fold": fold_name,
        "T": seqs["config"]["T"],
        "step": seqs["config"]["step"],
        "fs": seqs["config"]["fs"],
        "train_N": int(X_tr.shape[0]),
        "test_N": int(seqs["test"]["X"].shape[0]),
        "impute_dropped_train": int(seqs["impute_stats"]["train"]["dropped"]),
        "impute_dropped_test":  int(seqs["impute_stats"]["test"]["dropped"]),
        "mu_ECG": float(mu[0]), "mu_EDA": float(mu[1]), "mu_RESP": float(mu[2]),
        "sd_ECG": float(sd[0]), "sd_EDA": float(sd[1]), "sd_RESP": float(sd[2]),
    }
    for k in range(K):
        stats[f"train_{class_names[k]}"] = int(tr_counts[k])
        stats[f"test_{class_names[k]}"]  = int(te_counts[k])
    return stats

def run_all_folds(sequence_length=120,    # 30-s low-rate windows (120 steps)
                  step_overlap=0.5,
                  impute=True,            # not used; kept for API compat
                  out_root=None,
                  channels=("ECG","EDA","RESP")):

    out_root = Path(out_root or (project_root / "data" / "processed"))
    id2path  = {Path(p).stem: p for p in subject_files}

    all_rows = []
    for f_idx, split in enumerate(folds):
        test_ids  = split["test"]                # e.g. ['S10']
        train_ids = split["train"] + split.get("val", [])
        fold_tag  = f"fold_{test_ids[0]}"

        print(f"\n🚀 Fold {f_idx} — Test: {test_ids} | Train: {len(train_ids)} subjects")

        # ---------- combine all subjects of this fold ----------
        paths = [id2path[s] for s in (train_ids + test_ids)]
        ds_fold = process_multiple_subjects(paths,
                                            target_rate=4,
                                            channels=channels,
                                            transition_pad_s=5.0,
                                            min_valid_run_s=30.0,
                                            verbose=False)

        # ---------- slice train / test windows ----------
        seqs = create_training_sequences_from_combined(
            ds_fold,
            window_s=sequence_length / 4,        # 120 → 30-s windows
            step_s=(sequence_length / 4) * (1-step_overlap),
            train_subject_ids=train_ids,
            test_subject_ids=test_ids,
            require_single_label=True
        )

        # ---------- save ----------
        save_processed_data_two_stream(seqs, subdir=f"tc_multigan_{fold_tag}")

        # ---------- simple stats ----------
        row = {
            "fold": fold_tag,
            "train_N": int(seqs["train"]["X_low"].shape[0]),
            "test_N":  int(seqs["test"]["X_low"].shape[0]),
            "mu_low_ECG": float(seqs["train"]["mean_ecg"][0]),
            "sd_low_ECG": float(seqs["train"]["std_ecg"][0])
        }
        all_rows.append(row)

    df = pd.DataFrame(all_rows).sort_values("fold")
    csv_path = out_root / "loso_preproc_two_stream_summary.csv"
    df.to_csv(csv_path, index=False)
    print(f"\n📊 Aggregated preprocessing stats → {csv_path}")
    return df

In [23]:
df_stats = run_all_folds(sequence_length=120, step_overlap=0.5)
df_stats.describe().T


🚀 Fold 0 — Test: ['S10'] | Train: 14 subjects

🔄 Processing 15 subjects …

📊 Combined Dataset:
  • Low-rate  : (176820, 2)  (EDA+RESP @ 4 Hz)
  • ECG stream: (7735875, 1)  (ECG @ 175 Hz)
  • Subjects  : 15   Total: 736.8 min
  • Label counts (low-rate): {1: 69844, 2: 39264, 3: 21700, 4: 46012}

🚀 Fold 1 — Test: ['S11'] | Train: 14 subjects

🔄 Processing 15 subjects …

📊 Combined Dataset:
  • Low-rate  : (176820, 2)  (EDA+RESP @ 4 Hz)
  • ECG stream: (7735875, 1)  (ECG @ 175 Hz)
  • Subjects  : 15   Total: 736.8 min
  • Label counts (low-rate): {1: 69844, 2: 39264, 3: 21700, 4: 46012}

🚀 Fold 2 — Test: ['S13'] | Train: 14 subjects

🔄 Processing 15 subjects …

📊 Combined Dataset:
  • Low-rate  : (176820, 2)  (EDA+RESP @ 4 Hz)
  • ECG stream: (7735875, 1)  (ECG @ 175 Hz)
  • Subjects  : 15   Total: 736.8 min
  • Label counts (low-rate): {1: 69844, 2: 39264, 3: 21700, 4: 46012}

🚀 Fold 3 — Test: ['S14'] | Train: 14 subjects

🔄 Processing 15 subjects …

📊 Combined Dataset:
  • Low-rate  : 

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
train_N,15.0,2649.733333,2.711527,2645.0,2648.5,2649.0,2650.0,2657.0
test_N,15.0,189.266667,2.711527,182.0,189.0,190.0,190.5,194.0
mu_low_ECG,15.0,-2e-06,6.052112e-07,-3e-06,-3e-06,-2e-06,-2e-06,-8.484654e-07
sd_low_ECG,15.0,0.262519,0.003728952,0.255415,0.260487,0.262263,0.264563,0.2691641
