In [1]:
# ============================================================
# Phase 2 â€” 5-blocked time-wise CV (p043774)
#  - BlockGap = 0 (contiguous 5 folds)
#  - Sweep TimeGap_min: [0, 5, 10, 15]
#  - TrainDur_min = 10 (fixed)
#  - TestDur_min  = 5  (fixed; last 5 min of each fold)
#  - Within each fold:
#       Test  = last 5 minutes of the fold
#       Train = 10 minutes ending at (test_start - time_gap)
#       Val   = last 20% of TRAIN (time-aware, contiguous; no gap)
#  - Input: PPG_F only (already per-segment min-max 0~1)
#  - Label scaling: TRAIN-ONLY per fold (no future leakage)
#  - Report:
#       * per-fold MAE/SD for SBP/DBP
#       * per-gap average across valid folds (mean of fold metrics)
#  - No CSV saving
# ============================================================

import time
import random
import numpy as np
import h5py
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, Subset
from scipy.interpolate import interp1d
from scipy.integrate import trapezoid

# ==========================================
# [0] Experiment settings
# ==========================================
MAT_FILE = "/content/drive/MyDrive/Colab Notebooks/PusleDB/p043774.mat"
SEGMENT_LIMIT = None          # None = all segments
PAD_LEN = 200
SEC_PER_SEGMENT = 10.0

BATCH_SIZE = 32
EPOCHS = 100
LR = 1e-3
WEIGHT_DECAY = 1e-4
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

SEED = 42
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

print(f"Using Device: {DEVICE}")

# Phase-2 protocol config
N_FOLDS = 5
BLOCK_GAP_MIN = 0            # fixed: 0
TIME_GAP_SWEEP_MIN = [0, 5, 10, 15]

TRAIN_DUR_MIN = 10           # fixed
TEST_DUR_MIN  = 5            # fixed
VAL_FRAC_IN_TRAIN = 0.20     # last 20% of train indices

# ==========================================
# [1] Re-sample + Prior feature
# ==========================================
def cubic_resample(ppg, target_len=PAD_LEN):
    """Cubic spline interpolation (fallback to linear)."""
    x_old = np.linspace(0, 1, len(ppg))
    x_new = np.linspace(0, 1, target_len)
    if len(ppg) < 4:
        return np.interp(x_new, x_old, ppg).astype(np.float32)
    try:
        f = interp1d(x_old, ppg, kind="cubic", bounds_error=False, fill_value="extrapolate")
        return f(x_new).astype(np.float32)
    except Exception:
        return np.interp(x_new, x_old, ppg).astype(np.float32)

def extract_multiscale_morph_features(ppg_01):
    """
    Multi-scale Morphological Feature Extraction (44 dims)
    - PPG_F already 0~1 per segment => use directly
    """
    scales = [100, 150, 200, 250]
    all_features = []
    for scale in scales:
        x = cubic_resample(ppg_01, scale)

        peak_idx = int(np.argmax(x))
        end_idx = scale - 1

        vp = float(x[peak_idx])
        vt = float(x[end_idx])
        dv = vp - vt
        vm = float(np.mean(x))
        std_val = float(np.std(x))

        tvp = peak_idx / scale

        diff = np.diff(x)
        kmax = float(np.max(diff)) if len(diff) > 0 else 0.0
        tkmax = (int(np.argmax(diff)) / scale) if len(diff) > 0 else 0.0

        amax = float(trapezoid(x[:peak_idx])) if peak_idx > 0 else 0.0

        centered = x - vm
        skew_approx = float(np.mean(centered**3) / (std_val**3)) if std_val > 0 else 0.0
        kurt_approx = float(np.mean(centered**4) / (std_val**4)) if std_val > 0 else 0.0

        all_features.extend([vp, vt, dv, vm, kmax, tkmax, amax, std_val, tvp, skew_approx, kurt_approx])

    return np.array(all_features, dtype=np.float32)

# ==========================================
# [2] Load data (PPG_F only)
# ==========================================
def load_data_from_mat(mat_path, segment_limit=None):
    segments, priors, targets = [], [], []
    with h5py.File(mat_path, "r") as f:
        refs = f["Subj_Wins"]["PPG_F"][0]
        sbps = f["Subj_Wins"]["SegSBP"][0]
        dbps = f["Subj_Wins"]["SegDBP"][0]

        total = min(len(refs), segment_limit) if segment_limit else len(refs)
        for i in range(total):
            ppg = f[refs[i]][()].squeeze().astype(np.float32)   # (1250,) already 0~1
            sbp = float(f[sbps[i]][()][0][0])
            dbp = float(f[dbps[i]][()][0][0])

            segments.append(ppg)
            priors.append(extract_multiscale_morph_features(ppg))
            targets.append([sbp, dbp])

            if i % 1000 == 0:
                print(f"  Processed {i}/{total} ...")

    return segments, np.stack(priors).astype(np.float32), np.array(targets, dtype=np.float32)

# ==========================================
# [3] Dataset (RAW y; scaler is train-only)
# ==========================================
class PPGDatasetRawY(Dataset):
    def __init__(self, segments, priors, targets_mmHg):
        self.segments = segments            # list of (1250,)
        self.priors = priors                # (N, 44)
        self.targets = targets_mmHg         # (N, 2) in mmHg

    def __len__(self):
        return len(self.segments)

    def __getitem__(self, idx):
        x = cubic_resample(self.segments[idx], PAD_LEN)        # (200,)
        x = torch.tensor(x, dtype=torch.float32).unsqueeze(0)  # (1, 200)
        p = torch.tensor(self.priors[idx], dtype=torch.float32)
        y = torch.tensor(self.targets[idx], dtype=torch.float32)  # RAW mmHg
        return x, p, y

# ==========================================
# [4] Model
# ==========================================
class MorphCNNRegressor(nn.Module):
    def __init__(self, prior_dim=44):
        super().__init__()
        self.cnn = nn.Sequential(
            nn.Conv1d(1, 32, 7, padding=3),
            nn.BatchNorm1d(32),
            nn.ReLU(),
            nn.MaxPool1d(2),

            nn.Conv1d(32, 64, 5, padding=2),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.MaxPool1d(2),

            nn.Conv1d(64, 128, 5, padding=2),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.MaxPool1d(2),

            nn.Conv1d(128, 256, 3, padding=1),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.AdaptiveAvgPool1d(1)
        )

        self.fc_prior = nn.Sequential(
            nn.Linear(prior_dim, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 256),
            nn.BatchNorm1d(256),
            nn.ReLU()
        )

        self.fc_out = nn.Sequential(
            nn.Linear(256 + 256, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 2)
        )

    def forward(self, x, prior):
        feat = self.cnn(x).squeeze(-1)
        pfeat = self.fc_prior(prior)
        return self.fc_out(torch.cat([feat, pfeat], dim=1))

# ==========================================
# [5] Train-only label scaler
# ==========================================
class LabelScaler2D:
    """Train-only scaling. Default: minmax."""
    def __init__(self, mode="minmax", eps=1e-6):
        assert mode in ["minmax", "zscore"]
        self.mode = mode
        self.eps = eps
        self.fitted = False

    def fit(self, y_train_mmHg: np.ndarray):
        y = np.asarray(y_train_mmHg, dtype=np.float32)
        if self.mode == "minmax":
            self.y_min = y.min(axis=0)
            self.y_max = y.max(axis=0)
        else:
            self.y_mean = y.mean(axis=0)
            self.y_std = y.std(axis=0)
        self.fitted = True
        return self

    def transform(self, y_mmHg: torch.Tensor) -> torch.Tensor:
        assert self.fitted
        if self.mode == "minmax":
            y_min = torch.tensor(self.y_min, device=y_mmHg.device, dtype=y_mmHg.dtype)
            y_max = torch.tensor(self.y_max, device=y_mmHg.device, dtype=y_mmHg.dtype)
            return (y_mmHg - y_min) / (y_max - y_min + self.eps)
        else:
            y_mean = torch.tensor(self.y_mean, device=y_mmHg.device, dtype=y_mmHg.dtype)
            y_std = torch.tensor(self.y_std, device=y_mmHg.device, dtype=y_mmHg.dtype)
            return (y_mmHg - y_mean) / (y_std + self.eps)

    def inverse(self, y_scaled: torch.Tensor) -> torch.Tensor:
        assert self.fitted
        if self.mode == "minmax":
            y_min = torch.tensor(self.y_min, device=y_scaled.device, dtype=y_scaled.dtype)
            y_max = torch.tensor(self.y_max, device=y_scaled.device, dtype=y_scaled.dtype)
            return y_scaled * (y_max - y_min + self.eps) + y_min
        else:
            y_mean = torch.tensor(self.y_mean, device=y_scaled.device, dtype=y_scaled.dtype)
            y_std = torch.tensor(self.y_std, device=y_scaled.device, dtype=y_scaled.dtype)
            return y_scaled * (y_std + self.eps) + y_mean

# ==========================================
# [6] Train / Eval
# ==========================================
def set_seed(seed=SEED):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

def train_one_model(train_loader, val_loader, scaler: LabelScaler2D):
    model = MorphCNNRegressor(prior_dim=44).to(DEVICE)
    optimizer = optim.Adam(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
    criterion = nn.MSELoss()

    best_val = float("inf")
    best_state = None

    for epoch in range(1, EPOCHS + 1):
        model.train()
        for x, p, y_mmHg in train_loader:
            x, p, y_mmHg = x.to(DEVICE), p.to(DEVICE), y_mmHg.to(DEVICE)
            y = scaler.transform(y_mmHg)
            pred = model(x, p)
            loss = criterion(pred, y)
            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()

        model.eval()
        val_losses = []
        with torch.no_grad():
            for x, p, y_mmHg in val_loader:
                x, p, y_mmHg = x.to(DEVICE), p.to(DEVICE), y_mmHg.to(DEVICE)
                y = scaler.transform(y_mmHg)
                pred = model(x, p)
                val_losses.append(float(criterion(pred, y).item()))
        avg_val = float(np.mean(val_losses)) if len(val_losses) else float("inf")

        if avg_val < best_val:
            best_val = avg_val
            best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}

    if best_state is not None:
        model.load_state_dict(best_state)
    return model

def eval_mae_sd_mmHg(model, loader, scaler: LabelScaler2D):
    """
    Returns:
      - MAE (mean absolute error) for SBP/DBP
      - SD  (std of signed error) for SBP/DBP
    """
    model.eval()
    errs = []
    with torch.no_grad():
        for x, p, y_mmHg in loader:
            x, p, y_mmHg = x.to(DEVICE), p.to(DEVICE), y_mmHg.to(DEVICE)
            pred_scaled = model(x, p)
            pred_mmHg = scaler.inverse(pred_scaled)
            err = (pred_mmHg - y_mmHg).detach().cpu().numpy()  # signed
            errs.append(err)

    if len(errs) == 0:
        return dict(mae_sbp=np.nan, sd_sbp=np.nan, mae_dbp=np.nan, sd_dbp=np.nan, n=0)

    E = np.concatenate(errs, axis=0)  # (N,2)
    e_sbp, e_dbp = E[:, 0], E[:, 1]
    return dict(
        mae_sbp=float(np.mean(np.abs(e_sbp))),
        sd_sbp=float(np.std(e_sbp, ddof=0)),
        mae_dbp=float(np.mean(np.abs(e_dbp))),
        sd_dbp=float(np.std(e_dbp, ddof=0)),
        n=int(E.shape[0])
    )

# ==========================================
# [7] Phase-2 Engine (5-blocked time-wise CV, TimeGap sweep)
# ==========================================
def segs_from_minutes(minutes: float) -> int:
    return int((minutes * 60.0) / SEC_PER_SEGMENT)

def run_phase2_blocked_timewise_cv_timegap_sweep():
    set_seed(SEED)

    segments, priors, targets_mmHg = load_data_from_mat(MAT_FILE, segment_limit=SEGMENT_LIMIT)
    ds = PPGDatasetRawY(segments, priors, targets_mmHg)

    total_len = len(ds)
    print(f"\n[Data Ready] total_len={total_len}")

    b_gap_segs = segs_from_minutes(BLOCK_GAP_MIN)
    test_dur_segs  = segs_from_minutes(TEST_DUR_MIN)
    train_dur_segs = segs_from_minutes(TRAIN_DUR_MIN)

    # With BlockGap=0, fold_len is simply total_len // N_FOLDS (use contiguous blocks)
    available_len = total_len - (N_FOLDS - 1) * b_gap_segs
    if available_len <= 0:
        raise ValueError("Not enough segments for the requested N_FOLDS and BLOCK_GAP_MIN.")

    fold_len = available_len // N_FOLDS
    if fold_len <= 0:
        raise ValueError("Computed fold_len <= 0. Reduce N_FOLDS or BLOCK_GAP_MIN.")
    if fold_len <= test_dur_segs + 1:
        raise ValueError("Fold too short for test duration. Reduce TEST_DUR_MIN or N_FOLDS.")

    print("\n=== PHASE 2 CONFIG ===")
    print(f"N_FOLDS={N_FOLDS} | BlockGap={BLOCK_GAP_MIN}min ({b_gap_segs} segs)")
    print(f"FoldLen={fold_len} segs (~{fold_len*SEC_PER_SEGMENT/60.0:.2f} min)")
    print(f"TrainDur={TRAIN_DUR_MIN}min ({train_dur_segs} segs) | TestDur={TEST_DUR_MIN}min ({test_dur_segs} segs)")
    print(f"TimeGap sweep (min): {TIME_GAP_SWEEP_MIN}")
    print(f"ValFracInTrain={VAL_FRAC_IN_TRAIN:.2f}")

    t_global0 = time.time()

    for gap_min in TIME_GAP_SWEEP_MIN:
        gap_segs = segs_from_minutes(gap_min)
        print(f"\n===============================")
        print(f" TimeGap = {gap_min} min")
        print(f"===============================")

        fold_stats = []
        t_gap0 = time.time()

        for f_idx in range(N_FOLDS):
            fold_start = f_idx * (fold_len + b_gap_segs)
            fold_end   = fold_start + fold_len

            # test = last TEST_DUR_MIN minutes of the fold
            test_end = fold_end
            test_start = test_end - test_dur_segs

            # train = TRAIN_DUR_MIN minutes ending at (test_start - gap)
            train_end = test_start - gap_segs
            train_start = train_end - train_dur_segs

            # Feasibility: train must be inside the fold
            if train_start < fold_start or train_end > test_start:
                print(f"[Fold {f_idx+1}] SKIP (insufficient room): "
                      f"fold=({fold_start},{fold_end}) train=({train_start},{train_end}) test=({test_start},{test_end})")
                fold_stats.append(None)
                continue

            train_indices = list(range(train_start, train_end))
            test_indices  = list(range(test_start, test_end))

            # time-aware val split inside train (last 20%)
            n_total = len(train_indices)
            n_val = max(1, int(n_total * VAL_FRAC_IN_TRAIN))
            if n_total - n_val < 1:
                print(f"[Fold {f_idx+1}] SKIP (train too small after val split).")
                fold_stats.append(None)
                continue

            real_train_idx = train_indices[:-n_val]
            val_idx        = train_indices[-n_val:]

            # TRAIN-only scaler fit (per fold)
            y_train = targets_mmHg[np.array(real_train_idx)]
            scaler = LabelScaler2D(mode="minmax", eps=1e-6).fit(y_train)

            train_loader = DataLoader(Subset(ds, real_train_idx), batch_size=BATCH_SIZE, shuffle=True)
            val_loader   = DataLoader(Subset(ds, val_idx), batch_size=BATCH_SIZE, shuffle=False)
            test_loader  = DataLoader(Subset(ds, test_indices), batch_size=BATCH_SIZE, shuffle=False)

            t0 = time.time()
            model = train_one_model(train_loader, val_loader, scaler)
            stat = eval_mae_sd_mmHg(model, test_loader, scaler)
            elapsed = time.time() - t0

            stat.update({
                "fold": f_idx + 1,
                "gap_min": gap_min,
                "train_n": len(real_train_idx),
                "val_n": len(val_idx),
                "test_n": len(test_indices),
                "elapsed_s": float(elapsed),
                "fold_start": fold_start,
                "fold_end": fold_end,
                "train_start": train_start,
                "train_end": train_end,
                "test_start": test_start,
                "test_end": test_end,
            })
            fold_stats.append(stat)

            print(f"\n[Fold {f_idx+1}] fold=({fold_start},{fold_end}) | "
                  f"train=({train_start},{train_end}) | test=({test_start},{test_end})")
            print(f"  sizes Train/Val/Test: {len(real_train_idx)}/{len(val_idx)}/{len(test_indices)}")
            print(f"  SBP: MAE={stat['mae_sbp']:.4f} | SD={stat['sd_sbp']:.4f}")
            print(f"  DBP: MAE={stat['mae_dbp']:.4f} | SD={stat['sd_dbp']:.4f}")
            print(f"  elapsed: {elapsed:.1f}s")

        valid = [fs for fs in fold_stats if fs is not None and np.isfinite(fs["mae_sbp"])]

        def mean_key(key):
            vals = [v[key] for v in valid]
            return float(np.mean(vals)) if len(vals) else float("nan")

        print(f"\n--- TimeGap {gap_min} min SUMMARY ---")
        print(f"ValidFolds: {len(valid)}/{N_FOLDS}")
        print(f"Avg SBP: MAE={mean_key('mae_sbp'):.4f} | SD={mean_key('sd_sbp'):.4f}")
        print(f"Avg DBP: MAE={mean_key('mae_dbp'):.4f} | SD={mean_key('sd_dbp'):.4f}")
        print(f"Elapsed for this gap: {time.time() - t_gap0:.1f}s")

    print(f"\n[Phase2 Done] Total elapsed: {time.time() - t_global0:.1f}s")


if __name__ == "__main__":
    run_phase2_blocked_timewise_cv_timegap_sweep()


Using Device: cuda
  Processed 0/1943 ...
  Processed 1000/1943 ...

[Data Ready] total_len=1943

=== PHASE 2 CONFIG ===
N_FOLDS=5 | BlockGap=0min (0 segs)
FoldLen=388 segs (~64.67 min)
TrainDur=10min (60 segs) | TestDur=5min (30 segs)
TimeGap sweep (min): [0, 5, 10, 15]
ValFracInTrain=0.20

 TimeGap = 0 min

[Fold 1] fold=(0,388) | train=(298,358) | test=(358,388)
  sizes Train/Val/Test: 48/12/30
  SBP: MAE=1.4385 | SD=1.7433
  DBP: MAE=0.7990 | SD=1.0108
  elapsed: 11.7s

[Fold 2] fold=(388,776) | train=(686,746) | test=(746,776)
  sizes Train/Val/Test: 48/12/30
  SBP: MAE=4.6554 | SD=5.5065
  DBP: MAE=3.3579 | SD=3.2262
  elapsed: 4.8s

[Fold 3] fold=(776,1164) | train=(1074,1134) | test=(1134,1164)
  sizes Train/Val/Test: 48/12/30
  SBP: MAE=1.8334 | SD=2.0012
  DBP: MAE=1.0361 | SD=1.3582
  elapsed: 5.3s

[Fold 4] fold=(1164,1552) | train=(1462,1522) | test=(1522,1552)
  sizes Train/Val/Test: 48/12/30
  SBP: MAE=3.3943 | SD=1.4474
  DBP: MAE=2.0387 | SD=0.8061
  elapsed: 4.6s

[Fo