In [1]:
# ============================================================
# PulseDB / MIMIC single-subject (p043774) — time-aware protocol sweep
# CLEANED:
#   - Use PPG_F only (already per-segment min-max 0~1) => NO extra bandpass / NO extra input min-max
#   - Keep 중요 로직(시간 split + block gap + time gap + block-test) 그대로
# FIXED (critical):
#   - Label scaling is TRAIN-ONLY per fold (NO future leakage)
# ADDED:
#   - Report MAE + SD (std of signed error) in mmHg for SBP/DBP
# ============================================================

import time
import random
import numpy as np
import h5py
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, Subset
from scipy.interpolate import interp1d
from scipy.integrate import trapezoid
import pandas as pd

# ==========================================
# [0] 실험 환경 설정
# ==========================================
MAT_FILE = "/content/drive/MyDrive/Colab Notebooks/PusleDB/p043774.mat"
SEGMENT_LIMIT = None          # None = all
PAD_LEN = 200
SEC_PER_SEGMENT = 10.0

BATCH_SIZE = 32
EPOCHS = 100
LR = 1e-3
WEIGHT_DECAY = 1e-4
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

SEED = 42
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

print(f"Using Device: {DEVICE}")

# ==========================================
# [1] Re-sample + Prior feature
# ==========================================
def cubic_resample(ppg, target_len=PAD_LEN):
    """Cubic spline interpolation (fallback to linear)"""
    x_old = np.linspace(0, 1, len(ppg))
    x_new = np.linspace(0, 1, target_len)
    if len(ppg) < 4:
        return np.interp(x_new, x_old, ppg).astype(np.float32)
    try:
        f = interp1d(x_old, ppg, kind="cubic", bounds_error=False, fill_value="extrapolate")
        return f(x_new).astype(np.float32)
    except Exception:
        return np.interp(x_new, x_old, ppg).astype(np.float32)

def extract_multiscale_morph_features(ppg_01):
    """
    Multi-scale Morphological Feature Extraction (44 dims)
    - PPG_F already 0~1 per segment => use directly
    """
    scales = [100, 150, 200, 250]
    all_features = []

    for scale in scales:
        x = cubic_resample(ppg_01, scale)

        peak_idx = int(np.argmax(x))
        end_idx = scale - 1

        vp = float(x[peak_idx])
        vt = float(x[end_idx])
        dv = vp - vt
        vm = float(np.mean(x))
        std_val = float(np.std(x))

        tvp = peak_idx / scale

        diff = np.diff(x)
        kmax = float(np.max(diff)) if len(diff) > 0 else 0.0
        tkmax = (int(np.argmax(diff)) / scale) if len(diff) > 0 else 0.0

        amax = float(trapezoid(x[:peak_idx])) if peak_idx > 0 else 0.0

        centered = x - vm
        skew_approx = float(np.mean(centered**3) / (std_val**3)) if std_val > 0 else 0.0
        kurt_approx = float(np.mean(centered**4) / (std_val**4)) if std_val > 0 else 0.0

        all_features.extend([vp, vt, dv, vm, kmax, tkmax, amax, std_val, tvp, skew_approx, kurt_approx])

    return np.array(all_features, dtype=np.float32)

# ==========================================
# [2] 데이터 로딩 (PPG_F only)
# ==========================================
def load_data_from_mat(mat_path, segment_limit=None):
    segments, priors, targets = [], [], []
    with h5py.File(mat_path, "r") as f:
        refs = f["Subj_Wins"]["PPG_F"][0]
        sbps = f["Subj_Wins"]["SegSBP"][0]
        dbps = f["Subj_Wins"]["SegDBP"][0]

        total = min(len(refs), segment_limit) if segment_limit else len(refs)
        for i in range(total):
            ppg = f[refs[i]][()].squeeze().astype(np.float32)   # (1250,) already 0~1
            sbp = float(f[sbps[i]][()][0][0])
            dbp = float(f[dbps[i]][()][0][0])

            segments.append(ppg)
            priors.append(extract_multiscale_morph_features(ppg))
            targets.append([sbp, dbp])

            if i % 1000 == 0:
                print(f"  Processed {i}/{total} ...")

    return segments, np.stack(priors).astype(np.float32), np.array(targets, dtype=np.float32)

# ==========================================
# [3] Dataset (RAW y; label scaling is fold-wise train-only)
# ==========================================
class PPGDatasetRawY(Dataset):
    def __init__(self, segments, priors, targets_mmHg):
        self.segments = segments            # list of (1250,)
        self.priors = priors                # (N, 44)
        self.targets = targets_mmHg         # (N, 2) in mmHg

    def __len__(self):
        return len(self.segments)

    def __getitem__(self, idx):
        x = cubic_resample(self.segments[idx], PAD_LEN)        # (200,)
        x = torch.tensor(x, dtype=torch.float32).unsqueeze(0)  # (1, 200)

        p = torch.tensor(self.priors[idx], dtype=torch.float32)
        y = torch.tensor(self.targets[idx], dtype=torch.float32)  # RAW mmHg
        return x, p, y

# ==========================================
# [4] Model
# ==========================================
class MorphCNNRegressor(nn.Module):
    def __init__(self, prior_dim=44):
        super().__init__()
        self.cnn = nn.Sequential(
            nn.Conv1d(1, 32, 7, padding=3),
            nn.BatchNorm1d(32),
            nn.ReLU(),
            nn.MaxPool1d(2),

            nn.Conv1d(32, 64, 5, padding=2),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.MaxPool1d(2),

            nn.Conv1d(64, 128, 5, padding=2),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.MaxPool1d(2),

            nn.Conv1d(128, 256, 3, padding=1),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.AdaptiveAvgPool1d(1)
        )

        self.fc_prior = nn.Sequential(
            nn.Linear(prior_dim, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 256),
            nn.BatchNorm1d(256),
            nn.ReLU()
        )

        self.fc_out = nn.Sequential(
            nn.Linear(256 + 256, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 2)
        )

    def forward(self, x, prior):
        feat = self.cnn(x).squeeze(-1)
        pfeat = self.fc_prior(prior)
        return self.fc_out(torch.cat([feat, pfeat], dim=1))

# ==========================================
# [5] Train-only label scaler (fold-wise)
# ==========================================
class LabelScaler2D:
    """Train-only scaling. Default: minmax."""
    def __init__(self, mode="minmax", eps=1e-6):
        assert mode in ["minmax", "zscore"]
        self.mode = mode
        self.eps = eps
        self.fitted = False

    def fit(self, y_train_mmHg: np.ndarray):
        y = np.asarray(y_train_mmHg, dtype=np.float32)
        if self.mode == "minmax":
            self.y_min = y.min(axis=0)
            self.y_max = y.max(axis=0)
        else:
            self.y_mean = y.mean(axis=0)
            self.y_std = y.std(axis=0)
        self.fitted = True
        return self

    def transform(self, y_mmHg: torch.Tensor) -> torch.Tensor:
        assert self.fitted
        if self.mode == "minmax":
            y_min = torch.tensor(self.y_min, device=y_mmHg.device, dtype=y_mmHg.dtype)
            y_max = torch.tensor(self.y_max, device=y_mmHg.device, dtype=y_mmHg.dtype)
            return (y_mmHg - y_min) / (y_max - y_min + self.eps)
        else:
            y_mean = torch.tensor(self.y_mean, device=y_mmHg.device, dtype=y_mmHg.dtype)
            y_std = torch.tensor(self.y_std, device=y_mmHg.device, dtype=y_mmHg.dtype)
            return (y_mmHg - y_mean) / (y_std + self.eps)

    def inverse(self, y_scaled: torch.Tensor) -> torch.Tensor:
        assert self.fitted
        if self.mode == "minmax":
            y_min = torch.tensor(self.y_min, device=y_scaled.device, dtype=y_scaled.dtype)
            y_max = torch.tensor(self.y_max, device=y_scaled.device, dtype=y_scaled.dtype)
            return y_scaled * (y_max - y_min + self.eps) + y_min
        else:
            y_mean = torch.tensor(self.y_mean, device=y_scaled.device, dtype=y_scaled.dtype)
            y_std = torch.tensor(self.y_std, device=y_scaled.device, dtype=y_scaled.dtype)
            return y_scaled * (y_std + self.eps) + y_mean

# ==========================================
# [6] Train / Eval
# ==========================================
def set_seed(seed=SEED):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

def train_one_model(train_loader, val_loader, scaler: LabelScaler2D):
    model = MorphCNNRegressor(prior_dim=44).to(DEVICE)
    optimizer = optim.Adam(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
    criterion = nn.MSELoss()

    best_val = float("inf")
    best_state = None

    for epoch in range(1, EPOCHS + 1):
        model.train()
        for x, p, y_mmHg in train_loader:
            x, p, y_mmHg = x.to(DEVICE), p.to(DEVICE), y_mmHg.to(DEVICE)
            y = scaler.transform(y_mmHg)
            pred = model(x, p)
            loss = criterion(pred, y)
            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()

        model.eval()
        val_losses = []
        with torch.no_grad():
            for x, p, y_mmHg in val_loader:
                x, p, y_mmHg = x.to(DEVICE), p.to(DEVICE), y_mmHg.to(DEVICE)
                y = scaler.transform(y_mmHg)
                pred = model(x, p)
                val_losses.append(float(criterion(pred, y).item()))
        avg_val = float(np.mean(val_losses)) if len(val_losses) else float("inf")

        if avg_val < best_val:
            best_val = avg_val
            best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}

    if best_state is not None:
        model.load_state_dict(best_state)
    return model

def eval_mae_sd_mmHg(model, loader, scaler: LabelScaler2D):
    """
    Returns:
      - MAE (mean absolute error) for SBP/DBP
      - SD  (std of signed error) for SBP/DBP
    """
    model.eval()
    errs = []
    with torch.no_grad():
        for x, p, y_mmHg in loader:
            x, p, y_mmHg = x.to(DEVICE), p.to(DEVICE), y_mmHg.to(DEVICE)
            pred_scaled = model(x, p)
            pred_mmHg = scaler.inverse(pred_scaled)
            err = (pred_mmHg - y_mmHg).detach().cpu().numpy()  # signed
            errs.append(err)

    if len(errs) == 0:
        return dict(mae_sbp=np.nan, sd_sbp=np.nan, mae_dbp=np.nan, sd_dbp=np.nan, n=0)

    E = np.concatenate(errs, axis=0)  # (N,2)
    e_sbp, e_dbp = E[:, 0], E[:, 1]
    return dict(
        mae_sbp=float(np.mean(np.abs(e_sbp))),
        sd_sbp=float(np.std(e_sbp, ddof=0)),
        mae_dbp=float(np.mean(np.abs(e_dbp))),
        sd_dbp=float(np.std(e_dbp, ddof=0)),
        n=int(E.shape[0])
    )

# ==========================================
# [7] Sweep Engine
# ==========================================
def segs_from_minutes(minutes: float) -> int:
    return int((minutes * 60.0) / SEC_PER_SEGMENT)

def run_timewise_holdout_gap_sweep():
    set_seed(SEED)

    segments, priors, targets_mmHg = load_data_from_mat(MAT_FILE, segment_limit=SEGMENT_LIMIT)
    ds = PPGDatasetRawY(segments, priors, targets_mmHg)

    total_len = len(ds)
    print(f"\n[Data Ready] total_len={total_len}")

    sweep_train_dur = [3, 5, 10, 15]    # train length (min)
    sweep_time_gap  = [0, 5, 10, 15]    # gap between train and test (min)

    TEST_DURATION_MIN = 5
    test_dur_segs = segs_from_minutes(TEST_DURATION_MIN)

    # test = last 5 minutes
    test_end = total_len
    test_start = test_end - test_dur_segs
    if test_start <= 0:
        raise ValueError("Not enough data to allocate the test segment.")

    results = []
    total_iter = len(sweep_train_dur) * len(sweep_time_gap)
    it = 0
    t0 = time.time()

    for tr_dur in sweep_train_dur:
        tr_dur_segs = segs_from_minutes(tr_dur)

        for t_gap in sweep_time_gap:
            gap_segs = segs_from_minutes(t_gap)

            it += 1
            elapsed = time.time() - t0
            print(f"[{it}/{total_iter}] Train:{tr_dur}m | TGap:{t_gap}m (elapsed {elapsed:.1f}s)")

            # train is before test_start, separated by gap
            train_end = test_start - gap_segs
            train_start = train_end - tr_dur_segs

            # feasibility check
            if train_start < 0 or train_end <= 0 or train_start >= train_end:
                results.append({
                    "TrainDur_min": tr_dur,
                    "TimeGap_min": t_gap,
                    "MAE_SBP": np.nan,
                    "SD_SBP": np.nan,
                    "MAE_DBP": np.nan,
                    "SD_DBP": np.nan,
                    "train_n": 0,
                    "val_n": 0,
                    "test_n": int(test_dur_segs),
                    "OK": 0
                })
                continue

            train_indices = list(range(train_start, train_end))
            test_indices  = list(range(test_start, test_end))

            # time-aware val split (last 20% of train)
            n_total = len(train_indices)
            n_val = max(1, int(n_total * 0.20))
            if n_total - n_val < 1:
                # too small train set
                results.append({
                    "TrainDur_min": tr_dur,
                    "TimeGap_min": t_gap,
                    "MAE_SBP": np.nan,
                    "SD_SBP": np.nan,
                    "MAE_DBP": np.nan,
                    "SD_DBP": np.nan,
                    "train_n": 0,
                    "val_n": 0,
                    "test_n": int(test_dur_segs),
                    "OK": 0
                })
                continue

            real_train_idx = train_indices[:-n_val]
            val_idx        = train_indices[-n_val:]

            # TRAIN-only scaler fit
            y_train = targets_mmHg[np.array(real_train_idx)]
            scaler = LabelScaler2D(mode="minmax", eps=1e-6).fit(y_train)

            train_loader = DataLoader(Subset(ds, real_train_idx), batch_size=BATCH_SIZE, shuffle=True)
            val_loader   = DataLoader(Subset(ds, val_idx), batch_size=BATCH_SIZE, shuffle=False)
            test_loader  = DataLoader(Subset(ds, test_indices), batch_size=BATCH_SIZE, shuffle=False)

            model = train_one_model(train_loader, val_loader, scaler)
            stat = eval_mae_sd_mmHg(model, test_loader, scaler)

            results.append({
                "TrainDur_min": tr_dur,
                "TimeGap_min": t_gap,
                "MAE_SBP": stat["mae_sbp"],
                "SD_SBP":  stat["sd_sbp"],
                "MAE_DBP": stat["mae_dbp"],
                "SD_DBP":  stat["sd_dbp"],
                "train_n": len(real_train_idx),
                "val_n":   len(val_idx),
                "test_n":  len(test_indices),
                "OK": 1
            })

    df = pd.DataFrame(results)
    print("\n=== TIME-WISE HOLDOUT GAP SWEEP RESULTS ===")
    print(df[["TrainDur_min","TimeGap_min","MAE_SBP","SD_SBP","MAE_DBP","SD_DBP","train_n","val_n","test_n","OK"]])

    out_csv = "mimic_p043774_timewise_holdout_gap_sweep.csv"
    df.to_csv(out_csv, index=False)
    print(f"Saved: {out_csv}")
    return df


if __name__ == "__main__":
    df = run_timewise_holdout_gap_sweep()

Using Device: cuda
  Processed 0/1943 ...
  Processed 1000/1943 ...

[Data Ready] total_len=1943
[1/16] Train:3m | TGap:0m (elapsed 0.0s)
[2/16] Train:3m | TGap:5m (elapsed 8.3s)
[3/16] Train:3m | TGap:10m (elapsed 11.0s)
[4/16] Train:3m | TGap:15m (elapsed 13.2s)
[5/16] Train:5m | TGap:0m (elapsed 15.0s)
[6/16] Train:5m | TGap:5m (elapsed 17.9s)
[7/16] Train:5m | TGap:10m (elapsed 20.6s)
[8/16] Train:5m | TGap:15m (elapsed 23.8s)
[9/16] Train:10m | TGap:0m (elapsed 27.0s)
[10/16] Train:10m | TGap:5m (elapsed 31.8s)
[11/16] Train:10m | TGap:10m (elapsed 37.3s)
[12/16] Train:10m | TGap:15m (elapsed 42.5s)
[13/16] Train:15m | TGap:0m (elapsed 47.2s)
[14/16] Train:15m | TGap:5m (elapsed 55.4s)
[15/16] Train:15m | TGap:10m (elapsed 63.3s)
[16/16] Train:15m | TGap:15m (elapsed 71.0s)

=== TIME-WISE HOLDOUT GAP SWEEP RESULTS ===
    TrainDur_min  TimeGap_min    MAE_SBP    SD_SBP   MAE_DBP    SD_DBP  \
0              3            0   5.362437  3.011940  2.590262  1.661627   
1              3 