In [1]:
import os, re, glob, time, json, warnings, math
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
import torch, torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import StratifiedGroupKFold, GroupKFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, roc_curve

print("Torch:", torch.__version__, "| CUDA:", torch.version.cuda, "| cuda_available:", torch.cuda.is_available())
try:
    import numpy as np
    import numpy
    print("NumPy:", numpy.__version__)
except Exception as e:
    print("NumPy import issue:", e)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)


Torch: 2.8.0+cpu | CUDA: None | cuda_available: False
NumPy: 1.26.4
Device: cpu


In [3]:
DATA_DIR = "perg/csv"  # <-- change path if needed

NEED_DATA = "per_eye_avg" not in globals()


In [4]:
try:
    from scipy.signal import butter, filtfilt, welch
    from scipy.stats import skew, kurtosis
    SCIPY_OK = True
except Exception:
    SCIPY_OK = False
    def skew(x): return 0.0
    def kurtosis(x): return 0.0

if NEED_DATA:
    def _estimate_fs(time_series):
        try:
            t = pd.to_datetime(time_series, format="%Y-%m-%d %H:%M:%S.%f")
            dt = (t.view("int64")[1:] - t.view("int64")[:-1]) / 1e9
            return float(1.0 / np.median(dt))
        except Exception:
            try:
                vals = pd.to_numeric(time_series, errors="coerce").to_numpy(dtype=float)
                dt = np.diff(vals) / 1000.0
                return float(1.0 / np.median(dt))
            except Exception:
                return 1700.0

    def load_per_eye_table(data_dir):
        demo_path = os.path.join(data_dir, "D:/perg/data/raw/participants_info.csv") 
        if not os.path.exists(demo_path):
            raise FileNotFoundError(f"D:/perg/data/raw/participants_info.csv not found in {data_dir}")
        demo = pd.read_csv(demo_path)
        demo.columns = [c.strip() for c in demo.columns]

        files = sorted(glob.glob(os.path.join(data_dir, "[0-9][0-9][0-9][0-9].csv")))
        rows = []
        time_pat = re.compile(r'^TIME_(\d+)$')
        re_pat   = re.compile(r'^RE_(\d+)$')
        le_pat   = re.compile(r'^LE_(\d+)$')

        for fp in files:
            rec = pd.read_csv(fp)
            rec.columns = [c.strip() for c in rec.columns]
            rec_id = int(Path(fp).stem)
            cols = set(rec.columns)

            # discover repeats k
            ks = set()
            for c in cols:
                for pat in (time_pat, re_pat, le_pat):
                    m = pat.match(c)
                    if m: ks.add(int(m.group(1)))
            if not ks: ks = {1}

            for k in sorted(ks):
                time_col = f"TIME_{k}" if f"TIME_{k}" in cols else ("TIME" if "TIME" in cols else None)
                re_col   = f"RE_{k}"   if f"RE_{k}"   in cols else ("RE" if "RE" in cols else None)
                le_col   = f"LE_{k}"   if f"LE_{k}"   in cols else ("LE" if "LE" in cols else None)
                fs = _estimate_fs(rec[time_col]) if time_col is not None else 1700.0
                for eye, col in (("RE", re_col), ("LE", le_col)):
                    if col and col in rec.columns:
                        sig = pd.to_numeric(rec[col], errors="coerce").to_numpy(dtype=float)
                        rows.append({"record_id": rec_id, "eye": eye, "repeat": k,
                                     "n_samples": len(sig), "fs_hz": float(fs), "signal": sig})

        per_eye = pd.DataFrame(rows)
        key_col = "id_record" if "id_record" in demo.columns else "record_id"
        per_eye = per_eye.merge(demo, left_on="record_id", right_on=key_col, how="left")

        # targets
        if "diagnosis1" not in per_eye.columns:
            warnings.warn("diagnosis1 not found; default y_class=0")
            per_eye["y_class"] = 0
        else:
            per_eye["y_class"] = (per_eye["diagnosis1"].fillna("Normal").ne("Normal")).astype(int)

        re_col = next((c for c in per_eye.columns if c.lower() in ("va_re_logmar","logmar_re","va_re")), None)
        le_col = next((c for c in per_eye.columns if c.lower() in ("va_le_logmar","logmar_le","va_le")), None)
        per_eye["y_reg"] = np.where(per_eye["eye"].eq("RE"), per_eye.get(re_col), per_eye.get(le_col))
        return per_eye

    def baseline_and_filter(x, fs, baseline_ms=(0,10), band=(1.0,100.0)):
        x = np.asarray(x, dtype=float)
        if not np.isfinite(fs) or fs <= 0: fs = 1700.0
        n0 = max(1, int(fs*baseline_ms[1]/1000.0))
        y = x - float(np.nanmean(x[:n0]))
        if SCIPY_OK and band is not None:
            lo, hi = band
            if hi >= fs/2.0: hi = fs/2.0 - 1.0
            if hi > lo and hi > 0:
                b,a = butter(2, [lo/(fs/2.0), hi/(fs/2.0)], btype="band")
                y = filtfilt(b,a,y,method="gust")
        s = np.nanstd(y)
        if s > 1e-8: y = (y - np.nanmean(y)) / s
        return y

    def average_repeats(per_eye):
        rows = []
        for (rec, eye), grp in per_eye.groupby(["record_id", "eye"]):
            signals = []
            for _, r in grp.iterrows():
                fs = float(r.get("fs_hz", 1700.0)) if pd.notna(r.get("fs_hz")) else 1700.0
                sig = baseline_and_filter(r["signal"], fs)
                signals.append(sig)
            L = min(len(s) for s in signals)
            signals = [s[:L] for s in signals]
            avg_sig = np.mean(np.stack(signals, axis=0), axis=0)
            rows.append({
                "record_id": rec, "eye": eye, "signal": avg_sig,
                "fs_hz": float(grp["fs_hz"].iloc[0]) if pd.notna(grp["fs_hz"].iloc[0]) else 1700.0,
                "y_class": int(grp["y_class"].iloc[0]),
                "y_reg": float(grp["y_reg"].iloc[0]) if pd.notna(grp["y_reg"].iloc[0]) else np.nan,
                "n_repeats": len(grp)
            })
        return pd.DataFrame(rows)

    print("[data] building per_eye_avg …")
    per_eye = load_per_eye_table(DATA_DIR)
    per_eye_avg = average_repeats(per_eye)
    print("per_eye:", per_eye.shape, "| per_eye_avg:", per_eye_avg.shape)
else:
    print("[data] using existing per_eye_avg in notebook.")

# expose baseline_and_filter if not defined
if "baseline_and_filter" not in globals():
    def baseline_and_filter(x, fs, baseline_ms=(0,10), band=(1.0,100.0)):
        x = np.asarray(x, dtype=float)
        n0 = max(1, int(fs*baseline_ms[1]/1000.0))
        y = x - float(np.nanmean(x[:n0]))
        s = np.nanstd(y)
        if s > 1e-8: y = (y - np.nanmean(y)) / s
        return y

[data] building per_eye_avg …


KeyError: 'record_id'