In [6]:
import numpy as np
import pandas as pd
from dataclasses import dataclass
from pathlib import Path
from typing import Optional, Union

ArrayLike = Union[np.ndarray]

## Partition of rs-fMRI time series

We follow the paper’s partition strategy: window length L=20 timepoints.
To ensure site-consistent window counts, each site is truncated to the
time-series length reported in the paper (KKI 119, NYU 171, OHSU 74,
NeuroIMAGE 257, Peking_1 231). Subjects shorter than the target length
are dropped.

In [7]:
@dataclass
class WindowingResult:
    windows: np.ndarray  # shape: (T, N, L) 
    T: int               # number of valid windows 
    dropped: int         # number of dropped timepoints at the end (0...L-1)
    L: int               # window length
    step: int            # step size between windows
    N: int               # number of ROIs
    M: int               # original number of timepoints


def partition_time_series(
    X: ArrayLike,
    L: int = 20,
    *,
    overlap: Optional[int] = None,
    step: Optional[int] = None,
    time_axis: int = -1,
    drop_incomplete: bool = True,
    return_metadata: bool = True,
) -> Union[np.ndarray, WindowingResult]:
    """
    Sliding-window partitioning of ROI time series.

    Params:
      - L: window length (timepoints)
      - overlap: number of timepoints overlapped between consecutive windows (0..L-1)
      - step: hop size between window starts (1..L). If provided, it overrides overlap.
      - drop_incomplete: if True, only keep full windows of length L (paper behavior)
    """
    X = np.asarray(X)
    if X.ndim != 2:
        raise ValueError(f"X must be 2D, got shape {X.shape}")
    if L <= 0:
        raise ValueError("L must be positive")
    
    if time_axis == 0:
        X_nt = X.T
    elif time_axis == -1:
        X_nt = X
    else:
        raise ValueError("time_axis must be 0 (time first) or -1 (time last)")
    
    N, M = X_nt.shape

    if step is None:
        if overlap is None:
            step = L  # no overlap
        else:
            if not (0 <= overlap < L):
                raise ValueError(f"overlap must be in [0, {L-1}], got {overlap}")
            step = L - overlap
    else:
        if not (1 <= step <= L):
            raise ValueError(f"step must be in [1, {L}], got {step}")
        
    # Compute window start indices
    starts = np.arange(0, M, step, dtype=int)
    if drop_incomplete:
        starts = starts[starts + L <= M]

    T = len(starts)
    if T == 0:
        windows = np.empty((0, N, L), dtype=X.dtype)
        dropped = M
        return WindowingResult(windows, T, dropped, L, step, N, M) if return_metadata else windows
    
    windows = np.stack([X_nt[:, s:s+L] for s in starts], axis=0)  # shape (T, N, L)

    # Define "dropped" as the tail after the last possible full-window start
    last_start = starts[-1]
    used_until = last_start + L
    dropped = max(0, M - used_until)

    if return_metadata:
        return WindowingResult(windows, T, dropped, L, step, N, M)
    else:
        return windows


## What this notebook outputs

This notebook converts each subject’s ROI time series into non-overlapping windows
of shape `(T, 116, 20)` and stores them for reproducible model training.

**Outputs:**
- Window tensors:
  - `data/processed/windows/train/<SITE>/<SUBJECT_ID>.npy`
  - `data/processed/windows/val/<SITE>/<SUBJECT_ID>.npy`
- A manifest mapping each subject to its saved window file:
  - `data/processed/windows_manifest.csv`

In [8]:
# =========================
# 1) Config
# =========================
EXPECTED_N = 116
L = 20
STEP = 20
DROP_IF_SHORTER = True
DTYPE = np.float32
PROGRESS_EVERY = 50

PAPER_M = {
    "KKI": 119,
    "NYU": 171,
    "OHSU": 74,
    "NeuroIMAGE": 257,
    "Peking_1": 231,
}

CANDIDATE_PATH_COLS = [
    "tc_path", "timeseries_path", "time_series_path", "path",
    "roi_ts_path", "roi_timeseries_path", "file", "filepath",
    "roi_path", "aal_path"
]

# =========================
# 2) Paths + load manifests
# =========================
PROJECT_ROOT = Path("..").resolve()
DATA_PROCESSED = PROJECT_ROOT / "data" / "processed"

TRAIN_CSV = DATA_PROCESSED / "subjects_train_split_paper.csv"
VAL_CSV   = DATA_PROCESSED / "subjects_val_split_paper.csv"

OUT_ROOT = DATA_PROCESSED / "windows"
OUT_TRAIN = OUT_ROOT / "train"
OUT_VAL   = OUT_ROOT / "val"
OUT_TRAIN.mkdir(parents=True, exist_ok=True)
OUT_VAL.mkdir(parents=True, exist_ok=True)

df_train = pd.read_csv(TRAIN_CSV)
df_val   = pd.read_csv(VAL_CSV)

print(f"Train rows: {len(df_train)} | cols: {list(df_train.columns)}")
print(f"Val rows  : {len(df_val)} | cols: {list(df_val.columns)}")

def infer_path_col(df: pd.DataFrame) -> str:
    cols = {c.lower(): c for c in df.columns}
    for cand in CANDIDATE_PATH_COLS:
        if cand.lower() in cols:
            return cols[cand.lower()]
    for c in df.columns:
        if "path" in c.lower():
            return c
    raise ValueError("Could not infer a time-series path column (expected something like tc_path).")

PATH_COL = infer_path_col(df_train)
if PATH_COL not in df_val.columns:
    raise ValueError(f"PATH_COL='{PATH_COL}' not found in val CSV columns.")
print("Inferred PATH_COL:", PATH_COL)

Train rows: 408 | cols: ['site', 'subject_id', 'tc_path', 'T', 'R', 'DX', 'dx_raw', 'label']
Val rows  : 103 | cols: ['site', 'subject_id', 'tc_path', 'T', 'R', 'DX', 'dx_raw', 'label']
Inferred PATH_COL: tc_path


In [9]:
# =========================
# 3) Time series loading + shape utilities
# =========================
def load_subject_timeseries(tc_path: str) -> np.ndarray:
    """
    Load Athena AAL ROI time series from .1D (or other simple formats).
    Returns a 2D array, typically (T, N) from CSV parsing; we fix orientation later.
    """
    p = Path(tc_path)
    if not p.is_absolute():
        p = (PROJECT_ROOT / p).resolve()
    if not p.exists():
        raise FileNotFoundError(f"Time-series file not found: {p}")

    suf = p.suffix.lower()
    if suf == ".npy":
        return np.asarray(np.load(p))
    if suf == ".npz":
        z = np.load(p)
        return np.asarray(z[list(z.keys())[0]])

    if suf == ".csv":
        df = pd.read_csv(p, sep=",")
    elif suf == ".tsv":
        df = pd.read_csv(p, sep="\t")
    elif suf in [".txt", ".1d"]:
        df = pd.read_csv(p, sep=r"\s+|\t+", engine="python", comment="#")
    else:
        raise ValueError(f"Unsupported file extension: {suf} for {p}")

    if df.shape[1] == 1:
        df = pd.read_csv(p, sep=r"\s+", engine="python", comment="#")

    mean_cols = [c for c in df.columns if str(c).startswith("Mean_")]
    if mean_cols:
        roi_df = df[mean_cols]
    else:
        drop_cols = [c for c in df.columns if str(c).lower() in ["file", "sub-brick", "subbrick", "brick", "index"]]
        roi_df = df.drop(columns=drop_cols, errors="ignore").apply(pd.to_numeric, errors="coerce")

    roi_df = roi_df.dropna(axis=0, how="all")
    X = roi_df.to_numpy(dtype=float)

    if X.ndim != 2:
        raise ValueError(f"Loaded time series must be 2D, got shape {X.shape} from {p}")
    return X

def ensure_N_by_M(X: np.ndarray, expected_N: int = EXPECTED_N) -> np.ndarray:
    """
    Ensure output is (N, M).
    Accepts X in (N, M) or (M, N) and returns (N, M).
    """
    if X.shape[0] == expected_N:
        return X
    if X.shape[1] == expected_N:
        return X.T
    raise ValueError(f"Expected one dim == N={expected_N}, got shape {X.shape}")

def truncate_to_paper_length(X_nm: np.ndarray, site: str, *, drop_if_shorter: bool = DROP_IF_SHORTER) -> Optional[np.ndarray]:
    target = PAPER_M.get(site)
    if target is None:
        return X_nm
    M = X_nm.shape[1]
    if M >= target:
        return X_nm[:, :target]
    return None if drop_if_shorter else X_nm

In [10]:
# =========================
# 4) Window + save
# =========================
def build_and_save_windows(df: pd.DataFrame, split: str) -> pd.DataFrame:
    rows = []
    n = len(df)
    dropped_short = 0

    for i, row in df.iterrows():
        site = row["site"]
        subj = row["subject_id"]

        try:
            label = int(row["label"])

            X = load_subject_timeseries(row[PATH_COL])
            X_nm = ensure_N_by_M(X, expected_N=EXPECTED_N)
            X_nm = truncate_to_paper_length(X_nm, site, drop_if_shorter=DROP_IF_SHORTER)
            if X_nm is None:
                dropped_short += 1
                continue

            res = partition_time_series(
                X_nm,
                L=L,
                step=STEP,
                time_axis=-1,
                drop_incomplete=True,
                return_metadata=True,
            )
            windows = res.windows.astype(DTYPE, copy=False)  # (T, N, L)
            if not (windows.shape[1] == EXPECTED_N and windows.shape[2] == L):
                raise ValueError(f"Unexpected windows shape: {windows.shape}")

            out_dir = OUT_ROOT / split / site
            out_dir.mkdir(parents=True, exist_ok=True)
            out_path = out_dir / f"{subj}.npy"
            np.save(out_path, windows)

            rows.append({
                "split": split,
                "site": site,
                "subject_id": subj,
                "label": label,
                "T": int(windows.shape[0]),
                "N": int(windows.shape[1]),
                "L": int(windows.shape[2]),
                "M_used": int(res.M),
                "dropped_tail": int(res.dropped),
                "windows_path": str(out_path.relative_to(PROJECT_ROOT)),
            })
        
        except Exception as e:
            print(f"[WARN] {split} | {site} | {subj} | {e}")
            continue

        if (i + 1) % PROGRESS_EVERY == 0:
            print(f"[{split}] processed {i+1}/{n} | saved {len(rows)}")

    print(f"[{split}] dropped_shorter_than_paper: {dropped_short} / {n}")
    return pd.DataFrame(rows)

# =========================
# 5) Run + save manifest
# =========================
windows_train = build_and_save_windows(df_train, "train")
windows_val   = build_and_save_windows(df_val, "val")

windows_manifest = pd.concat([windows_train, windows_val], ignore_index=True)

OUT_MANIFEST = DATA_PROCESSED / "windows_manifest.csv"
windows_manifest.to_csv(OUT_MANIFEST, index=False)

print("Saved:", OUT_MANIFEST)
print("Train windows saved:", len(windows_train))
print("Val windows saved  :", len(windows_val))
print("Output root:", OUT_ROOT)

[train] processed 50/408 | saved 50
[train] processed 100/408 | saved 100
[train] processed 150/408 | saved 150
[train] processed 200/408 | saved 200
[train] processed 250/408 | saved 249
[train] processed 300/408 | saved 299
[train] processed 350/408 | saved 349
[train] processed 400/408 | saved 399
[train] dropped_shorter_than_paper: 1 / 408
[val] processed 50/103 | saved 50
[val] processed 100/103 | saved 100
[val] dropped_shorter_than_paper: 0 / 103
Saved: /Users/mariaborca/Documents/AI_2023-2026/Semestrul_5/KBS/Report_3/adhd-tcn-replication/data/processed/windows_manifest.csv
Train windows saved: 407
Val windows saved  : 103
Output root: /Users/mariaborca/Documents/AI_2023-2026/Semestrul_5/KBS/Report_3/adhd-tcn-replication/data/processed/windows
