In [1]:
import subprocess, os, time, re, csv, sys

import pandas as pd
import nibabel as nib
import numpy as np
import matplotlib.pyplot as plt

from pathlib import Path
from typing import List, Tuple
from collections import defaultdict

root_dir = "/fs5/p_masi/zuol1/data/cibs_brain2/analysis"
dest_root_dir = "/nfs2/harmonization/BIDS/CIBS_BRAIN2"

In [22]:
def dump(path_list, outfile):
    with open(outfile, "w") as f:
        for p in sorted(path_list):
            f.write(str(p) + "\n")
    print(f"Cached {len(path_list)} paths to '{outfile}'")

In [51]:
lines = []
for dirpath, dirnames, filenames in os.walk(Path(root_dir)):
    if dirnames:
        continue
    file_type = Path(dirpath).name
    session = Path(dirpath).parent.name
    if session not in {"00", "12"} or file_type != "nii":
        continue
    for fname in filenames:
        if fname.endswith(".nii") or fname.endswith(".nii.gz"):
            lines.append(str(Path(dirpath) / fname))
dump(lines, "nifti_all.txt")

Cached 692 paths to 'nifti_all.txt'


In [5]:
if os.path.exists("dataset_cache/nifti_all.txt"):
    with open("dataset_cache/nifti_all.txt", "r") as f:
        lines = [line.strip() for line in f if line.strip()]
    print(f"Loaded {len(lines)} paths from nifti_all.txt")

Loaded 692 paths from nifti_all.txt


In [54]:
depths = [len(Path(p).parts) for p in lines]

# Check whether the depths of all addresses are same
print(len(set(depths)) == 1)

True


In [55]:
subject_sessions = defaultdict(set)

for fp in lines:
    p = Path(fp)
    if len(p.parents) < 3:
        continue
    session = p.parents[1].name
    file_type = p.parents[0].name
    if session not in ["00", "12"] or file_type != "nii":
        continue
    subject_dir = p.parents[2]
    subject_sessions[subject_dir].add(session)

only_00 = [p for p, s in subject_sessions.items() if s == {"00"}]
both_00_12 = [p for p, s in subject_sessions.items() if s == {"00", "12"}]
only_12 = [p for p, s in subject_sessions.items() if s == {"12"}]

dump(only_00, "subject_only_00.txt")
dump(both_00_12, "subject_00_and_12.txt")
dump(only_12, "subject_only_12.txt")

print(f"Only baseline data (00 only): {len(only_00)}")
print(f"Valid data (00 + 12): {len(both_00_12)}")
print(f"Only 12-month data: {len(only_12)}")

Cached 77 paths to 'subject_only_00.txt'
Cached 34 paths to 'subject_00_and_12.txt'
Cached 21 paths to 'subject_only_12.txt'
Only baseline data (00 only): 77
Valid data (00 + 12): 34
Only 12-month data: 21


In [57]:
valid_images = []

for fp in lines:
    p = Path(fp)
    if len(p.parents) < 3:
        continue
    subj_dir = p.parents[2]
    if subj_dir in both_00_12:
        valid_images.append(p)

dump(valid_images, "nifti_00_and_12.txt")

Cached 284 paths to 'nifti_00_and_12.txt'


In [65]:
def extract_modality(fname: str) -> str:
    stem = fname
    for ext in (".nii.gz", ".nii"):
        if stem.endswith(ext):
            stem = stem[: -len(ext)]
            break
    try:
        seq_part = stem.split("_BRAIN_")[1]
    except IndexError:
        return ""
    tokens = seq_part.split("-")
    return "-".join(tokens[:2]) if len(tokens) >= 2 else ""

In [66]:
MODALITY_NEED = "T1-3D"
session_modalities = defaultdict(set)
for fp in lines:
    p = Path(fp)
    if len(p.parents) < 3:
        continue
    session_dir = p.parents[1]
    key = session_dir
    modality = extract_modality(p.name)
    if modality:
        session_modalities[key].add(modality)

sessions_with = [
    key for key, mods in session_modalities.items() if MODALITY_NEED in mods
]
sessions_without = [
    key for key, mods in session_modalities.items() if MODALITY_NEED not in mods
]

print(f"Total subject-session pairs: {len(session_modalities)}")
print(f"Pairs containing {MODALITY_NEED}: {len(sessions_with)}")
print(f"Pairs missing {MODALITY_NEED}: {len(sessions_without)}")

out_file = f"sessions_without_{MODALITY_NEED}.txt"
dump(sessions_without, out_file)

Total subject-session pairs: 166
Pairs containing T1-3D: 165
Pairs missing T1-3D: 1
Cached 1 paths to 'sessions_without_T1-3D.txt'


In [82]:
def parse_path(file_path: str) -> dict | None:
    p = Path(file_path)
    parts = p.parts
    if len(parts) != 11:
        raise RuntimeError(f"File address not valid.\nAddress: {file_path}")
    subject = parts[7]
    session = parts[8]
    scan_type = extract_modality(parts[10])
    if scan_type not in {"T1-3D"}:
        return None
    return {
        "filepath": p.resolve(),
        "scan_type": scan_type,
        "subject_id": f"sub-{subject}",
        "session_id": f"ses-{session}",
    }


rows = [row for fp in valid_images if (row := parse_path(fp))]
df = pd.DataFrame(rows)

In [83]:
df["run"] = (
    df.groupby(["scan_type", "subject_id", "session_id"])
    .cumcount()
    .add(1)
    .astype("string")
)

mask = (
    df.groupby(["scan_type", "subject_id", "session_id"])["run"].transform("size").eq(1)
)
df.loc[mask, "run"] = ""

In [84]:
def make_links(row):
    prefix = f"{row.subject_id}_{row.session_id}"
    if row.run:
        prefix += f"_run-{row.run}"
    if row.scan_type == "T1-3D":
        fname = f"{prefix}_T1w"
        subdir = "anat"
    else:
        raise ValueError(f"Unknown scan type: {row.scan_type}")
    base = f"{dest_root_dir}/{row.subject_id}/{row.session_id}/{subdir}/{fname}"
    row["nii_link"] = base + ".nii.gz"
    return row


df = df.apply(make_links, axis=1)

In [85]:
df.sort_values(["scan_type", "subject_id", "session_id", "run"], inplace=True)

In [6]:
mask = (
    (df["scan_type"] == "T1-3D")
    & df["run"].notna()
    & (df["run"] != "")
    # & (df["run"] == "")
)

if mask.any():
    print(df.loc[mask])
else:
    print("Nothing here.")

    Unnamed: 0                                           filepath scan_type  \
10          10  /fs5/p_masi/zuol1/data/cibs_brain2/raw/CIBS_BR...     T1-3D   
11          11  /fs5/p_masi/zuol1/data/cibs_brain2/raw/CIBS_BR...     T1-3D   
13          13  /fs5/p_masi/zuol1/data/cibs_brain2/raw/CIBS_BR...     T1-3D   
14          14  /fs5/p_masi/zuol1/data/cibs_brain2/raw/CIBS_BR...     T1-3D   
15          15  /fs5/p_masi/zuol1/data/cibs_brain2/raw/CIBS_BR...     T1-3D   
16          16  /fs5/p_masi/zuol1/data/cibs_brain2/raw/CIBS_BR...     T1-3D   
27          27  /fs5/p_masi/zuol1/data/cibs_brain2/raw/CIBS_BR...     T1-3D   
28          28  /fs5/p_masi/zuol1/data/cibs_brain2/raw/CIBS_BR...     T1-3D   
29          29  /fs5/p_masi/zuol1/data/cibs_brain2/raw/CIBS_BR...     T1-3D   
30          30  /fs5/p_masi/zuol1/data/cibs_brain2/raw/CIBS_BR...     T1-3D   
34          34  /fs5/p_masi/zuol1/data/cibs_brain2/raw/CIBS_BR...     T1-3D   
35          35  /fs5/p_masi/zuol1/data/cibs_brain2/r

In [87]:
df.to_csv("data.tsv", sep="\t")

In [4]:
def load_dataframe(path: str) -> pd.DataFrame:
    if path.endswith(".pkl"):
        return pd.read_pickle(path)
    return pd.read_csv(path, sep=None, engine="python")


def _collect_link_pairs(df: pd.DataFrame) -> List[Tuple[str, str]]:
    colmap = {
        "filepath": "nii_link",
    }

    present_pairs = [
        (src, dst)
        for src, dst in colmap.items()
        if src in df.columns and dst in df.columns
    ]
    if not present_pairs:
        raise ValueError("No recognised source/target column pairs found.")

    commands = []
    for _, row in df.iterrows():
        for src, dst in present_pairs:
            s, d = row[src], row[dst]
            if pd.notna(s) and pd.notna(d) and str(s).strip() and str(d).strip():
                commands.append(f'mkdir -p "{Path(d).parent}"')
                commands.append(f'ln -s "{s}" "{d}"')
    return commands


def write_link_commands(
    df: pd.DataFrame,
    output_txt: str | Path,
    *,
    overwrite: bool = True,
) -> None:
    output_txt = Path(output_txt)
    mode = "w" if overwrite else "a"

    commands = _collect_link_pairs(df)

    with output_txt.open(mode, encoding="utf-8") as f:
        for cmd in commands:
            f.write(cmd + "\n")

    print(f"{len(commands)} link commands written to {output_txt.resolve()}")

In [5]:
df = load_dataframe("dataset_cache/data.tsv")

In [91]:
write_link_commands(df, "data_link_command.txt")

188 link commands written to /nfs/ForHenry/brain_ventricle/data_link_command.txt


In [None]:
mask = (
    (df["scan_type"] == "T1-3D")
    & df["run"].notna()
    & (df["run"] != "")
    # & (df["run"] == "")
)

if mask.any():
    print(df.loc[mask])
else:
    print("Nothing here.")