# Explore FeatureStore & Data Preparation Pipeline

This notebook demonstrates how to:
1. Run the data preparation pipeline
2. Explore **long format** data (raw parcellator output)
3. Load **wide format** features per metric
4. Use TIV (Total Intracranial Volume) for normalization
5. Prepare data for regional brain age modeling

In [1]:
import os
from pathlib import Path

import pandas as pd
import numpy as np
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# NeuroAlign imports
from neuroalign.data.preprocessing import (
    FeatureStore,
    DataPreparationPipeline,
    PipelineConfig,
    DataPaths,
    ModalityConfig,
    OutputConfig,
    ANATOMICAL_METRICS,  # Available metrics: volume_mm3, mean, std, median, sum, etc.
)
from neuroalign.data.loaders import AnatomicalLoader, DiffusionLoader

print(f"Available anatomical metrics: {ANATOMICAL_METRICS}")

Available anatomical metrics: ['volume_mm3', 'mean', 'std', 'median', 'sum', 'robust_std', 'mad_median']


## 1. Configuration from Environment

In [2]:
# Helper to get paths from env
def get_path(var: str) -> Path | None:
    val = os.getenv(var)
    return Path(val).expanduser() if val else None

# Print current configuration
print("Current Configuration:")
print(f"  SESSIONS_CSV: {get_path('SESSIONS_CSV')}")
print(f"  CAT12_ROOT: {get_path('CAT12_ROOT')}")
print(f"  CAT12_ATLAS_ROOT: {get_path('CAT12_ATLAS_ROOT')}")
print(f"  QSIPARC_PATH: {get_path('QSIPARC_PATH')}")
print(f"  QSIRECON_PATH: {get_path('QSIRECON_PATH')}")
print(f"  ATLAS_NAME: {os.getenv('ATLAS_NAME', '4S456Parcels')}")

Current Configuration:
  SESSIONS_CSV: /home/galkepler/Downloads/linked_sessions.csv
  CAT12_ROOT: /media/storage/yalab-dev/BIDS/derivatives/CAT12.9_2577.new
  CAT12_ATLAS_ROOT: /mnt/62/Processed_Data/derivatives/qsirecon/atlases
  QSIPARC_PATH: /mnt/62/Processed_Data/derivatives/qsiparc
  QSIRECON_PATH: /mnt/62/Processed_Data/derivatives/qsirecon
  ATLAS_NAME: 4S456Parcels


In [3]:
sessions = pd.read_csv("/home/galkepler/Downloads/linked_sessions.csv")
sessions_test = sessions.sample(n=10, random_state=42)
# save session_id as string
sessions_test["session_id"] = sessions_test["session_id"].astype(int).astype(str)
sessions_test.to_csv("/home/galkepler/Downloads/linked_sessions_test.csv", index=False)

## 2. Run the Data Preparation Pipeline

This will load all data and save to a FeatureStore. If data already exists, it will only load new sessions (unless `force=True`).

In [4]:
# Configure the pipeline
OUTPUT_DIR = Path("../data/processed")

config = PipelineConfig(
    paths=DataPaths(
        # sessions_csv=get_path("SESSIONS_CSV"),
        sessions_csv=Path("/home/galkepler/Downloads/linked_sessions_test.csv"),
        cat12_root=get_path("CAT12_ROOT"),
        atlas_root=get_path("CAT12_ATLAS_ROOT"),
        qsiparc_path=get_path("QSIPARC_PATH"),
        qsirecon_path=get_path("QSIRECON_PATH"),
        output_dir=OUTPUT_DIR,
    ),
    modalities=ModalityConfig(
        anatomical=True,
        diffusion=True,
        gray_matter=True,
        white_matter=True,
        cortical_thickness=True,
    ),
    atlas_name=os.getenv("ATLAS_NAME", "4S456Parcels"),
    n_jobs=4,  # Parallel workers for anatomical loading
    force=False,  # Set to True to reload all data
)

print("Pipeline configured")
print(f"  Output: {config.paths.output_dir}")
print(f"  Force reload: {config.force}")

Pipeline configured
  Output: ../data/processed
  Force reload: False


In [18]:
from neuroalign.data.loaders.anatomical import _select_xml

for _, row in sessions_test.iterrows():
            subject = row["subject_code"]
            session = row["session_id"]
            if isinstance(session, float) or isinstance(session, int):
                session = str(int(session))
            if isinstance(subject, float) or isinstance(subject, int):
                subject = (
                    str(int(subject)).replace("_", "").replace("-", "").replace("\t", "").zfill(4)
                )
            candidate_dir = config.paths.cat12_root / f"sub-{subject}" / f"ses-{session}" / "anat"
            # print(candidate_dir.exists())
            candidate_xml = _select_xml(candidate_dir, subject, session)
            print(f"Subject: {subject}, Session: {session}, XML: {candidate_xml}")

Subject: BJJL213, Session: 202507041402, XML: /media/storage/yalab-dev/BIDS/derivatives/CAT12.9_2577.new/sub-BJJL213/ses-202507041402/anat/cat_sub-BJJL213_ses-202507041402_ce-corrected_T1w.xml
Subject: BJJL97, Session: 202410201758, XML: /media/storage/yalab-dev/BIDS/derivatives/CAT12.9_2577.new/sub-BJJL97/ses-202410201758/anat/cat_sub-BJJL97_ses-202410201758_ce-corrected_T1w.xml
Subject: FNCL06, Session: 202505180901, XML: /media/storage/yalab-dev/BIDS/derivatives/CAT12.9_2577.new/sub-FNCL06/ses-202505180901/anat/cat_sub-FNCL06_ses-202505180901_ce-corrected_T1w.xml
Subject: GYML25, Session: 202512300905, XML: None
Subject: 991, Session: 202307031706, XML: None
Subject: YBH10074, Session: 202601071013, XML: None
Subject: 107, Session: 201911281751, XML: None
Subject: BAL39, Session: 201902071703, XML: /media/storage/yalab-dev/BIDS/derivatives/CAT12.9_2577.new/sub-BAL39/ses-201902071703/anat/cat_sub-BAL39_ses-201902071703_ce-corrected_T1w.xml
Subject: BJJL253, Session: 202508271151, XML

In [5]:
# Run the pipeline (uncomment to execute)
# This may take a while depending on data size

pipeline = DataPreparationPipeline(config)
result = pipeline.run()

print(f"\nPipeline complete!")
print(f"  New sessions: {result.n_new_sessions}")
print(f"  Skipped: {result.n_skipped_sessions}")
print(f"  Total in store: {result.metadata['n_sessions']}")

Failed to load anatomical data: No CAT12 outputs found under /media/storage/yalab-dev/BIDS/derivatives/CAT12.9_2577.new. Run CAT12 first or check the path.
Loading diffusion data: 100%|██████████| 4/4 [00:01<00:00,  3.87it/s]
Failed to load diffusion data: No sessions successfully loaded


ValueError: No data was saved - check your data paths

## 3. Explore the FeatureStore

The FeatureStore organizes data in two formats:
- **Long format**: Raw parcellator output with ALL columns (volume_mm3, mean, std, median, etc.)
- **Wide format**: One Parquet file per metric for efficient loading

It also stores TIV (Total Intracranial Volume) separately for volume normalization.

In [6]:
# Load the feature store
store = FeatureStore(OUTPUT_DIR)

# Check if store exists
if store.exists():
    print("FeatureStore found!")
    summary = store.summary()
    print(f"\nSummary:")
    print(f"  Sessions: {summary['n_sessions']}")
    print(f"  Subjects: {summary['n_subjects']}")
    print(f"  Atlas: {summary['atlas_name']}")
    print(f"  Has TIV: {summary['has_tiv']}")
    print(f"  Created: {summary['created_at']}")
    
    print(f"\nLong formats available: {len(summary['long_formats'])}")
    for fmt in summary['long_formats']:
        print(f"  - {fmt}")
    
    print(f"\nWide features available: {summary['n_wide_features']}")
else:
    print("FeatureStore not found - run the pipeline first!")

FeatureStore found!

Summary:
  Sessions: 6
  Subjects: 6
  Atlas: 4S456Parcels
  Has TIV: False
  Created: 2026-01-15T10:25:10.817243

Long formats available: 7
  - anatomical_gm
  - anatomical_wm
  - anatomical_ct
  - diffusion_DSIStudio
  - diffusion_DIPYDKI
  - diffusion_DIPYMAPMRI
  - diffusion_AMICONODDI

Wide features available: 72


In [7]:
# List available wide-format features
print("Available wide-format features:")
print("\nAnatomical:")
for feat in store.list_features(modality="anatomical"):
    info = store.get_feature_info(feat)
    print(f"  - {feat}: {info.n_regions} regions, {info.n_sessions} sessions")

print("\nDiffusion (first 10):")
diff_feats = store.list_features(modality="diffusion")
for feat in diff_feats[:10]:
    info = store.get_feature_info(feat)
    print(f"  - {feat}: {info.n_regions} regions, {info.n_sessions} sessions")

if len(diff_feats) > 10:
    print(f"  ... and {len(diff_feats) - 10} more")

Available wide-format features:

Anatomical:
  - ct_mad_median: 456 regions, 6 sessions
  - ct_mean: 456 regions, 6 sessions
  - ct_median: 456 regions, 6 sessions
  - ct_robust_std: 456 regions, 6 sessions
  - ct_std: 456 regions, 6 sessions
  - ct_sum: 456 regions, 6 sessions
  - ct_volume_mm3: 456 regions, 6 sessions
  - gm_mad_median: 456 regions, 6 sessions
  - gm_mean: 456 regions, 6 sessions
  - gm_median: 456 regions, 6 sessions
  - gm_robust_std: 456 regions, 6 sessions
  - gm_std: 456 regions, 6 sessions
  - gm_sum: 456 regions, 6 sessions
  - gm_volume_mm3: 456 regions, 6 sessions
  - wm_mad_median: 456 regions, 6 sessions
  - wm_mean: 456 regions, 6 sessions
  - wm_median: 456 regions, 6 sessions
  - wm_robust_std: 456 regions, 6 sessions
  - wm_std: 456 regions, 6 sessions
  - wm_sum: 456 regions, 6 sessions
  - wm_volume_mm3: 456 regions, 6 sessions

Diffusion (first 10):
  - AMICONODDI_noddi_icvf: 456 regions, 2 sessions
  - AMICONODDI_noddi_isovf: 456 regions, 2 session

## 4. Explore Long Format Data

Long format preserves ALL columns from the parcellator output. This is useful for:
- Exploring different summary statistics (mean, median, std, etc.)
- Custom aggregations not available in wide format
- Quality control and debugging

In [9]:
# Load gray matter long format data
gm_long = store.load_long("anatomical_gm")
print(f"GM Long Format: {gm_long.shape}")
print(f"\nAll columns: {gm_long.columns.tolist()}")
print(f"\nAvailable metrics: {[c for c in ANATOMICAL_METRICS if c in gm_long.columns]}")
gm_long.head()

GM Long Format: (2736, 80)

All columns: ['index', 'label', 'network_label', 'label_7network', 'index_17network', 'label_17network', 'network_label_17network', 'atlas_name', 'network_id', 'volume_mm3', 'voxel_count', 'z_filtered_mean', 'z_filtered_std', 'iqr_filtered_mean', 'iqr_filtered_std', 'robust_mean', 'robust_std', 'mad_median', 'mean', 'std', 'median', 'sum', 'ScanID', 'Status', 'Lab', 'Name', 'ID', 'Cellular No.', 'Email', 'Gender', 'DOB', 'ScanDate', 'Age@Scan', 'Weight', 'Height', 'Protocol', 'Study', 'Group', 'Unnamed: 16', 'ScanTag', 'SubjectCode', 'HebrewName', 'No of Scan', 'PrivacyStatement', 'UID', 'session_id', 'subject_code', 'dicom_path', 'match_type', 'qualitymeasures_SurfaceEulerNumber', 'qualitymeasures_SurfaceDefectArea', 'qualitymeasures_SurfaceDefectNumber', 'qualitymeasures_SurfaceIntensityRMSE', 'qualitymeasures_SurfacePositionRMSE', 'qualitymeasures_res_RMS', 'qualitymeasures_res_BB', 'qualitymeasures_res_ECR', 'qualitymeasures_contrast', 'qualitymeasures_c

Unnamed: 0,index,label,network_label,label_7network,index_17network,label_17network,network_label_17network,atlas_name,network_id,volume_mm3,...,qualityratings_SurfaceDefectNumber,qualityratings_SurfaceIntensityRMSE,qualityratings_SurfacePositionRMSE,qualityratings_SIQR,qualityratings_SIQR3rms2,qualityratings_SIQR4rms2,qualityratings_SIQR4rms8,qualityratings_IQR,modality,metric
0,1,LH_Vis_1,Vis,7Networks_LH_Vis_1,61.0,17Networks_LH_DorsAttnA_TempOcc_2,DorsAttnA,4S456,,2875.5,...,1.4,1.558172,1.500964,1.759072,1.759072,1.567678,1.705066,1.657367,gm,volume
1,2,LH_Vis_2,Vis,7Networks_LH_Vis_2,193.0,17Networks_LH_DefaultC_PHC_2,DefaultC,4S456,,3398.625,...,1.4,1.558172,1.500964,1.759072,1.759072,1.567678,1.705066,1.657367,gm,volume
2,3,LH_Vis_3,Vis,7Networks_LH_Vis_3,1.0,17Networks_LH_VisCent_ExStr_1,VisCent,4S456,,2592.0,...,1.4,1.558172,1.500964,1.759072,1.759072,1.567678,1.705066,1.657367,gm,volume
3,4,LH_Vis_4,Vis,7Networks_LH_Vis_4,13.0,17Networks_LH_VisPeri_ExStrInf_1,VisPeri,4S456,,3418.875,...,1.4,1.558172,1.500964,1.759072,1.759072,1.567678,1.705066,1.657367,gm,volume
4,5,LH_Vis_5,Vis,7Networks_LH_Vis_5,2.0,17Networks_LH_VisCent_ExStr_2,VisCent,4S456,,3395.25,...,1.4,1.558172,1.500964,1.759072,1.759072,1.567678,1.705066,1.657367,gm,volume


In [10]:
# Compare different metrics for the same modality
print("Different summary statistics for Gray Matter:")
for metric in ["volume_mm3", "mean", "median"]:
    if metric in gm_long.columns:
        vals = gm_long[metric].dropna()
        print(f"  {metric}: min={vals.min():.3f}, max={vals.max():.3f}, mean={vals.mean():.3f}")

Different summary statistics for Gray Matter:
  volume_mm3: min=54.000, max=30982.500, mean=2851.055
  mean: min=0.001, max=0.896, mean=0.491
  median: min=0.000, max=0.996, mean=0.538


In [11]:
# Load wide format feature for modeling
# Note: Feature names follow pattern {modality}_{metric} e.g., gm_volume_mm3, ct_mean
gm_wide = store.load_feature("gm_volume_mm3")
print(f"GM Volume Wide Format: {gm_wide.shape}")

# Get region names
regions = store.get_regions("gm_volume_mm3")
print(f"Number of regions: {len(regions)}")
print(f"First 10 regions: {regions[:10]}")
gm_wide.head()

GM Volume Wide Format: (6, 459)
Number of regions: 456
First 10 regions: ['Cerebellar_Region1', 'Cerebellar_Region10', 'Cerebellar_Region2', 'Cerebellar_Region3', 'Cerebellar_Region4', 'Cerebellar_Region5', 'Cerebellar_Region6', 'Cerebellar_Region7', 'Cerebellar_Region8', 'Cerebellar_Region9']


Unnamed: 0,subject_code,session_id,Cerebellar_Region1,Cerebellar_Region10,Cerebellar_Region2,Cerebellar_Region3,Cerebellar_Region4,Cerebellar_Region5,Cerebellar_Region6,Cerebellar_Region7,...,RH_Vis_29,RH_Vis_3,RH_Vis_30,RH_Vis_4,RH_Vis_5,RH_Vis_6,RH_Vis_7,RH_Vis_8,RH_Vis_9,AGE
0,1047,202306260831,20557.125,11346.75,18623.25,10837.125,26240.625,20958.75,30969.0,11795.625,...,2953.125,2440.125,2109.375,3071.25,3756.375,4529.25,1144.125,2311.875,4549.5,33.91
1,BAL39,201902071703,20553.75,11346.75,18623.25,10837.125,26240.625,20904.75,30958.875,11795.625,...,2953.125,2440.125,2109.375,3071.25,3756.375,4529.25,1144.125,2311.875,4549.5,24.47
2,BJJL213,202507041402,20550.375,11346.75,18623.25,10830.375,26240.625,20955.375,30982.5,11795.625,...,2953.125,2440.125,2109.375,3071.25,3756.375,4529.25,1144.125,2311.875,4549.5,24.2
3,BJJL253,202508271151,20547.0,11346.75,18623.25,10833.75,26240.625,20904.75,30958.875,11795.625,...,2953.125,2440.125,2109.375,3071.25,3756.375,4529.25,1144.125,2311.875,4549.5,43.92
4,BJJL97,202410201758,20557.125,11346.75,18623.25,10833.75,26240.625,20935.125,30965.625,11795.625,...,2953.125,2440.125,2109.375,3071.25,3756.375,4529.25,1144.125,2311.875,4549.5,27.38


## 5. TIV (Total Intracranial Volume) for Normalization

TIV is stored separately so you can normalize volumetric features as needed.

In [12]:
# Load TIV data
if store.has_tiv():
    tiv = store.load_tiv()
    print(f"TIV data: {tiv.shape}")
    print(f"\nTIV statistics:")
    print(f"  Min: {tiv['tiv'].min():.1f} ml")
    print(f"  Max: {tiv['tiv'].max():.1f} ml")
    print(f"  Mean: {tiv['tiv'].mean():.1f} ml")
    tiv.head()
else:
    print("TIV not available - MATLAB/CAT12 not configured")
    print("Set MATLAB_BIN, SPM_PATH, CAT12_PATH, TIV_TEMPLATE in .env")

TIV not available - MATLAB/CAT12 not configured
Set MATLAB_BIN, SPM_PATH, CAT12_PATH, TIV_TEMPLATE in .env


In [13]:
# Load feature with TIV included for normalization
gm_with_tiv = store.load_feature("gm_volume_mm3", include_metadata=True, include_tiv=True)
print(f"GM with metadata and TIV: {gm_with_tiv.shape}")
print(f"\nMetadata columns: {[c for c in gm_with_tiv.columns if c in ['subject_code', 'session_id', 'AGE', 'tiv']]}")

# Example: Normalize volumes by TIV
if "tiv" in gm_with_tiv.columns and gm_with_tiv["tiv"].notna().any():
    region_cols = store.get_regions("gm_volume_mm3")
    gm_normalized = gm_with_tiv.copy()
    gm_normalized[region_cols] = gm_normalized[region_cols].div(gm_normalized["tiv"], axis=0)
    print(f"\nNormalized GM values (first region):")
    print(f"  Raw: {gm_with_tiv[region_cols[0]].mean():.3f}")
    print(f"  TIV-normalized: {gm_normalized[region_cols[0]].mean():.6f}")

GM with metadata and TIV: (6, 459)

Metadata columns: ['subject_code', 'session_id', 'AGE']


## 6. Load Multiple Features for Modeling

In [14]:
# Load multiple anatomical features
# Note: Feature names now include the metric (e.g., gm_volume_mm3, ct_mean)
anat_features = ["gm_volume_mm3", "wm_volume_mm3", "ct_mean"]
available = [f for f in anat_features if f in store.list_features()]
print(f"Loading: {available}")

multi_data = store.load_features(available, include_metadata=True)
print(f"\nCombined shape: {multi_data.shape}")

# Check columns
meta_cols = ["subject_code", "session_id", "AGE"]
feature_cols = [c for c in multi_data.columns if c not in meta_cols]
print(f"Metadata columns: {[c for c in meta_cols if c in multi_data.columns]}")
print(f"Feature columns: {len(feature_cols)}")

Loading: ['gm_volume_mm3', 'wm_volume_mm3', 'ct_mean']

Combined shape: (6, 1371)
Metadata columns: ['subject_code', 'session_id', 'AGE']
Feature columns: 1368


In [15]:
# Check for missing values
gm = store.load_feature("gm_volume_mm3", include_metadata=True)
region_cols = store.get_regions("gm_volume_mm3")

# Missing age
missing_age = gm["AGE"].isna().sum() if "AGE" in gm.columns else len(gm)
print(f"Missing AGE: {missing_age} / {len(gm)} sessions")

# Missing feature values
missing_features = gm[region_cols].isna().sum().sum()
total_cells = len(gm) * len(region_cols)
print(f"Missing feature values: {missing_features} / {total_cells} ({100*missing_features/total_cells:.2f}%)")

# Age distribution
if "AGE" in gm.columns:
    ages = gm["AGE"].dropna()
    print(f"\nAge Statistics:")
    print(f"  Min: {ages.min():.1f}")
    print(f"  Max: {ages.max():.1f}")
    print(f"  Mean: {ages.mean():.1f}")
    print(f"  Std: {ages.std():.1f}")

Missing AGE: 0 / 6 sessions
Missing feature values: 0 / 2736 (0.00%)

Age Statistics:
  Min: 24.2
  Max: 43.9
  Mean: 30.0
  Std: 7.7


## 7. Prepare Data for Regional Brain Age Modeling

In [16]:
# Example: Prepare data for a single region's brain age model

# Select a feature and region
feature_name = "gm_volume_mm3"
target_region = "LH_Vis_1"  # First visual cortex region

# Load feature
df = store.load_feature(feature_name)

# Get all region names
all_regions = store.get_regions(feature_name)

# For this region, use all OTHER regions as features
feature_regions = [r for r in all_regions if r != target_region]

# Prepare X and y
X = df[feature_regions].values
y = df["AGE"].values if "AGE" in df.columns else np.zeros(len(df))

# Remove rows with missing values
mask = ~np.isnan(y) & ~np.isnan(X).any(axis=1)
X_clean = X[mask]
y_clean = y[mask]

print(f"Target region: {target_region}")
print(f"Number of features (other regions): {len(feature_regions)}")
print(f"Samples: {len(X_clean)}")
print(f"X shape: {X_clean.shape}")
print(f"y shape: {y_clean.shape}")

Target region: LH_Vis_1
Number of features (other regions): 455
Samples: 6
X shape: (6, 455)
y shape: (6,)


In [17]:
# Quick example: Train a simple model
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X_clean, y_clean, test_size=0.2, random_state=42
)

# Create pipeline
model = Pipeline([
    ("scaler", StandardScaler()),
    ("ridge", Ridge(alpha=1.0)),
])

# Cross-validation
cv_scores = cross_val_score(model, X_train, y_train, cv=2, scoring="r2")
print(f"Cross-validation R2: {cv_scores.mean():.3f} (+/- {cv_scores.std()*2:.3f})")

# Fit and evaluate
model.fit(X_train, y_train)
train_score = model.score(X_train, y_train)
test_score = model.score(X_test, y_test)
print(f"Train R2: {train_score:.3f}")
print(f"Test R2: {test_score:.3f}")

Cross-validation R2: -112.893 (+/- 220.526)
Train R2: 0.995
Test R2: -14.529


## 8. Direct Loader Access (for debugging)

In [8]:
# Direct access to anatomical loader (for debugging/exploration)
anat_loader = AnatomicalLoader(
    cat12_root=get_path("CAT12_ROOT"),
    atlas_root=get_path("CAT12_ATLAS_ROOT"),
    atlas_name="4S456Parcels",
)

# Load a single session
test_subject = "0048R"
test_session = "202303161349"

session_data = anat_loader.load_session(test_subject, test_session)
if session_data is not None:
    print(f"Loaded session: {len(session_data)} rows")
    print(f"Modalities: {session_data['modality'].unique()}")
    print(f"Metrics: {session_data['metric'].unique()}")
    
    # Show available summary columns
    summary_cols = [c for c in session_data.columns if c in ANATOMICAL_METRICS]
    print(f"\nAvailable summary columns: {summary_cols}")
else:
    print("Session not found")

Loaded session: 1368 rows
Modalities: ['gm' 'wm' 'ct']
Metrics: ['volume' 'thickness']

Available summary columns: ['volume_mm3', 'robust_std', 'mad_median', 'mean', 'std', 'median', 'sum']


In [15]:
session_data

Unnamed: 0,index,label,network_label,label_7network,index_17network,label_17network,network_label_17network,atlas_name,network_id,volume_mm3,...,robust_std,mad_median,mean,std,median,sum,modality,metric,subject_code,session_id
0,1,LH_Vis_1,Vis,7Networks_LH_Vis_1,61.0,17Networks_LH_DorsAttnA_TempOcc_2,DorsAttnA,4S456,,2875.500,...,0.174347,0.096446,0.499111,0.200660,0.578435,425.242668,gm,volume,0048R,202303161349
1,2,LH_Vis_2,Vis,7Networks_LH_Vis_2,193.0,17Networks_LH_DefaultC_PHC_2,DefaultC,4S456,,3398.625,...,0.215922,0.116727,0.588974,0.230079,0.678658,593.096686,gm,volume,0048R,202303161349
2,3,LH_Vis_3,Vis,7Networks_LH_Vis_3,1.0,17Networks_LH_VisCent_ExStr_1,VisCent,4S456,,2592.000,...,0.205142,0.110522,0.474460,0.205142,0.548793,364.385081,gm,volume,0048R,202303161349
3,4,LH_Vis_4,Vis,7Networks_LH_Vis_4,13.0,17Networks_LH_VisPeri_ExStrInf_1,VisPeri,4S456,,3418.875,...,0.233215,0.132762,0.539623,0.233215,0.626174,546.638369,gm,volume,0048R,202303161349
4,5,LH_Vis_5,Vis,7Networks_LH_Vis_5,2.0,17Networks_LH_VisCent_ExStr_2,VisCent,4S456,,3395.250,...,0.235117,0.179390,0.422563,0.235117,0.481146,425.098838,gm,volume,0048R,202303161349
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1363,452,Cerebellar_Region6,,,,,,Cerebellum,,28535.625,...,0.876409,0.551500,1.618177,0.876409,1.512700,14856.485625,ct,thickness,0048R,202303161349
1364,453,Cerebellar_Region7,,,,,,Cerebellum,,11194.875,...,0.358251,0.280800,1.518926,0.499398,1.561700,5308.646766,ct,thickness,0048R,202303161349
1365,454,Cerebellar_Region8,,,,,,Cerebellum,,16031.250,...,0.621113,0.381800,1.957049,0.714323,2.103200,9630.639757,ct,thickness,0048R,202303161349
1366,455,Cerebellar_Region9,,,,,,Cerebellum,,8322.750,...,0.796211,0.505000,1.915157,0.796211,1.885400,4985.154074,ct,thickness,0048R,202303161349


In [12]:
from neuroalign.data.loaders.anatomical import _select_xml
subject = test_subject
session = test_session
cat12_dir = anat_loader.get_cat12_directory(subject, session)
_select_xml(cat12_dir=cat12_dir, subject=test_subject, session=test_session)
# existing = sorted(cat12_dir.glob("cat_*sub-*.xml"))
# existing
# if existing:
#     return existing[0]
# corrected = cat12_dir / f"cat_sub-{subject}_ses-{session}_ce-corrected_T1w.xml"
# if corrected.exists():
#     return corrected
# uncorrected = cat12_dir / f"cat_sub-{subject}_ses-{session}_ce-uncorrected_T1w.xml"
# return uncorrected if uncorrected.exists() else None

PosixPath('/media/storage/yalab-dev/BIDS/derivatives/CAT12.9_2577.new/sub-0048R/ses-202303161349/anat/cat_sub-0048R_ses-202303161349_ce-corrected_T1w.xml')

In [19]:
# Direct access to diffusion loader
diff_loader = DiffusionLoader(
    qsiparc_path=get_path("QSIPARC_PATH"),
    qsirecon_path=get_path("QSIRECON_PATH"),
)

print(f"Available workflows: {diff_loader.workflows}")

# Load a single session
diff_data = diff_loader.load_session(test_subject, test_session)
if diff_data is not None:
    print(f"\nLoaded session: {len(diff_data)} rows")
    print(f"Workflows: {diff_data['workflow'].unique()}")
    print(f"Models: {diff_data['model'].unique()}")
    print(f"Params: {diff_data['param'].unique()}")
    print(f"\nColumns: {diff_data.columns.tolist()}")
else:
    print("Session not found")

Available workflows: ['DSIStudio', 'MRtrix3_act-HSVS', 'DIPYDKI', 'DIPYMAPMRI', 'AMICONODDI']

Loaded session: 27816 rows
Workflows: ['DSIStudio' 'MRtrix3_act-HSVS' 'DIPYDKI' 'DIPYMAPMRI' 'AMICONODDI']
Models: ['tensor' 'gqi' 'noddi' 'dki' 'dkimicro' 'mapmri']
Params: ['tzz' 'ha' 'txz' 'tyz' 'rd' 'iso' 'rd2' 'fa' 'rd1' 'md' 'txx' 'ad' 'gfa'
 'txy' 'tyy' 'qa' 'icvf' 'rmse' 'od' 'nrmse' 'tf' 'isovf' 'ak' 'mkt'
 'sphericity' 'tortuosity' 'axonald' 'linearity' 'mk' 'kfa' 'rk' 'ade'
 'trace' 'rde' 'planarity' 'awf' 'mapcoeffs' 'msd' 'rtpp' 'rtop' 'lapnorm'
 'qiv' 'rtap']

Columns: ['index', 'label', 'network_label', 'label_7network', 'index_17network', 'label_17network', 'network_label_17network', 'atlas_name', 'network_id', 'volume_mm3', 'voxel_count', 'z_filtered_mean', 'z_filtered_std', 'iqr_filtered_mean', 'iqr_filtered_std', 'robust_mean', 'robust_std', 'mad_median', 'mean', 'std', 'median', 'workflow', 'model', 'param', 'desc', 'subject_code', 'session_id']


## Notes on the Data Structure

### Long Format
- Stored in `data/processed/long/`
- One file per modality (anatomical_gm.parquet, anatomical_ct.parquet, etc.)
- Preserves ALL parcellator columns: volume_mm3, mean, std, median, sum, robust_std, mad_median

### Wide Format  
- Stored in `data/processed/wide/`
- One file per metric: `{modality}_{metric}.parquet` (e.g., gm_volume_mm3.parquet)
- Ready for modeling: one row per session, regions as columns

### TIV
- Stored separately in `data/processed/tiv.parquet`
- Use `include_tiv=True` when loading features to get TIV for normalization
- Requires MATLAB/CAT12 configuration for calculation

In [None]:
# Space for experimentation
# Example: Compare different metrics for CT
# ct_long = store.load_long("anatomical_ct")
# for metric in ["mean", "median", "robust_std"]:
#     if metric in ct_long.columns:
#         print(f"{metric}: {ct_long[metric].describe()}")

Unnamed: 0,ScanID,Status,Lab,Name,ID,Cellular No.,Email,Gender,DOB,ScanDate,...,ScanTag,SubjectCode,HebrewName,No of Scan,PrivacyStatement,UID,session_id,subject_code,dicom_path,match_type
655,20250704_1402,Performed,YA,Ido Weinstock,322376799.0,545417196,ido.weinstock@mail.huji.ac.il,Male,4/22/2001,07/04/2025,...,,BJJ_L_213,עידו וינשטוק,1.0,07/04/2025: Yes,S003351,202507041402,BJJL213,/mnt/62/Raw_Data/20250704_1402,exact
1479,20241020_1758,Performed,YA,Rotem Kimhi,208514703.0,547653008,Rotemhashemy@gmail.com,Male,6/3/1997,10/20/2024,...,,BJJ_L_97,רותם קמחי,1.0,10/20/2024: Yes,S002760,202410201758,BJJL97,/mnt/62/Raw_Data/20241020_1758,exact
802,20250518_0901,Performed,YA,Ofek Shaviv,209666874.0,,Shavivofek@gmail.com,Male,6/30/1999,5/18/2025,...,Post,FNC_L_06,אופק שביב,2.0,02/20/2025: Yes,S003051,202505180901,FNCL06,/mnt/62/Raw_Data/20250518_0901,exact
69,20251230_0905,Performed,YA,Amit Levon,305762213.0,,amitlevon@mail.tau.ac.il,Male,8/25/1991,12/30/2025,...,Post,GYM_L_25,עמית לבאון,4.0,02/25/2025: Yes,S003060,202512300905,GYML25,,missing
2561,20230703_1706,Performed,YA,Alexander Rozenman,304323710.0,050-5936442,Einmedalia@gmail.com,Male,11/4/1986,7/3/2023,...,Post,991,אלכסנדר רוזנמן,2.0,05/15/2023: Yes,S001829,202307031706,991,/mnt/62/Raw_Data/20230703_1706,exact
33,20260107_1013,Performed,YBH,Ella Leibowitz,325480994.0,505691600,Ellah1@mail.tau.ac.il,Female,08/05/2003,01/07/2026,...,,YBH10074,אלה ליבוביץ,1.0,Folder not found,,202601071013,YBH10074,,missing
4359,20191128_1751,Performed,YA,Ester Pilpel,7768963.0,054-6555562,,Female,01/14/1948,11/28/2019,...,,107,אסתר פלפל,1.0,11/28/2019: No,S000529,201911281751,107,/mnt/62/Raw_Data/20191128_1751,exact
4489,20190207_1703,Performed,YA,Yoav Dlayahu,205367063.0,,yoav.dlayahu@gmail.com,Male,08/19/1994,2/7/2019,...,Post,BAL39,יואב דלייהו,2.0,"02/07/2019: No, 02/07/2019: No",S000397,201902071703,BAL39,,missing
471,20250827_1151,Performed,YA,Nir David Kedmi,43401652.0,524588486,Ndk.shows@gmail.com,Male,9/26/1981,08/27/2025,...,Pre,BJJ_L_253,ניר קדמי,1.0,Folder not found,S003465,202508271151,BJJL253,/mnt/62/Raw_Data/20250827_1151,exact
2584,20230626_0831,Performed,YA,Nitzan Livne,201554086.0,054-5665478,livnitzan@gmail.com,Male,07/30/1989,6/26/2023,...,Pre,1047,ניצן לבנה,2.0,06/26/2023: Yes,S001927,202306260831,1047,/mnt/62/Raw_Data/20230626_0831,exact
