# Anatomical Loader Debug Notebook

This notebook allows manual inspection and debugging of the AnatomicalLoader.

## What it loads:
- Gray Matter (GM) volume from CAT12 parcellated TSVs
- White Matter (WM) volume from CAT12 parcellated TSVs
- Cortical Thickness (CT) from CAT12 parcellated TSVs
- Total Intracranial Volume (TIV) from CAT12 XML files

In [1]:
import os
from pathlib import Path
from dotenv import load_dotenv
import pandas as pd
import numpy as np

# Load environment variables
load_dotenv()

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', None)

## 1. Configuration

Set up paths from environment variables or override manually.

In [2]:
# Load paths from environment (or override below)
CAT12_ROOT = os.getenv("CAT12_ROOT")
CAT12_PARCELLATED_ROOT = os.getenv("CAT12_PARCELLATED_ROOT")
SESSIONS_CSV = os.getenv("SESSIONS_CSV")
ATLAS_NAME = os.getenv("ATLAS_NAME", "4S456Parcels")

# Optionally override paths here:
# CAT12_ROOT = "/path/to/cat12"
# CAT12_PARCELLATED_ROOT = "/path/to/cat12_parcellated"
# SESSIONS_CSV = "/path/to/sessions.csv"

print("Configuration:")
print(f"  CAT12_ROOT: {CAT12_ROOT}")
print(f"  CAT12_PARCELLATED_ROOT: {CAT12_PARCELLATED_ROOT}")
print(f"  SESSIONS_CSV: {SESSIONS_CSV}")
print(f"  ATLAS_NAME: {ATLAS_NAME}")

Configuration:
  CAT12_ROOT: /media/storage/yalab-dev/BIDS/derivatives/CAT12.9_2577.new/
  CAT12_PARCELLATED_ROOT: /media/storage/yalab-dev/BIDS/derivatives/cat12_parcellated
  SESSIONS_CSV: /home/galkepler/Downloads/linked_sessions.csv
  ATLAS_NAME: 4S456Parcels


## 2. Verify Paths Exist

In [3]:
def check_path(path, name):
    if path is None:
        print(f"  {name}: NOT SET")
        return False
    p = Path(path)
    exists = p.exists()
    print(f"  {name}: {'EXISTS' if exists else 'MISSING'} - {p}")
    return exists

print("Path verification:")
check_path(CAT12_ROOT, "CAT12_ROOT")
check_path(CAT12_PARCELLATED_ROOT, "CAT12_PARCELLATED_ROOT")
check_path(SESSIONS_CSV, "SESSIONS_CSV")

Path verification:
  CAT12_ROOT: EXISTS - /media/storage/yalab-dev/BIDS/derivatives/CAT12.9_2577.new
  CAT12_PARCELLATED_ROOT: EXISTS - /media/storage/yalab-dev/BIDS/derivatives/cat12_parcellated
  SESSIONS_CSV: EXISTS - /home/galkepler/Downloads/linked_sessions.csv


True

## 3. Explore Directory Structure

In [4]:
# Explore CAT12 parcellated directory structure
if CAT12_PARCELLATED_ROOT:
    root = Path(CAT12_PARCELLATED_ROOT)
    print("CAT12 Parcellated Root Structure:")
    print(f"Root: {root}")
    
    # List top-level directories
    if root.exists():
        for item in sorted(root.iterdir())[:10]:
            print(f"  {item.name}/" if item.is_dir() else f"  {item.name}")
        
        # Try to find subject directories
        cat12_dir = root / "cat12"
        if cat12_dir.exists():
            subjects = sorted([d.name for d in cat12_dir.iterdir() if d.is_dir()])[:5]
            print(f"\nFirst 5 subjects in cat12/: {subjects}")

CAT12 Parcellated Root Structure:
Root: /media/storage/yalab-dev/BIDS/derivatives/cat12_parcellated
  cat12/

First 5 subjects in cat12/: ['sub-0048R', 'sub-0069R', 'sub-0110R', 'sub-0313R', 'sub-0396R']


In [5]:
# Explore CAT12 root (for XML files)
if CAT12_ROOT:
    root = Path(CAT12_ROOT)
    print("CAT12 Root Structure (for XML/TIV):")
    print(f"Root: {root}")
    
    if root.exists():
        subjects = sorted([d.name for d in root.iterdir() if d.is_dir() and d.name.startswith("sub-")])[:5]
        print(f"\nFirst 5 subjects: {subjects}")
        
        # Look at one subject's structure
        if subjects:
            first_sub = root / subjects[0]
            print(f"\nStructure of {subjects[0]}:")
            for item in sorted(first_sub.rglob("*"))[:20]:
                rel_path = item.relative_to(first_sub)
                print(f"  {rel_path}")

CAT12 Root Structure (for XML/TIV):
Root: /media/storage/yalab-dev/BIDS/derivatives/CAT12.9_2577.new

First 5 subjects: ['sub-0001', 'sub-0010', 'sub-0011', 'sub-0012', 'sub-0013']

Structure of sub-0001:
  ses-201902270901
  ses-201902270901/anat
  ses-201902270901/anat/catROI_sub-0001_ses-201902270901_ce-corrected_T1w.mat
  ses-201902270901/anat/catROI_sub-0001_ses-201902270901_ce-corrected_T1w.xml
  ses-201902270901/anat/cat_sub-0001_ses-201902270901_ce-corrected_T1w.mat
  ses-201902270901/anat/cat_sub-0001_ses-201902270901_ce-corrected_T1w.xml
  ses-201902270901/anat/catlog_sub-0001_ses-201902270901_ce-corrected_T1w.txt
  ses-201902270901/anat/catreport_sub-0001_ses-201902270901_ce-corrected_T1w.pdf
  ses-201902270901/anat/catreportj_sub-0001_ses-201902270901_ce-corrected_T1w.jpg
  ses-201902270901/anat/ctsub-0001_ses-201902270901_ce-corrected_T1w.nii
  ses-201902270901/anat/lh.central.sub-0001_ses-201902270901_ce-corrected_T1w.gii
  ses-201902270901/anat/lh.pbt.sub-0001_ses-201902

## 4. Load Sessions CSV

In [6]:
if SESSIONS_CSV and Path(SESSIONS_CSV).exists():
    sessions = pd.read_csv(SESSIONS_CSV, dtype={"subject_code": str, "session_id": str})
    print(f"Sessions CSV loaded: {len(sessions)} rows")
    print(f"\nColumns: {list(sessions.columns)}")
    print(f"\nFirst 10 sessions:")
    display(sessions.head(10))
else:
    print("Sessions CSV not found")
    sessions = None

Sessions CSV loaded: 4630 rows

Columns: ['ScanID', 'Status', 'Lab', 'Name', 'ID', 'Cellular No.', 'Email', 'Gender', 'DOB', 'ScanDate', 'Age@Scan', 'Weight', 'Height', 'Protocol', 'Study', 'Group', 'Unnamed: 16', 'ScanTag', 'SubjectCode', 'HebrewName', 'No of Scan', 'PrivacyStatement', 'UID', 'session_id', 'subject_code', 'dicom_path', 'match_type']

First 10 sessions:


Unnamed: 0,ScanID,Status,Lab,Name,ID,Cellular No.,Email,Gender,DOB,ScanDate,Age@Scan,Weight,Height,Protocol,Study,Group,Unnamed: 16,ScanTag,SubjectCode,HebrewName,No of Scan,PrivacyStatement,UID,session_id,subject_code,dicom_path,match_type
0,20260120_1926,Performed,YBH,Maria Kamar,322393505.0,543391042,Maria.kamar2012@gmail.com,Female,05/23/2001,01/20/2026,24.66256,47.6,1.523,SNBB_YBH,YBH_Research,,1356-14,,YBH10096,מריה קמר,1.0,,,202601201926,YBH10096,,missing
1,20260120_1649,Performed,YA,Shalev Guli,21607742.0,542815084,Shalevguli@gmail.com,Male,09/12/1985,01/20/2026,40.355921,66.8,1.751,SNBB_YA_Plasticity,YA_Music,Professional,1356-14,Pre,YA1393,שלו גולי,1.0,Folder not found,,202601201649,YA1393,,missing
2,20260120_1256,Performed,YA,Or Bareli,207447038.0,508323783,orbar91@gmail.com,Male,08/06/1998,01/20/2026,27.457906,86.5,1.814,SNBB_YA_Plasticity,YA_Music,Professional,1356-14,Pre,YA1378,אור בראלי,1.0,Folder not found,,202601201256,YA1378,,missing
3,20260120_1148,Performed,YA,Jonathan Katzir,200122885.0,544353735,Mailyonik@gmail.com,Male,06/28/1987,01/20/2026,38.565366,80.0,1.802,SNBB_YA_Plasticity,YA_Ageing,Learner,1356-14,Pre,AGN_40,יהונתן קציר,1.0,12/09/2021: No,,202601201148,AGN40,,missing
4,20260120_0845,Performed,SNBB,Daniel Lerer,314857368.0,524696866,Daniellerer445@gmail.com,Male,03/16/2000,01/20/2026,25.848049,60.1,1.724,SNBB_DB,DB_SNBB,,1356-14,,BB01648,דניאל לרר,1.0,,,202601200845,BB01648,,missing
5,20260119_1859,Performed,SNBB,Netanel Burbia,213359102.0,506957055,netanelburbia@gmail.com,Male,03/19/2003,01/19/2026,22.839151,66.7,1.73,SNBB_DB,DB_SNBB,,1356-14,,BB01643,נתנאל בורביע,1.0,,,202601191859,BB01643,,missing
6,20260119_1759,Performed,SNBB,Alma Talker,325511574.0,506080318,Talker20744@gmail.com,Female,10/12/2003,01/19/2026,22.272416,52.8,1.541,SNBB_DB,DB_SNBB,,1356-14,,BB01640,אלמה טלקר,1.0,,,202601191759,BB01640,,missing
7,20260119_1658,Performed,SNBB,Ilanit Metsger,211342449.0,549171922,ilanit2610@gmail.com,Female,10/26/2000,01/19/2026,25.232033,65.85,1.695,SNBB_DB,DB_SNBB,,1356-14,,BB01637,אילנית מצגר,1.0,,,202601191658,BB01637,,missing
8,20260119_1340,Performed,YBH,Yael Tovah Michaelson,213466097.0,544864744,Yaelmichaelson@gmail.com,Female,01/05/2003,01/19/2026,23.039014,51.0,1.63,SNBB_YBH,YBH_Research,,1356-14,,YBH10068,יעל טובה מיכלסון,1.0,,,202601191340,YBH10068,,missing
9,20260119_1204,Performed,YA,Ittai Shamir,200655223.0,544333941,Ittaisham@gmail.com,Male,04/25/1988,01/19/2026,37.735797,86.0,1.835,SNBB_YA_Plasticity,YA_Pottery,Learner,1356-14,Post,PTR_L_04,איתי שמיר,2.0,12/08/2025: Yes,,202601191204,PTRL04,,missing


## 5. Initialize AnatomicalLoader

In [7]:
from neuroalign_preprocessing.loaders import AnatomicalLoader

loader = AnatomicalLoader(
    cat12_root=CAT12_ROOT,
    cat12_parcellated_root=CAT12_PARCELLATED_ROOT,
    atlas_name=ATLAS_NAME,
    n_jobs=1  # Use serial for debugging
)

print(f"AnatomicalLoader initialized:")
print(f"  cat12_root: {loader.paths.cat12_root}")
print(f"  cat12_parcellated_root: {loader.paths.cat12_parcellated_root}")
print(f"  atlas_name: {loader.paths.atlas_name}")

AnatomicalLoader initialized:
  cat12_root: /media/storage/yalab-dev/BIDS/derivatives/CAT12.9_2577.new
  cat12_parcellated_root: /media/storage/yalab-dev/BIDS/derivatives/cat12_parcellated
  atlas_name: 4S456Parcels


## 6. Test Single Session Loading

Load a single session to inspect the data structure.

In [None]:
# Pick the first session from the CSV
if sessions is not None and len(sessions) > 0:
    test_subject = "0048R"
    test_session = "202303161349"
    print(f"Testing with: sub-{test_subject}_ses-{test_session}")
else:
    # Manual override if no sessions CSV
    test_subject = "001"
    test_session = "01"
    print(f"Using manual test subject: sub-{test_subject}_ses-{test_session}")

Testing with: sub-YBH10096_ses-202601201926


In [9]:
# Check if session directory exists
session_dir = loader.get_session_directory(test_subject, test_session)
print(f"Session directory: {session_dir}")

if session_dir:
    print(f"\nFiles in session directory:")
    for f in sorted(session_dir.iterdir()):
        print(f"  {f.name}")

Session directory: None


In [None]:
# Check CAT12 directory for XML files
cat12_dir = loader.get_cat12_directory(test_subject, test_session)
print(f"CAT12 directory: {cat12_dir}")

if cat12_dir:
    print(f"\nXML files in CAT12 directory:")
    for f in sorted(cat12_dir.glob("*.xml")):
        print(f"  {f.name}")

In [None]:
# Load single session
single_session_df = loader.load_session(
    subject=test_subject,
    session=test_session,
    include_metadata=True,
    include_tiv=True
)

if single_session_df is not None:
    print(f"Single session loaded: {len(single_session_df)} rows")
    print(f"\nColumns: {list(single_session_df.columns)}")
    print(f"\nModalities: {single_session_df['modality'].unique().tolist()}")
    print(f"Metrics: {single_session_df['metric'].unique().tolist()}")
    
    if 'tiv' in single_session_df.columns:
        print(f"\nTIV: {single_session_df['tiv'].iloc[0]:.2f} mL")
else:
    print("Failed to load session")

In [None]:
# Display sample data for each modality
if single_session_df is not None:
    for modality in single_session_df['modality'].unique():
        print(f"\n{'='*60}")
        print(f"MODALITY: {modality}")
        print(f"{'='*60}")
        mod_df = single_session_df[single_session_df['modality'] == modality]
        print(f"Rows: {len(mod_df)}")
        display(mod_df.head(10))

## 7. Inspect Raw TSV Files

Look at the raw TSV files to verify data integrity.

In [None]:
# Load and display raw TSV files
if session_dir:
    for tissue in ["GM", "WM", "CT"]:
        tsv_files = list(session_dir.glob(f"*_tissue-{tissue}_parc.tsv"))
        if tsv_files:
            print(f"\n{'='*60}")
            print(f"RAW TSV: {tissue}")
            print(f"File: {tsv_files[0].name}")
            print(f"{'='*60}")
            raw_df = pd.read_csv(tsv_files[0], sep="\t")
            print(f"Shape: {raw_df.shape}")
            print(f"Columns: {list(raw_df.columns)}")
            display(raw_df.head(5))
        else:
            print(f"\nNo {tissue} TSV file found")

## 8. Inspect TIV Extraction from XML

In [None]:
import xml.etree.ElementTree as ET

if cat12_dir:
    xml_files = list(cat12_dir.glob("cat_*.xml"))
    if xml_files:
        xml_path = xml_files[0]
        print(f"Inspecting XML: {xml_path.name}")
        
        tree = ET.parse(xml_path)
        root = tree.getroot()
        
        print(f"\nRoot tag: {root.tag}")
        print(f"\nAll vol_TIV elements found:")
        for i, vol_tiv in enumerate(root.iter("vol_TIV")):
            parent = None
            for p in root.iter():
                if vol_tiv in p:
                    parent = p.tag
                    break
            text = vol_tiv.text[:50] if vol_tiv.text else "None"
            print(f"  {i+1}. text='{text}...'")
        
        # Extract TIV using the loader's method
        from neuroalign_preprocessing.loaders.anatomical import _extract_tiv_from_xml
        tiv = _extract_tiv_from_xml(xml_path)
        print(f"\nExtracted TIV: {tiv} mL")

## 9. Load Multiple Sessions

In [None]:
# Load a small batch of sessions for testing
if sessions is not None:
    # Create a temporary CSV with first N sessions
    n_test = min(5, len(sessions))
    test_sessions = sessions.head(n_test)
    temp_csv = Path("/tmp/test_sessions.csv")
    test_sessions.to_csv(temp_csv, index=False)
    
    print(f"Testing batch load with {n_test} sessions...")
    
    batch_df = loader.load_sessions(
        sessions_csv=temp_csv,
        n_jobs=1,
        progress=True,
        calculate_tiv=True
    )
    
    print(f"\nBatch loaded: {len(batch_df)} rows")
    print(f"Unique sessions: {batch_df[['subject_code', 'session_id']].drop_duplicates().shape[0]}")

In [None]:
# Summary statistics
if 'batch_df' in dir() and batch_df is not None:
    print("Summary by modality:")
    summary = batch_df.groupby(['modality', 'metric']).agg({
        'label': 'nunique',
        'subject_code': 'nunique'
    }).rename(columns={'label': 'n_regions', 'subject_code': 'n_subjects'})
    display(summary)
    
    print("\nTIV statistics:")
    if 'tiv' in batch_df.columns:
        tiv_stats = batch_df.groupby(['subject_code', 'session_id'])['tiv'].first()
        print(f"  Mean: {tiv_stats.mean():.2f} mL")
        print(f"  Std: {tiv_stats.std():.2f} mL")
        print(f"  Range: {tiv_stats.min():.2f} - {tiv_stats.max():.2f} mL")

## 10. Data Quality Checks

In [None]:
if 'batch_df' in dir() and batch_df is not None:
    print("Data Quality Checks:")
    print("="*60)
    
    # Check for missing values
    print("\n1. Missing values per column:")
    missing = batch_df.isnull().sum()
    missing_pct = (missing / len(batch_df) * 100).round(2)
    missing_df = pd.DataFrame({'missing': missing, 'pct': missing_pct})
    display(missing_df[missing_df['missing'] > 0])
    
    # Check for negative values in volume columns
    print("\n2. Negative values check:")
    numeric_cols = batch_df.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        neg_count = (batch_df[col] < 0).sum()
        if neg_count > 0:
            print(f"  WARNING: {col} has {neg_count} negative values")
    print("  Done (no warnings = good)")
    
    # Check for duplicate entries
    print("\n3. Duplicate check:")
    dup_cols = ['subject_code', 'session_id', 'modality', 'label']
    dups = batch_df.duplicated(subset=dup_cols, keep=False).sum()
    print(f"  Duplicate rows: {dups}")
    
    # Check region counts are consistent
    print("\n4. Region count consistency:")
    region_counts = batch_df.groupby(['subject_code', 'session_id', 'modality'])['label'].nunique()
    print(f"  Regions per modality per session:")
    print(region_counts.groupby('modality').agg(['min', 'max', 'mean']).to_string())

## 11. Visualize Sample Data

In [None]:
# Simple histogram of GM volumes
if 'batch_df' in dir() and batch_df is not None:
    import matplotlib.pyplot as plt
    
    gm_df = batch_df[batch_df['modality'] == 'gm']
    if 'volume_mm3' in gm_df.columns and len(gm_df) > 0:
        fig, ax = plt.subplots(1, 1, figsize=(10, 4))
        gm_df['volume_mm3'].hist(bins=50, ax=ax)
        ax.set_xlabel('Volume (mm3)')
        ax.set_ylabel('Count')
        ax.set_title('Distribution of Gray Matter Regional Volumes')
        plt.tight_layout()
        plt.show()

## 12. Debug Specific Issues

Use this section to debug specific issues you encounter.

In [None]:
# Debug a specific subject/session
debug_subject = ""  # Fill in subject code
debug_session = ""  # Fill in session ID

if debug_subject and debug_session:
    print(f"Debugging: sub-{debug_subject}_ses-{debug_session}")
    
    # Check paths
    sess_dir = loader.get_session_directory(debug_subject, debug_session)
    cat12_dir = loader.get_cat12_directory(debug_subject, debug_session)
    
    print(f"\nSession dir exists: {sess_dir is not None}")
    print(f"CAT12 dir exists: {cat12_dir is not None}")
    
    if sess_dir:
        print(f"\nFiles in session dir:")
        for f in sess_dir.iterdir():
            print(f"  {f.name}")
    
    # Try loading
    debug_df = loader.load_session(debug_subject, debug_session)
    if debug_df is not None:
        display(debug_df.head())
    else:
        print("\nFailed to load session data")