# Preparing the Scan Files for Exploration and Analysis

### Libraries

In [1]:
from nilearn import plotting, image
from nilearn.input_data import NiftiMasker
import nibabel as nib
from bids import BIDSLayout, BIDSValidator
import nibabel as nib
import numpy as np
import matplotlib.pyplot as plt
from sklearn.decomposition import FastICA
from nilearn.image import index_img
from nipype.interfaces import fsl
from nipype.pipeline.engine import Workflow, Node
from nipype import Workflow, Node
from nipype.interfaces.fsl import MCFLIRT
import os
from scipy.stats import ttest_ind
import pandas as pd
import collections
from ydata_profiling import ProfileReport
import random



In [2]:
base_dir = '/Volumes/Expansion/ADHD/yomo-20231204_124111'

In [None]:
# Not the nicest function, but it works to extract every nifti files from the dataset
# .. despite the lack of standardization in the folder structure
# i.e., some records have multiple rest files
def load_fmri_data_with_metadata(base_dir):
    fmri_records = []
    for subject_folder in os.listdir(base_dir):
        subject_path = os.path.join(base_dir, subject_folder)
        if os.path.isdir(subject_path):
            # Search for any subdirectories that contain 'anat', 'rest', or 'func'
            for dir_name in os.listdir(subject_path):
                if any(scan_type in dir_name for scan_type in ['anat', 'rest', 'func']):
                    scan_path = os.path.join(subject_path, dir_name)
                    nifti_path = os.path.join(scan_path, 'NIfTI')
                    if os.path.isdir(nifti_path):
                        for file in os.listdir(nifti_path):
                            if file.endswith('.nii.gz') and not file.startswith('.'):
                                file_path = os.path.join(nifti_path, file)
                                fmri_records.append({
                                    "subject_id": subject_folder,
                                    "scan_type": dir_name,
                                    "file_path": file_path
                                })
    return pd.DataFrame(fmri_records)

fmri_data_df = load_fmri_data_with_metadata(base_dir)

In [4]:
fmri_data_df

Unnamed: 0,subject_id,scan_type,file_path
0,NeuroIMAGE_4020830_1,rest_1,/Volumes/Expansion/ADHD/yomo-20231204_124111/N...
1,NeuroIMAGE_4020830_1,anat_1,/Volumes/Expansion/ADHD/yomo-20231204_124111/N...
2,Peking_3248920_1,rest_1,/Volumes/Expansion/ADHD/yomo-20231204_124111/P...
3,Peking_3248920_1,anat_1,/Volumes/Expansion/ADHD/yomo-20231204_124111/P...
4,Peking_3767334_1,rest_1,/Volumes/Expansion/ADHD/yomo-20231204_124111/P...
...,...,...,...
2383,NYU_10032_1,rest_1,/Volumes/Expansion/ADHD/yomo-20231204_124111/N...
2384,NYU_10032_1,anat_1,/Volumes/Expansion/ADHD/yomo-20231204_124111/N...
2385,NYU_10030_1,rest_2,/Volumes/Expansion/ADHD/yomo-20231204_124111/N...
2386,NYU_10030_1,rest_1,/Volumes/Expansion/ADHD/yomo-20231204_124111/N...


In [5]:
# Split 'subject_id' into three parts
splits = fmri_data_df['subject_id'].str.split('_', expand=True)

# Assign the split parts to new columns
fmri_data_df['site'] = splits[0]
fmri_data_df['ID'] = splits[1]
fmri_data_df['session'] = splits[2]

In [6]:
# Reorder columns
fmri_data_df = fmri_data_df[['ID', 'site', 'session', 'subject_id', 'scan_type', 'file_path']]

# Rename 'subject_id' to 'combinedData'
fmri_data_df = fmri_data_df.rename(columns={'subject_id': 'combinedData'})
fmri_data_df.head()

Unnamed: 0,ID,site,session,combinedData,scan_type,file_path
0,4020830,NeuroIMAGE,1,NeuroIMAGE_4020830_1,rest_1,/Volumes/Expansion/ADHD/yomo-20231204_124111/N...
1,4020830,NeuroIMAGE,1,NeuroIMAGE_4020830_1,anat_1,/Volumes/Expansion/ADHD/yomo-20231204_124111/N...
2,3248920,Peking,1,Peking_3248920_1,rest_1,/Volumes/Expansion/ADHD/yomo-20231204_124111/P...
3,3248920,Peking,1,Peking_3248920_1,anat_1,/Volumes/Expansion/ADHD/yomo-20231204_124111/P...
4,3767334,Peking,1,Peking_3767334_1,rest_1,/Volumes/Expansion/ADHD/yomo-20231204_124111/P...


In [7]:
# A total of 2388 scans from 973 unique subjects
fmri_data_df.shape

(2388, 6)

In [8]:
labels_df = pd.read_csv('/Volumes/Expansion/ADHD/yomo-20231204_124111/adhd200_preprocessed_phenotypics.tsv', sep='\t')

In [9]:
labels_df['DX'].value_counts()

DX
0          585
1          212
3          137
pending     26
2           13
Name: count, dtype: int64

In [10]:
# For simplicity, we will only consider ADHD vs. Healthy controls and not the subtypes
def binary_label(dx_value):
    if dx_value == '0':
        return 0
    elif dx_value in ['1', '2', '3']:
        return 1
    elif dx_value == 'pending':
        return np.nan
    else:
        return None  # or some default value

labels_df['diagnosis'] = labels_df['DX'].map(binary_label)

In [11]:
labels_df['diagnosis'].value_counts()

diagnosis
0.0    585
1.0    362
Name: count, dtype: int64

In [12]:
labels_df.head()

Unnamed: 0,ScanDir ID,Site,Gender,Age,Handedness,DX,Secondary Dx,ADHD Measure,ADHD Index,Inattentive,Hyper/Impulsive,IQ Measure,Verbal IQ,Performance IQ,Full2 IQ,Full4 IQ,Med Status,QC_Athena,QC_NIAK,diagnosis
0,2371032,3,0.0,10.73,1,0,,2,47,55,43,1.0,121.0,119.0,,122.0,1,1.0,1.0,0.0
1,2026113,3,0.0,12.99,1,1,,2,90,89,78,1.0,122.0,108.0,,106.0,1,1.0,1.0,1.0
2,3434578,3,0.0,8.12,1,0,,2,42,42,43,1.0,85.0,98.0,,89.0,1,1.0,1.0,0.0
3,8628223,3,0.0,10.81,1,0,Simple phobia,2,42,49,49,1.0,85.0,86.0,,97.0,1,1.0,1.0,0.0
4,1623716,3,0.0,12.65,1,1,,2,87,90,90,1.0,89.0,88.0,,89.0,1,1.0,1.0,1.0


In [13]:
labels_df.shape

(973, 20)

In [14]:
fmri_data_df.shape

(2388, 6)

In [15]:
fmri_data_df['ID'] = fmri_data_df['ID'].astype('int64')

In [16]:
# Merging the labels and pheno data
merged_df = pd.merge(fmri_data_df, labels_df, left_on='ID', right_on='ScanDir ID', how='left')
merged_df.head()

Unnamed: 0,ID,site,session,combinedData,scan_type,file_path,ScanDir ID,Site,Gender,Age,...,Hyper/Impulsive,IQ Measure,Verbal IQ,Performance IQ,Full2 IQ,Full4 IQ,Med Status,QC_Athena,QC_NIAK,diagnosis
0,4020830,NeuroIMAGE,1,NeuroIMAGE_4020830_1,rest_1,/Volumes/Expansion/ADHD/yomo-20231204_124111/N...,4020830,4,0.0,18.19,...,,,,,,,,1.0,1.0,0.0
1,4020830,NeuroIMAGE,1,NeuroIMAGE_4020830_1,anat_1,/Volumes/Expansion/ADHD/yomo-20231204_124111/N...,4020830,4,0.0,18.19,...,,,,,,,,1.0,1.0,0.0
2,3248920,Peking,1,Peking_3248920_1,rest_1,/Volumes/Expansion/ADHD/yomo-20231204_124111/P...,3248920,1,1.0,12.17,...,10.0,3.0,-999.0,-999.0,,-999.0,1.0,1.0,1.0,0.0
3,3248920,Peking,1,Peking_3248920_1,anat_1,/Volumes/Expansion/ADHD/yomo-20231204_124111/P...,3248920,1,1.0,12.17,...,10.0,3.0,-999.0,-999.0,,-999.0,1.0,1.0,1.0,0.0
4,3767334,Peking,1,Peking_3767334_1,rest_1,/Volumes/Expansion/ADHD/yomo-20231204_124111/P...,3767334,1,1.0,8.33,...,25.0,3.0,112.0,104.0,-999.0,109.0,1.0,1.0,1.0,1.0


In [17]:
merged_df.columns

Index(['ID', 'site', 'session', 'combinedData', 'scan_type', 'file_path',
       'ScanDir ID', 'Site', 'Gender', 'Age', 'Handedness', 'DX',
       'Secondary Dx', 'ADHD Measure', 'ADHD Index', 'Inattentive',
       'Hyper/Impulsive', 'IQ Measure', 'Verbal IQ', 'Performance IQ',
       'Full2 IQ', 'Full4 IQ', 'Med Status', 'QC_Athena', 'QC_NIAK',
       'diagnosis'],
      dtype='object')

In [18]:
merged_df.shape

(2388, 26)

In [19]:
# For some records we have more than 1 rest session for some reason (not discloused in the documentation)
merged_df['scan_type'].value_counts()

scan_type
rest_1    980
anat_1    973
rest_2    307
rest_3    114
rest_4      6
rest_6      4
rest_5      4
Name: count, dtype: int64

In [20]:
# Find the IDs for each rest category
ids_rest_2 = set(merged_df[merged_df['scan_type'] == 'rest_2']['ID'])
ids_rest_3 = set(merged_df[merged_df['scan_type'] == 'rest_3']['ID'])
ids_rest_4 = set(merged_df[merged_df['scan_type'] == 'rest_4']['ID'])
ids_rest_5 = set(merged_df[merged_df['scan_type'] == 'rest_5']['ID'])
ids_rest_6 = set(merged_df[merged_df['scan_type'] == 'rest_6']['ID'])

# Combine IDs for rest_3, rest_4, rest_5, and rest_6
ids_rest_3_to_6 = ids_rest_3.union(ids_rest_4, ids_rest_5, ids_rest_6)

In [21]:
# Double check if ids_rest_2 includes all ids from rest_3 to rest_6
# Logically, yes, as we won't have rest 2-6 without rest 2, but just to be sure
if ids_rest_3_to_6.issubset(ids_rest_2):
    print("All IDs from rest_3, rest_4, rest_5, and rest_6 are included in rest_2 IDs.")
else:
    print("There are some IDs in rest_3 to rest_6 that are not included in rest_2 IDs.")


All IDs from rest_3, rest_4, rest_5, and rest_6 are included in rest_2 IDs.


In [22]:
# Filter the DataFrame
filtered_for_diagnosis = merged_df[merged_df['ID'].isin(ids_rest_2)]

# Display the ID and diagnosis columns
print(filtered_for_diagnosis[['ID', 'diagnosis']])


           ID  diagnosis
8     1340333        1.0
9     1340333        1.0
10    1340333        1.0
11    1340333        1.0
18    3560456        1.0
...       ...        ...
2381  3470141        1.0
2382  3470141        1.0
2385    10030        1.0
2386    10030        1.0
2387    10030        1.0

[1047 rows x 2 columns]


In [23]:
filtered_for_diagnosis

Unnamed: 0,ID,site,session,combinedData,scan_type,file_path,ScanDir ID,Site,Gender,Age,...,Hyper/Impulsive,IQ Measure,Verbal IQ,Performance IQ,Full2 IQ,Full4 IQ,Med Status,QC_Athena,QC_NIAK,diagnosis
8,1340333,OHSU,1,OHSU_1340333_1,rest_3,/Volumes/Expansion/ADHD/yomo-20231204_124111/O...,1340333,6,0.0,7.42,...,90,2.0,,,,98.0,2,1.0,1.0,1.0
9,1340333,OHSU,1,OHSU_1340333_1,rest_2,/Volumes/Expansion/ADHD/yomo-20231204_124111/O...,1340333,6,0.0,7.42,...,90,2.0,,,,98.0,2,1.0,1.0,1.0
10,1340333,OHSU,1,OHSU_1340333_1,rest_1,/Volumes/Expansion/ADHD/yomo-20231204_124111/O...,1340333,6,0.0,7.42,...,90,2.0,,,,98.0,2,1.0,1.0,1.0
11,1340333,OHSU,1,OHSU_1340333_1,anat_1,/Volumes/Expansion/ADHD/yomo-20231204_124111/O...,1340333,6,0.0,7.42,...,90,2.0,,,,98.0,2,1.0,1.0,1.0
18,3560456,OHSU,1,OHSU_3560456_1,rest_3,/Volumes/Expansion/ADHD/yomo-20231204_124111/O...,3560456,6,0.0,7.83,...,63,2.0,,,,126.0,-999,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2381,3470141,OHSU,1,OHSU_3470141_1,rest_1,/Volumes/Expansion/ADHD/yomo-20231204_124111/O...,3470141,6,1.0,8.50,...,90,2.0,,,,104.0,2,1.0,1.0,1.0
2382,3470141,OHSU,1,OHSU_3470141_1,anat_1,/Volumes/Expansion/ADHD/yomo-20231204_124111/O...,3470141,6,1.0,8.50,...,90,2.0,,,,104.0,2,1.0,1.0,1.0
2385,10030,NYU,1,NYU_10030_1,rest_2,/Volumes/Expansion/ADHD/yomo-20231204_124111/N...,10030,5,1.0,12.41,...,72,2.0,100.0,105.0,,103.0,-999,1.0,1.0,1.0
2386,10030,NYU,1,NYU_10030_1,rest_1,/Volumes/Expansion/ADHD/yomo-20231204_124111/N...,10030,5,1.0,12.41,...,72,2.0,100.0,105.0,,103.0,-999,1.0,1.0,1.0


In [24]:
# Get unique IDs and their diagnosis
unique_diagnosis_original = merged_df.drop_duplicates(subset='ID')[['ID', 'diagnosis']]

# Get diagnosis distribution
diagnosis_distribution_original = unique_diagnosis_original['diagnosis'].value_counts()
print("Diagnosis distribution in original dataset:")
print(diagnosis_distribution_original)

Diagnosis distribution in original dataset:
diagnosis
0.0    585
1.0    362
Name: count, dtype: int64


In [25]:
# Remove IDs with more than one rest session
cleaned = merged_df[~merged_df['ID'].isin(ids_rest_2)]

# Get unique IDs and their diagnosis in the cleaned dataset
unique_diagnosis_cleaned = cleaned.drop_duplicates(subset='ID')[['ID', 'diagnosis']]

# Get diagnosis distribution in the cleaned dataset
diagnosis_distribution_cleaned = unique_diagnosis_cleaned['diagnosis'].value_counts()
print("Diagnosis distribution in cleaned dataset:")
print(diagnosis_distribution_cleaned)

Diagnosis distribution in cleaned dataset:
diagnosis
0.0    412
1.0    231
Name: count, dtype: int64


In [26]:
cleaned = merged_df[~merged_df['ID'].isin(ids_rest_2)]
cleaned.shape

(1341, 26)

In [27]:
cleaned['session'] = cleaned['session'].astype('int64')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaned['session'] = cleaned['session'].astype('int64')


In [28]:
filtered_rows = cleaned[cleaned['session'] > 1]
print(filtered_rows)


         ID   site  session   combinedData scan_type  \
1967  15022  WashU        2  WashU_15022_2    rest_1   
1970  15021  WashU        2  WashU_15021_2    rest_1   
1973  15024  WashU        2  WashU_15024_2    rest_1   
1978  15023  WashU        2  WashU_15023_2    rest_1   
2034  15011  WashU        2  WashU_15011_2    rest_1   
2042  15013  WashU        2  WashU_15013_2    rest_1   
2061  15014  WashU        2  WashU_15014_2    rest_1   
2066  15017  WashU        2  WashU_15017_2    rest_1   
2068  15016  WashU        2  WashU_15016_2    rest_1   
2069  15016  WashU        2  WashU_15016_2    anat_1   
2072  15019  WashU        2  WashU_15019_2    rest_1   

                                              file_path  ScanDir ID  Site  \
1967  /Volumes/Expansion/ADHD/yomo-20231204_124111/W...       15022     8   
1970  /Volumes/Expansion/ADHD/yomo-20231204_124111/W...       15021     8   
1973  /Volumes/Expansion/ADHD/yomo-20231204_124111/W...       15024     8   
1978  /Volumes/Expa

In [29]:
unique_vals = filtered_rows['ID'].nunique()

print(f"Number of unique values in 'ID': {unique_vals}")

Number of unique values in 'ID': 10


In [30]:
print(cleaned[cleaned['site']=='WashU'])

         ID   site  session   combinedData scan_type  \
1925  15038  WashU        1  WashU_15038_1    rest_1   
1926  15038  WashU        1  WashU_15038_1    anat_1   
1965  15022  WashU        1  WashU_15022_1    rest_1   
1966  15022  WashU        1  WashU_15022_1    anat_1   
1967  15022  WashU        2  WashU_15022_2    rest_1   
1968  15021  WashU        1  WashU_15021_1    rest_1   
1969  15021  WashU        1  WashU_15021_1    anat_1   
1970  15021  WashU        2  WashU_15021_2    rest_1   
1971  15024  WashU        1  WashU_15024_1    rest_1   
1972  15024  WashU        1  WashU_15024_1    anat_1   
1973  15024  WashU        2  WashU_15024_2    rest_1   
1976  15023  WashU        1  WashU_15023_1    rest_1   
1977  15023  WashU        1  WashU_15023_1    anat_1   
1978  15023  WashU        2  WashU_15023_2    rest_1   
1988  15025  WashU        1  WashU_15025_1    rest_1   
1989  15025  WashU        1  WashU_15025_1    anat_1   
2033  15011  WashU        1  WashU_15011_1    an

In [31]:
nan_rows = cleaned[pd.isna(cleaned['diagnosis'])]
print(nan_rows)


        ID   site  session   combinedData scan_type  \
22   26005  Brown        1  Brown_26005_1    rest_1   
23   26005  Brown        1  Brown_26005_1    anat_1   
26   26009  Brown        1  Brown_26009_1    rest_1   
27   26009  Brown        1  Brown_26009_1    anat_1   
30   26002  Brown        1  Brown_26002_1    rest_1   
31   26002  Brown        1  Brown_26002_1    anat_1   
32   26001  Brown        1  Brown_26001_1    rest_1   
33   26001  Brown        1  Brown_26001_1    anat_1   
34   26004  Brown        1  Brown_26004_1    rest_1   
35   26004  Brown        1  Brown_26004_1    anat_1   
44   26017  Brown        1  Brown_26017_1    rest_1   
45   26017  Brown        1  Brown_26017_1    anat_1   
50   26016  Brown        1  Brown_26016_1    rest_1   
51   26016  Brown        1  Brown_26016_1    anat_1   
63   26015  Brown        1  Brown_26015_1    rest_1   
64   26015  Brown        1  Brown_26015_1    anat_1   
69   26014  Brown        1  Brown_26014_1    rest_1   
70   26014

In [32]:
unique_values_count = nan_rows['ID'].nunique()

print(f"Number of unique values in 'ID': {unique_values_count}")


Number of unique values in 'ID': 26


In [33]:
filtered_rows['site'].value_counts()

site
WashU    11
Name: count, dtype: int64

We need to eliminate from the data frame the subjects that have no diagnosis/ label, and the ones that have had more than one scan session. 

In [34]:
# we started off with 1953 rows
cleaned.shape

(1341, 26)

In [35]:
cleaned.head()

Unnamed: 0,ID,site,session,combinedData,scan_type,file_path,ScanDir ID,Site,Gender,Age,...,Hyper/Impulsive,IQ Measure,Verbal IQ,Performance IQ,Full2 IQ,Full4 IQ,Med Status,QC_Athena,QC_NIAK,diagnosis
0,4020830,NeuroIMAGE,1,NeuroIMAGE_4020830_1,rest_1,/Volumes/Expansion/ADHD/yomo-20231204_124111/N...,4020830,4,0.0,18.19,...,,,,,,,,1.0,1.0,0.0
1,4020830,NeuroIMAGE,1,NeuroIMAGE_4020830_1,anat_1,/Volumes/Expansion/ADHD/yomo-20231204_124111/N...,4020830,4,0.0,18.19,...,,,,,,,,1.0,1.0,0.0
2,3248920,Peking,1,Peking_3248920_1,rest_1,/Volumes/Expansion/ADHD/yomo-20231204_124111/P...,3248920,1,1.0,12.17,...,10.0,3.0,-999.0,-999.0,,-999.0,1.0,1.0,1.0,0.0
3,3248920,Peking,1,Peking_3248920_1,anat_1,/Volumes/Expansion/ADHD/yomo-20231204_124111/P...,3248920,1,1.0,12.17,...,10.0,3.0,-999.0,-999.0,,-999.0,1.0,1.0,1.0,0.0
4,3767334,Peking,1,Peking_3767334_1,rest_1,/Volumes/Expansion/ADHD/yomo-20231204_124111/P...,3767334,1,1.0,8.33,...,25.0,3.0,112.0,104.0,-999.0,109.0,1.0,1.0,1.0,1.0


In [36]:
# removing 52 rows with missign diagnosis
cleaned_df = cleaned.dropna(subset=['diagnosis'])

In [37]:
cleaned_df.shape

(1289, 26)

In [38]:
ids_with_multiple_sessions = cleaned_df[cleaned_df['session'] > 1]['ID'].unique()
print(len(ids_with_multiple_sessions))
ids_with_multiple_sessions

10


array([15022, 15021, 15024, 15023, 15011, 15013, 15014, 15017, 15016,
       15019])

In [39]:
# Filter the DataFrame to only include rows with these IDs
rows_with_multiple_sessions = cleaned_df[cleaned_df['ID'].isin(ids_with_multiple_sessions)]


In [40]:
rows_with_multiple_sessions['site'].value_counts()

site
WashU    29
Name: count, dtype: int64

In [41]:
# Remove rows with these IDs from the cleaned_df
cleaned_df = cleaned_df[~cleaned_df['ID'].isin(ids_with_multiple_sessions)]

In [42]:
# Cleaned data end result
cleaned_df.shape

(1260, 26)

In [43]:
cleaned_df['scan_type'].value_counts()

scan_type
anat_1    633
rest_1    627
Name: count, dtype: int64

In [44]:
# Unique count of health controls and ADHD cases in the cleaned dataset
unique_IDs = cleaned_df.drop_duplicates(subset='ID')

# Count of unique health controls (diagnosis = 0) and ADHD (diagnosis = 1)
unique_health_controls_count = unique_IDs[unique_IDs['diagnosis'] == 0].shape[0]
unique_adhd_count = unique_IDs[unique_IDs['diagnosis'] == 1].shape[0]

print("Unique health controls count:", unique_health_controls_count)
print("Unique ADHD count:", unique_adhd_count)

Unique health controls count: 402
Unique ADHD count: 231


In [48]:
# Export the results to use in the next steps
cleaned_df.to_csv('/Volumes/Expansion/ADHD/yomo-20231204_124111/cleaned_df.csv', index=False)