In [1]:
from pathlib import Path

import pandas as pd

In [2]:
scan_csv = Path('OASIS3_MR_scans.csv')
clinical_csv = Path('OASIS3_UDSd1_diagnoses.csv')

scan_df = pd.read_csv(scan_csv)
clinical_df = pd.read_csv(clinical_csv)

print("Number of scans: ", len(scan_df))
print("Number of clinical visits: ", len(clinical_df))
print("Number of unique subjects in scans: ", scan_df['Subject'].nunique())
print("Number of unique subjects in clinical: ", clinical_df['OASISID'].nunique())


Number of scans:  2844
Number of clinical visits:  8499
Number of unique subjects in scans:  1377
Number of unique subjects in clinical:  1340


In [3]:
base_clinical_df = clinical_df[clinical_df['days_to_visit'] == 0]
print("Number of base clinical visits: ", len(base_clinical_df))

assert len(base_clinical_df) == base_clinical_df['OASISID'].nunique()

clinical_df_to_merge = base_clinical_df.loc[:, ['OASISID', 'age at visit']].copy()

print("Number of missing OASISID: ", clinical_df_to_merge['OASISID'].isna().sum())
print("Number of missing age at visit: ", clinical_df_to_merge['age at visit'].isna().sum())

clinical_df_to_merge.rename(columns={'age at visit': 'base_age'}, inplace=True)

clinical_df_to_merge.head()

Number of base clinical visits:  1323
Number of missing OASISID:  0
Number of missing age at visit:  0


Unnamed: 0,OASISID,base_age
0,OAS30001,65.19
15,OAS30002,67.25
23,OAS30003,58.81
35,OAS30004,55.13
42,OAS30005,48.06


In [4]:
# filter only T1-weighted scans
t1_scan_df = scan_df.loc[scan_df['Scans'].notna() & scan_df['Scans'].str.contains('T1w')].copy()
print("Number of T1-weighted scans: ", len(t1_scan_df))

parse_days_to_visit = lambda row: int(row['MR ID'].split('_')[-1][1:])
t1_scan_df['days_to_visit'] = t1_scan_df.apply(parse_days_to_visit, axis=1)

t1_scan_df = t1_scan_df.merge(clinical_df_to_merge, left_on='Subject', right_on='OASISID', how='inner')
t1_scan_df = t1_scan_df.drop(columns=['OASISID'])
t1_scan_df.head()

t1_scan_df['Age'] = (t1_scan_df['base_age'] + t1_scan_df['days_to_visit'] / 365).round(2)

t1_scan_df = t1_scan_df[['MR ID', 'Subject', 'Age', 'Scanner', 'Scans']]

print("Number of T1-weighted scans with precise age: ", len(t1_scan_df))

t1_scan_df.head()

Number of T1-weighted scans:  2832
Number of T1-weighted scans with precise age:  2748


Unnamed: 0,MR ID,Subject,Age,Scanner,Scans
0,OAS30001_MR_d0129,OAS30001,65.54,3.0T,"T1w(2), T2w(2), bold(3)"
1,OAS30001_MR_d0757,OAS30001,67.26,3.0T,"T1w(2), T2star(1), T2w(2), bold(2), dwi(1), mi..."
2,OAS30001_MR_d2430,OAS30001,71.85,3.0T,"FLAIR(1), T1w(1), T2star(1), T2w(1), angio(1),..."
3,OAS30001_MR_d3132,OAS30001,73.77,3.0T,"T1w(1), T2star(1), T2w(2), asl(2), bold(2), dw..."
4,OAS30001_MR_d3746,OAS30001,75.45,,"FLAIR(1), T1w(1), T2star(1), T2w(1), angio(1),..."


In [5]:
print("Number of null values for each column:")
print(t1_scan_df.isnull().sum())

Number of null values for each column:
MR ID        0
Subject      0
Age          0
Scanner    708
Scans        0
dtype: int64


### Mapping the scans to the nearest clinical visit with tolerance (1 yr) to define whether the scan is cognitively normal or not

In [6]:
clinical_df_to_merge.head()

Unnamed: 0,OASISID,base_age
0,OAS30001,65.19
15,OAS30002,67.25
23,OAS30003,58.81
35,OAS30004,55.13
42,OAS30005,48.06


In [9]:
mr_sorted = t1_scan_df.sort_values(by=['Age', 'Subject']).reset_index(drop=True)
udsd_sorted = clinical_df.sort_values(by=['age at visit', 'OASISID']).reset_index(drop=True)
udsd_sorted = udsd_sorted[['OASISID', 'age at visit', 'NORMCOG']]

matched = pd.merge_asof(
    mr_sorted,      # sorted by ["Subject", "Age"]
    udsd_sorted,    # sorted by ["OASISID", "age at visit"],
    left_by="Subject",
    right_by="OASISID",
    left_on="Age",
    right_on="age at visit",
    direction="nearest",
    tolerance=1.0   # years
)

print("Number of scans with missing NORMCOG: ", (matched['NORMCOG'].isna()).sum())
print("Number of cognitively abnormal scans: ", (matched['NORMCOG'] == 0.0).sum())
print("Number of cognitively normal scans: ", (matched['NORMCOG'] == 1.0).sum())

print("Using only cognitively normal scans")

matched = matched[matched['NORMCOG'] == 1.0]
matched.drop(columns=['OASISID', 'age at visit'], inplace=True)
matched.sort_values(by=['Subject', 'Age'], inplace=True)
matched.head()


Number of scans with missing NORMCOG:  216
Number of cognitively abnormal scans:  500
Number of cognitively normal scans:  2032
Using only cognitively normal scans


Unnamed: 0,MR ID,Subject,Age,Scanner,Scans,NORMCOG
705,OAS30001_MR_d0129,OAS30001,65.54,3.0T,"T1w(2), T2w(2), bold(3)",1.0
872,OAS30001_MR_d0757,OAS30001,67.26,3.0T,"T1w(2), T2star(1), T2w(2), bold(2), dwi(1), mi...",1.0
1447,OAS30001_MR_d2430,OAS30001,71.85,3.0T,"FLAIR(1), T1w(1), T2star(1), T2w(1), angio(1),...",1.0
1696,OAS30001_MR_d3132,OAS30001,73.77,3.0T,"T1w(1), T2star(1), T2w(2), asl(2), bold(2), dw...",1.0
1891,OAS30001_MR_d3746,OAS30001,75.45,,"FLAIR(1), T1w(1), T2star(1), T2w(1), angio(1),...",1.0


In [14]:
sfcn_age = matched[(matched['Age'] >= 44) & (matched['Age'] <= 80)].copy()

print("Number of scans in SFCN age range: ", len(sfcn_age))
print("Number of unique subjects in SFCN age range: ", sfcn_age['Subject'].nunique())

sfcn_age.head()

# sfcn_age.to_csv('sfcn_age.csv', index=False)


Number of scans in SFCN age range:  1792
Number of unique subjects in SFCN age range:  862


Unnamed: 0,MR ID,Subject,Age,Scanner,Scans,NORMCOG
705,OAS30001_MR_d0129,OAS30001,65.54,3.0T,"T1w(2), T2w(2), bold(3)",1.0
872,OAS30001_MR_d0757,OAS30001,67.26,3.0T,"T1w(2), T2star(1), T2w(2), bold(2), dwi(1), mi...",1.0
1447,OAS30001_MR_d2430,OAS30001,71.85,3.0T,"FLAIR(1), T1w(1), T2star(1), T2w(1), angio(1),...",1.0
1696,OAS30001_MR_d3132,OAS30001,73.77,3.0T,"T1w(1), T2star(1), T2w(2), asl(2), bold(2), dw...",1.0
1891,OAS30001_MR_d3746,OAS30001,75.45,,"FLAIR(1), T1w(1), T2star(1), T2w(1), angio(1),...",1.0
