In [1]:
import pandas as pd
import seaborn as sns
from datetime import datetime
import numpy as np
from statistics import median

f_tab_adni_tau = './data/ADNI_Tau_Amyloid_SUVR_amyloid_tau_status_dems.csv'
f_tab_pet_data = './data/Tabular_Info_PET.csv'

df_adni_tau = pd.read_csv(f_tab_adni_tau, low_memory=False)
df_pet_data = pd.read_csv(f_tab_pet_data, low_memory=False)

FileNotFoundError: [Errno 2] No such file or directory: './data/Tabular_Info_PET.csv'

### ADNI_Tau_Amyloid_SUVR_amyloid_tau_status_dems

Change the strings in the session column 'ses' to datetime objects to facilitate computing the time differences

In [None]:
df_adni_tau['ses'] = df_adni_tau['ses'].apply(lambda x: datetime.strptime(x, 'ses-%Y-%m-%d'))

#### Number of subjects

In [None]:
unique_id_list = df_adni_tau['ID'].unique()
print(f'Lenth of the DataFrame:    {len(df_adni_tau)}')
print(f'Number of unique subjects:    {len(unique_id_list)}')


#### Diagnosis change

In the following we can observe that there are subjects for which the diagnosis 'DX' changed between different assesments. 

The subsequent dataframe lists 'DX' for a single patient in chronological order. It stands out that there was a diagnosis "CN" after the patient has been diagnosed as "MCI". Also interesting is, that there is another "CN" diagnosis two years later.

In [None]:
grp_id = df_adni_tau.groupby('ID')
s59 = grp_id.get_group('sub-0059').sort_values(by='ses')[['ID', 'age', 'group', 'pet.modality', 'DX', 'ses']]
s59

### Plot the frequency of time differences between 2 pet scans from the same person and the same pet modality

In [None]:
time_diff_days = []

grp_id_mod = df_adni_tau.groupby(['ID', 'pet.modality'])
for name, group in grp_id_mod:
    group = group.sort_values(by='ses')[['ID', 'age', 'group', 'pet.modality', 'DX', 'ses']]
    group['delta'] = (group['ses']-group['ses'].shift())
    group['delta'] = group['delta'].apply(lambda x: x.days)
    
    list_td = list(group['delta'])
    list_td = list(filter(lambda x: str(x) != 'nan', list_td))
    time_diff_days += list_td


In [None]:
print(len(time_diff_days))
sns.histplot(time_diff_days, stat='percent', cumulative=True, binwidth=10)
sns.histplot(time_diff_days, stat='percent', binwidth=10)

In the plot above can be seen, that two sessions are usually one or two years apart.

## Plot the class distributions over the different modalities 

We can observe that we have a class-imbalance problem since the 'Dementia' class is clearly underrepresented

In [None]:
df_adni_tau['pet.modality'].unique()

In [None]:
filt_av45 = df_adni_tau['pet.modality'] == 'pet-AV45'
filt_av1451 = df_adni_tau['pet.modality'] == 'pet-AV1451'
filt_fbb = df_adni_tau['pet.modality'] == 'pet-FBB'
df_pet_av45 = df_adni_tau.loc[filt_av45]
df_pet_av1451 = df_adni_tau.loc[filt_av1451]
df_pet_fbb = df_adni_tau.loc[filt_fbb]


In [None]:
sns.countplot(data=df_pet_av45, x='DX')

In [None]:
sns.countplot(data=df_pet_av1451, x='DX')

In [None]:
sns.countplot(data=df_pet_fbb, x='DX')

### Overview of keys and values

In [None]:
print(print('\n'.join(list(df_adni_tau.keys()))))

### Usually AD patients have In-Clinic-Visits more frequently 
(3 per year which does not exactly coincide with what we see below)

In [None]:
#print(df_adni_tau.head(10))
#print(df_adni_tau[['ID', 'ses', 'ADAS13', 'Phase']].head(10))
filt_ad = df_adni_tau['DX'] == 'Dementia'
df_ad = df_adni_tau.loc[filt_ad]
df_ad[['ID', 'ses', 'DX']].head(30).sort_values(by=['ID', 'ses'])

In [None]:
df_adni_tau

### Subject study entry statistics

In [None]:
# Drop all but the first appearance of each subject
df_subject_entry = df_adni_tau.sort_values('ses').drop_duplicates(subset=['ID'])

In [None]:
sns.histplot(df_subject_entry, x='age', discrete=True)

In [None]:
sns.countplot(data=df_subject_entry, x='sex')

In [None]:
sns.countplot(data=df_subject_entry, x='group')

In [None]:
# Add column with year of first session
df_subject_entry['acq.year'] = pd.to_datetime(df_subject_entry['acq.date']).dt.year

sns.histplot(data=df_subject_entry, x='acq.year', discrete=True)

In [None]:
sns.histplot(data=df_subject_entry, x='MMSE', discrete=True)
filt_missing_mmse = pd.isnull(df_subject_entry['MMSE'])
print(f'Missing Values: {filt_missing_mmse.sum()} ({filt_missing_mmse.sum()/len(df_subject_entry)*100:.2f}%)')

In [None]:
sns.histplot(data=df_subject_entry, x='MMSE', hue='DX', element="step")#,stat="density")

Uncommenting the next cell will display the data for samples where MMSE is missing.
However this did not give meaningful insights.

In [None]:
#df_subject_entry.loc[filt_missing_mmse][['ID','group', 'pet.modality', 'DX']].head(47)

In [None]:
sns.histplot(data=df_subject_entry, x='ADAS13', discrete=True)
filt_missing_adas = pd.isnull(df_subject_entry['ADAS13'])
print(f'Missing Values: {filt_missing_adas.sum()} ({(filt_missing_adas.sum()/len(df_subject_entry))*100:.2f}%)')

In [None]:
sns.histplot(data=df_subject_entry, x='ADAS13', hue='DX', element="step")

In [None]:
sns.countplot(data=df_subject_entry, x='Phase')

In [None]:
sns.countplot(data=df_subject_entry, x='apoe')

In [None]:
sns.histplot(data=df_subject_entry, x='centiloid')

In [None]:
sns.histplot(data=df_subject_entry, x='tau.global.SUVR')

In [None]:
sns.histplot(data=df_subject_entry, x='mPACCtrailsB')

In [None]:
sns.countplot(data=df_subject_entry, x='SITEID')

In [None]:
sns.countplot(data=df_subject_entry, x='DX')

In [None]:
sns.histplot(data=df_subject_entry, x='PTEDUCAT')

In [None]:
sns.histplot(data=df_subject_entry, x='ADNI_MEM')

In [None]:
sns.histplot(data=df_subject_entry, x='ADNI_EF')

In [None]:
sns.histplot(data=df_subject_entry, x='ADNI_LAN')

In [None]:
sns.histplot(data=df_subject_entry, x='ADNI_VS')

In [None]:
sns.histplot(data=df_subject_entry, x='t.diff.uwn.psych.pet.yrs')

In [None]:
sns.histplot(data=df_subject_entry, x='t.diff.adas.pet.yrs')

In [None]:
sns.histplot(data=df_subject_entry, x='t.diff.diagnosis.pet.yrs')

### Intra subject change per year

In [None]:
# Select only entries with pet-AV1251 and group them by ID
grp_id = df_adni_tau[df_adni_tau['pet.modality'] == 'pet-AV1451'].groupby(['ID'])

In [None]:
list_delta_MMSE = []

# Calculate median change of MMSE per year for each subject
for _, group in grp_id:
    group = group.sort_values(by='ses')[['ID', 'DX', 'ses', 'MMSE', 'ADAS13', 'ADNI_MEM', 'ADNI_EF', 'ADNI_LAN', 'ADNI_VS']]
    group['delta'] = (group['ses']-group['ses'].shift())
    group['delta'] = group['delta'].apply(lambda x: x.days)
    
    group['delta_MMSE_per_ses'] = (group['MMSE']-group['MMSE'].shift())
    group['delta_MMSE_per_year'] = group['delta_MMSE_per_ses']/group['delta']*365
    
    list_delta_MMSE_sub = list(group['delta_MMSE_per_year'])
    list_delta_MMSE_sub = list(filter(lambda x: str(x) != 'nan', list_delta_MMSE_sub))
    if list_delta_MMSE_sub:
        list_delta_MMSE += [round(median(list_delta_MMSE_sub))]
    
sns.histplot(list_delta_MMSE, stat='percent', discrete=True)

In [None]:
list_delta_ADAS13 = []

# Calculate median change of ADAS13 per year for each subject
for _, group in grp_id:
    group = group.sort_values(by='ses')[['ID', 'DX', 'ses', 'MMSE', 'ADAS13', 'ADNI_MEM', 'ADNI_EF', 'ADNI_LAN', 'ADNI_VS']]
    group['delta'] = (group['ses']-group['ses'].shift())
    group['delta'] = group['delta'].apply(lambda x: x.days)
    
    group['delta_ADAS13_per_ses'] = (group['ADAS13']-group['ADAS13'].shift())
    group['delta_ADAS13_per_year'] = group['delta_ADAS13_per_ses']/group['delta']*365
    
    list_delta_sub = list(group['delta_ADAS13_per_year'])
    list_delta_sub = list(filter(lambda x: str(x) != 'nan', list_delta_sub))
    if list_delta_sub:
        list_delta_ADAS13 += [median(list_delta_sub)]
    
sns.histplot(list_delta_ADAS13, stat='percent')

### Non redundancy reduced statistics

In [None]:
sns.countplot(data=df_adni_tau, x='pet.modality')

### Tabular_Info_PET

In [None]:
print(print('\n'.join(list(df_pet_data.keys()))))

In [5]:
import pandas as pd
import seaborn as sns
from datetime import datetime
import numpy as np
from statistics import median
mri_t1w_path = './data/train_path_data_labels.csv'

df_mri_t1w = pd.read_csv(mri_t1w_path, low_memory=False)
df_mri_t1w

Unnamed: 0,ID,age,sex,group,pet.modality,modality.description,image.id,ses,acq.date,SUVR.Schaefer200.ROI.idx.1,...,t.diff.uwn.psych.pet.yrs,ADAS13,MMSE,mPACCtrailsB,t.diff.adas.pet.yrs,PHASE,DX,SITEID,t.diff.diagnosis.pet.yrs,Phase
0,sub-0021,80,F,CN,pet-AV45,"AV45 Coreg, Avg, Standardized Image and Voxel ...",I338512,ses-2012-10-03,2012-10-03,0.912861,...,0.000000,1.00,28.0,1.299940,0.000000,,CN,8,0.021918,ADNI2
1,sub-0021,85,F,CN,pet-AV1451,"AV1451 Coreg, Avg, Standardized Image and Voxe...",I964024,ses-2018-02-02,2018-02-02,1.126000,...,-0.183562,12.33,27.0,-2.151210,-0.183562,,CN,8,0.065753,ADNI3
2,sub-0021,83,F,CN,pet-AV45,"AV45 Coreg, Avg, Standardized Image and Voxel ...",I557331,ses-2015-11-05,2015-11-05,0.899505,...,-0.024658,5.00,27.0,-0.816517,-0.024658,,CN,8,0.013699,ADNI2
3,sub-0021,78,F,CN,pet-AV45,"AV45 Coreg, Avg, Standardized Image and Voxel ...",I196544,ses-2010-10-08,2010-10-08,1.007220,...,-0.002740,4.00,29.0,0.355272,-0.002740,,CN,8,0.027397,ADNIGO
4,sub-0021,85,F,CN,pet-AV45,"AV45 Coreg, Avg, Standardized Image and Voxel ...",I943941,ses-2017-11-28,2017-11-28,0.877004,...,-0.002740,12.33,27.0,-2.151210,-0.002740,,CN,8,0.246575,ADNI3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2958,sub-6887,73,F,MCI,pet-AV1451,ADNI3-TAU 2 (AC),I1360700,ses-2020-10-27,2020-10-27,1.218620,...,,22.33,26.0,-8.167770,-0.060274,,MCI,26,-0.049315,ADNI3
2959,sub-6888,64,F,MCI,pet-AV1451,PET1 Tau,I1346966,ses-2020-10-07,2020-10-07,1.126700,...,,15.67,26.0,-3.761820,0.000000,,MCI,45,-0.016438,ADNI3
2960,sub-6889,68,M,MCI,pet-AV1451,BRAIN ADNI Tau,I1350177,ses-2020-10-16,2020-10-16,1.183060,...,,29.00,28.0,-5.300670,0.019178,,MCI,24,-0.057534,ADNI3
2961,sub-6890,66,M,MCI,pet-AV1451,AV-1451 PET BRAIN TOF Tau,I1380834,ses-2020-12-01,2020-12-01,1.087770,...,,12.00,29.0,-1.200640,0.000000,,MCI,16,0.035616,ADNI3


In [4]:
sns.countplot(data=df_mri_t1w, x='label')

ValueError: Could not interpret input 'label'