In [None]:
import pandas as pd
import seaborn as sns
from datetime import datetime
import numpy as np
from statistics import median

f_tab_adni_tau = './data/ADNI_Tau_Amyloid_SUVR_amyloid_tau_status_dems.csv'
f_tab_pet_data = './data/Tabular_Info_PET.csv'

df_adni_tau = pd.read_csv(f_tab_adni_tau, low_memory=False)
df_pet_data = pd.read_csv(f_tab_pet_data, low_memory=False)

### ADNI_Tau_Amyloid_SUVR_amyloid_tau_status_dems

Change the strings in the session column 'ses' to datetime objects to facilitate computing the time differences

In [None]:
df_adni_tau['ses'] = df_adni_tau['ses'].apply(lambda x: datetime.strptime(x, 'ses-%Y-%m-%d'))

#### Number of subjects

In [None]:
unique_id_list = df_adni_tau['ID'].unique()
print(f'Lenth of the DataFrame:    {len(df_adni_tau)}')
print(f'Number of unique subjects:    {len(unique_id_list)}')


#### Diagnosis change

In the following we can observe that there are subjects for which the diagnosis 'DX' changed between different assesments. 

The subsequent dataframe lists 'DX' for a single patient in chronological order. It stands out that there was a diagnosis "CN" after the patient has been diagnosed as "MCI". Also interesting is, that there is another "CN" diagnosis two years later.

In [None]:
grp_id = df_adni_tau.groupby('ID')
s59 = grp_id.get_group('sub-0059').sort_values(by='ses')[['ID', 'age', 'group', 'pet.modality', 'DX', 'ses']]
s59

### Plot the frequency of time differences between 2 pet scans from the same person and the same pet modality

In [None]:
time_diff_days = []

grp_id_mod = df_adni_tau.groupby(['ID', 'pet.modality'])
for name, group in grp_id_mod:
    group = group.sort_values(by='ses')[['ID', 'age', 'group', 'pet.modality', 'DX', 'ses']]
    group['delta'] = (group['ses']-group['ses'].shift())
    group['delta'] = group['delta'].apply(lambda x: x.days)
    
    list_td = list(group['delta'])
    list_td = list(filter(lambda x: str(x) != 'nan', list_td))
    time_diff_days += list_td


In [None]:
print(len(time_diff_days))
sns.histplot(time_diff_days, stat='percent', cumulative=True, binwidth=10)
sns.histplot(time_diff_days, stat='percent', binwidth=10)

In the plot above can be seen, that two sessions are usually one or two years apart.

## Plot the class distributions over the different modalities 

We can observe that we have a class-imbalance problem since the 'Dementia' class is clearly underrepresented

In [None]:
df_adni_tau['pet.modality'].unique()

In [None]:
filt_av45 = df_adni_tau['pet.modality'] == 'pet-AV45'
filt_av1451 = df_adni_tau['pet.modality'] == 'pet-AV1451'
filt_fbb = df_adni_tau['pet.modality'] == 'pet-FBB'
df_pet_av45 = df_adni_tau.loc[filt_av45]
df_pet_av1451 = df_adni_tau.loc[filt_av1451]
df_pet_fbb = df_adni_tau.loc[filt_fbb]


In [None]:
sns.countplot(data=df_pet_av45, x='DX')

In [None]:
sns.countplot(data=df_pet_av1451, x='DX')

In [None]:
sns.countplot(data=df_pet_fbb, x='DX')

### Overview of keys and values

In [None]:
print(print('\n'.join(list(df_adni_tau.keys()))))

In [None]:
df_adni_tau

### Subject study entry statistics

In [None]:
# Drop all but the first appearance of each subject
df_subject_entry = df_adni_tau.sort_values('ses').drop_duplicates(subset=['ID'])

In [None]:
sns.histplot(df_subject_entry, x='age', discrete=True)

In [None]:
sns.countplot(data=df_subject_entry, x='sex')

In [None]:
sns.countplot(data=df_subject_entry, x='group')

In [None]:
# Add column with year of first session
df_subject_entry['acq.year'] = pd.to_datetime(df_subject_entry['acq.date']).dt.year

sns.histplot(data=df_subject_entry, x='acq.year', discrete=True)

In [None]:
sns.histplot(data=df_subject_entry, x='MMSE', discrete=True)

In [None]:
sns.histplot(data=df_subject_entry, x='ADAS13', discrete=True)

In [None]:
sns.countplot(data=df_subject_entry, x='Phase')

In [None]:
sns.countplot(data=df_subject_entry, x='apoe')

In [None]:
sns.histplot(data=df_subject_entry, x='centiloid')

In [None]:
sns.histplot(data=df_subject_entry, x='tau.global.SUVR')

In [None]:
sns.histplot(data=df_subject_entry, x='mPACCtrailsB')

In [None]:
sns.countplot(data=df_subject_entry, x='SITEID')

In [None]:
sns.countplot(data=df_subject_entry, x='DX')

In [None]:
sns.histplot(data=df_subject_entry, x='PTEDUCAT')

In [None]:
sns.histplot(data=df_subject_entry, x='ADNI_MEM')

In [None]:
sns.histplot(data=df_subject_entry, x='ADNI_EF')

In [None]:
sns.histplot(data=df_subject_entry, x='ADNI_LAN')

In [None]:
sns.histplot(data=df_subject_entry, x='ADNI_VS')

In [None]:
sns.histplot(data=df_subject_entry, x='t.diff.uwn.psych.pet.yrs')

In [None]:
sns.histplot(data=df_subject_entry, x='t.diff.adas.pet.yrs')

In [None]:
sns.histplot(data=df_subject_entry, x='t.diff.diagnosis.pet.yrs')

### Intra subject change per year

In [None]:
# Select only entries with pet-AV1251 and group them by ID
grp_id = df_adni_tau[df_adni_tau['pet.modality'] == 'pet-AV1451'].groupby(['ID'])

In [None]:
list_delta_MMSE = []

# Calculate median change of MMSE per year for each subject
for _, group in grp_id:
    group = group.sort_values(by='ses')[['ID', 'DX', 'ses', 'MMSE', 'ADAS13', 'ADNI_MEM', 'ADNI_EF', 'ADNI_LAN', 'ADNI_VS']]
    group['delta'] = (group['ses']-group['ses'].shift())
    group['delta'] = group['delta'].apply(lambda x: x.days)
    
    group['delta_MMSE_per_ses'] = (group['MMSE']-group['MMSE'].shift())
    group['delta_MMSE_per_year'] = group['delta_MMSE_per_ses']/group['delta']*365
    
    list_delta_MMSE_sub = list(group['delta_MMSE_per_year'])
    list_delta_MMSE_sub = list(filter(lambda x: str(x) != 'nan', list_delta_MMSE_sub))
    if list_delta_MMSE_sub:
        list_delta_MMSE += [round(median(list_delta_MMSE_sub))]
    
sns.histplot(list_delta_MMSE, stat='percent', discrete=True)

In [None]:
list_delta_ADAS13 = []

# Calculate median change of ADAS13 per year for each subject
for _, group in grp_id:
    group = group.sort_values(by='ses')[['ID', 'DX', 'ses', 'MMSE', 'ADAS13', 'ADNI_MEM', 'ADNI_EF', 'ADNI_LAN', 'ADNI_VS']]
    group['delta'] = (group['ses']-group['ses'].shift())
    group['delta'] = group['delta'].apply(lambda x: x.days)
    
    group['delta_ADAS13_per_ses'] = (group['ADAS13']-group['ADAS13'].shift())
    group['delta_ADAS13_per_year'] = group['delta_ADAS13_per_ses']/group['delta']*365
    
    list_delta_sub = list(group['delta_ADAS13_per_year'])
    list_delta_sub = list(filter(lambda x: str(x) != 'nan', list_delta_sub))
    if list_delta_sub:
        list_delta_ADAS13 += [median(list_delta_sub)]
    
sns.histplot(list_delta_ADAS13, stat='percent')

### Non redundancy reduced statistics

In [None]:
sns.countplot(data=df_adni_tau, x='pet.modality')

### Tabular_Info_PET

In [None]:
print(print('\n'.join(list(df_pet_data.keys()))))