# Make ADNI data

In [1]:
import pandas as pd

## Load data

In [2]:
adni = pd.read_csv('../data/adni_data.csv', low_memory=False)

## Select features

In [3]:
# Filter out visits without a CDR-SB measurement.
assert adni['CDRSB.bl'].notnull().all()
has_cdrsb = adni.CDRSB.notnull()
adni_filtered = adni[has_cdrsb]

# Filter out visits without a diagnosis.
has_dx = adni_filtered.DX.notnull()
adni_filtered = adni_filtered[has_dx]

# Filter out visits separated by more than 6 months from the previous one.
months_between_visits = adni_filtered.groupby('RID')['M'].diff().fillna(0)
is_regular_and_continuous = (months_between_visits <= 6.0).groupby(adni_filtered.RID).cummin()
adni_filtered = adni_filtered[is_regular_and_continuous]

# Filter out patients with less than 2 visits.
visit_counts = adni_filtered.groupby('RID').size()
patients_with_at_least_two_visits = visit_counts[visit_counts >= 2].index
adni_filtered = adni_filtered[adni_filtered.RID.isin(patients_with_at_least_two_visits)]

# Filter out visits before 2020-10-02 (ICLR 2021 submission deadline).
#is_before_iclr_deadline = pd.to_datetime(adni_filtered.EXAMDATE) <= '2020-10-02'
#adni_filtered = adni_filtered[is_before_iclr_deadline]

In [4]:
# In Pace et al. (2022), 1626 patients were included in the dataset...

adni_filtered.groupby('RID').ngroups

1605

In [5]:
# ... and the patients had a median of three visits.

adni_filtered.groupby('RID').size().value_counts()

3    1108
5     285
2     178
4      34
dtype: int64

In [6]:
# Let's have a look at the diagnosis distribution at baseline.
# - LMCI = late mild cognitive impairment
# - CN = cognitively normal
# - AD = Alzheimer's disease
# - EMCI = early mild cognitive impairment
# - SMC = subjective memory concerns

adni_filtered['DX.bl'].value_counts()

LMCI    2185
CN      1157
AD       873
EMCI     810
SMC      216
Name: DX.bl, dtype: int64

In [7]:
# The task consists of predicting whether an MRI scan was ordered.

adni_filtered['MRI_ordered'] = adni_filtered.Hippocampus.notnull().astype(int)

In [8]:
# From Pace et al. (2022): "Patient observations consist of CDR-SB on a
# severity scale following O'Bryant et al (2008)..."

# From Hüyük et al. (2021): "The CDR-SB result is categorized as: 
# {'normal', 'questionable impairment', 'mild/severe dementia'}."

# From O'Bryant et al. (2008): "Optimal ranges of CDR-SOB scores corresponding
# to the global CDR scores were 0.5 to 4.0 for a global score of 0.5, 
# 4.5 to 9.0 for a global score of 1.0, 9.5 to 15.5 for a global score of 2.0,
# and 16.0 to 18.0 for a global score of 3.0."

# From https://en.wikipedia.org/wiki/Clinical_Dementia_Rating:
# global score of 0.5 = questionable impairment
# global score of 1.0 = mild impairment
# global score of 2.0 = moderate impairment
# global score of 3.0 = severe impairment

adni_filtered['CDRSB_cat'] = pd.cut(
    adni_filtered.CDRSB,
    bins=[0, 0.5, 4.5, 18.5],
    right=False,
    labels=["CDR-SB normal", "CDR-SB questionable", "CDR-SB severe"]
)

In [9]:
# From Pace et al. (2022): "... and the MRI outcome of the previous visit, 
# categorized into four possibilities (no MRI scan; below average, average and 
# above average hippocampal volume)."

hippocampus_mean = adni_filtered.Hippocampus.mean(skipna=True)
hippocampus_std = adni_filtered.Hippocampus.std(skipna=True)

def mri_outcome(volume):
    if pd.isnull(volume):
        return "No MRI"
    if volume < hippocampus_mean - 0.5 * hippocampus_std:
        return "Vh low"
    elif volume > hippocampus_mean + 0.5 * hippocampus_std:
        return "Vh high"
    else:
        return "Vh average"

adni_filtered["MRI_outcome"] = adni_filtered.Hippocampus.apply(mri_outcome)

adni_filtered["MRI_previous_outcome"] = \
    adni_filtered.groupby('RID').MRI_outcome.shift(1, fill_value="No MRI")

### Compare with the code from Hüyük et al. (2021)

In [10]:
df = adni.copy()
df.DX.replace('CN', 'NL', inplace=True)

In [11]:
# https://github.com/vanderschaarlab/mlforhealthlabpub/blob/main/alg/interpole/adni/data.py

state = ['NL', 'MCI', 'Dementia']
state += ['{} to {}'.format(s0, s1) for s0 in state for s1 in state if s0 != s1]
state_dict = {s:i for s,i in zip(state,range(len(state)))}

#df = pd.read_csv('data/adni.csv', low_memory=False)
df = df[~df.DX.isna()]
df = df[~df.CDRSB.isna()]

visc = ['bl', 'm06'] + ['m{}'.format(k*6) for k in range(2,20)]
rids = [df[df.VISCODE == vis].RID.unique() for vis in visc]
for i in range(1,len(rids)):
    rids[i] = [rid for rid in rids[i] if rid in rids[i-1]]

df = df[df.VISCODE.isin(visc)]
for vis, rid in zip(visc, rids):
    df = df[df.RID.isin(rid) | (df.VISCODE != vis)]

data = list()
for rid in rids[0]:

    traj = dict()
    traj['s'] = list([None])
    traj['a'] = list()
    traj['z'] = list()
    traj['tau'] = 0

    df1 = df[df.RID == rid]
    for vis in visc:

        df2 = df1[df1.VISCODE == vis]
        if df2.empty:
            break

        s = state_dict[df2.DX.values[0]]
        a = 0 if df2.Hippocampus.isna().values[0] else 1
        z0 = 0 if df2.CDRSB.values[0] == 0 else 1 if df2.CDRSB.values[0] <= 2.5 else 2
        z1 = 0 if df2.Hippocampus.isna().values[0] else 1 if df2.Hippocampus.values[0] < 6642-.5*1225 else 2 if df2.Hippocampus.values[0] <= 6642+.5*1225 else 3

        traj['s'].append(s)
        traj['a'].append(a)
        traj['z'].append(4*z0+z1)
        traj['tau'] += 1

    if traj['s'][-1] == 0 or traj['s'][-1] == 1 or traj['s'][-1] == 2:
        data.append(traj)
        #print('n = {}, tau = {}'.format(len(data), traj['tau']))

In [12]:
sum([t['tau'] >= 2 for t in data])

1605

## Save data

In [13]:
adni_filtered = adni_filtered[
    [
        'RID',
        'CDRSB_cat',
        'MRI_previous_outcome',
        'MRI_ordered',
        'AGE',
        'PTGENDER',
        'PTMARRY',
        'PTEDUCAT',
        'APOE4'
    ]
]
adni_filtered.head()

Unnamed: 0,RID,CDRSB_cat,MRI_previous_outcome,MRI_ordered,AGE,PTGENDER,PTMARRY,PTEDUCAT,APOE4
0,2,CDR-SB normal,No MRI,1,74.3,Male,Married,16,0.0
1,2,CDR-SB normal,Vh high,0,74.3,Male,Married,16,0.0
16,3,CDR-SB severe,No MRI,1,81.3,Male,Married,18,1.0
17,3,CDR-SB severe,Vh low,1,81.3,Male,Married,18,1.0
18,3,CDR-SB questionable,Vh low,1,81.3,Male,Married,18,1.0


In [14]:
adni_filtered = adni_filtered.astype(
    {
        'RID': 'object',
        'CDRSB_cat': 'category',
        'MRI_previous_outcome': 'category',
        'MRI_ordered': 'int64',
        'AGE': 'float64',
        'PTGENDER': 'category',
        'PTMARRY': 'category',
        'PTEDUCAT': 'float64',
        'APOE4': 'category'
    }
)

In [15]:
assert adni_filtered.notnull().all().all()

In [16]:
adni_filtered.to_pickle('../data/adni_Xgy.pkl')