# ADNIMERGE Table

## Data Import, Exploration, and Cleaning

In [1]:
# import the relevant packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from datetime import timedelta

# run line magics
%matplotlib inline

In [2]:
# starting with object dtype to avoid errors in data consistency

adni_full = pd.read_csv('ADNIMERGE.csv', dtype='object')

In [3]:
adni_full.head()

Unnamed: 0,RID,PTID,VISCODE,SITE,COLPROT,ORIGPROT,EXAMDATE,DX_bl,AGE,PTGENDER,...,TAU_bl,PTAU_bl,FDG_bl,PIB_bl,AV45_bl,Years_bl,Month_bl,Month,M,update_stamp
0,2,011_S_0002,bl,11,ADNI1,ADNI1,9/8/2005,CN,74.3,Male,...,,,1.36665,,,0.0,0.0,0,0,58:27.0
1,3,011_S_0003,bl,11,ADNI1,ADNI1,9/12/2005,AD,81.3,Male,...,239.7,22.83,1.08355,,,0.0,0.0,0,0,58:27.0
2,3,011_S_0003,m06,11,ADNI1,ADNI1,3/13/2006,AD,81.3,Male,...,239.7,22.83,1.08355,,,0.498289,5.96721,6,6,58:27.0
3,3,011_S_0003,m12,11,ADNI1,ADNI1,9/12/2006,AD,81.3,Male,...,239.7,22.83,1.08355,,,0.999316,11.9672,12,12,58:27.0
4,3,011_S_0003,m24,11,ADNI1,ADNI1,9/12/2007,AD,81.3,Male,...,239.7,22.83,1.08355,,,1.99863,23.9344,24,24,58:27.0


In [4]:
# 14036 rows

adni_full.shape

(14036, 113)

In [5]:
adni_full.DX.value_counts()

MCI         4410
CN          3310
Dementia    2230
Name: DX, dtype: int64

In [6]:
DX = ['CN', 'MCI', 'Dementia']

In [7]:
adni_full.DX_bl.value_counts()

LMCI    4886
CN      4233
EMCI    2553
AD      1600
SMC      741
Name: DX_bl, dtype: int64

In [9]:
# initialize empty dataframe

adni = pd.DataFrame()

In [10]:
# set important datatypes

adni['RID'] = adni_full.RID.astype('int')
adni['EXAMDATE'] = pd.to_datetime(adni_full['EXAMDATE'])
adni['EXAMDATE_bl'] = pd.to_datetime(adni_full['EXAMDATE_bl'])
adni['PTGENDER'] = pd.Categorical(adni_full.PTGENDER)
adni['DX'] = pd.Categorical(adni_full.DX, ordered=True, categories=DX)
adni['PTEDUCAT'] = adni_full.PTEDUCAT.astype('int')

In [11]:
# can't convert to floats  ['ABETA', 'TAU', 'PTAU']
# create a list of values that should be float dtype for quick looping

floats = ['AGE', 'CDRSB', 'ADAS11', 'ADAS13', 'MMSE', 'RAVLT_immediate', 'Hippocampus', 'Ventricles', 'WholeBrain', 'Entorhinal', 'MidTemp', 'FDG', 'AV45']

In [12]:
# use list to convert dtypes to float and move to new df

for i in floats:
    adni[i] = adni_full[i].astype('float')
    if i == 'AGE':
        continue
    else:    
        y = i + '_bl'
        adni[y] = adni_full[y].astype('float')

In [13]:
adni.PTGENDER.dtype

CategoricalDtype(categories=['Female', 'Male'], ordered=False)

In [15]:
adni.ADAS13_bl.dtype

dtype('float64')

In [16]:
adni.ADAS13.dtype

dtype('float64')

In [14]:
# NaN handling
adni.RAVLT_immediate.isnull().sum() # 4177 NaN values

4177