# MRI and Alzheimers 
## BIOF509 Spring 2019 Frinal Project

## Import Dataset

In [None]:
# This project is trying to find the best method to predict whether the patient will get
# Dementia/Alzheimers or not given the data
# Data should not be combined, each dataset should not be used individually only.

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

# merge two datasets
# mri_cross = pd.read_csv('oasis_cross-sectional.csv', sep = ',')
mri_long = pd.read_csv('oasis_longitudinal.csv', sep = ',')
mri_long.head()

In [None]:
# closer look to the score and labels
mri_long.groupby(['CDR', 'Group']).size().unstack()

In [None]:
mri_long.isnull().sum()

## Pre-Process

In [None]:
# the data contains more than one visits, we only keep single visit = 1
# no converted patients included
# MR Delay == 0
df = mri_long.loc[mri_long['Visit'] == 1]
df = df.drop(['Subject ID', 'MRI ID', 'Visit', 'Hand', 'MR Delay'], axis = 1)
display(df.head(), df.shape)

In [None]:
# missing values at the initial Visit
# SES (Socioeconomics Status)
df.isnull().sum()

In [None]:
from sklearn.preprocessing import OrdinalEncoder

ode = OrdinalEncoder()

# CDR ordinal 
# df["CDR"] = df["CDR"].astype('category')
# cdr = df["CDR"].values.tolist() 
cdr = df[["CDR"]]
cdr_code = ode.fit_transform(cdr)
cdr_code.categories_

In [None]:
# MMSE Scores Mini-Mental State Examination - Ordinal
# Ref: https://www.heartandstroke.ca/-/media/pdf-files/canada/clinical-update/allen-huang-cognitive-screening-toolkit.ashx?la=en&hash=631B35521724C28268D0C2130D07A401E33CDBB0
# 25-30 (Questionably significant); 20-25 (Mild); 10-20 (Moderate); 0-10 (Severe)
def mmse_group(col):
    if col <= 10:
        return "Severe"
    elif 10 < col <= 20:
        return "Moderate"
    elif 20 < col <= 25:
        return "Mild"
    elif 25 < col <= 30:
        return "Questionably significant"

df['MMSE Group'] = df['MMSE'].apply(mmse_group)
df['MMSE Group'].value_counts(sort = True)

In [None]:
from sklearn.preprocessing import LabelEncoder

lbe = LabelEncoder()

# M/F F = 0, M = 1
df['Gender_code'] = lbe.fit_transform(df['M/F'])

# Group Demented = 0, Nondemented = 1
df['Group_code'] = lbe.fit_transform(df['Group'])

## Data Exploration

In [None]:
# Demented > ND at patients' initial visit
sns.countplot(x = 'Group', data = df)

In [None]:
# converted --> Demented
df['Group'] = df['Group'].replace(['Converted'],['Demented'])
sns.countplot(x = 'Group', data = df)

In [None]:
sns.countplot(x = 'CDR', data= df)

In [None]:
# multicolinearity exists
p1 = sns.pairplot(df, hue = 'Group')

In [None]:
# lets see 'Group' relationships with other predictors one by one
# Gender: M/F
# Demented male > ND male ; ND female > Demented female
# female less likely to be Demented than male
sns.countplot(x = 'M/F', hue = 'Group', data = df)

In [None]:
# Age vs Group
sns.kdeplot(df['Age'], shade = True)

#### Remove na & impute

In [None]:
# remove NA
df_rmna = df.dropna()

# impute 
# df_ipna = df.

In [None]:
# recode categorical varibles
# Group, M/F nominal
# CDR ordinal



## Modeling