In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter, defaultdict
from importlib import reload
import seaborn as sns
import os, sys

sys.path.append(os.path.join("../", "utils"))
import load_data

In [2]:
reload(load_data)
uds = load_data.load_uds()
mri = load_data.load_mri()
csf = load_data.load_csf()
uds_dict, mri_dict = load_data.load_feature_map()
print(uds.shape, mri.shape, csf.shape)

(45100, 94) (11273, 172) (3017, 23)


# Data Preprocessing
## Drop missing
- drop features has more than 50% ppl missing
- drop individuals has more than 50% MRI features missing



In [3]:
## Drop missing
# - drop features has more than 50% ppl missing
# - drop individuals has more than 50% MRI features missing

def drop_missing(df, row_thres=0.5, col_thres=0.5):
    df = df.loc[df.isna().mean(axis=1)<row_thres, df.isna().mean(axis=0)<col_thres]
    return df

uds = drop_missing(uds)
mri = drop_missing(mri, col_thres=0.6)
csf = drop_missing(csf, col_thres=0.8)

In [None]:
# Impute missing with column mean (numeric) or mode (categorical)
def impute_missing(df):
    for c in df.columns:
        try:
            if c not in ['NACCID', 'datetime', 'NACCUDSD']:
                if df[c].nunique() <= 20:
                    df[c] = df[c].fillna(value = df[c].mode().values[-1])
                else:
                    df[c] = df[c].fillna(value = df[c].mean())
        except:
            print("Cannot convert column ", c)
    return df

uds = impute_missing(uds)
mri = impute_missing(mri)
csf = impute_missing(csf)
uds = uds.dropna(subset=['NACCUDSD'])
print(Counter(uds['NACCUDSD']))

uds.loc[uds['NACCUDSD']==2, 'NACCUDSD']=1
uds = uds.drop('NACCVNUM', axis=1)
mri = mri[mri['NACCVNUM'] == 1].drop('NACCVNUM', axis=1)
print(uds.shape, mri.shape, csf.shape)

In [5]:
uds

Unnamed: 0,NACCID,NACCADC,NACCAGE,NACCVNUM,EDUC,SEX,NACCAPOE,NACCUDSD,NACCALZP,MEMORY,...,ANIMALS,VEG,TRAILA,TRAILARR,TRAILALI,TRAILB,TRAILBRR,TRAILBLI,BOSTON,datetime
0,NACC020208,186,69,1,16.0,1,,3,1,1.0,...,16.0,11.0,49.0,0.0,24.0,183.0,1.0,24.0,,2020-06-09
1,NACC107305,186,74,1,18.0,2,,1,8,0.0,...,30.0,16.0,30.0,0.0,24.0,48.0,0.0,24.0,,2021-12-01
2,NACC151065,186,86,1,14.0,2,,3,7,0.0,...,10.0,9.0,64.0,0.0,24.0,300.0,2.0,23.0,,2021-12-21
3,NACC187327,186,68,1,14.0,2,,1,8,0.5,...,22.0,11.0,32.0,1.0,24.0,68.0,0.0,24.0,,2021-09-14
4,NACC188799,186,78,1,14.0,2,,3,7,0.5,...,13.0,10.0,36.0,0.0,24.0,11.0,1.0,24.0,,2022-02-07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45095,NACC993286,9661,75,1,21.0,1,1.0,2,7,0.0,...,22.0,15.0,27.0,,,37.0,,,27.0,2006-08-04
45096,NACC994463,9661,66,1,16.0,1,2.0,4,1,1.0,...,27.0,17.0,31.0,0.0,24.0,126.0,0.0,24.0,28.0,2012-01-27
45097,NACC995870,9661,53,1,16.0,2,1.0,1,8,0.0,...,17.0,12.0,16.0,0.0,24.0,44.0,0.0,24.0,,2019-11-21
45098,NACC998475,9661,70,1,13.0,2,1.0,3,7,0.0,...,10.0,7.0,,,,,,,20.0,2006-10-20


In [6]:
uds.isna().sum()

NACCID          0
NACCADC         0
NACCAGE         0
NACCVNUM        0
EDUC          294
            ...  
TRAILB       7600
TRAILBRR    19636
TRAILBLI    19640
BOSTON      13857
datetime        0
Length: 75, dtype: int64

### Random Forest Imputation

# PCA feature dimension reduction

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
scaler = StandardScaler()
pca = PCA(n_components=5)

def pca_transform(df, var_dict, pca_thres = 0.8):
    to_ret = pd.DataFrame()
    for cat in var_dict['Category'].unique():
        if cat != 'DEMO':
            var_names = var_dict[var_dict['Category'] == cat]['VariableName'].values
            var_names = set(var_names).intersection(set(df.columns))
            if len(var_names) > 5:
                pca_transformed = pca.fit_transform(scaler.fit_transform(df.loc[:,var_names]))
                num_selected_1 = np.sum(pca.explained_variance_ratio_ > pca_thres)
                num_selected_2 = np.argmax(-np.diff(pca.explained_variance_ratio_, n=1) > 0.1) + 1
                num_selected = max(num_selected_1, num_selected_2)
                temp = pd.DataFrame(pca_transformed[:,:num_selected])
                temp.columns = ["{}_{}".format(cat, i+1) for i in range(num_selected)]
                print(cat, pca.explained_variance_ratio_, num_selected)
                to_ret = pd.concat([to_ret, temp], axis=1)
    return to_ret

In [None]:
# PCA UDS
uds_pca = pca_transform(uds.drop(['NACCID', 'datetime', 'NACCUDSD', 'NACCADC'], axis=1), uds_dict, pca_thres=0.2)
demo_features = set(uds.columns).intersection(set(uds_dict[uds_dict['Category'] == 'DEMO']['VariableName'].values))
uds_pca = pd.concat([uds_pca, uds[demo_features - set(['NACCADC'])].reset_index(drop=True)], axis=1)
uds_pca['NACCID'] = uds.reset_index()['NACCID']
print(uds_pca.shape)

plt.figure(figsize = (12,8))
sns.heatmap(uds_pca.corr(), annot=True, fmt='.2f', cmap='RdYlGn')

In [None]:
# PCA MRI data
mri_pca = pca_transform(mri.drop(['NACCID', 'MRIMO', 'MRIDY', 'MRIYR'], axis=1), mri_dict, pca_thres=0.2)
mri_pca['NACCID'] = mri.reset_index()['NACCID']
uds_mri_merged = uds_pca.merge(mri_pca, on='NACCID', how='inner')
print(uds_mri_merged.shape)

plt.figure(figsize = (15, 10))
sns.heatmap(uds_mri_merged.corr(), annot=True, fmt='.2f', cmap='RdYlGn')

In [None]:
csf_var = csf.drop("NACCADC", axis=1).describe().columns
csf_pca = pca.fit_transform(csf[csf_var])
csf_pca = pd.DataFrame(csf_pca[:,:2])
csf_pca.columns = ['CSF_{}'.format(i) for i in range(csf_pca.shape[1])]
csf_pca['NACCID'] = csf.reset_index()['NACCID']
uds_mri_cfs_merged = uds_mri_merged.merge(csf_pca, on='NACCID', how='inner')
print(uds_mri_cfs_merged.shape)

plt.figure(figsize = (15, 10))
sns.heatmap(uds_mri_cfs_merged.corr(), annot=True, fmt='.2f', cmap='RdYlGn')

# Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics

## UDS Only

In [None]:
clf = LogisticRegression(random_state=48, max_iter=1000, solver='saga', penalty='elasticnet', C=0.4, l1_ratio=0.3)

X_train, X_test, y_train, y_test = train_test_split(uds_pca.drop(['NACCID', 'NACCALZP', 'NACCUDSD'], axis=1), 
                                                     uds_pca['NACCUDSD'], test_size = 0.3, random_state=48)
clf = clf.fit(X_train, y_train)

print("Training Score:  ")
print(metrics.confusion_matrix(y_train, clf.predict(X_train)))
print(metrics.accuracy_score(y_train, clf.predict(X_train)))
print()
print("Testing Score:  ")
print(metrics.confusion_matrix(y_test, clf.predict(X_test)))
print(metrics.accuracy_score(y_test, clf.predict(X_test)))

## UDS + MRI

In [None]:
clf = LogisticRegression(random_state=48, max_iter=1000, solver='saga', penalty='elasticnet', C=0.4, l1_ratio=0.3)

X_train, X_test, y_train, y_test = train_test_split(uds_mri_merged.drop(['NACCID', 'NACCALZP', 'NACCUDSD'], axis=1), 
                                                    uds_mri_merged['NACCUDSD'], test_size = 0.3, random_state=48)
clf = clf.fit(X_train, y_train)

print("Training Score:  ")
print(metrics.confusion_matrix(y_train, clf.predict(X_train)))
print(metrics.accuracy_score(y_train, clf.predict(X_train)))
print()
print("Testing Score:  ")
print(metrics.confusion_matrix(y_test, clf.predict(X_test)))
print(metrics.accuracy_score(y_test, clf.predict(X_test)))

## UDS + MRI + CSF

In [None]:
clf = LogisticRegression(random_state=48, max_iter=2000, solver='saga', penalty='elasticnet', C=0.8, l1_ratio=0.3)

X_train, X_test, y_train, y_test = train_test_split(uds_mri_cfs_merged.drop(['NACCID', 'NACCALZP', 'NACCUDSD'], axis=1), 
                                                    uds_mri_cfs_merged['NACCUDSD'], test_size = 0.2, random_state=48)
clf = clf.fit(X_train, y_train)

print("Training Score:  ")
print(metrics.confusion_matrix(y_train, clf.predict(X_train)))
print(metrics.accuracy_score(y_train, clf.predict(X_train)))
print()
print("Testing Score:  ")
print(metrics.confusion_matrix(y_test, clf.predict(X_test)))
print(metrics.accuracy_score(y_test, clf.predict(X_test)))
