# Trying to do fusion using a deep learning method 

#### importing data + choosing imputing method

In [56]:
import os, sys
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter, defaultdict
from importlib import reload

import torch
from torch import optim, nn
import torch.utils.data as Data
from torch.nn import functional as F
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


import pydot
import tensorflow as tf 

# Import necessary modules
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt


# Keras specific
from tf import keras
from tf.keras import layers
from tf.keras.models import Sequential
from tf.keras.utils import plot_model
from tf.keras.layers import Dense
from tf.keras.utils import to_categorical 

ModuleNotFoundError: No module named 'tf'

In [2]:
def seed_torch(seed=0):
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)#as reproducibility docs
    torch.manual_seed(seed)# as reproducibility docs
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.benchmark = False# as reproducibility docs
    torch.backends.cudnn.deterministic = True# as reproducibility docs

In [9]:
def load_data(impute_method = 'RF'):
    uds = pd.read_csv("../data/data_imputed/{}/uds.csv".format(impute_method))
    uds['datetime'] = pd.to_datetime(uds['datetime'])
    uds = uds.dropna(subset=['EDUC'])
    
    mri = pd.read_csv("../data/data_imputed/{}/mri.csv".format(impute_method))
    mri['datetime'] = pd.to_datetime(mri['datetime'])
    
    csf = pd.read_csv("../data/data_imputed/{}/csf.csv".format(impute_method))
    return uds, mri, csf

uds_dict = pd.read_csv("../data/data_dictionary/uds_feature_dictionary_cleaned.csv")
mri_dict = pd.read_csv("../data/data_dictionary/mri_feature_dictionary_cleaned.csv") 

uds_drop_columns = ['NACCID', 'NACCADC', 'NACCVNUM', 'datetime', 'NACCUDSD', 'NACCALZP', 'NACCAD3', 'NACCAD5']
mri_drop_columns = ['NACCID', 'NACCVNUM', 'datetime', 'datetime_UDS', 'timediff', 'within-a-year']
csf_drop_columns = ['NACCID', 'CSFABMD', 'CSFTTMD', 'CSFPTMD']

uds, mri, csf = load_data()
print(uds.shape, mri.shape, csf.shape)

(44740, 89) (2873, 161) (2180, 7)


In [4]:
uds[['NACCID','NACCAD3']]

Unnamed: 0,NACCID,NACCAD3
0,NACC020208,MCI-AD
1,NACC107305,Healthy
2,NACC151065,
3,NACC187327,Healthy
4,NACC188799,
...,...,...
45095,NACC993286,
45096,NACC994463,Dementia-AD
45097,NACC995870,Healthy
45098,NACC998475,


#### add classifying variable (UDS/ALZP) to the MRI and CSF data sets

Need to add the class for each of the people in each data set so that we can do the initial step of the deep neural net. 

In [65]:
#uds_sub_mri = uds[uds['NACCID'].isin(mri['NACCID'])]
#uds_sub_mri
mri = pd.merge(mri, uds[["NACCID",'NACCAD3']], on="NACCID", how="inner")
mri

Unnamed: 0,NACCID,NACCVNUM,datetime,datetime_UDS,timediff,within-a-year,NACCICV,NACCBRNV,NACCWMVL,CSFVOL,...,RSUPFRM,RSUPPAR,RSUPPARM,RSUPTEM,RSUPTEMM,RSUPMAR,RSUPMARM,RTRTEM,RTRTEMM,NACCAD3
0,NACC914950,11,2017-03-02,2006-10-31,3775,False,1535.13000,1081.63,504.80000,407.37,...,2.11,10.53,1.61,15.71,2.02,7.24,1.89,0.720,1.2100,Healthy
1,NACC550785,10,2015-06-02,2006-03-28,3353,False,1571.92000,1210.39,516.57000,358.48,...,2.47,13.20,1.64,13.90,2.01,10.37,1.89,0.750,1.9000,Healthy
2,NACC129206,10,2015-12-09,2006-04-18,3522,False,1553.60000,1086.93,425.40000,464.06,...,2.47,13.44,1.62,16.53,2.32,10.38,2.02,1.534,2.2308,Healthy
3,NACC943925,12,2017-02-21,2006-01-24,4046,False,1496.83000,1072.43,440.16000,410.38,...,2.41,11.10,1.84,13.68,2.15,10.43,2.08,0.970,1.4800,Healthy
4,NACC540727,6,2016-11-11,2012-01-03,1774,False,1177.33334,916.72,396.32110,276.96,...,2.40,10.53,1.79,10.79,1.86,9.72,1.90,0.770,1.6600,MCI-AD
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2333,NACC159647,1,2016-05-11,2016-02-18,83,True,1336.42000,981.58,444.05000,351.36,...,2.26,8.84,1.53,11.88,2.24,8.98,1.86,0.790,1.5600,Healthy
2334,NACC732291,1,2016-06-13,2016-04-05,69,True,1339.20000,959.04,419.74000,378.69,...,2.65,9.04,1.52,11.57,1.95,6.99,2.07,0.910,1.7000,Healthy
2335,NACC650247,1,2016-05-06,2016-04-07,29,True,1343.77000,934.67,394.78000,398.83,...,2.39,8.37,1.33,12.00,2.36,6.78,1.70,0.710,1.5700,Healthy
2336,NACC050273,1,2016-05-24,2016-04-12,42,True,1437.06000,1058.00,435.41000,376.57,...,2.36,7.40,1.30,15.12,2.33,9.02,1.84,0.850,1.2500,Healthy


In [6]:
mri['NACCAD3'].value_counts()

Healthy        1563
Dementia-AD     469
MCI-AD          306
Name: NACCAD3, dtype: int64

In [90]:
csf = pd.merge(csf, uds[["NACCID",'NACCAD3']], on="NACCID", how="inner")
csf['NACCAD3'].value_counts()

Healthy        1074
Dementia-AD     600
MCI-AD          144
Name: NACCAD3, dtype: int64

## first layer - individual DNNs 

### UDS data set 

In [51]:
#recode the response variable to numeric
uds['NACCAD3_num'] = uds['NACCAD3'].map({'Healthy': 0, 'MCI-AD': 1, 'Dementia-AD':2})

#get rid of rows where classification has not been possible 
uds = uds.dropna(subset=['NACCAD3'])

#designate which column we are trying to predict
target_column = ['NACCAD3_num'] 

predictors = list(set(list(uds.columns))-set(target_column)- set(uds_drop_columns)-set(['NACCAD3']))
uds[predictors] = uds[predictors]/uds[predictors].max()
uds.describe()

Unnamed: 0,NACCADC,NACCVNUM,SEX,NACCAGE,EDUC,NACCUDSD,NACCALZP,NACCAPOE,MEMORY,ORIENT,...,TRAILBRR,TRAILBLI,BOSTON,MINTTOTS,CRAFTDRE,DIGFORCT,DIGFORSL,DIGBACCT,DIGBACLS,NACCAD3_num
count,34025.0,34025.0,34025.0,34025.0,34025.0,34025.0,34025.0,34025.0,34025.0,34025.0,...,34025.0,34025.0,34025.0,34025.0,34025.0,34025.0,34025.0,34025.0,34025.0,34025.0
mean,4746.609082,1.0,0.799412,0.678852,0.50678,2.310389,4.663424,0.284276,0.190066,0.146147,...,0.281043,0.931021,0.813427,0.836087,0.441801,0.602934,0.691367,0.529287,0.545845,0.829802
std,2935.33922,0.0,0.245072,0.098408,0.114638,1.39691,3.472523,0.165859,0.245286,0.240192,...,0.164168,0.191816,0.229031,0.20382,0.318059,0.208374,0.165779,0.216123,0.182476,0.916342
min,186.0,1.0,0.5,0.169811,0.0,1.0,1.0,0.166667,0.0,0.0,...,0.166667,0.173913,0.137931,0.225806,0.047619,0.25,0.0,0.090909,0.0,0.0
25%,2125.0,1.0,0.5,0.622642,0.433333,1.0,1.0,0.166667,0.0,0.0,...,0.166667,1.0,0.724138,0.774194,0.047619,0.5,0.666667,0.363636,0.5,0.0
50%,4967.0,1.0,1.0,0.688679,0.533333,1.0,8.0,0.166667,0.166667,0.0,...,0.18,1.0,0.896552,0.903226,0.47619,0.583333,0.666667,0.545455,0.5,0.0
75%,6713.0,1.0,1.0,0.745283,0.6,4.0,8.0,0.333333,0.333333,0.333333,...,0.353333,1.0,1.0,1.0,0.714286,0.75,0.777778,0.636364,0.625,2.0
max,9661.0,1.0,1.0,1.0,1.0,4.0,8.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0


In [38]:
#predictors

In [52]:
X = uds[predictors].values
y = uds[target_column].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05,  random_state=None)
print(X_train.shape); print(X_test.shape)

(32323, 81)
(1702, 81)


In [53]:
#change it so that we have an ecoded variable for our classification
# one hot encode outputs
y_train = keras.utils.to_categorical(y_train)
y_test = keras.utils.to_categorical(y_test)

count_classes = y_test.shape[1]
print(count_classes)

3


In [58]:
model_uds = Sequential()
model_uds.add(Dense(500, activation='relu', input_dim=X_train.shape[1]))
model_uds.add(Dense(100, activation='relu'))
model_uds.add(Dense(50, activation='relu'))
model_uds.add(Dense(3, activation='softmax'))

# Compile the model
model_uds.compile(optimizer='adam', 
              loss='categorical_crossentropy', 
              metrics=['accuracy'])

In [60]:
model_uds.fit(X_train, y_train, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x163f18feeb0>

In [86]:
pred_train_uds= model_uds.predict(X_train)
scores_uds = model_uds.evaluate(X_train, y_train, verbose=0)
print('UDS Accuracy on training data: {}% \n Error on training data: {}'.format(scores_uds[1], 1 - scores_uds[1]))   
 
pred_test= model_uds.predict(X_test)
scores2_uds = model_uds.evaluate(X_test, y_test, verbose=0)
print('UDS Accuracy on test data: {}% \n Error on test data: {}'.format(scores2_uds[1], 1 - scores2_uds[1]))    

UDS Accuracy on training data: 0.9128793478012085% 
 Error on training data: 0.0871206521987915
UDS Accuracy on test data: 0.9036427736282349% 
 Error on test data: 0.09635722637176514


## MRI data set

In [68]:
#recode the response variable to numeric
mri['NACCAD3_num'] = mri['NACCAD3'].map({'Healthy': 0, 'MCI-AD': 1, 'Dementia-AD':2})

#get rid of rows where classification has not been possible 
mri = mri.dropna(subset=['NACCAD3'])

#designate which column we are trying to predict
target_column = ['NACCAD3_num'] 

predictors = list(set(list(mri.columns))-set(target_column)- set(mri_drop_columns)-set(['NACCAD3']))
mri[predictors] = mri[predictors]/mri[predictors].max()
mri.describe()

Unnamed: 0,NACCVNUM,timediff,NACCICV,NACCBRNV,NACCWMVL,CSFVOL,GRAYVOL,WHITEVOL,WMHVOL,HIPPOVOL,...,RSUPFRM,RSUPPAR,RSUPPARM,RSUPTEM,RSUPTEMM,RSUPMAR,RSUPMARM,RTRTEM,RTRTEMM,NACCAD3_num
count,2338.0,2338.0,2338.0,2338.0,2338.0,2338.0,2338.0,2338.0,2338.0,2338.0,...,2338.0,2338.0,2338.0,2338.0,2338.0,2338.0,2338.0,2338.0,2338.0,2338.0
mean,1.938409,488.200171,0.793191,0.7749,0.740383,0.683765,0.781875,0.735411,0.114419,0.760308,...,0.703367,0.707194,0.65937,0.732344,0.685173,0.672373,0.707151,0.563094,0.649373,0.532079
std,1.821181,735.174823,0.081205,0.086098,0.096959,0.122871,0.085708,0.100299,0.16627,0.113893,...,0.101785,0.119975,0.121159,0.103692,0.108851,0.122543,0.107135,0.160782,0.167292,0.806503
min,1.0,0.0,0.613803,0.588238,0.536588,0.445858,0.573811,0.520115,0.000164,0.474329,...,0.426187,0.424082,0.35984,0.494474,0.409836,0.403382,0.421809,0.236832,0.238345,0.0
25%,1.0,33.0,0.736181,0.714926,0.670143,0.588653,0.72302,0.663407,0.016782,0.68889,...,0.64723,0.626226,0.581779,0.663362,0.619538,0.584781,0.643181,0.449804,0.533026,0.0
50%,1.0,98.0,0.789271,0.770738,0.735758,0.673924,0.779363,0.731321,0.05421,0.764189,...,0.704494,0.703779,0.663379,0.727106,0.681691,0.668735,0.710826,0.547588,0.669603,0.0
75%,2.0,730.75,0.846906,0.831046,0.803284,0.765814,0.837048,0.801656,0.14104,0.842945,...,0.759195,0.787728,0.734639,0.797056,0.747892,0.754503,0.774859,0.664765,0.771786,1.0
max,12.0,4220.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0


In [71]:
X_mri = mri[predictors].values
y_mri = mri[target_column].values

X_mri_train, X_mri_test, y_mri_train, y_mri_test = train_test_split(X_mri, y_mri, test_size=0.05,  random_state=None)
print(X_mri_train.shape); print(X_mri_test.shape)

(2221, 155)
(117, 155)


In [72]:
#change it so that we have an ecoded variable for our classification
# one hot encode outputs
y_mri_train = keras.utils.to_categorical(y_mri_train)
y_mri_test = keras.utils.to_categorical(y_mri_test)

count_classes = y_mri_test.shape[1]
print(count_classes)

3


In [82]:
model_mri = Sequential()
model_mri.add(Dense(500, activation='relu', input_dim=X_mri_train.shape[1]))
model_mri.add(Dense(100, activation='relu'))
model_mri.add(Dense(100, activation='relu'))
model_mri.add(Dense(50, activation='relu'))
model_mri.add(Dense(3, activation='softmax'))

# Compile the model
model_mri.compile(optimizer='adam', 
              loss='categorical_crossentropy', 
              metrics=['accuracy'])

In [84]:
model_mri.fit(X_mri_train, y_mri_train, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x163f9bc1fa0>

In [88]:
pred_train_mri= model_mri.predict(X_mri_train)
scores_mri = model_mri.evaluate(X_mri_train, y_mri_train, verbose=0)
print('MRI Accuracy on training data: {}% \n Error on training data: {}'.format(scores_uds[1], 1 - scores_uds[1]))   
 
pred_test_mri= model_mri.predict(X_mri_test)
scores2_mri = model_mri.evaluate(X_mri_test, y_mri_test, verbose=0)
print('MRI Accuracy on test data: {}% \n Error on test data: {}'.format(scores2_mri[1], 1 - scores2_mri[1]))    

MRI Accuracy on training data: 0.9128793478012085% 
 Error on training data: 0.0871206521987915
MRI Accuracy on test data: 0.7777777910232544% 
 Error on test data: 0.2222222089767456


### CSF data set 

In [91]:
#recode the response variable to numeric
csf['NACCAD3_num'] = csf['NACCAD3'].map({'Healthy': 0, 'MCI-AD': 1, 'Dementia-AD':2})

#get rid of rows where classification has not been possible 
csf = csf.dropna(subset=['NACCAD3'])

#designate which column we are trying to predict
target_column = ['NACCAD3_num'] 

predictors = list(set(list(csf.columns))-set(target_column)- set(csf_drop_columns)-set(['NACCAD3']))
csf[predictors] = csf[predictors]/csf[predictors].max()
csf.describe()

Unnamed: 0,CSFABMD,CSFTTMD,CSFPTMD,CSFABETA,CSFTTAU,CSFPTAU,NACCAD3_num
count,1818.0,1768.0,1673.0,1818.0,1818.0,1818.0,1818.0
mean,4.277228,4.417986,4.561267,0.335674,0.206897,0.279243,0.739274
std,3.199845,3.152429,3.18038,0.211998,0.185926,0.201749,0.923733
min,1.0,1.0,1.0,0.05298,0.009105,0.037071,0.0
25%,1.0,2.0,2.0,0.170081,0.064548,0.144084,0.0
50%,2.0,2.0,2.0,0.291046,0.155061,0.213955,0.0
75%,8.0,8.0,8.0,0.451459,0.286132,0.345854,2.0
max,8.0,8.0,8.0,1.0,1.0,1.0,2.0


In [92]:
X_csf = csf[predictors].values
y_csf = csf[target_column].values

X_csf_train, X_csf_test, y_csf_train, y_csf_test = train_test_split(X_csf, y_csf, test_size=0.05,  random_state=None)
print(X_csf_train.shape); print(X_csf_test.shape)

(1727, 3)
(91, 3)


In [93]:
#change it so that we have an ecoded variable for our classification
# one hot encode outputs
y_csf_train = keras.utils.to_categorical(y_csf_train)
y_csf_test = keras.utils.to_categorical(y_csf_test)

count_classes = y_csf_test.shape[1]
print(count_classes)

3


In [94]:
model_csf = Sequential()
model_csf.add(Dense(500, activation='relu', input_dim=X_csf_train.shape[1]))
model_csf.add(Dense(100, activation='relu'))
model_csf.add(Dense(50, activation='relu'))
model_csf.add(Dense(3, activation='softmax'))

# Compile the model
model_csf.compile(optimizer='adam', 
              loss='categorical_crossentropy', 
              metrics=['accuracy'])

In [96]:
model_csf.fit(X_csf_train, y_csf_train, epochs=30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x16383a375e0>

In [97]:
pred_train_csf= model_csf.predict(X_csf_train)
scores_csf = model_csf.evaluate(X_csf_train, y_csf_train, verbose=0)
print('csf Accuracy on training data: {}% \n Error on training data: {}'.format(scores_uds[1], 1 - scores_uds[1]))   
 
pred_test_csf= model_csf.predict(X_csf_test)
scores2_csf = model_csf.evaluate(X_csf_test, y_csf_test, verbose=0)
print('csf Accuracy on test data: {}% \n Error on test data: {}'.format(scores2_csf[1], 1 - scores2_csf[1]))    

csf Accuracy on training data: 0.9128793478012085% 
 Error on training data: 0.0871206521987915
csf Accuracy on test data: 0.8461538553237915% 
 Error on test data: 0.1538461446762085


In [51]:
plot_model(model_csf)

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model/model_to_dot to work.


In [47]:
import pydotplus

In [48]:
import tensorflow as tf 

In [43]:
tf.keras.utils.plot_model(model_csf, to_file='model_test.png', show_shapes=True, show_layer_names=True)

You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) for plot_model/model_to_dot to work.


In [49]:
from keras.utils.vis_utils import plot_model

In [54]:
#from plot_model import plot_model
plot_model(model_uds)

ImportError: cannot import name 'wrappers' from 'tensorflow.python.keras.layers' (C:\ProgramData\Anaconda3\lib\site-packages\tensorflow\python\keras\layers\__init__.py)