# Trying to do fusion using a deep learning method 

#### importing data + choosing imputing method

In [1]:
import os, sys
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter, defaultdict
from importlib import reload

import torch
from torch import optim, nn
import torch.utils.data as Data
from torch.nn import functional as F
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")


import pydot
import tensorflow as tf 

# Import necessary modules
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt


# Keras specific
import tensorflow_addons as tfa
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import plot_model
from tensorflow.keras.layers import Dense
from tensorflow.keras.utils import to_categorical 

In [2]:
def seed_torch(seed=0):
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)#as reproducibility docs
    torch.manual_seed(seed)# as reproducibility docs
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.benchmark = False# as reproducibility docs
    torch.backends.cudnn.deterministic = True# as reproducibility docs

In [3]:
def load_data(impute_method = 'RF'):
    uds = pd.read_csv("../data/data_imputed/{}/uds.csv".format(impute_method))
    uds['datetime'] = pd.to_datetime(uds['datetime'])
    uds = uds.dropna(subset=['EDUC'])
    
    mri = pd.read_csv("../data/data_imputed/{}/mri.csv".format(impute_method))
    mri['datetime'] = pd.to_datetime(mri['datetime'])
    
    csf = pd.read_csv("../data/data_imputed/{}/csf.csv".format(impute_method))
    return uds, mri, csf

uds_dict = pd.read_csv("../data/data_dictionary/uds_feature_dictionary_cleaned.csv")
mri_dict = pd.read_csv("../data/data_dictionary/mri_feature_dictionary_cleaned.csv") 

uds_drop_columns = ['NACCID', 'NACCADC', 'NACCVNUM', 'datetime', 'NACCUDSD', 'NACCALZP', 'NACCAD3', 'NACCAD5']
mri_drop_columns = ['NACCID', 'NACCVNUM', 'datetime', 'datetime_UDS', 'timediff', 'within-a-year']
csf_drop_columns = ['NACCID', 'CSFABMD', 'CSFTTMD', 'CSFPTMD']

uds, mri, csf = load_data()
print(uds.shape, mri.shape, csf.shape)

(44740, 89) (2873, 161) (2180, 7)


In [4]:
uds[['NACCID','NACCAD3']]

Unnamed: 0,NACCID,NACCAD3
0,NACC020208,MCI-AD
1,NACC107305,Healthy
2,NACC151065,
3,NACC187327,Healthy
4,NACC188799,
...,...,...
45095,NACC993286,
45096,NACC994463,Dementia-AD
45097,NACC995870,Healthy
45098,NACC998475,


#### add classifying variable (UDS/ALZP) to the MRI and CSF data sets

Need to add the class for each of the people in each data set so that we can do the initial step of the deep neural net. 

In [5]:
#uds_sub_mri = uds[uds['NACCID'].isin(mri['NACCID'])]
#uds_sub_mri
mri = pd.merge(mri, uds[["NACCID",'NACCAD3']], on="NACCID", how="inner")
mri

Unnamed: 0,NACCID,NACCVNUM,datetime,datetime_UDS,timediff,within-a-year,NACCICV,NACCBRNV,NACCWMVL,CSFVOL,...,RSUPFRM,RSUPPAR,RSUPPARM,RSUPTEM,RSUPTEMM,RSUPMAR,RSUPMARM,RTRTEM,RTRTEMM,NACCAD3
0,NACC914950,11,2017-03-02,2006-10-31,3775,False,1535.13000,1081.63,504.80000,407.37,...,2.11,10.53,1.61,15.71,2.02,7.24,1.89,0.720,1.2100,Healthy
1,NACC388999,11,2016-06-24,2006-02-21,3776,False,1314.57000,1001.09,437.70000,312.44,...,2.70,10.17,2.00,13.07,2.17,8.92,2.09,0.630,1.6100,
2,NACC550785,10,2015-06-02,2006-03-28,3353,False,1571.92000,1210.39,516.57000,358.48,...,2.47,13.20,1.64,13.90,2.01,10.37,1.89,0.750,1.9000,Healthy
3,NACC321645,9,2015-12-03,2006-01-03,3621,False,1417.97000,1043.73,431.46000,372.57,...,2.33,12.02,1.68,14.07,2.06,8.38,1.97,0.980,1.5800,
4,NACC129206,10,2015-12-09,2006-04-18,3522,False,1553.60000,1086.93,425.40000,464.06,...,2.47,13.44,1.62,16.53,2.32,10.38,2.02,1.534,2.2308,Healthy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2855,NACC159647,1,2016-05-11,2016-02-18,83,True,1336.42000,981.58,444.05000,351.36,...,2.26,8.84,1.53,11.88,2.24,8.98,1.86,0.790,1.5600,Healthy
2856,NACC732291,1,2016-06-13,2016-04-05,69,True,1339.20000,959.04,419.74000,378.69,...,2.65,9.04,1.52,11.57,1.95,6.99,2.07,0.910,1.7000,Healthy
2857,NACC650247,1,2016-05-06,2016-04-07,29,True,1343.77000,934.67,394.78000,398.83,...,2.39,8.37,1.33,12.00,2.36,6.78,1.70,0.710,1.5700,Healthy
2858,NACC050273,1,2016-05-24,2016-04-12,42,True,1437.06000,1058.00,435.41000,376.57,...,2.36,7.40,1.30,15.12,2.33,9.02,1.84,0.850,1.2500,Healthy


In [6]:
mri['NACCAD3'].value_counts()

Healthy        1563
Dementia-AD     469
MCI-AD          306
Name: NACCAD3, dtype: int64

In [7]:
csf = pd.merge(csf, uds[["NACCID",'NACCAD3']], on="NACCID", how="inner")
csf['NACCAD3'].value_counts()

Healthy        1074
Dementia-AD     600
MCI-AD          144
Name: NACCAD3, dtype: int64

## first layer - individual DNNs 

### UDS data set 

In [11]:
#recode the response variable to numeric
uds['NACCAD3_num'] = uds['NACCAD3'].map({'Healthy':0, 'MCI-AD':1,  'Dementia-AD':2})

#get rid of rows where classification has not been possible 
uds = uds.dropna(subset=['NACCAD3'])

#designate which column we are trying to predict
target_column = ['NACCAD3_num'] 

predictors = list(set(list(uds.columns))-set(target_column)- set(uds_drop_columns)-set(['NACCAD3']))
uds[predictors] = uds[predictors]/uds[predictors].max()
uds.describe()
uds.shape

(34025, 90)

In [13]:
#predictors
uds[['NACCID']].values

array([['NACC020208'],
       ['NACC107305'],
       ['NACC187327'],
       ...,
       ['NACC993141'],
       ['NACC994463'],
       ['NACC995870']], dtype=object)

In [14]:
X_uds = np.concatenate((uds[['NACCID']].values,uds[predictors].values), axis = 1 )
y_uds = uds[target_column].values

#change it so that we have an ecoded variable for our classification
# one hot encode outputs 
y_uds = keras.utils.to_categorical(y_uds, num_classes=3)

X_uds_train, X_uds_test, y_uds_train, y_uds_test = train_test_split(X_uds, y_uds, test_size=0.05,  random_state=20)
print(X_uds_train.shape); print(y_uds_train.shape)

(32323, 82)
(32323, 3)


In [15]:
#want a record of which IDs were used in training and testing so can keep the right ones when 
#we merge with output from other models 
#split X_train into a vector of everything but IDs and the IDs 
X_uds_train_ID = X_uds_train[:,0]
X_uds_train = np.delete(X_uds_train,0, 1 )
X_uds_train = X_uds_train.astype(float)
X_uds_test_ID = X_uds_test[:,0]
X_uds_test = np.delete(X_uds_test,0, 1 )
X_uds_test = X_uds_test.astype(float)

In [17]:
count_classes = y_uds_test.shape[1]
print(count_classes)

3


In [18]:
#also need training data with the ID in so that we can merge it later on 
y_uds_ID = np.column_stack((X_uds[:,0], y_uds))
y_uds_ID.shape

(34025, 4)

### UDS Model set up / compile / run 

In [19]:
f1_ma = tfa.metrics.F1Score(num_classes=3, average='macro')
f1_mi = tfa.metrics.FBetaScore(num_classes=3, average='micro') #this seems to be the same as 
#the accuracy so i think somehting is wrong 

In [20]:
y_uds_train.shape

(32323, 3)

In [21]:
model_uds = Sequential()
model_uds.add(Dense(500, activation='relu', input_dim=X_uds_train.shape[1]))
model_uds.add(Dense(250, activation='relu'))
model_uds.add(Dense(50, activation='relu'))
model_uds.add(Dense(3, activation='softmax'))

# Compile the model
model_uds.compile(optimizer='adam', 
              loss='categorical_crossentropy', 
              metrics=['accuracy', f1_ma])
#f1_score is macro F1
#fbeta_score is micro F1

In [22]:
model_uds.fit(X_uds_train, y_uds_train, epochs=20)
#am prettuy sure these results are wrong and the numbers shouldn't be that good ??? 

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x15700329e80>

In [23]:
#pred_train_uds= model_uds.predict(X_uds_train)
#scores_uds = model_uds.evaluate(X_uds_train, y_uds_train, verbose=0)
#print('UDS Accuracy on training data: {}% \n Error on training data: {}'.format(scores_uds[1], 1 - scores_uds[1]))   
 
pred_test= model_uds.predict(X_uds_test)
scores2_uds = model_uds.evaluate(X_uds_test, y_uds_test, verbose=0)
print('UDS Accuracy on test data: {}% \n Error on test data: {} \n F1-macro on test: {}'.
      format(round(scores2_uds[1],4), round(1 - scores2_uds[1],4), round(scores2_uds[2],4)))  

#henry was getting f1 scores of like 90 on the test set for whatever he was doing 
#also does it mean i'm over fitting to the data if the accuracy and the f1 are much higher on the 
#training set than the testing set???

UDS Accuracy on test data: 0.8942% 
 Error on test data: 0.1058 
 F1-macro on test: 0.832


In [24]:
#after training the network with our training test and testing set will put the whole data set into the model to give us 
#our softmax output for the next model 
uds_all_output = model_uds.predict(np.delete(X_uds,0, 1 ).astype(float))
uds_all_output_ID = np.column_stack((X_uds[:,0], uds_all_output))



In [25]:
#softmax classification on the full (ish) UDS data set
#pred_train_uds_IDs =  np.column_stack((X_uds_train_ID, pred_train_uds))

In [26]:
print(uds_all_output_ID.shape) ; print(y_uds_ID)

(34025, 4)
[['NACC020208' 0.0 1.0 0.0]
 ['NACC107305' 1.0 0.0 0.0]
 ['NACC187327' 1.0 0.0 0.0]
 ...
 ['NACC993141' 1.0 0.0 0.0]
 ['NACC994463' 0.0 0.0 1.0]
 ['NACC995870' 1.0 0.0 0.0]]


## MRI data set

In [22]:
#recode the response variable to numeric
mri['NACCAD3_num'] = mri['NACCAD3'].map({'Healthy': 0, 'MCI-AD':1 ,'Dementia-AD':2})

#get rid of rows where classification has not been possible 
mri = mri.dropna(subset=['NACCAD3'])

#designate which column we are trying to predict
target_column = ['NACCAD3_num'] 

predictors = list(set(list(mri.columns))-set(target_column)- set(mri_drop_columns)-set(['NACCAD3']))
mri[predictors] = mri[predictors]/mri[predictors].max()
mri.describe()
mri.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


(2338, 163)

In [23]:
X_mri =np.concatenate((mri[['NACCID']].values,mri[predictors].values), axis = 1 )
y_mri = mri[target_column].values

#change it so that we have an ecoded variable for our classification
# one hot encode outputs 
y_mri = keras.utils.to_categorical(y_mri)

X_mri_train, X_mri_test, y_mri_train, y_mri_test = train_test_split(X_mri, y_mri, test_size=0.05,  random_state=20)
print(X_mri_train.shape); print(X_mri_test.shape)

(2221, 156)
(117, 156)


In [24]:
#want a record of which IDs were used in training and testing so can keep the right ones when 
#we merge with output from other models 
#split X_train into a vector of everything but IDs and the IDs 
X_mri_train_ID = X_mri_train[:,0]
X_mri_train = np.delete(X_mri_train,0, 1 )
X_mri_train = X_mri_train.astype(float)
X_mri_test_ID = X_mri_test[:,0]
X_mri_test = np.delete(X_mri_test,0, 1 )
X_mri_test = X_mri_test.astype(float)

In [25]:
#also need training data with the ID in so that we can merge it later on 
y_mri_ID = np.column_stack((X_mri[:,0], y_mri))
y_mri_ID

array([['NACC914950', 1.0, 0.0, 0.0],
       ['NACC550785', 1.0, 0.0, 0.0],
       ['NACC129206', 1.0, 0.0, 0.0],
       ...,
       ['NACC650247', 1.0, 0.0, 0.0],
       ['NACC050273', 1.0, 0.0, 0.0],
       ['NACC635044', 1.0, 0.0, 0.0]], dtype=object)

In [26]:
y_mri_ID.shape

(2338, 4)

In [27]:
count_classes = y_mri_test.shape[1]
print(count_classes)

3


### MRI model

In [32]:
model_mri = Sequential()
model_mri.add(Dense(500, activation='relu', input_dim=X_mri_train.shape[1]))
model_mri.add(Dense(300, activation='relu'))
model_mri.add(Dense(150, activation='relu'))
model_mri.add(Dense(50, activation='relu'))
model_mri.add(Dense(3, activation='softmax'))

# Compile the model
model_mri.compile(optimizer='adam', 
              loss='categorical_crossentropy', 
              metrics=['accuracy', f1_ma])

In [33]:
model_mri.fit(X_mri_train, y_mri_train, epochs=15)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x1e6cff71a30>

In [34]:
#pred_train_mri= model_mri.predict(X_mri_train)
#scores_mri = model_mri.evaluate(X_mri_train, y_mri_train, verbose=0)
#print('MRI Accuracy on training data: {}% \n Error on training data: {}'.format(scores_mri[1], 1 - scores_mri[1]))   
 
pred_test_mri= model_mri.predict(X_mri_test)
scores2_mri = model_mri.evaluate(X_mri_test, y_mri_test, verbose=0)
print('MRI Accuracy on test data: {}% \n Error on test data: {} \n F1-macro on test: {}'.format(round(scores2_mri[1],4),
                                                                        round(1 - scores2_mri[1],4), round(scores2_mri[2],4)))  

MRI Accuracy on test data: 0.7863% 
 Error on test data: 0.2137 
 F1-macro on test: 0.5198


In [35]:
#after training the network with our training test and testing set will put the whole data set into the model to give us 
#our softmax output for the next model 
mri_all_output = model_mri.predict(np.delete(X_mri,0, 1 ).astype(float))
mri_all_output_ID = np.column_stack((X_mri[:,0], mri_all_output))



In [36]:
mri_all_output_ID.shape

(2338, 4)

### CSF data set 

In [37]:
#recode the response variable to numeric
csf['NACCAD3_num'] = csf['NACCAD3'].map({'Healthy': 0,  'MCI-AD':1 , 'Dementia-AD':2})

#get rid of rows where classification has not been possible 
csf = csf.dropna(subset=['NACCAD3'])

#designate which column we are trying to predict
target_column = ['NACCAD3_num'] 

predictors = list(set(list(csf.columns))-set(target_column)- set(csf_drop_columns)-set(['NACCAD3']))
csf[predictors] = csf[predictors]/csf[predictors].max()
#csf.describe()

In [128]:
csf.shape

(1818, 9)

In [38]:
X_csf =  np.concatenate((csf[['NACCID']].values,csf[predictors].values), axis = 1 )
y_csf = csf[target_column].values

#change it so that we have an ecoded variable for our classification
# one hot encode outputs
y_csf = keras.utils.to_categorical(y_csf)

X_csf_train, X_csf_test, y_csf_train, y_csf_test = train_test_split(X_csf, y_csf, test_size=0.05,  random_state=20)
print(X_csf_train.shape); print(X_csf_test.shape)

(1727, 4)
(91, 4)


In [39]:
count_classes = y_csf_test.shape[1]
print(count_classes)

3


### CSF Model

In [40]:
#want a record of which IDs were used in training and testing so can keep the right ones when 
#we merge with output from other models 
#split X_train into a vector of everything but IDs and the IDs 
X_csf_train_ID = X_csf_train[:,0]
X_csf_train = np.delete(X_csf_train,0, 1 )
X_csf_train = X_csf_train.astype(float)
X_csf_test_ID = X_csf_test[:,0]
X_csf_test = np.delete(X_csf_test,0, 1 )
X_csf_test = X_csf_test.astype(float)



In [41]:
#also need training data with the ID in so that we can merge it later on 
y_csf_ID = np.column_stack((X_csf[:,0], y_csf))
y_csf_ID

array([['NACC000441', 1.0, 0.0, 0.0],
       ['NACC001235', 0.0, 0.0, 1.0],
       ['NACC001634', 0.0, 0.0, 1.0],
       ...,
       ['NACC998175', 0.0, 1.0, 0.0],
       ['NACC998324', 1.0, 0.0, 0.0],
       ['NACC999002', 0.0, 0.0, 1.0]], dtype=object)

In [47]:
X_csf

array([['NACC000441', 0.13675977653631285, 0.4918893129770992,
        0.12825043885313048],
       ['NACC001235', 0.15379888268156425, 0.6999045801526718,
        0.45985371562317146],
       ['NACC001634', 0.10986964618249534, 0.4532442748091603,
        0.07899356348741954],
       ...,
       ['NACC998175', 0.3859217877094972, 0.2538167938931298,
        0.2924224692802809],
       ['NACC998324', 0.7157045313469894, 0.17032442748091606,
        0.15330602691632533],
       ['NACC999002', 0.5524518932340161, 0.6665076335877862,
        0.564072557050907]], dtype=object)

In [57]:
model_csf = Sequential()
model_csf.add(Dense(150, activation='relu', input_dim=X_csf_train.shape[1]))
model_csf.add(Dense(80, activation='relu'))
model_csf.add(Dense(50, activation='relu'))
model_csf.add(Dense(3, activation='softmax'))

# Compile the model
model_csf.compile(optimizer='adam', 
              loss='categorical_crossentropy', 
              metrics=['accuracy', f1_ma])

In [112]:
model_csf.fit(X_csf_train, y_csf_train, epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x1e6eeefc5b0>

In [113]:
#pred_train_csf= model_csf.predict(X_csf_train)
#scores_csf = model_csf.evaluate(X_csf_train, y_csf_train, verbose=0)
#print('csf Accuracy on training data: {}% \n Error on training data: {}'.format(scores_csf[1], 1 - scores_csf[1]))   
 
pred_test_csf= model_csf.predict(X_csf_test)
scores2_csf = model_csf.evaluate(X_csf_test, y_csf_test, verbose=0)
print('CSF Accuracy on test data: {}% \n Error on test data: {} \n F1-macro on test: {}'.format(round(scores2_csf[1],4),
                                                                        round(1 - scores2_csf[1],4), round(scores2_csf[2],4))) 

CSF Accuracy on test data: 0.7033% 
 Error on test data: 0.2967 
 F1-macro on test: 0.4194


In [None]:
#plot_model(model_uds)

In [60]:
#after training the network with our training test and testing set will put the whole data set into the model to give us 
#our softmax output for the next model 
csf_all_output = model_csf.predict(np.delete(X_csf,0, 1 ).astype(float))
csf_all_output_ID = np.column_stack((X_csf[:,0], csf_all_output))



In [61]:
print(y_csf_ID.shape) ; print(csf_all_output.shape)

(1818, 4)
(1818, 3)


## Merging the outputs for 2 stage of the model

In [62]:
#want to find a way to inner join the ID values and the softmax values from UDS model and MRI model / other tuples 
#do a concat with NACCID and the output and then do inner join?? 
#make data frames so can do pandas inner join then will change back to array 
pred_uds_IDs_df = pd.DataFrame(uds_all_output_ID, columns = ["NACCID", "UDS_C1", "UDS_C2", "UDS_C3" ])
pred_mri_IDs_df = pd.DataFrame(mri_all_output_ID, columns = ["NACCID", "MRI_C1", "MRI_C2", "MRI_C3"])
pred_csf_IDs_df = pd.DataFrame(csf_all_output_ID, columns = ["NACCID", "CSF_C1", "CSF_C2", "CSF_C3"])

In [63]:
#data frames with the softmax classifiers from the first stage of models 
X_uds_mri_ID = pd.merge(pred_uds_IDs_df, pred_mri_IDs_df, on="NACCID", how="inner")
X_uds_csf_ID = pd.merge(pred_uds_IDs_df, pred_csf_IDs_df, on="NACCID", how="inner")
X_mri_csf_ID = pd.merge(pred_mri_IDs_df, pred_csf_IDs_df, on="NACCID", how="inner")

In [64]:
#want to find a way to inner join the ID values and the softmax values from UDS model and MRI model / other tuples 
#do a concat with NACCID and the output and then do inner join?? 
#make data frames so can do pandas inner join then will change back to array 
y_uds_IDs_df = pd.DataFrame(y_uds_ID, columns = ["NACCID", "ind_1", "ind_2", "ind_3"])
y_mri_IDs_df = pd.DataFrame(y_mri_ID, columns = ["NACCID", "ind_1", "ind_2", "ind_3"])
y_csf_IDs_df = pd.DataFrame(y_csf_ID, columns = ["NACCID", "ind_1", "ind_2", "ind_3"])

In [65]:
#data frames with the softmax classifiers from the first stage of models 
y_uds_mri_ID = pd.merge(y_uds_IDs_df, y_mri_IDs_df[['NACCID']], on="NACCID", how="inner")
y_uds_csf_ID = pd.merge(y_uds_IDs_df, y_csf_IDs_df[['NACCID']], on="NACCID", how="inner")
y_mri_csf_ID = pd.merge(y_mri_IDs_df, y_csf_IDs_df[['NACCID']], on="NACCID", how="inner")

In [129]:
X_mri_csf_ID.shape

(264, 7)

## Stage 2 

### UDS / MRI 

In [66]:
#split data into test and train for this model
X_uds_mri_train, X_uds_mri_test, y_uds_mri_train, y_uds_mri_test = train_test_split(X_uds_mri_ID,
                    y_uds_mri_ID, test_size=0.05,  random_state=20)
print(X_uds_mri_train.shape); print(y_uds_mri_train.shape)

(2221, 7)
(2221, 4)


In [67]:
#convert inputs to float arrays 
X_uds_mri_train = pd.DataFrame.to_numpy(X_uds_mri_train.iloc[: , 1:]) 
X_uds_mri_train = X_uds_mri_train.astype(float)
y_uds_mri_train = pd.DataFrame.to_numpy(y_uds_mri_train.iloc[: , 1:]) 
y_uds_mri_train = y_uds_mri_train.astype(float)

#convert inputs to float arrays 
X_uds_mri_test = pd.DataFrame.to_numpy(X_uds_mri_test.iloc[: , 1:]) 
X_uds_mri_test = X_uds_mri_test.astype(float)
y_uds_mri_test = pd.DataFrame.to_numpy(y_uds_mri_test.iloc[: , 1:]) 
y_uds_mri_test = y_uds_mri_test.astype(float)

In [68]:
model_uds_mri = Sequential()
model_uds_mri.add(Dense(12, activation='relu', input_dim=X_uds_mri_train.shape[1]))
model_uds_mri.add(Dense(5, activation='relu'))
model_uds_mri.add(Dense(3, activation='softmax'))

# Compile the model
model_uds_mri.compile(optimizer='adam', 
              loss='categorical_crossentropy', 
              metrics=['accuracy', f1_ma])

In [69]:
model_uds_mri.fit(X_uds_mri_train, y_uds_mri_train, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1e6e81115b0>

In [71]:
#pred_train_uds_mri= model_uds_mri.predict(X_uds_mri_train)
#scores_uds_mri = model_uds_mri.evaluate(X_uds_mri_train, y_uds_mri_train, verbose=0)
#print('uds mri Accuracy on training data: {}% \n Error on training data: {}'.format(scores_uds_mri[1], 1 - scores_uds_mri[1]))   
 
pred_test_uds_mri= model_uds_mri.predict(X_uds_mri_test)
scores2_uds_mri = model_uds_mri.evaluate(X_uds_mri_test, y_uds_mri_test, verbose=0)
print('UDS MRI Accuracy on test data: {}% \n Error on test data: {} \n F1-macro on test: {}'.format(round(scores2_uds_mri[1],4),
                                                                round(1 - scores2_uds_mri[1],4), round(scores2_uds_mri[2],4))) 

UDS MRI Accuracy on test data: 0.9658% 
 Error on test data: 0.0342 
 F1-macro on test: 0.9182


In [72]:
#change all data to numpy arrays 
X_uds_mri = pd.DataFrame.to_numpy(X_uds_mri_ID.iloc[: , 1:]) 
X_uds_mri = X_uds_mri.astype(float)
                    
               
#after training the network with our training test and testing set will put the whole data set into the model to give us 
#our softmax output for the next model 
uds_mri_all_output = model_uds_mri.predict(X_uds_mri)
uds_mri_all_output_ID = np.column_stack((X_uds_mri_ID.iloc[: , 0], uds_mri_all_output))



In [73]:
print(uds_mri_all_output_ID.shape) ; print(y_uds_mri_ID.shape)

(2338, 4)
(2338, 4)


### UDS / CSF 

In [74]:
#split data into test and train for this model
X_uds_csf_train, X_uds_csf_test, y_uds_csf_train, y_uds_csf_test = train_test_split(X_uds_csf_ID,
                    y_uds_csf_ID, test_size=0.05,  random_state=20)
print(X_uds_csf_train.shape); print(y_uds_csf_train.shape)

(1727, 7)
(1727, 4)


In [75]:
#convert inputs to float arrays 
X_uds_csf_train = pd.DataFrame.to_numpy(X_uds_csf_train.iloc[: , 1:]) 
X_uds_csf_train = X_uds_csf_train.astype(float)
y_uds_csf_train = pd.DataFrame.to_numpy(y_uds_csf_train.iloc[: , 1:]) 
y_uds_csf_train = y_uds_csf_train.astype(float)

#convert inputs to float arrays 
X_uds_csf_test = pd.DataFrame.to_numpy(X_uds_csf_test.iloc[: , 1:]) 
X_uds_csf_test = X_uds_csf_test.astype(float)
y_uds_csf_test = pd.DataFrame.to_numpy(y_uds_csf_test.iloc[: , 1:]) 
y_uds_csf_test = y_uds_csf_test.astype(float)

In [76]:
model_uds_csf = Sequential()
model_uds_csf.add(Dense(12, activation='relu', input_dim=X_uds_csf_train.shape[1]))
model_uds_csf.add(Dense(6, activation='relu'))
model_uds_csf.add(Dense(3, activation='softmax'))

# Compile the model
model_uds_csf.compile(optimizer='adam', 
              loss='categorical_crossentropy', 
              metrics=['accuracy', f1_ma])

In [77]:
model_uds_csf.fit(X_uds_csf_train, y_uds_csf_train, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1e6e9528730>

In [114]:
#pred_train_uds_csf= model_uds_csf.predict(X_uds_csf_train)
#scores_uds_csf = model_uds_csf.evaluate(X_uds_csf_train, y_uds_csf_train, verbose=0)
#print('uds csf Accuracy on training data: {}% \n Error on training data: {}'.format(scores_uds_csf[1], 1 - scores_uds_csf[1]))   
 
pred_test_uds_csf= model_uds_csf.predict(X_uds_csf_test)
scores2_uds_csf = model_uds_csf.evaluate(X_uds_csf_test, y_uds_csf_test, verbose=0)
print('UDS CSF Accuracy on test data: {}% \n Error on test data: {} \n F1-macro on test: {}'.format(round(scores2_uds_csf[1],4),
                                                                        round(1 - scores2_uds_csf[1],4), round(scores2_uds_csf[2],4)))    

UDS CSF Accuracy on test data: 0.9231% 
 Error on test data: 0.0769 
 F1-macro on test: 0.8158


In [79]:
#change all data to numpy arrays 
X_uds_csf = pd.DataFrame.to_numpy(X_uds_csf_ID.iloc[: , 1:]) 
X_uds_csf = X_uds_csf.astype(float)
                    
               
#after training the network with our training test and testing set will put the whole data set into the model to give us 
#our softmax output for the next model 
uds_csf_all_output = model_uds_csf.predict(X_uds_csf)
uds_csf_all_output_ID = np.column_stack((X_uds_csf_ID.iloc[: , 0], uds_csf_all_output))



In [80]:
print(uds_csf_all_output_ID.shape) ; print(y_uds_csf_ID.shape)

(1818, 4)
(1818, 4)


### MRI / CSF

In [81]:
#split data into test and train for this model
X_mri_csf_train, X_mri_csf_test, y_mri_csf_train, y_mri_csf_test = train_test_split(X_mri_csf_ID,
                    y_mri_csf_ID, test_size=0.05,  random_state=20)
print(X_mri_csf_train.shape); print(y_mri_csf_train.shape)

(250, 7)
(250, 4)


In [82]:
#convert inputs to float arrays 
X_mri_csf_train = pd.DataFrame.to_numpy(X_mri_csf_train.iloc[: , 1:]) 
X_mri_csf_train = X_mri_csf_train.astype(float)
y_mri_csf_train = pd.DataFrame.to_numpy(y_mri_csf_train.iloc[: , 1:]) 
y_mri_csf_train = y_mri_csf_train.astype(float)

#convert inputs to float arrays 
X_mri_csf_test = pd.DataFrame.to_numpy(X_mri_csf_test.iloc[: , 1:]) 
X_mri_csf_test = X_mri_csf_test.astype(float)
y_mri_csf_test = pd.DataFrame.to_numpy(y_mri_csf_test.iloc[: , 1:]) 
y_mri_csf_test = y_mri_csf_test.astype(float)

In [115]:
model_mri_csf = Sequential()
model_mri_csf.add(Dense(100, activation='relu', input_dim=X_mri_csf_train.shape[1]))
model_mri_csf.add(Dense(50, activation='relu'))
model_mri_csf.add(Dense(20, activation='relu'))
model_mri_csf.add(Dense(3, activation='softmax'))

# Compile the model
model_mri_csf.compile(optimizer='adam', 
              loss='categorical_crossentropy', 
              metrics=['accuracy', f1_ma])

In [117]:
model_mri_csf.fit(X_mri_csf_train, y_mri_csf_train, epochs=30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x1e6f00440d0>

In [118]:
#pred_train_mri_csf= model_mri_csf.predict(X_mri_csf_train)
#scores_mri_csf = model_mri_csf.evaluate(X_mri_csf_train, y_mri_csf_train, verbose=0)
#print('mri csf Accuracy on training data: {}% \n Error on training data: {}'.format(scores_mri_csf[1], 1 - scores_mri_csf[1]))   
 
pred_test_mri_csf= model_mri_csf.predict(X_mri_csf_test)
scores2_mri_csf = model_mri_csf.evaluate(X_mri_csf_test, y_mri_csf_test, verbose=0)
print('MRI CSF Accuracy on test data: {}% \n Error on test data: {} \n F1-macro on test: {}'.format(round(scores2_mri_csf[1],4),
                                                                        round(1 - scores2_mri_csf[1],4), round(scores2_mri_csf[2],4)))

MRI CSF Accuracy on test data: 0.7857% 
 Error on test data: 0.2143 
 F1-macro on test: 0.5359


In [119]:
#change all data to numpy arrays 
X_mri_csf = pd.DataFrame.to_numpy(X_mri_csf_ID.iloc[: , 1:]) 
X_mri_csf = X_mri_csf.astype(float)
                    
               
#after training the network with our training test and testing set will put the whole data set into the model to give us 
#our softmax output for the next model 
mri_csf_all_output = model_mri_csf.predict(X_mri_csf)
mri_csf_all_output_ID = np.column_stack((X_mri_csf_ID.iloc[: , 0], mri_csf_all_output))



In [120]:
print(mri_csf_all_output_ID.shape) ; print(y_mri_csf_ID.shape)

(264, 4)
(264, 4)


## Stage 3 - UDS/ MRI/ CSF 

### Input data

In [121]:
#want to inner join all the output vectors based on the IDs 
#note that the output files are arrays and the y data is dataw frame
#we want the input to the model to be an array (I think) so need to merge the inputs and outputs 
#then change them all to arrays 
pred_uds_mri_IDs_df = pd.DataFrame(uds_mri_all_output_ID, columns = ["NACCID", "UM_1", "UM_C2", "UM_C3"])
pred_mri_csf_IDs_df = pd.DataFrame(mri_csf_all_output_ID, columns = ["NACCID", "MC_C1", "MC_C2", "MC_C3"])
pred_uds_csf_IDs_df = pd.DataFrame(uds_csf_all_output_ID, columns = ["NACCID", "UC_C1", "UC_C2", "UC_C3"])

#data frames with the softmax classifiers from the first stage of models 
X_UMC_ID = pd.merge(pd.merge(pred_mri_csf_IDs_df,pred_uds_csf_IDs_df,on="NACCID", how="inner"),pred_uds_mri_IDs_df, on="NACCID", how="inner")
#X_UMC_ID

In [130]:
X_UMC_ID.shape

(264, 10)

In [122]:
y_UMC_ID = pd.merge(pd.merge(y_mri_csf_ID,y_uds_csf_ID,on="NACCID", how="inner"),y_uds_mri_ID, on="NACCID", how="inner")
y_UMC_ID = y_UMC_ID.iloc[: , :4]
#y_UMC_ID

### UDS/ MRI/ CSF Model

In [123]:
#split data into test and train for this model
X_UMC_train, X_UMC_test, y_UMC_train, y_UMC_test = train_test_split(X_UMC_ID,
                    y_UMC_ID, test_size=0.05,  random_state=20)
print(X_mri_csf_train.shape); print(y_mri_csf_train.shape)

(250, 6)
(250, 3)


In [124]:
#convert inputs to float arrays 
X_UMC_train = pd.DataFrame.to_numpy(X_UMC_train.iloc[: , 1:]) 
X_UMC_train = X_UMC_train.astype(float)
y_UMC_train = pd.DataFrame.to_numpy(y_UMC_train.iloc[: , 1:]) 
y_UMC_train = y_UMC_train.astype(float)

#convert inputs to float arrays 
X_UMC_test = pd.DataFrame.to_numpy(X_UMC_test.iloc[: , 1:]) 
X_UMC_test = X_UMC_test.astype(float)
y_UMC_test = pd.DataFrame.to_numpy(y_UMC_test.iloc[: , 1:]) 
y_UMC_test = y_UMC_test.astype(float)

In [131]:
model_UMC = Sequential()
model_UMC.add(Dense(75, activation='relu', input_dim=X_UMC_train.shape[1]))
model_UMC.add(Dense(40, activation='relu'))
model_UMC.add(Dense(10, activation='relu'))
model_UMC.add(Dense(3, activation='softmax'))

# Compile the model
model_UMC.compile(optimizer='adam', 
              loss='categorical_crossentropy', 
              metrics=['accuracy',f1_ma])

In [132]:
model_UMC.fit(X_UMC_train, y_UMC_train, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1e6f1258c40>

In [133]:
#pred_train_UMC= model_UMC.predict(X_UMC_train)
#scores_UMC = model_UMC.evaluate(X_UMC_train, y_UMC_train, verbose=0)
#print('mri csf Accuracy on training data: {}% \n Error on training data: {}'.format(scores_UMC[1], 1 - scores_UMC[1]))   
 
pred_test_UMC= model_UMC.predict(X_UMC_test)
scores2_UMC = model_UMC.evaluate(X_UMC_test, y_UMC_test, verbose=0)
print('UDS/MRI/CSF Accuracy on test data: {}% \n Error on test data: {} \n F1-macro on test: {}'.format(round(scores2_UMC[1],4),
                                                                        round(1 - scores2_UMC[1],4), round(scores2_UMC[2],4)))

UDS/MRI/CSF Accuracy on test data: 1.0% 
 Error on test data: 0.0 
 F1-macro on test: 1.0
