# AutoEncoder Fusion

This notebook uses AutoEncoder to impute the missing modalities. Take patients MRI data source as an example. We first model MRI->MRI AE to find the latent space for MRI features for those who have MRI records. However, for those who don't, we utilize the UDS->MRI AE to project UDS onto the shared latent space between UDS and MRI. As a result, we can have the complete dataset for MRI. But following issues also need to be considered:
- The AutoEncoder needs a lot hyperparameter tuning. Even though, the latent representation might not be accurate.
- The decoder dimension is high-dimensional (UDS-89, MRI-155). AE might be unstable.
- The distribution of MRI might be different depending on whether a patient has a MRI record or not.

In [1]:
import os, sys
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter, defaultdict
from importlib import reload
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import torch
from torch import optim, nn
import torch.utils.data as Data
from torch.nn import functional as F
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

def seed_torch(seed=0):
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)#as reproducibility docs
    torch.manual_seed(seed)# as reproducibility docs
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.benchmark = False# as reproducibility docs
    torch.backends.cudnn.deterministic = True# as reproducibility docs

In [21]:
def load_data(impute_method = 'Mean-Mode'):
    uds = pd.read_csv("../data/data_imputed/{}/uds.csv".format(impute_method))
    uds['datetime'] = pd.to_datetime(uds['datetime'])
    uds = uds.dropna(subset=['EDUC'])
    
    mri = pd.read_csv("../data/data_imputed/{}/mri.csv".format(impute_method))
    mri['datetime'] = pd.to_datetime(mri['datetime'])
    
    csf = pd.read_csv("../data/data_imputed/{}/csf.csv".format(impute_method))
    return uds, mri, csf

uds_dict = pd.read_csv("../data/data_dictionary/uds_feature_dictionary_cleaned.csv")
mri_dict = pd.read_csv("../data/data_dictionary/mri_feature_dictionary_cleaned.csv") 

uds_drop_columns = ['NACCID', 'NACCADC', 'NACCVNUM', 'datetime', 'NACCUDSD', 'NACCALZP', 'NACCAD3', 'NACCAD5']
mri_drop_columns = ['NACCID', 'NACCVNUM', 'datetime', 'datetime_UDS', 'timediff', 'within-a-year']
csf_drop_columns = ['NACCID', 'CSFABMD', 'CSFTTMD', 'CSFPTMD']

uds, mri, csf = load_data()
print(uds.shape, mri.shape, csf.shape)

(44740, 89) (2873, 161) (2180, 7)


## AutoEncoder for Dimension Reduction - UDS and MRI

In [3]:
class AutoEncoder(torch.nn.Module):
    def __init__(self, e_dim, d_dim, seed=48):
        super(AutoEncoder, self).__init__()
        seed_torch(seed)
        e_list = [nn.Linear(e_dim[0], e_dim[1])]
        for i in range(2, len(e_dim)):
            e_list.append(nn.ReLU())
#             e_list.append(nn.Dropout(0.1))
            e_list.append(nn.Linear(e_dim[i-1], e_dim[i]))
        d_list = [nn.Linear(d_dim[0], d_dim[1])]
        for i in range(2, len(d_dim)):
            d_list.append(nn.ReLU())
#             d_list.append(nn.Dropout(0.1))
            d_list.append(nn.Linear(d_dim[i-1], d_dim[i]))
        self.encoder_layers = nn.Sequential(*e_list)
        self.decoder_layers = nn.Sequential(*d_list)
    def encode(self, X):
        return self.encoder_layers(X)
    def decode(self, Z):
        return self.decoder_layers(Z)
    def loss(self, X1, X2):
        Z = self.encoder_layers(X1)
        X_hat = self.decoder_layers(Z)
        return torch.mean((X2-X_hat)**2)

In [75]:
def train_test_split_data(X, test_size = 0.2, seed=48):
    X_train, X_test = train_test_split(X, test_size=test_size, random_state=seed)
    X_val, X_test = train_test_split(X_test, test_size=test_size, random_state=seed)
    X_train = torch.tensor(X_train).float().to(device)
    X_val = torch.tensor(X_val).float().to(device)
    X_test = torch.tensor(X_test).float().to(device)
    return X_train, X_val, X_test

model = None

def trainAE(X1, e_dim, d_dim, X2=None, test_size = 0.2, 
            lr=1e-2, epochs=200, weight_decay=1e-5, T_max=200, small=1e-7, 
            display_intvl=20, seed=48):
    global model

    model = AutoEncoder(e_dim, d_dim, seed=seed).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer,  T_max=T_max)
    if X2 is None:
        X2 = X1
    X1tr, X1v, X1te = train_test_split_data(X1, test_size = test_size, seed=seed)
    X2tr, X2v, X2te = train_test_split_data(X2, test_size = test_size, seed=seed)
    train_dataset = Data.TensorDataset(X1tr, X2tr)        
    train_loader = Data.DataLoader(train_dataset, batch_size=len(X1tr) // 3, shuffle=False) 
    train_loss, last_loss = 0, 0
    for epoch in range(epochs):
        for batch_idx, (X1b, X2b, ) in enumerate(train_loader):
            loss = model.loss(X1b, X2b)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            train_loss += loss
        if (epoch % display_intvl == 0 or torch.abs(last_loss - train_loss) < small):
            val_loss = model.loss(X1v, X2v)
            print('Epoch {} (lr: {:11f}):  Loss: {:.3f} (Val-Loss: {:.3f})'.format(
                epoch, scheduler.get_last_lr()[0], train_loss.item(), val_loss.item()))
        if torch.abs(last_loss - train_loss) < small:
            break
        last_loss = train_loss
        train_loss = 0
        scheduler.step()
    print("Testing MSE:  ", model.loss(X1te, X2te))
    return model

In [76]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
scaler = StandardScaler()
pca = None

def pca_transform(df, var_dict, n_component, pca_thres = 0.8):
    global pca
    pca = PCA(n_components=n_component)
    to_ret = pd.DataFrame()
    for cat in var_dict['Category'].unique():
        if cat != 'DEMO':
            var_names = var_dict[var_dict['Category'] == cat]['VariableName'].values
            var_names = set(var_names).intersection(set(df.columns))
            if len(var_names) > n_component:
                pca_transformed = pca.fit_transform(scaler.fit_transform(df.loc[:,var_names]))
                num_selected_1 = np.sum(pca.explained_variance_ratio_ > pca_thres)
                num_selected_2 = np.argmax(-np.diff(pca.explained_variance_ratio_, n=1) > 0.1) + 1
                num_selected = max(num_selected_1, num_selected_2)
                temp = pd.DataFrame(pca_transformed[:,:num_selected])
                temp.columns = ["{}_{}".format(cat, i+1) for i in range(num_selected)]
                print(cat, pca.explained_variance_ratio_, num_selected)
                to_ret = pd.concat([to_ret, temp], axis=1)
    return to_ret

## UDS -> UDS

In [77]:
uds_pca = pca_transform(uds.drop(uds_drop_columns, axis=1), uds_dict, 5, pca_thres=0.2)
uds_pca = pd.concat([uds[['SEX', 'NACCAGE', 'EDUC', 'NACCAPOE']].reset_index(drop=True), uds_pca], axis=1)
print(uds_pca.shape)

CDF [0.76768548 0.10899955 0.03684562 0.03143016 0.01722622] 1
GDS [0.33769396 0.07255963 0.06801303 0.05805296 0.05455118] 1
FAS [0.75274188 0.05874551 0.04165319 0.03237024 0.02882413] 1
NPI [0.30584755 0.07569686 0.07295629 0.06332827 0.06178203] 1
NEURO [0.32660678 0.12672186 0.08952474 0.07479331 0.05000349] 1
(44740, 9)


In [80]:
uds_sub = uds.drop(uds_drop_columns, axis=1)
uds_scaler = StandardScaler()
uds_model = trainAE(uds_scaler.fit_transform(uds_sub), [81, 32, 16, 9], [9,32, 81], 
                    lr=1e-2, epochs=300, weight_decay=1e-5, T_max=60, small=1e-7)

Epoch 0 (lr:    0.010000):  Loss: 3.552 (Val-Loss: 0.928)
Epoch 20 (lr:    0.007500):  Loss: 1.667 (Val-Loss: 0.543)
Epoch 40 (lr:    0.002500):  Loss: 1.518 (Val-Loss: 0.499)
Epoch 60 (lr:    0.000000):  Loss: 1.502 (Val-Loss: 0.494)
Epoch 80 (lr:    0.002500):  Loss: 1.484 (Val-Loss: 0.488)
Epoch 100 (lr:    0.007500):  Loss: 1.440 (Val-Loss: 0.469)
Epoch 120 (lr:    0.010000):  Loss: 1.465 (Val-Loss: 0.471)
Epoch 140 (lr:    0.007500):  Loss: 1.332 (Val-Loss: 0.442)
Epoch 160 (lr:    0.002500):  Loss: 1.288 (Val-Loss: 0.423)
Epoch 180 (lr:    0.000000):  Loss: 1.277 (Val-Loss: 0.422)
Epoch 200 (lr:    0.002500):  Loss: 1.273 (Val-Loss: 0.420)
Epoch 220 (lr:    0.007500):  Loss: 1.262 (Val-Loss: 0.417)
Epoch 240 (lr:    0.010000):  Loss: 1.352 (Val-Loss: 0.453)
Epoch 260 (lr:    0.007500):  Loss: 1.241 (Val-Loss: 0.408)
Epoch 280 (lr:    0.002500):  Loss: 1.217 (Val-Loss: 0.402)
Testing MSE:   tensor(0.4021, device='cuda:0', grad_fn=<MeanBackward0>)


## MRI -> MRI

In [81]:
mri_scaler = StandardScaler()
mri_sub = mri.drop(mri_drop_columns, axis=1)
mri_model = trainAE(mri_scaler.fit_transform(mri_sub), [155,64,16,5], [5,64,155], 
                    lr=1e-2, epochs=5000, weight_decay=1e-5, T_max=2500, small=1e-7, display_intvl=50)

Epoch 0 (lr:    0.010000):  Loss: 3.003 (Val-Loss: 0.879)
Epoch 50 (lr:    0.009990):  Loss: 1.137 (Val-Loss: 0.371)
Epoch 100 (lr:    0.009961):  Loss: 1.095 (Val-Loss: 0.373)
Epoch 150 (lr:    0.009911):  Loss: 1.077 (Val-Loss: 0.380)
Epoch 200 (lr:    0.009843):  Loss: 1.062 (Val-Loss: 0.384)
Epoch 250 (lr:    0.009755):  Loss: 1.054 (Val-Loss: 0.390)
Epoch 300 (lr:    0.009649):  Loss: 1.044 (Val-Loss: 0.392)
Epoch 350 (lr:    0.009524):  Loss: 1.045 (Val-Loss: 0.395)
Epoch 400 (lr:    0.009382):  Loss: 1.039 (Val-Loss: 0.398)
Epoch 450 (lr:    0.009222):  Loss: 1.031 (Val-Loss: 0.402)
Epoch 500 (lr:    0.009045):  Loss: 1.028 (Val-Loss: 0.405)
Epoch 550 (lr:    0.008853):  Loss: 1.030 (Val-Loss: 0.407)
Epoch 600 (lr:    0.008645):  Loss: 1.026 (Val-Loss: 0.407)
Epoch 650 (lr:    0.008423):  Loss: 1.022 (Val-Loss: 0.410)
Epoch 700 (lr:    0.008187):  Loss: 1.023 (Val-Loss: 0.412)
Epoch 750 (lr:    0.007939):  Loss: 1.022 (Val-Loss: 0.413)
Epoch 800 (lr:    0.007679):  Loss: 1.017 (

In [82]:
mri_pca = pca_transform(mri.drop(mri_drop_columns, axis=1), mri_dict, 5, pca_thres=0.1)
print(mri_pca.shape)

Regional gray matter volumes [0.56327925 0.17272831 0.06163145 0.04907454 0.0346227 ] 2
Regional cortical thicknesses [0.44952865 0.05045401 0.03774226 0.03147718 0.02400231] 1
PET scan data [0.37569421 0.12838299 0.07782488 0.0606391  0.02956345] 2
(2873, 5)


# UDS -> MRI

In [83]:
uds_mri_scaler, mri_mri_scaler = StandardScaler(), StandardScaler()
uds_mri_id = set(uds['NACCID']).intersection(mri['NACCID'])
uds_mri_sub = uds[uds['NACCID'].isin(uds_mri_id)].sort_values('NACCID').drop(uds_drop_columns, axis=1)
mri_mri_sub = mri[mri['NACCID'].isin(uds_mri_id)].sort_values('NACCID').drop(mri_drop_columns, axis=1)
assert(uds_mri_sub.shape[0] == mri_mri_sub.shape[0] )

In [84]:
uds_mri_model = trainAE(uds_mri_scaler.fit_transform(uds_mri_sub), [81, 32, 16, 5], [5, 32, 64, 155], 
                        X2=mri_mri_scaler.fit_transform(mri_mri_sub), 
                        lr=1e-2, epochs=5000, weight_decay=1e-5, T_max=3000, small=1e-9, display_intvl=50)

Epoch 0 (lr:    0.010000):  Loss: 3.729 (Val-Loss: 0.973)
Epoch 50 (lr:    0.009993):  Loss: 2.055 (Val-Loss: 0.894)
Epoch 100 (lr:    0.009973):  Loss: 1.910 (Val-Loss: 1.012)
Epoch 150 (lr:    0.009938):  Loss: 1.820 (Val-Loss: 0.966)
Epoch 200 (lr:    0.009891):  Loss: 1.731 (Val-Loss: 1.054)
Epoch 250 (lr:    0.009830):  Loss: 1.752 (Val-Loss: 1.026)
Epoch 300 (lr:    0.009755):  Loss: 1.676 (Val-Loss: 1.069)
Epoch 350 (lr:    0.009668):  Loss: 1.690 (Val-Loss: 1.097)
Epoch 400 (lr:    0.009568):  Loss: 1.621 (Val-Loss: 1.161)
Epoch 450 (lr:    0.009455):  Loss: 1.703 (Val-Loss: 1.229)
Epoch 500 (lr:    0.009330):  Loss: 1.611 (Val-Loss: 1.199)
Epoch 550 (lr:    0.009193):  Loss: 1.607 (Val-Loss: 1.122)
Epoch 600 (lr:    0.009045):  Loss: 1.597 (Val-Loss: 1.115)
Epoch 650 (lr:    0.008886):  Loss: 1.563 (Val-Loss: 1.174)
Epoch 700 (lr:    0.008716):  Loss: 1.553 (Val-Loss: 1.170)
Epoch 750 (lr:    0.008536):  Loss: 1.665 (Val-Loss: 1.147)
Epoch 800 (lr:    0.008346):  Loss: 1.547 (

# UDS -> CSF

In [87]:
uds_csf_scaler, csf_csf_scaler = StandardScaler(), StandardScaler()
uds_csf_id = set(uds['NACCID']).intersection(csf['NACCID'])
uds_csf_sub = uds[uds['NACCID'].isin(uds_csf_id)].sort_values('NACCID').drop(uds_drop_columns, axis=1)
csf_csf_sub = csf[csf['NACCID'].isin(uds_csf_id)].sort_values('NACCID').drop(['NACCID', 'CSFABMD', 'CSFTTMD', 'CSFPTMD'], axis=1)
assert(uds_csf_sub.shape[0] == csf_csf_sub.shape[0] )

In [90]:
uds_csf_model = trainAE(uds_csf_scaler.fit_transform(uds_csf_sub), [81, 32, 16, 3], [3, 8, 3], 
                        X2=csf_csf_scaler.fit_transform(csf_csf_sub), 
                        lr=1e-2, epochs=5000, weight_decay=1e-5, T_max=3000, small=1e-9, display_intvl=50)

Epoch 0 (lr:    0.010000):  Loss: 3.223 (Val-Loss: 1.158)
Epoch 50 (lr:    0.009993):  Loss: 1.050 (Val-Loss: 1.242)
Epoch 100 (lr:    0.009973):  Loss: 0.749 (Val-Loss: 1.406)
Epoch 150 (lr:    0.009938):  Loss: 0.627 (Val-Loss: 1.559)
Epoch 200 (lr:    0.009891):  Loss: 0.562 (Val-Loss: 1.653)
Epoch 250 (lr:    0.009830):  Loss: 0.501 (Val-Loss: 1.812)
Epoch 300 (lr:    0.009755):  Loss: 0.502 (Val-Loss: 1.875)
Epoch 350 (lr:    0.009668):  Loss: 0.481 (Val-Loss: 1.920)
Epoch 400 (lr:    0.009568):  Loss: 0.451 (Val-Loss: 2.045)
Epoch 450 (lr:    0.009455):  Loss: 0.440 (Val-Loss: 2.079)
Epoch 500 (lr:    0.009330):  Loss: 0.439 (Val-Loss: 2.024)
Epoch 550 (lr:    0.009193):  Loss: 0.449 (Val-Loss: 2.141)
Epoch 600 (lr:    0.009045):  Loss: 0.421 (Val-Loss: 2.044)
Epoch 650 (lr:    0.008886):  Loss: 0.414 (Val-Loss: 2.163)
Epoch 700 (lr:    0.008716):  Loss: 0.394 (Val-Loss: 2.211)
Epoch 750 (lr:    0.008536):  Loss: 0.390 (Val-Loss: 2.163)
Epoch 800 (lr:    0.008346):  Loss: 0.387 (

# Construct Full Dataset

In [131]:
Z_uds = uds_model.encode(torch.tensor(uds_scaler.fit_transform(uds_sub)).float().to(device)).detach().cpu().numpy()
uds_new = pd.DataFrame(Z_uds)
uds_new['NACCID'] = uds['NACCID'].values
print(uds_new.shape)

(44740, 10)


In [132]:
Z_mri = mri_model.encode(torch.tensor(mri_scaler.fit_transform(mri_sub)).float().to(device)).detach().cpu().numpy()
mri_new = pd.DataFrame(Z_mri)
mri_new['NACCID'] = mri['NACCID'].values
mri_new['MRI-imputed'] = 0

uds_missing_mri_sub = uds[~uds['NACCID'].isin(uds_mri_id)]

Z_uds_mri = uds_mri_model.encode(torch.tensor(
    uds_mri_scaler.fit_transform(uds_missing_mri_sub.drop(uds_drop_columns, axis=1))
).float().to(device)).detach().cpu().numpy()
missing_mri = pd.DataFrame(Z_uds_mri)
missing_mri['NACCID'] = uds_missing_mri_sub['NACCID'].values
missing_mri['MRI-imputed'] = 1

mri_new = pd.concat([mri_new, missing_mri])
mri_new = mri_new[mri_new['NACCID'].isin(uds_new['NACCID'])]
print(mri_new.shape)

(44740, 7)


In [168]:
csf_new = csf.drop(['CSFABMD', 'CSFTTMD', 'CSFPTMD'], axis=1)
csf_new[['CSFABETA', 'CSFTTAU', 'CSFPTAU']] = csf_csf_scaler.transform(csf_new[['CSFABETA', 'CSFTTAU', 'CSFPTAU']])

uds_missing_csf_sub = uds[~uds['NACCID'].isin(uds_csf_id)]
Z_uds_csf = uds_csf_model.encode(torch.tensor(
    uds_csf_scaler.fit_transform(uds_missing_csf_sub.drop(uds_drop_columns, axis=1))
).float().to(device)).detach().cpu().numpy()
missing_csf = pd.DataFrame(Z_uds_csf, columns=csf_new.columns[1:])
missing_csf['NACCID'] = uds_missing_csf_sub['NACCID'].values
missing_csf['csf-imputed'] = 1

csf_new['csf-imputed'] = 0
csf_new = pd.concat([csf_new, missing_csf], axis=0)
print(csf_new.shape)

(44748, 5)


# Fit Baseline Model

In [162]:
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics

import warnings
warnings.filterwarnings('ignore')

def print_summary(clf, Xtr, ytr, Xte, yte, confusion_metrix = False):
    if confusion_metrix:
        print(metrics.confusion_matrix(ytr, clf.predict(Xtr)))
        print(metrics.confusion_matrix(yte, clf.predict(Xte)))
    acctr = metrics.accuracy_score(ytr, clf.predict(Xtr))
    auctr = metrics.roc_auc_score(ytr, clf.predict_proba(Xtr), average='macro', multi_class='ovo')
    f1tr_macro = metrics.f1_score(ytr, clf.predict(Xtr), average='macro')
    f1tr_micro = metrics.f1_score(ytr, clf.predict(Xtr), average='micro')
    
    accte = metrics.accuracy_score(yte, clf.predict(Xte))
    aucte = metrics.roc_auc_score(yte, clf.predict_proba(Xte), average='macro', multi_class='ovo')
    f1te_macro = metrics.f1_score(yte, clf.predict(Xte), average='macro')
    f1te_micro = metrics.f1_score(yte, clf.predict(Xte), average='micro')
 
    metrics_df = pd.DataFrame.from_dict({"Train": {"Acc": acctr, "AUC": auctr, "F1-macro": f1tr_macro, "F1-micro": f1tr_micro}, 
                                         "Test": {"Acc": accte, "AUC": aucte, "F1-macro": f1te_macro, "F1-micro": f1te_micro}}, 
                                         orient='Index')
    return metrics_df.round(3)

def train_logistic(df, target, C=0.4, l1_ratio=0.3, df_name=None):
    clf = LogisticRegression(random_state=48, max_iter=1500, solver='saga', penalty='elasticnet', C=C, l1_ratio=l1_ratio)
    class_n = df.groupby(target)['NACCID'].count()
    df = df.loc[df[target].isin(class_n[class_n>=5].index), ]
    if df_name is not None and len(class_n) != len(class_n[class_n>5].index):
        print("{}: {} label(s) have samples size less than 5".format(df_name, len(class_n[class_n<5])))
    X_train, X_test, y_train, y_test = train_test_split(df.drop(['NACCID', target], axis=1), 
                                                        df[target], stratify=df[target], test_size = 0.3, random_state=48)
    clf = clf.fit(X_train, y_train)
    return print_summary(clf, X_train, y_train, X_test, y_test)

## UDS

In [165]:
X = uds_new.merge(uds[['NACCID', 'NACCAD3']], on='NACCID', how='inner')
train_logistic(X, 'NACCAD3')

Unnamed: 0,Acc,AUC,F1-macro,F1-micro
Train,0.875,0.94,0.776,0.875
Test,0.876,0.939,0.778,0.876


## UDS + MRI

In [166]:
X = uds_new.merge(uds[['NACCID', 'NACCAD3']], on='NACCID', how='inner')
X = X.merge(mri_new, on='NACCID', how='inner')
train_logistic(X, 'NACCAD3')

Unnamed: 0,Acc,AUC,F1-macro,F1-micro
Train,0.877,0.941,0.779,0.877
Test,0.878,0.94,0.782,0.878


## UDS + MRI + CSF

In [167]:
X = uds_new.merge(uds[['NACCID', 'NACCAD3']], on='NACCID', how='inner')
X = X.merge(mri_new, on='NACCID', how='inner')
X = X.merge(csf_new, on='NACCID', how='inner')
train_logistic(X, 'NACCAD3')

Unnamed: 0,Acc,AUC,F1-macro,F1-micro
Train,0.881,0.945,0.789,0.881
Test,0.881,0.945,0.789,0.881
