# Read the input dataset :

1. usable_samples_ADNI.json : stores the IID (index) for each row of PRS_feature_matrix.npy
2. PRS_feature_matrix.npy : PR Score for different features
3. Covar_FILE_bigger_dataset : for reading covar such as age, gender
4. Final_Samples.json : contains ID and output for each data

In [96]:
if True:
    def warn(*args, **kwargs):
        pass
    import warnings
    warnings.warn = warn

import pandas as pd
import numpy as np
import json
import random
import math

import os
import time
import sys

import torch as t
from torch import nn
from torch.autograd import Variable
import torch
from torch.utils import data

from sklearn.metrics import precision_recall_fscore_support, roc_auc_score, precision_recall_curve, auc
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold 

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DEVICE = "cpu"

import shap
from copy import deepcopy

import pickle
from tqdm import tqdm

import matplotlib
import matplotlib.pyplot as plt

import seaborn as sns

import tpot

from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.over_sampling import ADASYN

In [97]:
random_seed = 42 # or any of your favorite number 
torch.manual_seed(random_seed)
torch.cuda.manual_seed(random_seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

np.random.seed(random_seed)
random.seed(random_seed)

In [98]:
yrs = '4yrs'
model_suffix="_cn_progression"
dataset_suffix="from_cn"

Covar for ADNI Plink

# Alter parameters :
    1. Number of features
    2. Number of Hidden Layers 
    3. Dimension of Hidden Layers

In [99]:
# num_features = 52
hidden = 4
hidden_dimension = 32

# Indices of features to consider

# Functions & Classes

In [100]:
# save_in_file: saves model accuracy in a text file
#     args : model_name : name of model with layers and dimensions
#            accuracy : accuracy  score
def save_in_file(model_name, accuracy):
    model_file = open("model_details.txt","a")
    model_file.write(model_name + " -> accuracy : " + str(accuracy) + "\n" )
    model_file.close() 

**Modifications**
1. Added relu in the hidden layers and sigmoid in the output layer as activation functions
2. Added dropout in the hidden layers

In [101]:
class simple_model(nn.Module):
    def __init__(self, num_features, hidden_dim= hidden_dimension, drop_probab=.5):
        super(simple_model, self).__init__()
        
        ####
        num_hidden = hidden
        hidden_dim = hidden_dimension
        self.fc1 = nn.Linear(num_features, hidden_dim)
        self.fc_hidden = nn.ModuleList([nn.Linear(hidden_dim, hidden_dim) for i in range(num_hidden)])
        self.fc2 = nn.Linear(hidden_dim, 8)
        self.outLayer = nn.Linear(8, 1)
#         self.softmax = nn.Softmax(-1)
        self.sigmoid = nn.Sigmoid()
        self.relu = nn.ReLU()
        self.num_hidden = num_hidden
        self.drop_probab = drop_probab
        self.dropout = nn.functional.dropout
        ####

    def forward(self, features):
        features = self.fc1(features)
        features = self.dropout(features, p=self.drop_probab)
        for i in range(self.num_hidden):
            features = self.fc_hidden[i](features)
            # added by Mashiat
            features = self.dropout(features, p=self.drop_probab)
            features = self.relu( features )
            ####################
        features = self.fc2(features)
        features = self.dropout(features, p=self.drop_probab)
        logit = self.outLayer(features)
#         print(features.shape, features)
        probab = self.sigmoid(logit)
        return probab
    

# Converting Pandas Dataframe to Dataset class

overriding the constructor, getitem, len function of the original class

In [102]:
class df_dataSet(data.Dataset):
    def __init__(self, X, y):
        
        self.features = torch.tensor(X.values, dtype=torch.float32)
        self.labels = torch.tensor(y.values, dtype=torch.float32)
        
    def __getitem__(self, index):
        return self.features[index], self.labels[index]
    
        
        
    def __len__(self):
        return len(self.labels)

In [103]:
# epoch function : runs an epoch of a model
#                 args :
#                         model : neural network model
#                         optimizer :
#                         criterion :
#                         is_training : train - true or test - false
#                         loader : torch dataset
#                 returns :
#                         different accuracy score for the dataset of per epoch
def epoch(model, optimizer, criterion, is_training, loader):
    pred = []
    true = []
    total_loss = 0.
#     print(loader)
    for batch_idx, (features, label) in enumerate(loader):
        features = torch.autograd.Variable(features.to(DEVICE).float())
        label = torch.autograd.Variable(label.to(DEVICE).float())
        label = torch.reshape(label, (label.shape[0], 1))
        probab = model(features)
        if is_training:  
#             print(probab.shape, label.shape)
            loss = criterion(probab, label)
            ## compute gradient and do SGD step 
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
#             print(batch_idx, ':', loss) 
        pred += probab.detach().cpu().numpy().tolist()
        true += label.detach().cpu().numpy().tolist()
    
    pred, true, total_loss = np.array(pred).reshape([-1]), np.array(true).reshape([-1]), total_loss
    pred_binary = (pred > .5).astype(float)
    precision, recall, fscore, support = precision_recall_fscore_support(true, pred_binary)
    auroc = roc_auc_score(true, pred)
    p, r, thresholds = precision_recall_curve(true, pred)
    auprc = auc(r, p)
    acc = (pred_binary==true).mean()
    
    return precision[1], recall[1], fscore[1], support, auroc, auprc, acc, total_loss, pred, pred_binary, true
#     return None, None, None, None, None, None, acc, total_loss, pred, pred_binary, true
    


**epoch function for LOOCV**

Without precision, recall, ROC, AUC 

# Read dataset and column priorities

In [104]:
shuffled = pd.read_csv('adni_shuffled_balanced_' + yrs + '_'+dataset_suffix+'.csv')

# Read from the output file and store keys in a list
input_file = "column_importance_ADNI"+dataset_suffix+".txt"
col_imp = []

with open(input_file, "r") as f:
    for line in f:
        key = line.strip()  # Remove newline character
        col_imp.append(key)

# print("Keys read from the file:", col_imp)

# Selected Column Number

In [105]:
f = open("ADNI_feature_count.txt", "r")
count = f.read()
print( int(count) )
cols_to_take = int(count)
selected_col = col_imp[:cols_to_take]
print(selected_col)

5
['Non-cancer illness code, self-reported: type 2 diabetes', 'Cigarettes per Day', 'Age completed full time education', 'Non-cancer illness code, self-reported: depression', "Illnesses of mother: Alzheimer's disease/dementia"]


# Add AD

In [106]:
if 'Alzheimer\'s Disease' not in selected_col:
    selected_col.append('Alzheimer\'s disease' )

selected_col.append( 'output' )
print(selected_col)

['Non-cancer illness code, self-reported: type 2 diabetes', 'Cigarettes per Day', 'Age completed full time education', 'Non-cancer illness code, self-reported: depression', "Illnesses of mother: Alzheimer's disease/dementia", "Alzheimer's disease", 'output']


In [107]:
shuffled = shuffled[selected_col]

print( shuffled.shape )
# print( shuffled.head() )

# dropping last / output column in df
shuffled_X = shuffled.iloc[: , :-1]
shuffled_Y =  shuffled.iloc[: , -1]

(128, 7)


In [108]:
def printScores(avg_acc,avg_prec,avg_rec,avg_fsc,avg_roc,avg_prc):
    print("accuracy:",avg_acc)
    print("precision:",avg_prec)
    print("recall:",avg_rec)
    print("fscore:",avg_fsc)
    print("auroc:",avg_roc)
    print("auprc:",avg_prc)

In [109]:
import csv
import os

# CSV file path
def write_out_to_csv(datatype, model, years, acc, prec, rec, auprc, auroc, fscore):
    csv_file_path = "scores.csv"

    # Check if the CSV file already exists
    if not os.path.exists(csv_file_path):
        # Create a new CSV file and write header
        with open(csv_file_path, "w", newline="") as csvfile:
            csvwriter = csv.writer(csvfile)
            csvwriter.writerow(["Datatype", "Model", "Year", "Size", "Features","Average Accuracy", "Average Precision", "Average Recall", "Average F-Score", "Average ROC AUC", "Average PR AUC"])

    # Append data to the CSV file
    with open(csv_file_path, "a", newline="") as csvfile:
        csvwriter = csv.writer(csvfile)
        csvwriter.writerow([datatype, model, years, shuffled.shape[0], shuffled_X.shape[1], acc, prec, rec, fscore, auroc, auprc])

    print("Average scores have been appended to the CSV file.")

# Model training

https://www.analyticsvidhya.com/blog/2021/09/a-comprehensive-guide-on-neural-networks-performance-optimization/

In [110]:


GENERATE_SHAP = True
total_epochs = 500 #250(ideal)
num_features = shuffled_X.shape[1]
# random_integers = [2, 6, 108, 90, 5]
random_seed = random_seed#, 92, 0, 87, 73, 82, 54]

total_folds = 10#[37*2]

avg_val_acc = []

shap_values_list = []
# for num_features in num_features_list:
print(f'NF:{num_features}')
global_best_acc_val = 0.
precision_avg = 0
recall_avg = 0
auprc_avg = 0
auroc_avg = 0
fscore_avg = 0
print(f'\n#F{total_folds}')
accuracies = []
accuracies_val = []
temp_shap_values = np.zeros(shuffled_X.shape)

kf = KFold(n_splits = total_folds, random_state=None)
acc_score = []

for train_index , test_index in kf.split(shuffled):
    print("Split :")
    X_train , X_test = shuffled_X.iloc[train_index,:], shuffled_X.iloc[test_index,:]
    y_train , y_test = shuffled_Y[train_index] , shuffled_Y[test_index]

    train_dataset = df_dataSet( X_train, y_train )
    valid_dataset = df_dataSet( X_test, y_test )

    train_batch_size = train_dataset.__len__()
    val_batch_size = valid_dataset.__len__()


    train_loader = torch.utils.data.DataLoader(train_dataset, batch_size = train_batch_size, shuffle = False, num_workers = 0)
    valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size = val_batch_size, shuffle = False, num_workers = 0)

    model = simple_model(num_features = shuffled_X.shape[1], hidden_dim = hidden_dimension)
    model = model.to(DEVICE)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    criterion = torch.nn.BCEWithLogitsLoss() 
    best_acc_val = 0.
    model_best = None

    for epoch_num in range(total_epochs):
        model.train()
        precision, recall, fscore, support, auroc, auprc, acc_train, total_loss, pred, pred_binary, true = epoch(model=model, optimizer=optimizer, 
                                                                                 criterion=criterion, is_training=True, 
                                                                               loader=train_loader)
        model.eval()
        precision, recall, fscore, support, auroc, auprc, acc_val, total_loss, pred, pred_binary, true = epoch(model=model, 
                                                                                 optimizer=optimizer, 
                                                                                 criterion=criterion, is_training=False, 
                                                                                loader=valid_loader)
        if acc_val > best_acc_val:
            best_acc_val = acc_val
            if acc_val > global_best_acc_val:
                global_best_acc_val = acc_val

            torch.save(model.state_dict(), 'PRS_model'+model_suffix+'.pt')

    model_best = simple_model(num_features= shuffled_X.shape[1], hidden_dim = hidden_dimension, drop_probab=.0)
    model_best.load_state_dict(torch.load('PRS_model'+model_suffix+'.pt'))
    model_best = model_best.to(DEVICE)
    model_best.eval()
    precision, recall, fscore, support, auroc, auprc, acc_test, total_loss, pred, pred_binary, true = epoch(model=model_best, 
                                                                             optimizer=optimizer, 
                                                                             criterion=criterion, is_training=False, 
                                                                             loader=valid_loader)
    accuracies += [acc_test]
    accuracies_val += [best_acc_val]
#                 print("precision : ", precision, " ; recall : ", recall)
    precision_avg += precision
    recall_avg += recall
    auprc_avg += auprc
    auroc_avg += auroc
    fscore_avg += fscore


    print(total_folds, ':')

print(f'random_seed:{random_seed}:', np.mean(accuracies), np.std(accuracies), 
      np.mean(accuracies_val), np.std(accuracies_val), 'train acc:', acc_train)
avg_val_acc += [np.mean(accuracies_val)]
print("accuraacies of validation: ", accuracies_val)
print(f'global_best_acc_val:{global_best_acc_val}')
precision_avg = precision_avg * 1.0 / total_folds
recall_avg = recall_avg * 1.0 / total_folds
auprc_avg = auprc_avg * 1.0 / total_folds
auroc_avg = auroc_avg * 1.0 / total_folds
fscore_avg = fscore_avg * 1.0 / total_folds
print( "precision avg : ", precision_avg )
print( "recall avg : ", recall_avg )
print( "AUPRC avg : ", auprc_avg )
print( "AUROC avg : ", auroc_avg )
print( "FScore avg : ", fscore_avg )
avg_val_acc = np.array(avg_val_acc)
printScores(avg_val_acc,precision_avg,recall_avg,fscore_avg,auroc_avg,auprc_avg)
write_out_to_csv("ADNI", "NN", yrs, avg_val_acc, precision_avg,recall_avg,fscore_avg,auroc_avg,auprc_avg)

 

# # usable_features = usable_features.cpu().detach().numpy().astype(np.float64)

# # print(avg_val_acc.max(), avg_val_acc.min(), avg_val_acc.mean(), avg_val_acc.std())


NF:6

#F10
Split :
10 :
Split :
10 :
Split :
10 :
Split :
10 :
Split :
10 :
Split :
10 :
Split :
10 :
Split :
10 :
Split :
10 :
Split :
10 :
random_seed:42: 0.6243589743589744 0.08620305371289354 0.7967948717948719 0.0787209763052622 train acc: 0.7413793103448276
accuraacies of validation:  [0.6923076923076923, 0.9230769230769231, 0.7692307692307693, 0.7692307692307693, 0.8461538461538461, 0.7692307692307693, 0.6923076923076923, 0.9230769230769231, 0.8333333333333334, 0.75]
global_best_acc_val:0.9230769230769231
precision avg :  0.4912820512820513
recall avg :  0.6375
AUPRC avg :  0.6090424697106245
AUROC avg :  0.5355158730158729
FScore avg :  0.5313419913419913
accuracy: [0.79679487]
precision: 0.4912820512820513
recall: 0.6375
fscore: 0.5313419913419913
auroc: 0.5355158730158729
auprc: 0.6090424697106245
Average scores have been appended to the CSV file.


# precision, recall, fscore, auroc, auprc,accuracy score

In [111]:
def GetScores(true,pred_binary,pred):
    print(true.shape,pred_binary.shape,pred.shape)
    precision, recall, fscore, support = precision_recall_fscore_support(true, pred_binary)
    auroc = roc_auc_score(true, pred)
    p, r, thresholds = precision_recall_curve(true, pred)
    auprc = auc(r, p)
    acc = (pred_binary==true).mean()


#     print("precision",precision[1],"recall", recall[1], "fscore",fscore[1], "auroc", auroc,"auprc", auprc,"accuracy" ,acc)
    return acc, precision[1], recall[1], fscore, auroc, auprc

# GradientBoosting

In [112]:
import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
# tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
# features = tpot_data.drop('target', axis=1)
# training_features, testing_features, training_target, testing_target = \
#             train_test_split(shuffled_X, shuffled_Y, random_state=1)
# print("training_features",X_train.shape)
# print("testing_features",X_test.shape)


# Average CV score on the training set was: 0.6719885773624091
exported_pipeline = GradientBoostingClassifier(learning_rate=0.01, max_depth=8, max_features=1,
                    min_samples_leaf=7, min_samples_split=6, n_estimators=100, random_state = random_seed )
# Fix random state in exported estimator
accuracies=[]
precisions=[]
recalls=[]
fscores=[]
aurocs=[]
auprcs=[]
if hasattr(exported_pipeline, 'random_state'):
    setattr(exported_pipeline, 'random_state', random_seed)
    
for train_index , test_index in kf.split(shuffled):
    X_train , X_test = shuffled_X.iloc[train_index,:], shuffled_X.iloc[test_index,:]
    y_train , y_test = shuffled_Y[train_index] , shuffled_Y[test_index]

    exported_pipeline.fit(X_train, y_train)
    y_pred = exported_pipeline.predict(X_test)

    true=y_test
    pred_binary=y_pred
    pred=exported_pipeline.predict_proba(X_test).T
    pred=pred[1]
    acc,precision, recall,fscore, auroc, auprc=GetScores(true,pred_binary,pred) 
    
    accuracies.append(acc)
    precisions.append(precision)
    recalls.append(recall)
    fscores.append(fscore)
    aurocs.append(auroc)
    auprcs.append(auprc)
avg_acc=np.mean(accuracies)
avg_prec=np.mean(precisions)
avg_rec=np.mean(recalls)
avg_fsc=np.mean(fscores)
avg_roc=np.mean(aurocs)
avg_prc=np.mean(auprcs)
printScores(avg_acc,avg_prec,avg_rec,avg_fsc,avg_roc,avg_prc)

(13,) (13,) (13,)
(13,) (13,) (13,)
(13,) (13,) (13,)
(13,) (13,) (13,)
(13,) (13,) (13,)
(13,) (13,) (13,)
(13,) (13,) (13,)
(13,) (13,) (13,)
(12,) (12,) (12,)
(12,) (12,) (12,)
accuracy: 0.5397435897435898
precision: 0.5085714285714286
recall: 0.47095238095238096
fscore: 0.5172213570742983
auroc: 0.5201587301587302
auprc: 0.5930983714257524


In [113]:
write_out_to_csv("ADNI", "XGBoost", yrs, avg_acc, avg_prec, avg_rec, avg_fsc, avg_roc, avg_prc)

Average scores have been appended to the CSV file.


# SVM

In [114]:
# X_train, X_test, y_train, y_test = train_test_split(shuffled_X, shuffled_Y, test_size=0.25, random_state=0)

# Preprocess the data
# from sklearn.preprocessing import StandardScaler
# sc = StandardScaler()
# X_train = sc.fit_transform(X_train)
# X_test = sc.transform(X_test)

# Create an SVM object
from sklearn.svm import SVC
classifier = SVC(kernel='linear', random_state=random_seed, probability=True)


accuracies=[]
precisions=[]
recalls=[]
fscores=[]
aurocs=[]
auprcs=[]
for train_index , test_index in kf.split(shuffled):
    X_train , X_test = shuffled_X.iloc[train_index,:], shuffled_X.iloc[test_index,:]
    y_train , y_test = shuffled_Y[train_index] , shuffled_Y[test_index]
   
    # Train the model
    classifier.fit(X_train, y_train)

    # Evaluate the model
    y_pred = classifier.predict(X_test)
    from sklearn.metrics import confusion_matrix
    cm = confusion_matrix(y_test, y_pred)
#     print(cm)

    true=y_test
    pred_binary=y_pred
    pred=classifier.predict_proba(X_test).T
    pred=pred[1]
    acc,precision, recall,fscore, auroc, auprc=GetScores(true,pred_binary,pred)
    accuracies.append(acc)
    precisions.append(precision)
    recalls.append(recall)
    fscores.append(fscore)
    aurocs.append(auroc)
    auprcs.append(auprc)
avg_acc=np.mean(accuracies)
avg_prec=np.mean(precisions)
avg_rec=np.mean(recalls)
avg_fsc=np.mean(fscores)
avg_roc=np.mean(aurocs)
avg_prc=np.mean(auprcs)
printScores(avg_acc,avg_prec,avg_rec,avg_fsc,avg_roc,avg_prc)
write_out_to_csv("ADNI", "SVM", yrs, avg_acc, avg_prec, avg_rec, avg_fsc, avg_roc, avg_prc)


(13,) (13,) (13,)
(13,) (13,) (13,)
(13,) (13,) (13,)
(13,) (13,) (13,)
(13,) (13,) (13,)
(13,) (13,) (13,)
(13,) (13,) (13,)
(13,) (13,) (13,)
(12,) (12,) (12,)
(12,) (12,) (12,)
accuracy: 0.5942307692307692
precision: 0.6112698412698412
recall: 0.5792460317460317
fscore: 0.5809796494355318
auroc: 0.6222321428571428
auprc: 0.6087364977394738
Average scores have been appended to the CSV file.


# Random Forest Classifier

In [115]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=random_seed)
accuracies=[]
precisions=[]
recalls=[]
fscores=[]
aurocs=[]
auprcs=[]
for train_index , test_index in kf.split(shuffled):
    X_train , X_test = shuffled_X.iloc[train_index,:], shuffled_X.iloc[test_index,:]
    y_train , y_test = shuffled_Y[train_index] , shuffled_Y[test_index]
    rfc.fit(X_train, y_train)
    y_pred_bin = rfc.predict(X_test)
    y_pred_frac= rfc.predict_proba(X_test).T[1]
    acc,precision, recall,fscore, auroc, auprc=GetScores(y_test,y_pred_bin,y_pred_frac)
    accuracies.append(acc)
    precisions.append(precision)
    recalls.append(recall)
    fscores.append(fscore)
    aurocs.append(auroc)
    auprcs.append(auprc)
avg_acc=np.mean(accuracies)
avg_prec=np.mean(precisions)
avg_rec=np.mean(recalls)
avg_fsc=np.mean(fscores)
avg_roc=np.mean(aurocs)
avg_prc=np.mean(auprcs)
printScores(avg_acc,avg_prec,avg_rec,avg_fsc,avg_roc,avg_prc)
write_out_to_csv("ADNI", "RF", yrs, avg_acc, avg_prec, avg_rec, avg_fsc, avg_roc, avg_prc)

(13,) (13,) (13,)
(13,) (13,) (13,)
(13,) (13,) (13,)
(13,) (13,) (13,)
(13,) (13,) (13,)
(13,) (13,) (13,)
(13,) (13,) (13,)
(13,) (13,) (13,)
(12,) (12,) (12,)
(12,) (12,) (12,)
accuracy: 0.5391025641025642
precision: 0.4704761904761905
recall: 0.4575
fscore: 0.50953969233381
auroc: 0.5383333333333333
auprc: 0.6037539528923457
Average scores have been appended to the CSV file.
