# Fully connected Neural Network and Ensemble model

In [None]:
# mount google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import torch
import pandas as pd
import numpy as np
import os
import pickle


import torch
from torch.utils.data import DataLoader,Dataset
import torch.optim as optim


import torch.nn as nn

from sklearn.model_selection import StratifiedShuffleSplit, train_test_split

from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, f1_score
from scipy.stats import mode

In [None]:
# upload cc400 filtered fc dataset
df = pd.read_csv('/content/drive/MyDrive/Project/vectorised/fc_cc400_filtered.csv')

# if on local host
# df = pd.read_csv('vectorised/fc_cc400_filtered.csv')

In [None]:
df.head()

Unnamed: 0,#5-#182,#5-#230,#5-#390,#8-#224,#9-#55,#10-#18,#10-#133,#10-#239,#10-#390,#11-#82,...,#310-#390,#315-#346,#317-#347,#318-#362,#323-#382,#323-#391,#335-#360,#343-#382,#356-#366,DX_GROUP
0,-0.465351,-0.038908,-0.303455,0.061997,0.285353,-0.302591,-0.575635,0.093608,-0.311941,0.502320,...,0.105564,0.281348,-0.078890,0.431270,-0.367768,0.088068,0.272958,-0.402212,0.349068,1
1,-0.009303,0.247179,0.019453,-0.089782,0.118145,0.045821,-0.036047,0.138405,0.046357,0.364785,...,0.006089,0.052901,-0.005255,0.166180,0.029733,-0.006374,0.513472,0.115737,0.519810,1
2,-0.131458,0.072728,0.090999,0.324135,0.235671,-0.226198,-0.300777,0.442815,-0.039074,0.477168,...,-0.112499,-0.089670,-0.184399,0.130349,0.320956,0.134839,0.131533,-0.027364,0.234810,1
3,0.155183,0.077839,0.129694,-0.299517,0.191523,-0.196390,-0.279142,0.075303,0.181871,0.709533,...,0.117378,-0.332721,-0.203306,0.292701,0.187288,-0.461183,0.044323,-0.016915,0.198380,1
4,-0.005694,-0.116649,0.046553,-0.101360,-0.211751,0.016708,-0.093205,0.279462,0.240016,0.256403,...,0.069215,-0.025669,-0.043941,0.474094,0.048160,-0.178296,0.238861,-0.083691,-0.080434,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
944,0.242785,0.333870,0.324636,-0.013359,0.227179,0.246499,0.280949,-0.473235,0.376639,0.405462,...,0.177930,-0.399635,-0.147217,0.157535,-0.168862,0.141573,0.085821,0.174607,0.022434,1
945,-0.190760,-0.086610,-0.045279,-0.212172,0.302947,0.140238,0.121516,-0.039889,0.031693,0.288423,...,-0.062911,0.255855,0.144863,0.246999,0.206618,-0.105380,-0.088745,0.283497,0.294862,1
946,0.138792,0.152113,0.253272,-0.215632,0.052178,-0.371396,-0.358475,-0.308918,0.345141,0.394936,...,-0.033570,-0.459476,0.112224,0.646445,-0.334266,-0.294695,0.314053,0.049902,-0.000490,1
947,-0.027725,0.081142,0.384957,-0.146337,-0.330887,0.068879,0.120493,0.010479,0.087033,0.419849,...,0.002085,0.024210,-0.080221,0.254440,-0.138745,0.110950,-0.319803,0.143677,-0.130846,1


### Neural Netork

In [None]:
# isolate features
features = df.iloc[:,:-1].columns.to_list()
# targets
target = "DX_GROUP"

In [None]:

class VectorisedData(Dataset):
    """
    Custom Dataset class for tabular data
    """

    def __init__(self, csv_file):
        """
        csv_file: tabular dataset
        """
        self.df = csv_file

        # Grouping variable names
        self.features = self.df.iloc[:,:-1].columns.to_list()
        self.target = "DX_GROUP"


        # Save target and predictors
        self.X = self.df.drop(self.target, axis=1)
        self.y = self.df[self.target]

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        # Convert idx from tensor to list due to pandas bug (that arises when using pytorch's random_split)
        if isinstance(idx, torch.Tensor):
            idx = idx.tolist()

        # return numpy array of dtype flt32 to be comabitlple with torch
        return [self.X.iloc[idx].values.astype(np.float32), self.y[idx].astype(np.float32)]

In [None]:
# fully connected neural network for cc400 vectorised data
class CC400_NN1(nn.Module):
    def __init__(self):
        super(CC400_NN1, self).__init__()

        # Fully connected layer
        self.seq_dense = nn.Sequential(
            nn.Linear(260,130),
            nn.ReLU(),
            nn.Dropout(p=0.2),
            nn.Linear(130,60),
            nn.ReLU(),
            nn.Dropout(p=0.2),
            nn.Linear(60,1),
            nn.Sigmoid()
        )

    
    def forward(self, x):

        x = self.seq_dense(x)

        return x

In [None]:
def train_one_epoch_binary(model, loss_fn, optimiser, train_loader, device):
   """
    Function for training one epoch
    model: model class (pytorch module)
    loss_fn: loss function, (pytorch module)
    optimiser: optimser (pytorch module)
    train_loader: train set dataloader (pytorch DataLoader class)
    """
    running_loss = 0
    epoch_accuracy = []
    for j, (x_train, y) in enumerate(train_loader):
        optimiser.zero_grad()  # zero the gradient at each epoch start
        y = y.to(device)  # send y to cuda
        y = y.unsqueeze(1)
        x_train = x_train.to(device)
        prediction = model.forward(x_train)
        loss = loss_fn(prediction, y)  # loss
        # calculate accuracy for each mini-batch  take prediction tensor, reshape to 1d detach from computational graph turn to numpy array, 
        #round and see if rounded number is equal to label, find mean of this boolean array, this is the accuracy
        accuracy = (torch.round(
            prediction) == y).float().mean()  

        running_loss += loss.item()  # get epoch loss
        epoch_accuracy.append(accuracy.item())

        loss.backward()  # backward propagation
        optimiser.step()

        running_loss += loss.item()  # get epoch loss
        epoch_accuracy.append(accuracy.item())

    return running_loss, np.mean(epoch_accuracy)


def validate_one_epoch_binary(model, loss_fn, test_loader, device):
   """
       Function for validating one epoch
       model: model class (pytorch module)
       loss_fn: loss function, (pytorch module)
       train_loader: test set dataloader (pytorch DataLoader class)
       """
    test_loss_run = 0
    test_acc_epoch = []
    for j, (x_test, y_test) in enumerate(test_loader):
        y_test = y_test.to(device)
        y_test = y_test.unsqueeze(1)
        x_test = x_test.to(device)
        test_pred = model.forward(x_test)
        test_loss = loss_fn(test_pred, y_test)  # loss

        test_acc = (torch.round(
            test_pred) == y_test).float().mean()  # calculate accuracy for each mini-batch  take prediction tensor, reshape to 1d detach from computational graph turn to numpy array, round and see if rounded number is equal to label, find mean of this boolean array, this is the accuracy

        test_loss_run += test_loss.item()
        test_acc_epoch.append(test_acc.item())

    return test_loss_run, np.mean(test_acc_epoch)

In [None]:
# set device to gpu if available ese cpu
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda', index=0)

In [None]:
def get_spec_sense(true_y, preds):
    """
    Calculate specificity and sensitivity
    true_y: True classes
    preds: predicted classes
    """
    # Use sklearn confusion matrixs to get true positive, etc
    tn, fp, fn, tp = confusion_matrix(true_y, preds).ravel()
    # calculate specificity and sensitivity
    specificity = tn / (tn + fp)
    sensitivity = tp / (tp + fn)
    # output specificity and sensitivity
    return specificity, sensitivity


The FCN model is trained along with svm and lr models. All models are then combined together, and applied to the validation set to obtain prediction, this prediction is used to get validation accuracy, specificty etc

In [None]:
# inialise shuffle split with 10 folds
cv = StratifiedShuffleSplit(n_splits=10,random_state=0)
cv_preds = []
cv_true = []

# metrics for neural network from all crossvalidation folds
cv_accuracy_nn = []
cv_sensitivity_nn = []
cv_specificity_nn = []
cv_f1_nn = []
# metrics for logistic regression model
cv_accuracy_lr = []
cv_sensitivity_lr = []
cv_specificity_lr = []
cv_f1_lr = []
# metrics for support vector machine
cv_accuracy_svm = []
cv_sensitivity_svm = []
cv_specificity_svm = []
cv_f1_svm = []
# metrics for ensemble model
cv_accuracy_ensemble = []
cv_sensitivity_ensemble = []
cv_specificity_ensemble = []
cv_f1_ensemble = []
# for each split in k-folds
for train, test in cv.split(df[features], df[target]):
    #initialise neural netowrk
    model = CC400_NN1()
    # initialise Binaru Cross entrophy loss function
    loss_fn = nn.BCELoss()
    # initialise Adam optimiser with learning rate of 0.0001 and weight decay 1e-1
    optimiser = optim.AdamW(model.parameters(), lr=0.0001, weight_decay=1e-1)
    # get train set
    train_df = df.iloc[train, :]
    # reset index so VectorisedData() does not throw out error
    train_df.reset_index(drop=True, inplace=True)
    train_data = VectorisedData(train_df)
    # dataloader class with mini batch size of 3 
    train_dataloader = DataLoader(train_data, batch_size= 3,
                                        shuffle=True,
                                        num_workers=0)
    
    
    # train network on gpu
    model.to(device)
    
    train_loss_history = []
    train_acc_history = []
    test_loss_history = []
    test_acc_history = []
    # only train fcn for 10 epochs to prevent overfitting
    for i in range(10):
        # set fcn to training mode
        model.train()
        #  train one epoch get train loss and train_acc
        train_loss, train_acc = train_one_epoch_binary(model, loss_fn, optimiser, train_dataloader, device)
        # append to history lost
        train_loss_history.append(train_loss)
        train_acc_history.append(train_acc)
    # isolat validation X and y for svm and lr training
    X_train = df.iloc[train,:-1]
    y_train = df.iloc[train,-1]
    X_val = df.iloc[test,:-1]
    y_val = df.iloc[test,-1]
    # initialise svm and lr classes
    svm = SVC(kernel='rbf')
    lr = LogisticRegression()
    # fit models
    svm.fit(X_train, y_train)
    lr.fit(X_train, y_train)
    
    # set network to evaluation mode for getting predictions on the validation set
    model.eval()
    # turn the validation X to numpy array and then to torch.Tensor
    nn_input = torch.Tensor(X_val.to_numpy())
    # send the input tensor to the gpu so it can be run given to the fcn
    nn_input.to(device)
    # get network predictions
    nn_out = model.forward(nn_input.to(device))
    # send fcn prediction back to cpu, detach so tensor does not require gradient
    # turn detached tensor to numpy array, round flatten and set to integar
    nn_predictions = np.round(nn_out.cpu().detach().numpy()).flatten().astype(dtype=int)
    # get predicitons from svm and lr from validation X
    svm_predictions = svm.predict(X_val)
    lr_predictions = lr.predict(X_val)
    
    # get accuracy for all models
    nn_accuracy = (nn_predictions == y_val).round().mean()
    svm_accuracy = (svm_predictions == y_val).round().mean()
    lr_accuracy = (lr_predictions == y_val).round().mean()
    
    # use get_spec_sense() to get specificity and sensitivity for all models
    nn_specificity, nn_sensitivity = get_spec_sense(y_val , nn_predictions)
    svm_specificity, svm_sensitivity = get_spec_sense(y_val , svm_predictions)
    lr_specificity, lr_sensitivity = get_spec_sense(y_val , lr_predictions)
    # get f1 scores
    nn_f1 = f1_score(y_val , nn_predictions)
    svm_f1 = f1_score(y_val , svm_predictions)
    lr_f1 = f1_score(y_val , lr_predictions)
    #  column wise concatanate all model predictions
    # this makes a array of shape (sample size, 3), each row shows predicitons from all models
    all_pred = np.c_[nn_predictions,lr_predictions,svm_predictions]
    # initialise list for storing final prediction
    final_preds = []
    # iterate over row number
    for i in range(len(all_pred)):
      # get counts for each value in a row
      values, counts = np.unique(all_pred[i,:], return_counts = True)
      # get value with highest count and append to final_preds
      # this is the ensemble model's prediction
      final_preds.append(values[np.argmax(counts)])
    # append the ensemble model's prediction to cv_preds
    cv_preds.append(final_preds)
    cv_true.append(y_val)
    # get accuracy etc of the ensemble model
    final_accuracy = (final_preds == y_val).round().mean()
    final_specificity, final_sensitivity = get_spec_sense(y_val , final_preds)
    final_f1 = f1_score(y_val , final_preds)

    # append calculated metrics for from all models to associated list
    cv_accuracy_nn.append(nn_accuracy)
    cv_sensitivity_nn.append(nn_sensitivity)
    cv_specificity_nn.append(nn_specificity)
    cv_f1_nn.append(nn_f1)

    cv_accuracy_svm.append(svm_accuracy)
    cv_sensitivity_svm.append(svm_sensitivity)
    cv_specificity_svm.append(svm_specificity)
    cv_f1_svm.append(svm_f1)

    cv_accuracy_lr.append(lr_accuracy)
    cv_sensitivity_lr.append(lr_sensitivity)
    cv_specificity_lr.append(lr_specificity)
    cv_f1_lr.append(nn_f1)

    cv_accuracy_ensemble.append(final_accuracy)
    cv_sensitivity_ensemble.append(final_sensitivity)
    cv_specificity_ensemble.append(final_specificity)
    cv_f1_ensemble.append(final_f1)

In [None]:
# make dictionary from all metric lists 
cv_metrics = {"fcn_accuracy" : cv_accuracy_nn, "fcn_sensitivity": cv_sensitivity_nn, "fcn_specificity": cv_specificity_nn, "fcn_f1" :cv_f1_nn,
              "lr_accuracy": cv_accuracy_lr,"lr_sensitivity": cv_sensitivity_lr, "lr_specificity": cv_specificity_lr,"lr_f1": cv_f1_lr,
              "svm_accuracy": cv_accuracy_svm, "svm_sensitivity": cv_sensitivity_svm,"svm_specificity":cv_specificity_svm, "svm_f1": cv_f1_svm,
              "ensemble_accuracy" :cv_accuracy_ensemble, "ensemble_sensitivity": cv_sensitivity_ensemble, 
              "ensemble_specificity": cv_specificity_ensemble, "ensemble_f1": cv_f1_ensemble }

In [None]:
# turn metric dictionary to dataframe
cv_metrics = pd.DataFrame(cv_metrics)

In [None]:
# find means for each model
cv_metrics.mean()

fcn_accuracy            0.797852
fcn_sensitivity         0.757778
fcn_specificity         0.834000
fcn_f1                  0.780173
lr_accuracy             0.780273
lr_sensitivity          0.744444
lr_specificity          0.812000
lr_f1                   0.780173
svm_accuracy            0.809570
svm_sensitivity         0.760000
svm_specificity         0.854000
svm_f1                  0.790150
ensemble_accuracy       0.811523
ensemble_sensitivity    0.764444
ensemble_specificity    0.854000
ensemble_f1             0.793252
dtype: float64

In [None]:
# save metrics as a csv file to model_evaluation/
cv_metrics.to_csv('/content/drive/MyDrive/Project/model_evaluation/ensemble_metrics.csv', index = False)