In [78]:
import pandas as pd
import numpy as np

from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split

df = pd.read_csv('Phenotypic_V1_0b_preprocessed1.csv')
print(df.shape)
df.head(5)

(1112, 13)


Unnamed: 0.1,Unnamed: 0,SUB_ID,X,subject,SITE_ID,FILE_ID,DX_GROUP,anat_cnr,anat_efc,anat_fber,anat_fwhm,anat_qi1,anat_snr
0,1,50002,1,50002,PITT,no_filename,1,10.201539,1.194664,16.223458,3.878,0.152711,12.072452
1,2,50003,2,50003,PITT,Pitt_0050003,1,7.165701,1.126752,10.460008,4.282238,0.161716,9.241155
2,3,50004,3,50004,PITT,Pitt_0050004,1,7.698144,1.226218,9.72575,3.881684,0.174186,9.323463
3,4,50005,4,50005,PITT,Pitt_0050005,1,9.071807,1.256278,11.198226,3.628667,0.119269,10.8142
4,5,50006,5,50006,PITT,Pitt_0050006,1,8.026798,1.407166,6.282055,3.674539,0.130647,10.123574


In [79]:
#Dropping empty columns
df['DX_GROUP'].replace(2, 0, inplace=True) #So sigmoid function gives right output. if you replace sigmoid, you can skip this

df['anat_cnr'].replace('', np.nan, inplace=True)
df['anat_efc'].replace('', np.nan, inplace=True)
df['anat_fber'].replace('', np.nan, inplace=True)
df['anat_fwhm'].replace('', np.nan, inplace=True)
df['anat_qi1'].replace('', np.nan, inplace=True)
df['anat_snr'].replace('', np.nan, inplace=True)

#Replacing null values in all relevant input columns
df.dropna(subset=['anat_cnr','anat_efc', 'anat_fber', 'anat_fwhm', 'anat_qi1', 'anat_snr'], inplace=True)

#Verifying number of null rows
print("Number of null values:")
print(df.isnull().sum())

Number of null values:
Unnamed: 0    0
SUB_ID        0
X             0
subject       0
SITE_ID       0
FILE_ID       0
DX_GROUP      0
anat_cnr      0
anat_efc      0
anat_fber     0
anat_fwhm     0
anat_qi1      0
anat_snr      0
dtype: int64


In [80]:
X = np.load("features.npz")['a']
y = np.load("labels.npz")['a']
y = np.select([y == 1, y == 2], [0, 1], y)

train_x,test_x,train_y,test_y = train_test_split(X,y,random_state=42)
print("\nTraining data samples:")
print(train_x.shape)


Training data samples:
(653, 2016)


In [81]:
from sklearn import preprocessing
import torch

#MinMaxscaler is used to scale all the features of Train & Test dataframes
scaler = preprocessing.MinMaxScaler()
x_train = scaler.fit_transform(train_x)
#x_test =  scaler.fit_transform(test_x.values)
x_test =  scaler.fit_transform(test_x)

print("Scaled values of Train set \n")
print(x_train)
print("\nScaled values of Test set \n")
print(x_test)

#Train and Test sets are converted into Tensors
x_tensor =  torch.from_numpy(x_train).float()
y_tensor =  torch.from_numpy(train_y.ravel()).float()
xtest_tensor =  torch.from_numpy(x_test).float()
ytest_tensor =  torch.from_numpy(test_y.ravel()).float()

print("\nTrain set Tensors \n")
print(x_tensor)
print(y_tensor)
print("\nTest set Tensors \n")
print(xtest_tensor)
print(ytest_tensor)

Scaled values of Train set 

[[0.62458682 0.25869528 0.77813539 ... 0.69973366 0.64220319 0.62623055]
 [0.36619258 0.15922842 0.55327651 ... 0.69319589 0.59513916 0.49256394]
 [0.44473889 0.55010733 0.29953039 ... 0.49220317 0.62189607 0.44785573]
 ...
 [0.69060952 0.77490686 0.76992131 ... 0.68014218 0.53279481 0.55429992]
 [0.63730504 0.43471942 0.68012454 ... 0.62819085 0.72711074 0.72213087]
 [0.9276913  0.7532871  0.6978907  ... 0.86303712 0.80951451 0.80104934]]

Scaled values of Test set 

[[0.17371594 0.67910485 0.50993032 ... 0.71507732 0.69312968 0.82817554]
 [0.30584733 0.         0.48060218 ... 0.83418068 0.41021101 0.69783373]
 [0.57094588 0.63241503 0.93818587 ... 0.84096935 0.62120692 0.73941233]
 ...
 [0.3940889  0.69572294 0.48047557 ... 0.7169222  0.73136255 0.72531619]
 [0.20635841 0.22775244 0.58127283 ... 0.8653163  0.48998645 0.32111185]
 [0.48215526 0.68322656 0.52951599 ... 0.8790218  0.65768344 0.61489296]]

Train set Tensors 

tensor([[0.6246, 0.2587, 0.7781, 

In [None]:
from torch.utils.data import TensorDataset, DataLoader

#Define batch size 
bs = 64
#x_train and y_train are combined to a single TensorDataset (easier to iterate over and slice)
y_tensor = y_tensor.unsqueeze(1)
train_ds = TensorDataset(x_tensor, y_tensor)
#DataLoader is responsible for managing batches, & makes it easier to iterate over batches
train_dl = DataLoader(train_ds, batch_size=bs)

#For the validation/test dataset
ytest_tensor = ytest_tensor.unsqueeze(1)
test_ds = TensorDataset(xtest_tensor, ytest_tensor)
test_loader = DataLoader(test_ds, batch_size=32)

In [None]:
from torch import nn

n_input_dim = train_x.shape[1]
#Layer size
n_hidden1 = 3  # Number of hidden nodes
n_hidden2 = 2
n_output =  1   # Number of output nodes for binary classifier

class ChurnModel(nn.Module):
    def __init__(self):
        super(ChurnModel, self).__init__()
        self.layer_1 = nn.Linear(n_input_dim, n_hidden1) 
        self.layer_2 = nn.Linear(n_hidden1, n_hidden2)
        self.layer_out = nn.Linear(n_hidden2, n_output) 
        
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid() #outputs probability between 0 and 1
        self.dropout = nn.Dropout(p=0.2)
        self.batchnorm1 = nn.BatchNorm1d(n_hidden1)
        self.batchnorm2 = nn.BatchNorm1d(n_hidden2)
        
        
    def forward(self, inputs):
        x = self.relu(self.layer_1(inputs))
        x = self.batchnorm1(x)
        x = self.relu(self.layer_2(x))
        x = self.batchnorm2(x)
        x = self.dropout(x)
        x = self.sigmoid(self.layer_out(x))
        return x
    

model = ChurnModel()
print(model)

In [None]:
#Loss Computation
loss_func = nn.BCELoss()
#Optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.007, weight_decay= 1e-6)
epochs = 100

In [None]:
model.train()
train_loss, valid_loss = [], []
for epoch in range(epochs):
    #Within each epoch run the subsets of data = batch sizes.
    for xb, yb in train_dl:
        y_pred = model(xb)            # Forward Propagation
        loss = loss_func(y_pred, yb)  # Loss Computation
        optimizer.zero_grad()         # Clearing all previous gradients, setting to zero 
        loss.backward()               # Back Propagation
        optimizer.step()              # Updating the parameters 
    train_loss.append(loss.item())
    # evaluation part 
    model.eval()
    for xb, yb in test_loader:
        y_pred = model(xb)            # Forward Propagation
        loss = loss_func(y_pred, yb)
        valid_loss.append(loss.item())
print ("Epoch:", epoch, "Training Loss: ", np.mean(train_loss), "Valid Loss: ", np.mean(valid_loss))

In [None]:
import itertools

y_pred_list = []
model.eval()
#Model doesn't need to backpropagate the gradients in test set, so use torch.no_grad()
#reduces memory usage and speeds up computation
with torch.no_grad():
    for xb_test,yb_test  in test_loader:
        y_test_pred = model(xb_test)
        y_pred_tag = torch.round(y_test_pred)
        y_pred_list.append(y_pred_tag.detach().numpy())

#Takes arrays and makes them list of list for each batch        
y_pred_list = [a.squeeze().tolist() for a in y_pred_list]
#flattens the lists in sequence
ytest_pred = list(itertools.chain.from_iterable(y_pred_list))

In [None]:
plt.plot(train_loss)
plt.show()

In [None]:
plt.plot(valid_loss)
plt.show()

In [None]:
from sklearn import metrics
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score

#Classification accuracy
y_true_test = test_y.ravel()
print(metrics.classification_report(y_true_test,ytest_pred))

print("Precision Score : ",precision_score(y_true_test,ytest_pred, pos_label='positive', average=None))
print("Recall Score : ",recall_score(y_true_test,ytest_pred, pos_label='positive', average=None))
print("F1 Score : ",f1_score(y_true_test,ytest_pred, pos_label='positive', average=None))