# Neural Network Model
Below a 6 layer neural network defined for binary classification of diabetes vs no diabetes:

In [9]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch import nn
from torch.utils.data import DataLoader, Dataset
import pandas as pd
import numpy as np
import random
from sklearn.metrics import confusion_matrix,classification_report

In [10]:
class FCN(nn.Module):
    layers: nn.Sequential
    def __init__(self, features) :
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(features, 50),
            nn.ReLU(),
            nn.Linear(50, 20),
            nn.ReLU(),
            nn.Linear(20, 10),
            nn.ReLU(),
            nn.Linear(10, 25),
            nn.ReLU(),
            nn.Linear(25, 10),
            nn.ReLU(),
            nn.Linear(10, 2),
        )
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.layers(x)

In [11]:
# Class to load CSV data into PyTorch dataloaders:
class CSV_Data(Dataset):
    def __init__(self, f):
        self.dat = pd.read_csv(f)
        self.y = self.dat.iloc[:, 0].to_numpy()
        self.X = self.dat.iloc[:, 1:].to_numpy()
    def __len__(self):
        return len(self.dat)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

     

In [12]:
# FCN Accuracy Calculation

def tester(model, testloader):
    correct, total = 0, 0
    with torch.no_grad():
        for i, (features, label) in enumerate(testloader):
            features = features.type(torch.float32)
            output = model(features)
            pred = torch.argmax(output, 1)
            if(pred == min(1, label)):
                correct+=1
            total+=1
    print(f"Accuracy: {(correct*100)/total} ") 

In [13]:
def metrics(testloader, model):
    y_pred = []
    y_true = []
    for i, (features, label) in enumerate(testloader):
        features = features.type(torch.float32)
        output = model(features)
        pred = torch.argmax(output, 1)
        y_pred.append(pred[0].item())
        y_true.append(min(1, label[0].item()))
    print("Confusion Matrix")
    print(confusion_matrix(y_true, y_pred))
    print("Classification Report")
    print(classification_report(y_true, y_pred))
    

In [14]:
'''
data: path to data csv file
epochs: number of epochs to train for
random_negative_drop: proportion of randomly dropped 0 labels
'''
def train(data, epochs, random_negative_drop):
    
    data = CSV_Data(data)
    train_set, test_set = torch.utils.data.random_split(data, [int(0.75*(len(data))), len(data)- int(0.75*len(data))])
    trainloader = DataLoader(train_set, batch_size = 1, shuffle = True)
    testloader = DataLoader(test_set, batch_size = 1, shuffle = False)
    # print(trainloader.size)
    _, (f, _) = list(enumerate(trainloader))[0]

    model = FCN(f.shape[1])
    
    weights = [20.0, 1.0]
    class_weights = torch.FloatTensor(weights)
    criterion = nn.CrossEntropyLoss(weight=class_weights)
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    tester(model, testloader)   
    
    for epoch in range(epochs):
        
        print(f"Epoch {epoch+1}")
        
        for i, (features, label) in enumerate(trainloader):
            
            if(( random.uniform(0.0, 1.0)> random_negative_drop  and label==0) or label!=0):
                features = features.type(torch.float32)
                l = [min(1, label)]
                g = torch.Tensor(l)  
                optimizer.zero_grad()
                logits = model.forward(features)
                loss = criterion(logits, g.long())
                loss.backward()
                optimizer.step()
        tester(model, testloader)

    print("Train Metrics:")
    metrics(trainloader, model)
    print("Test Metrics:")
    metrics(testloader, model)

In [15]:
# train the data on the cleaned original dataset
print("Original Cleaned data")
train('cleaned_diabetes.csv', 5, 0)
      
print("5050 Cleaned Data")
train('5050_clean_diabetes_dataset.csv', 5, 0)
      
print("Original Cleaned Data Random Drop")
train('cleaned_diabetes.csv', 5, 0.85)
      
print("5050 Cleaned Data Random Drop")
train('5050_clean_diabetes_dataset.csv', 5, 0.85)



Original Cleaned data
Accuracy: 12.18895186203206 
Epoch 1
Accuracy: 87.81104813796794 
Epoch 2
Accuracy: 87.81104813796794 
Epoch 3
Accuracy: 87.81104813796794 
Epoch 4
Accuracy: 87.86699423511955 
Epoch 5
Accuracy: 87.84753472306681 
Train Metrics:
Confusion Matrix
[[108666     68]
 [ 14509     90]]
Classification Report
              precision    recall  f1-score   support

           0       0.88      1.00      0.94    108734
           1       0.57      0.01      0.01     14599

    accuracy                           0.88    123333
   macro avg       0.73      0.50      0.47    123333
weighted avg       0.85      0.88      0.83    123333

Test Metrics:
Confusion Matrix
[[36078    22]
 [ 4974    37]]
Classification Report
              precision    recall  f1-score   support

           0       0.88      1.00      0.94     36100
           1       0.63      0.01      0.01      5011

    accuracy                           0.88     41111
   macro avg       0.75      0.50      0.47   