In [10]:
import numpy as np
import pandas as pd
import torch
from torch import nn
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from sklearn import metrics
from tqdm import tqdm

## Build the Neural Network

In [63]:
class NN(nn.Module):
    def __init__(self, device):
        super(NN, self).__init__()
        
        self.device = device

        self.lin1 = nn.Linear(1024, 256)
        self.relu = nn.ReLU()
        self.lin2 = nn.Linear(256, 1)
        self.sigmoid = nn.Sigmoid()
        
        self.loss = nn.functional.binary_cross_entropy
 

    def forward(self, X):
        X = self.lin1(X)
        X = self.relu(X)
        X = self.lin2(X)
        X = self.sigmoid(X)
        return X
    
    def train_model(self, dataset, epochs):  
        model.train()
        optimizer = torch.optim.Adam(self.parameters())
        
        for epoch in range(epochs):
            with tqdm(dataset, unit="batch") as tepoch:
                for inputs, targets in tepoch:
                    
                    inputs, targets = inputs.to(self.device), targets.to(self.device)
                    tepoch.set_description(f"Epoch {epoch + 1}")
                    
                    # clear the gradients
                    optimizer.zero_grad()
                    # compute the model output
                    yhat = self(inputs.float())
                    # calculate accuracy
                    correct = (torch.round(yhat) == targets).sum().item()
                    accuracy = correct / len(inputs)
                    # calculate loss
                    loss = self.loss(yhat, targets.float())
                    # credit assignment
                    loss.backward()
                    # update model weights
                    optimizer.step()
                    
                    tepoch.set_postfix(loss=loss.item(), accuracy=100. * accuracy)
                
    def test(self, dataloader):
        model.eval()
        predictions, actuals = list(), list()
        
        with torch.no_grad():
            for inputs, targets in dataloader:
                inputs = inputs.to(self.device)
                # evaluate the model on the test set
                yhat = self(inputs.float())
                yhat = yhat.cpu().detach().numpy()
                actual = targets.numpy()
                # reshape for stacking
                actual = actual.reshape((len(actual), 1))
                yhat = yhat.reshape((len(yhat), 1))
                # store
                predictions.append(yhat)
                actuals.append(actual)
        predictions, actuals = np.vstack(predictions), np.vstack(actuals)
        print("Predictions: ", predictions[:10])
        print("Real labels: ", actuals[:10])
        # calculate accuracy
        pred_label = np.round(predictions)
        acc = metrics.accuracy_score(actuals, pred_label)
        f1 = metrics.f1_score(actuals, pred_label, average='binary', zero_division=0)
        auroc = metrics.roc_auc_score(actuals, predictions)
        precision, recall, thresholds = metrics.precision_recall_curve(actuals, predictions)
        specificity = metrics.recall_score(actuals, pred_label, pos_label=0)
        sensitivity = metrics.recall_score(actuals, pred_label)
        auprc = metrics.auc(recall, precision)
        print(f"Test metrics: \n Accuracy: {float(acc):>6f}, F1 score: {float(f1):>6f}, AUROC: {float(auroc):>6f}, AUPRC: {float(auprc):>6f}, Specificity: {float(specificity):>6f}, Sensitivity: {float(sensitivity):>6f}\n")
        return acc, f1, auroc, auprc

## Prepare dataset

In [64]:
class EmbeddingDataset(Dataset):
    def __init__(self, df):
        self.X = df.drop(columns=['Label']).to_numpy()
        self.y = np.expand_dims(df['Label'].to_numpy(), axis=1)
        self.len = len(df)
    
    def __len__(self):
        return self.len
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [65]:
train_df = pd.read_csv('T3SE_clustered_embeddings_Nterm_train.csv')
train_df.drop(columns=['ID', 'Sequence_part'], inplace=True)

test_df = pd.read_csv('T3SE_clustered_embeddings_Nterm_test.csv')
test_df.drop(columns=['ID', 'Sequence_part'], inplace=True)

In [66]:
print(f"Train dataset size: {len(train_df)}")
print(f"Test dataset size: {len(test_df)}")

Train dataset size: 3013
Test dataset size: 754


In [67]:
train_dset = EmbeddingDataset(train_df)
train_loader = DataLoader(train_dset, batch_size=32, shuffle=True)
test_dset = EmbeddingDataset(test_df)
test_loader = DataLoader(test_dset, batch_size=1, shuffle=False)

In [68]:
# Run on GPU or CPU
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Using {} device'.format(device))

Using cuda device


## Model training and testing

In [69]:
model = NN(device=device).to(device)
print(model)

NN(
  (lin1): Linear(in_features=1024, out_features=256, bias=True)
  (relu): ReLU()
  (lin2): Linear(in_features=256, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)


In [70]:
model.train_model(train_loader, 35)

Epoch 1: 100%|██████████| 95/95 [00:00<00:00, 260.25batch/s, accuracy=100, loss=0.142] 
Epoch 2: 100%|██████████| 95/95 [00:00<00:00, 292.64batch/s, accuracy=60, loss=0.959]  
Epoch 3: 100%|██████████| 95/95 [00:00<00:00, 310.90batch/s, accuracy=100, loss=0.0603]
Epoch 4: 100%|██████████| 95/95 [00:00<00:00, 244.86batch/s, accuracy=100, loss=0.0719]
Epoch 5: 100%|██████████| 95/95 [00:00<00:00, 397.50batch/s, accuracy=80, loss=0.397]  
Epoch 6: 100%|██████████| 95/95 [00:00<00:00, 234.23batch/s, accuracy=80, loss=0.357]  
Epoch 7: 100%|██████████| 95/95 [00:00<00:00, 236.66batch/s, accuracy=100, loss=0.0658] 
Epoch 8: 100%|██████████| 95/95 [00:00<00:00, 231.65batch/s, accuracy=100, loss=0.0696] 
Epoch 9: 100%|██████████| 95/95 [00:00<00:00, 236.32batch/s, accuracy=100, loss=0.0861] 
Epoch 10: 100%|██████████| 95/95 [00:00<00:00, 236.88batch/s, accuracy=100, loss=0.0574] 
Epoch 11: 100%|██████████| 95/95 [00:00<00:00, 254.60batch/s, accuracy=80, loss=0.321]   
Epoch 12: 100%|██████████

In [71]:
model.test(test_loader)

Predictions:  [[0.02375989]
 [0.00854938]
 [0.5944409 ]
 [0.01201478]
 [0.00165575]
 [0.01242965]
 [0.00123633]
 [0.19943842]
 [0.10316078]
 [0.01323742]]
Real labels:  [[0]
 [0]
 [1]
 [0]
 [0]
 [0]
 [0]
 [1]
 [0]
 [0]]
Test metrics: 
 Accuracy: 0.937666, F1 score: 0.605042, AUROC: 0.920825, AUPRC: 0.700663, Specificity: 0.988218, Sensitivity: 0.480000



(0.9376657824933687,
 0.6050420168067226,
 0.9208247422680412,
 0.7006628756817251)

In [56]:
torch.save(model.state_dict(), "T3SEembedding_simple_NN.pth")