# AUTHOR : MANAS MAHALE <<manas.mahale@bcp.edu.in>>

In [1]:
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader

np.random.seed(0)
random.seed(0)
torch.manual_seed(0)

<torch._C.Generator at 0x153cf7970>

In [2]:
EPOCH      = 500
BATCH_SIZE = 500

In [3]:
X_train = pd.read_pickle('./Data/processed/X_train.pkl').values
y_train = pd.read_pickle('./Data/processed/y_train.pkl').values
X_test  = pd.read_pickle('./Data/processed/X_test.pkl').values
y_test  = pd.read_pickle('./Data/processed/y_test.pkl').values

In [4]:
vocab = list(set(np.concatenate([list(i) for i in X_train])))

In [5]:
d = {}
for n, i in enumerate(vocab):
    zero = np.zeros(len(vocab))
    zero[n]=1
    d[i] = zero

In [6]:
def padded_one_hot_encode_smiles(smiles, pad_len = max([len(i) for i in X_train])):
    return np.array([d[smiles[i]] if i < len(smiles) else np.zeros(len(vocab)) for i in range(pad_len)])

In [7]:
X_train = np.array([padded_one_hot_encode_smiles(i) for i in X_train[:3000]]).astype(np.float32)
X_test = np.array([padded_one_hot_encode_smiles(i) for i in X_test[:1000]]).astype(np.float32)

In [8]:
print("X_train shape :", X_train.shape)
print("X_test shape  :", X_test.shape)

X_train shape : (3000, 748, 46)
X_test shape  : (1000, 748, 46)


In [9]:
y_train = y_train[:3000]
y_test = y_train[:1000]

In [10]:
train_data = []
for i in range(len(X_train)):
    train_data.append([X_train[i], y_train[i]])

test_data = []
for i in range(len(X_test)):
    test_data.append([X_test[i], y_test[i]])

In [11]:
train_dataloader = DataLoader(train_data, shuffle=True, batch_size=BATCH_SIZE)
test_dataloader  = DataLoader(test_data, shuffle=True, batch_size=BATCH_SIZE)

classes = ('drug', 'drug_like', 'non_drug')

In [12]:
class NeuralNetwork(nn.Module):
    def __init__(self):
        super(NeuralNetwork, self).__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(748*46, 5000),
            nn.ReLU(),
            nn.Linear(5000, 2500),
            nn.ReLU(),
            nn.Linear(2500, 500),
            nn.ReLU(),
            nn.Linear(500, 3),
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits

net = NeuralNetwork()

In [13]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

In [None]:
total_loss = []
for epoch in range(EPOCH):  # loop over the dataset multiple times
    print(f"Epoch {epoch + 1}")
    print('-'*10)
    running_loss = 0.0
    for i, data in enumerate(train_dataloader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        # print statistics
        running_loss += loss.item()
        print(f'Batch {i + 1} => Loss: {running_loss}')
        total_loss.append(running_loss)
        running_loss = 0.0
    print("="*20)

print('Training Done !!')

Epoch 1
----------
Batch 1 => Loss: 1.086142897605896
Batch 2 => Loss: 1.086614727973938
Batch 3 => Loss: 1.087407112121582
Batch 4 => Loss: 1.0872570276260376
Batch 5 => Loss: 1.0863697528839111
Batch 6 => Loss: 1.0857099294662476
Epoch 2
----------
Batch 1 => Loss: 1.0852749347686768
Batch 2 => Loss: 1.0841617584228516
Batch 3 => Loss: 1.0832465887069702
Batch 4 => Loss: 1.082993745803833
Batch 5 => Loss: 1.0827317237854004
Batch 6 => Loss: 1.0808846950531006
Epoch 3
----------
Batch 1 => Loss: 1.0783129930496216


In [None]:
plt.plot(total_loss)

In [None]:
PATH = './DNN.pth'
torch.save(net.state_dict(), PATH)

In [None]:
# LOAD Model
# net = NeuralNetwork()
# net.load_state_dict(torch.load(PATH))

In [None]:
correct = 0
total = 0

with torch.no_grad():
    for data in test_dataloader:
        images, labels = data
        outputs = net(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print('Accuracy of the network on the testset: %d %%' % (
    100 * correct / total))

In [None]:
# prepare to count predictions for each class
correct_pred = {classname: 0 for classname in classes}
total_pred = {classname: 0 for classname in classes}

# again no gradients needed
with torch.no_grad():
    for data in test_dataloader:
        images, labels = data
        outputs = net(images)
        _, predictions = torch.max(outputs, 1)
        # collect the correct predictions for each class
        for label, prediction in zip(labels, predictions):
            if label == prediction:
                correct_pred[classes[label]] += 1
            total_pred[classes[label]] += 1


# print accuracy for each class
for classname, correct_count in correct_pred.items():
    accuracy = 100 * float(correct_count) / total_pred[classname]
    print("Accuracy for class {:5s} is: {:.1f} %".format(classname,
                                                   accuracy))