In [60]:
!pip install PyTDC
!pip install datasets

!pip install transformers



In [61]:
import tqdm
import numpy as np
import pandas as pd
import plotly.express as px
from tdc.multi_pred import DTI
from tdc.generation import MolGen
import plotly.figure_factory as ff

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [86]:
data = MolGen(name = 'MOSES')
data=data.get_data()[:100000]

Found local copy...
Loading...
Done!


In [87]:
data=data.sample(frac=1)
data=data.reset_index(drop=True)

## Some Basic Analysis

In [88]:
data['l_smiles']=data.smiles.apply(len)

In [89]:
data['l_smiles'].describe(percentiles=[i/10 for i in range(1,10)])

count    100000.000000
mean         35.150310
std           4.566592
min          15.000000
10%          29.000000
20%          31.000000
30%          33.000000
40%          34.000000
50%          35.000000
60%          36.000000
70%          38.000000
80%          39.000000
90%          41.000000
max          54.000000
Name: l_smiles, dtype: float64

## A solid preset length for drug would be 50

In [90]:
def tokenize(input_string):
  return [ord(char) for char in input_string]
def encode(input_string,max_length=128,padding=True):
  tokens=tokenize(input_string)
  if len(tokens)>max_length:
    tokens=tokens[:max_length]
  if (len(tokens)<max_length) & padding:
    tokens.extend([0 for _ in range(max_length-len(tokens))])
  return tokens
def decode(input_tokens):
  return ''.join(list(map(lambda x:chr(x), input_tokens)))

In [91]:
l_tokenizer=encode('z',padding=False)[0]+1

In [92]:
class Drug_Dataset(Dataset):
    def __init__(self, df,drug_max_length):
        self.df = df
        self.dml=drug_max_length
    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row=self.df.iloc[idx]
        input_drug=torch.tensor(encode(row['smiles'],max_length=self.dml))
        return {'input_drug':input_drug}

In [109]:
dml=50

In [110]:
l=int(data.shape[0]*0.8)
train_p=Drug_Dataset(data[:l],drug_max_length=dml)
test_p=Drug_Dataset(data[l:],drug_max_length=dml)

In [111]:
train_loader=DataLoader(train_p,batch_size=32,shuffle=True)
test_loader=DataLoader(test_p,batch_size=32)

## Let's create the model

## This is Attention, as Torch's attention does not work with mask.

In [112]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
class Encoder(nn.Module):
    def __init__(self, latent_dim=8,embed_dim=16):
        super(Encoder, self).__init__()
        self.latent_dim=latent_dim
        self.embeddings = nn.Embedding(l_tokenizer, embed_dim)
        self.conv1=nn.Conv1d(embed_dim,embed_dim//2,3)
        self.pool1=nn.MaxPool1d(2)
        self.conv2=nn.Conv1d(embed_dim//2,embed_dim//4,3)
        self.pool2=nn.MaxPool1d(2)
        self.conv3=nn.Conv1d(embed_dim//4,embed_dim//8,2)
        self.pool3=nn.MaxPool1d(2)
        self.linear3=nn.Linear((embed_dim//8)*5, latent_dim)

        self.N = torch.distributions.Normal(0, 1)
        # self.N.loc = self.N.loc # hack to get sampling on the GPU
        self.N.scale = self.N.scale
        self.kl = 0

    def forward(self, x):
        x=self.embeddings(x)
        bn=x.size(0)
        x=torch.transpose(x,1,2)
        x = self.pool1(F.relu(self.conv1(x)))
        x = self.pool2(F.relu(self.conv2(x)))
        x = self.pool3(F.relu(self.conv3(x)))
        # x=torch.transpose(x,1,2)
        x=x.view(bn,-1)
        mu =  self.linear3(x)
        sigma = torch.exp(self.linear3(x))
        z = mu + sigma*(self.N.sample(mu.shape)).to(device)
        self.kl = (sigma**2 + mu**2 - torch.log(sigma) - 1/2).sum()/x.size(0)/dml/100
        return z

class Decoder(nn.Module):
    def __init__(self, latent_dim=8,dim1=16):
        super(Decoder, self).__init__()
        self.linear=nn.Linear(latent_dim,24)
        self.conv1 = nn.ConvTranspose1d(1, dim1,3,stride=2)
        self.conv2 = nn.ConvTranspose1d(dim1, 2*dim1,2)
        self.linear2 = nn.Linear(2*dim1,l_tokenizer)

    def forward(self, z):
        z = F.relu(self.linear(z))
        z = z.view(-1,1,24)
        z = F.relu(self.conv1(z))
        z = F.relu(self.conv2(z))
        z = torch.transpose(z,1,2)
        z = self.linear2(z)
        return z

class VariationalAutoencoder(nn.Module):
    def __init__(self, latent_dims=32):
        super(VariationalAutoencoder, self).__init__()
        self.encoder = Encoder(latent_dims,64).to(device)
        self.decoder = Decoder(latent_dims,32).to(device)

    def forward(self, x):
        z = self.encoder(x)
        return self.decoder(z)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
autoencoder=VariationalAutoencoder()
opt = torch.optim.Adam(autoencoder.parameters(),lr=0.0005)
loss_fn=torch.nn.CrossEntropyLoss()
def acc_fn(y,y_hat):
  return torch.mean((y==torch.argmax(y_hat,dim=1)).float())
for epoch in range(100):
    train_loss1=0
    train_loss2=0
    train_acc=0
    autoencoder.train()
    for batch in tqdm.tqdm(train_loader):
        x = batch['input_drug'].to(device) # GPU
        opt.zero_grad()
        x_hat = autoencoder(x)
        # x2=torch.transpose(x,1,2)
        x2_hat=torch.transpose(x_hat,1,2)
        loss1=loss_fn(x2_hat,x)
        loss2=autoencoder.encoder.kl
        acc=acc_fn(x,x2_hat)
        loss = loss1 + loss2
        train_acc+=acc.detach().cpu().numpy()
        train_loss1+=loss1.detach().cpu().numpy()
        train_loss2+=loss2.detach().cpu().numpy()
        loss.backward()
        opt.step()

    test_loss1=0
    test_loss2=0
    test_acc=0
    autoencoder.eval()
    with torch.no_grad():
      for batch in tqdm.tqdm(test_loader):
          x = batch['input_drug'].to(device) # GPU
          x_hat = autoencoder(x)
          # x2=torch.transpose(x,1,2)
          x2_hat=torch.transpose(x_hat,1,2)
          loss1=loss_fn(x2_hat,x)
          loss2=autoencoder.encoder.kl
          acc=acc_fn(x,x2_hat)
          loss = loss1 + loss2
          test_acc+=acc.detach().cpu().numpy()
          test_loss1+=loss1.detach().cpu().numpy()
          test_loss2+=loss2.detach().cpu().numpy()

    print(f"TRAIN: EPOCH {epoch}: SSE: {train_loss1/len(train_loader)}, KL_LOSS: {train_loss2/len(train_loader)}, ACC: {train_acc/len(train_loader)}   \nTEST: EPOCH {epoch}: SSE: {test_loss1/len(test_loader)}, KL_LOSS: {test_loss2/len(test_loader)}, ACC: {test_acc/len(test_loader)}")