In [1]:
!pip install PyTDC
!pip install datasets
!pip install transformers

Installing collected packages: safetensors, tokenizers, transformers
Successfully installed safetensors-0.4.0 tokenizers-0.15.0 transformers-4.35.2


In [2]:
import tqdm
import numpy as np
import pandas as pd
from tdc.multi_pred import DTI


import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [3]:
data = DTI(name = 'KIBA')
split = data.get_split()

Downloading...
100%|██████████| 96.6M/96.6M [00:05<00:00, 17.0MiB/s]
Loading...
Done!


## Data Analysis

In [4]:
new_data=data.get_data()

In [5]:
new_data['Drug_l']=new_data.Drug.apply(len)
new_data['Target_l']=new_data.Target.apply(len)
new_data[['Drug_l','Target_l','Y']].describe()

Unnamed: 0,Drug_l,Target_l,Y
count,117657.0,117657.0,117657.0
mean,46.751566,730.593513,11.720685
std,13.839408,384.708142,0.834272
min,14.0,215.0,0.0
25%,39.0,454.0,11.2
50%,45.0,629.0,11.520216
75%,54.0,912.0,11.923909
max,532.0,4128.0,17.200179


In [6]:
new_data[['Drug',"Target"]].tail()

Unnamed: 0,Drug,Target
117652,CCCc1nc[nH]c1CNc1cc(Cl)c2ncc(C#N)c(Nc3ccc(F)c(...,MATTVTCTRFTDEYQLYEDIGKGAFSVVRRCVKLCTGHEYAAKIIN...
117653,CCCc1nc[nH]c1CNc1cc(Cl)c2ncc(C#N)c(Nc3ccc(F)c(...,MATTATCTRFTDDYQLFEELGKGAFSVVRRCVKKTSTQEYAAKIIN...
117654,CCCc1nc[nH]c1CNc1cc(Cl)c2ncc(C#N)c(Nc3ccc(F)c(...,MASTTTCTRFTDEYQLFEELGKGAFSVVRRCMKIPTGQEYAAKIIN...
117655,CCCc1nc[nH]c1CNc1cc(Cl)c2ncc(C#N)c(Nc3ccc(F)c(...,MSQERPTFYRQELNKTIWEVPERYQNLSPVGSGAYGSVCAAFDTKT...
117656,CCCc1nc[nH]c1CNc1cc(Cl)c2ncc(C#N)c(Nc3ccc(F)c(...,MATITCTRFTEEYQLFEELGKGAFSVVRRCVKVLAGQEYAAKIINT...


## Let's create the tokenizer

In [7]:
def tokenize(input_string):
  return [ord(char) for char in input_string]
def encode(input_string,max_length=128,padding=True):
  tokens=tokenize(input_string)
  if len(tokens)>max_length:
    tokens=tokens[:max_length]
  if (len(tokens)<max_length) & padding:
    tokens.extend([0 for _ in range(max_length-len(tokens))])
  return tokens
def decode(input_tokens):
  return ''.join(list(map(lambda x:chr(x), input_tokens)))

In [8]:
l_tokenizer=encode('z',padding=False)[0]+1

## Now, let's create the dataset object

In [9]:
class DTIA_Dataset(Dataset):
    def __init__(self, df,drug_max_length,target_max_length):
        self.df = df
        self.dml=drug_max_length
        self.tml=target_max_length
    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row=self.df.iloc[idx]
        input_drug=torch.tensor(encode(row['Drug'],max_length=self.dml))
        input_target=torch.tensor(encode(row['Target'],max_length=self.tml))
        y=torch.tensor(row['Y'],dtype=torch.float32)
        return {'input_drug':input_drug,
         'input_target': input_target,
         'y':y}

In [10]:
dml=45
tml=700

In [11]:
train_p=DTIA_Dataset(split['train'],drug_max_length=45,target_max_length=700)
test_p=DTIA_Dataset(split['valid'],drug_max_length=45,target_max_length=700)

In [12]:
train_loader=DataLoader(train_p,batch_size=32,shuffle=True)
test_loader=DataLoader(test_p,batch_size=32)

## Now a model

In [13]:
class Drug_Model(torch.nn.Module):
    def __init__(self, embed_dim=64,dim1=32):
        super(Drug_Model, self).__init__()
        self.embeddings = nn.Embedding(l_tokenizer, embed_dim)
        self.dense1 = nn.Linear(embed_dim, dim1)
        self.bn1=nn.BatchNorm1d(dml)
        self.dp1=nn.Dropout(0.2)

    def forward(self, input_ids):
        input_embeddings = self.embeddings(input_ids)
        dense1_output = F.relu(self.dp1(self.bn1(self.dense1(input_embeddings))))
        mean_outputs = dense1_output.mean(dim=1)
        return mean_outputs


class Target_Model(torch.nn.Module):
    def __init__(self, embed_dim=64,dim1=32):
        super(Target_Model, self).__init__()
        self.embeddings = nn.Embedding(l_tokenizer, embed_dim)
        self.dense1 = nn.Linear(embed_dim, dim1)
        self.bn1=nn.BatchNorm1d(tml)
        self.dp1=nn.Dropout(0.2)

    def forward(self, input_ids):
        input_embeddings = self.embeddings(input_ids)
        dense1_output = F.relu(self.dp1(self.bn1(self.dense1(input_embeddings))))
        mean_outputs = dense1_output.mean(dim=1)
        return mean_outputs



class DTIA_Model(torch.nn.Module):
    def __init__(self, embed_dim=64,dim1=32):
        super(DTIA_Model, self).__init__()
        self.drug_encoder=Drug_Model(embed_dim=embed_dim,dim1=dim1)
        self.target_encoder=Target_Model(embed_dim=embed_dim,dim1=dim1)
        self.dense = nn.Linear(dim1*2, 1)

    def forward(self, drug_inputs,target_inputs):
        drug_encoded=self.drug_encoder(drug_inputs)
        target_encoded=self.target_encoder(target_inputs)
        drug_n_target=torch.concat((drug_encoded,target_encoded),dim=-1)
        outputs = self.dense(drug_n_target)

        return outputs

In [14]:
model=DTIA_Model(embed_dim=32,dim1=32)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device='cpu'
model.to(device)

DTIA_Model(
  (drug_encoder): Drug_Model(
    (embeddings): Embedding(123, 32)
    (dense1): Linear(in_features=32, out_features=32, bias=True)
    (bn1): BatchNorm1d(45, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (dp1): Dropout(p=0.2, inplace=False)
  )
  (target_encoder): Target_Model(
    (embeddings): Embedding(123, 32)
    (dense1): Linear(in_features=32, out_features=32, bias=True)
    (bn1): BatchNorm1d(700, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (dp1): Dropout(p=0.2, inplace=False)
  )
  (dense): Linear(in_features=64, out_features=1, bias=True)
)

## Now, training.

In [17]:
def train(num_epochs=30):
    criterion= nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-2)
    best_test_loss = float('inf')

    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        train_total=0.0
        test_loss=0.0
        test_total=0.0

        for batch in tqdm.tqdm(train_loader):
            input_drug=batch['input_drug'].to(device)
            input_target=batch['input_target'].to(device)
            y=batch['y'].to(device)
            optimizer.zero_grad()
            outputs = model(input_drug,input_target)
            loss = criterion(outputs,y)
            train_loss += loss.item()
            train_total+=outputs.size(0)
            loss.backward()
            optimizer.step()
        avg_train_loss = train_loss / len(train_loader)



        model.eval()
        with torch.no_grad():
            for batch in test_loader:
                input_drug=batch['input_drug'].to(device)
                input_target=batch['input_target'].to(device)
                y=batch['y'].to(device)
                outputs = model(input_drug,input_target)
                loss = criterion(outputs,y)
                test_total+=outputs.size(0)
                test_loss += loss.item()
        avg_test_loss = test_loss / len(test_loader)

        if avg_test_loss < best_test_loss:
            best_test_loss = avg_test_loss
            torch.save(model.state_dict(), "best_model.pt")
        print(f"Epoch {epoch+1}/{num_epochs}: "
              f"Train Loss: {avg_train_loss:.4f}, Valid Loss: {avg_test_loss:.4f}")

In [None]:
train()

## FUTURE WORK


1.   Add figures for training history and some evaluations
2.   Improve the tokenizer
3.   Improve the model

