In [None]:
!pip install PyTDC
!pip install datasets

!pip install transformers

In [2]:
import tqdm
import numpy as np
import pandas as pd
import plotly.express as px
from tdc.multi_pred import DTI
import plotly.figure_factory as ff

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [None]:
data = DTI(name = 'Davis')
split = data.get_split()

## Data Provision

In [4]:
new_data=data.get_data()
# new_data['weights']=1/new_data['Y']
# new_data=new_data.sample(frac=0.5,weights='weights')
y_max=new_data['Y'].max()
y_min=new_data['Y'].min()
new_data['Y']=(new_data['Y']-y_min)/(y_max-y_min)
# new_data['Y']=-np.log10(new_data['Y']/1e9)
new_data=new_data.sample(frac=1)
new_data=new_data.reset_index(drop=True)

## Some basic Analysis

In [5]:
new_data['Drug_l']=new_data.Drug.apply(len)
new_data['Target_l']=new_data.Target.apply(len)
new_data[['Drug_l','Target_l','Y']].describe()

Unnamed: 0,Drug_l,Target_l,Y
count,25772.0,25772.0,25772.0
mean,54.176471,744.849604,0.755811
std,10.962637,372.813592,0.399002
min,32.0,244.0,0.0
25%,45.0,479.0,0.377499
50%,53.0,632.0,1.0
75%,61.25,954.0,1.0
max,81.0,2549.0,1.0


In [6]:
new_data[['Drug',"Target"]].tail()

Unnamed: 0,Drug,Target
25767,Cc1nc(Nc2ncc(C(=O)Nc3c(C)cccc3Cl)s2)cc(N2CCN(C...,MELRVGNKYRLGRKIGSGSFGDIYLGANIASGEEVAIKLECVKTKH...
25768,CCN1CCN(Cc2ccc(NC(=O)Nc3ccc(Oc4cc(NC)ncn4)cc3)...,MQSKVLLAVALWLCVETRAASVGLPSVSLDLPRLSIQKDILTIKAN...
25769,CCN(CC)CCNC(=O)c1c(C)[nH]c(C=C2C(=O)Nc3ccc(F)c...,MRHSKRTYCPDWDDKDWDYGKWRSSSSHKRRKRSHSSAQENKRCKY...
25770,COc1c(Cl)cc2c([nH]c3cnccc32)c1NC(=O)c1cccnc1C,MAAAAGNRASSSGFPGARATSPEAGGGGGALKASSAPAAAAGLLRE...
25771,Cc1cc(Nc2cc(N3CCN(C)CC3)nc(Sc3ccc(NC(=O)C4CC4)...,MRKGVLKDPEIADLFYKDDPEELFIGLHEIGHGSFGAVYFATNAHT...


In [7]:
fig = px.histogram(new_data['Y'], nbins=200,marginal="box")

# Show the plot
fig.show()

## Let's create the tokenizer

In [8]:
def tokenize(input_string):
  return [ord(char) for char in input_string]
def encode(input_string,max_length=128,padding=True):
  tokens=tokenize(input_string)
  if len(tokens)>max_length:
    tokens=tokens[:max_length]
  if (len(tokens)<max_length) & padding:
    tokens.extend([0 for _ in range(max_length-len(tokens))])
  return tokens
def decode(input_tokens):
  return ''.join(list(map(lambda x:chr(x), input_tokens)))

In [9]:
l_tokenizer=encode('z',padding=False)[0]+1

## Now, let's create the dataset object

In [10]:
class DTIA_Dataset(Dataset):
    def __init__(self, df,drug_max_length,target_max_length):
        self.df = df
        self.dml=drug_max_length
        self.tml=target_max_length
    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row=self.df.iloc[idx]
        input_drug=torch.tensor(encode(row['Drug'],max_length=self.dml))
        input_target=torch.tensor(encode(row['Target'],max_length=self.tml))
        y=torch.tensor(row['Y'],dtype=torch.float32)
        return {'input_drug':input_drug,
         'input_target': input_target,
         'y':y}

In [11]:
dml=45
tml=700

In [12]:
l=int(new_data.shape[0]*0.8)
train_p=DTIA_Dataset(new_data[:l],drug_max_length=45,target_max_length=700)
test_p=DTIA_Dataset(new_data[l:],drug_max_length=45,target_max_length=700)

In [13]:
train_loader=DataLoader(train_p,batch_size=32,shuffle=True)
test_loader=DataLoader(test_p,batch_size=32)

## Now a model

In [15]:


class Drug_Model(torch.nn.Module):
    def __init__(self, embed_dim=16,dim1=32):
        super(Drug_Model, self).__init__()
        self.embeddings = nn.Embedding(l_tokenizer, embed_dim)
        self.conv1=nn.Conv1d(embed_dim,dim1,kernel_size=4)
        self.bn1=nn.BatchNorm1d(dim1)

        self.conv2=nn.Conv1d(dim1,dim1*2,kernel_size=4)
        self.bn2=nn.BatchNorm1d(dim1*2)

        self.conv3=nn.Conv1d(dim1*2,dim1*4,kernel_size=4)
        self.bn3=nn.BatchNorm1d(dim1*4)

        self.fpool=nn.MaxPool1d(36)

    def forward(self, input_ids):
        x = self.embeddings(input_ids)
        x=torch.transpose(x,1,2)
        x=F.relu((self.bn1(self.conv1(x))))
        x=F.relu((self.bn2(self.conv2(x))))
        x=F.relu((self.bn3(self.conv3(x))))
        x = torch.squeeze(self.fpool(x),2)
        return x


class Target_Model(torch.nn.Module):
    def __init__(self, embed_dim=16,dim1=32):
        super(Target_Model, self).__init__()
        self.embeddings = nn.Embedding(l_tokenizer, embed_dim)
        self.conv1=nn.Conv1d(embed_dim,dim1,kernel_size=4)
        self.bn1=nn.BatchNorm1d(dim1)

        self.conv2=nn.Conv1d(dim1,dim1*2,kernel_size=4)
        self.bn2=nn.BatchNorm1d(dim1*2)

        self.conv3=nn.Conv1d(dim1*2,dim1*4,kernel_size=4)
        self.bn3=nn.BatchNorm1d(dim1*4)

        self.fpool=nn.MaxPool1d(691)

    def forward(self, input_ids):
        x = self.embeddings(input_ids)
        x=torch.transpose(x,1,2)
        x=F.relu((self.bn1(self.conv1(x))))
        x=F.relu((self.bn2(self.conv2(x))))
        x=F.relu((self.bn3(self.conv3(x))))
        x = torch.squeeze(self.fpool(x),2)
        return x

class DTIA_Model(torch.nn.Module):
    def __init__(self, embed_dim=32,dim1=64):
        super(DTIA_Model, self).__init__()
        self.drug_encoder=Drug_Model(embed_dim=embed_dim,dim1=dim1)
        self.target_encoder=Target_Model(embed_dim=embed_dim,dim1=dim1)
        # self.drug_encoder=Drug_Model_att(embed_dim=embed_dim,dim1=dim1)
        # self.target_encoder=Target_Model_att(embed_dim=embed_dim,dim1=dim1)
        self.dense1 = nn.Linear(dim1*8, 1024)
        # self.dense1 = nn.Linear(embed_dim*2, 1024)
        self.dp1=nn.Dropout(0.1)
        self.dense2 = nn.Linear(1024, 1024)
        self.dp2=nn.Dropout(0.1)
        self.dense3 = nn.Linear(1024, 512)
        self.dp3=nn.Dropout(0.1)
        self.fdense = nn.Linear(512, 1)


    def forward(self, drug_inputs,target_inputs):
        drug_encoded=self.drug_encoder(drug_inputs)
        target_encoded=self.target_encoder(target_inputs)
        x=torch.concat((drug_encoded,target_encoded),dim=-1)
        x = F.relu(self.dp1(self.dense1(x)))
        x = F.relu(self.dp2(self.dense2(x)))
        x = F.relu(self.dp3(self.dense3(x)))
        x=self.fdense(x)

        return x

In [16]:

model=DTIA_Model(embed_dim=128,dim1=64)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device='cpu'
model.to(device)

DTIA_Model(
  (drug_encoder): Drug_Model(
    (embeddings): Embedding(123, 128)
    (conv1): Conv1d(128, 64, kernel_size=(4,), stride=(1,))
    (bn1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (conv2): Conv1d(64, 128, kernel_size=(4,), stride=(1,))
    (bn2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (conv3): Conv1d(128, 256, kernel_size=(4,), stride=(1,))
    (bn3): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (fpool): MaxPool1d(kernel_size=36, stride=36, padding=0, dilation=1, ceil_mode=False)
  )
  (target_encoder): Target_Model(
    (embeddings): Embedding(123, 128)
    (conv1): Conv1d(128, 64, kernel_size=(4,), stride=(1,))
    (bn1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (conv2): Conv1d(64, 128, kernel_size=(4,), stride=(1,))
    (bn2): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=T

## Now, training.

In [17]:
def train(num_epochs=10):
    criterion= nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    best_test_loss = float('inf')

    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        train_total=0.0
        test_loss=0.0
        test_total=0.0

        for batch in tqdm.tqdm(train_loader):
            input_drug=batch['input_drug'].to(device)
            input_target=batch['input_target'].to(device)
            y=batch['y'].to(device)
            optimizer.zero_grad()
            outputs = model(input_drug,input_target).view(-1,)
            loss = criterion(outputs,y)
            train_loss += loss.item()
            train_total+=outputs.size(0)
            loss.backward()
            optimizer.step()
        avg_train_loss = train_loss / len(train_loader)



        model.eval()
        with torch.no_grad():
            for batch in test_loader:
                input_drug=batch['input_drug'].to(device)
                input_target=batch['input_target'].to(device)
                y=batch['y'].to(device)
                outputs = model(input_drug,input_target).view(-1,)
                loss = criterion(outputs,y)
                test_total+=outputs.size(0)
                test_loss += loss.item()
        avg_test_loss = test_loss / len(test_loader)

        if avg_test_loss < best_test_loss:
            best_test_loss = avg_test_loss
            torch.save(model.state_dict(), "best_model.pt")
        print(f"Epoch {epoch+1}/{num_epochs}: "
              f"Train Loss: {avg_train_loss:.4f}, Valid Loss: {avg_test_loss:.4f}")

In [18]:
train()

100%|██████████| 645/645 [00:25<00:00, 25.39it/s]


Epoch 1/10: Train Loss: 0.1642, Valid Loss: 0.1144


100%|██████████| 645/645 [00:17<00:00, 36.52it/s]


Epoch 2/10: Train Loss: 0.1218, Valid Loss: 0.1210


100%|██████████| 645/645 [00:17<00:00, 36.92it/s]


Epoch 3/10: Train Loss: 0.1164, Valid Loss: 0.1062


100%|██████████| 645/645 [00:18<00:00, 35.19it/s]


Epoch 4/10: Train Loss: 0.1147, Valid Loss: 0.1100


100%|██████████| 645/645 [00:18<00:00, 34.83it/s]


Epoch 5/10: Train Loss: 0.1118, Valid Loss: 0.1054


100%|██████████| 645/645 [00:18<00:00, 35.46it/s]


Epoch 6/10: Train Loss: 0.1087, Valid Loss: 0.1064


100%|██████████| 645/645 [00:17<00:00, 37.12it/s]


Epoch 7/10: Train Loss: 0.1055, Valid Loss: 0.0999


100%|██████████| 645/645 [00:17<00:00, 37.13it/s]


Epoch 8/10: Train Loss: 0.1028, Valid Loss: 0.1049


100%|██████████| 645/645 [00:17<00:00, 36.57it/s]


Epoch 9/10: Train Loss: 0.1011, Valid Loss: 0.0971


100%|██████████| 645/645 [00:17<00:00, 36.94it/s]


Epoch 10/10: Train Loss: 0.1007, Valid Loss: 0.0980


## VISUAL EVALUATION

In [19]:
ys=[]
preds=[]
for batch in tqdm.tqdm(test_loader):
    input_drug=batch['input_drug'].to(device)
    input_target=batch['input_target'].to(device)
    y=batch['y'].to(device)
    outputs = model(input_drug,input_target).view(-1,)
    ys.extend(list(y.detach().cpu().numpy()))
    preds.extend(list(outputs.detach().cpu().numpy()))
temp_df=pd.DataFrame({'ys':ys,'preds':preds})
px.scatter(temp_df,x='ys',y='preds')

100%|██████████| 162/162 [00:03<00:00, 40.85it/s]


In [20]:
temp_df.corr()

Unnamed: 0,ys,preds
ys,1.0,0.615017
preds,0.615017,1.0


## FUTURE WORK


1.   Add figures for training history and some evaluations
2.   Improve the tokenizer
3.   Improve the model

