In [1]:
import pandas as pd
import torch
from torch import tensor
from torch.utils.data import DataLoader,TensorDataset,SequentialSampler,RandomSampler
import numpy as np
from sklearn.model_selection import train_test_split

RAMDOMSEED = 42

In [2]:
train = pd.read_parquet('../data/train/train.parquet')

In [3]:
# Apply log transformation to the target variable
train['IC50 (nM)'] = np.log1p(train['IC50 (nM)'])  

In [6]:
train['IC50 (nM)'].describe()

count    1.687796e+06
mean     6.071394e+00
std      3.347597e+00
min      0.000000e+00
25%      3.367296e+00
50%      6.021023e+00
75%      8.810012e+00
max      3.223619e+01
Name: IC50 (nM), dtype: float64

In [7]:
from sklearn.preprocessing import StandardScaler
def create_tensor_dataset(df, seq_col, mol_col, target_col, batch_size=1024):
    # Convert columns to tensors
    seq_scaler = StandardScaler()
    mol_scaler = StandardScaler()
    seq_tensor = tensor(seq_scaler.fit_transform(np.array(df[seq_col].tolist()))).to(torch.float32)
    mol_tensor = tensor(mol_scaler.fit_transform(np.array(df[mol_col].tolist()))).to(torch.float32)
    target_tensor = tensor(np.array(df[target_col])).to(torch.float32)
    
    # Create TensorDataset
    dataset = TensorDataset(mol_tensor, seq_tensor, target_tensor)
    
    # Create DataLoader
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    
    return dataloader

In [8]:
# Split the dataset into training and validation sets
train_df, val_df = train_test_split(train, test_size=0.2, random_state=RAMDOMSEED)

In [9]:
# Create DataLoaders for training and validation sets
train_dataloader = create_tensor_dataset(train_df, seq_col='proteins', mol_col='smiles', target_col='IC50 (nM)')
val_dataloader = create_tensor_dataset(val_df, seq_col='proteins', mol_col='smiles', target_col='IC50 (nM)')

In [10]:
step_mol, step_seq, step_label = next(iter(train_dataloader))

In [None]:
# 768 SMILES  embeddings
# 320 proteins embeddings

In [12]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class resBlock(nn.Module):
    def __init__(self, in_channels, out_channels, use_conv1=False, strides=1, dropout=0.3):
        super().__init__()
        
        self.process = nn.Sequential(
            nn.Conv1d(in_channels, out_channels, kernel_size=3, stride=strides, padding=1),
            nn.BatchNorm1d(out_channels),
            nn.ReLU(inplace=True),
            nn.Dropout(p=dropout),
            nn.Conv1d(out_channels, out_channels, kernel_size=3, padding=1),
            nn.BatchNorm1d(out_channels)
        )
        
        if use_conv1:
            self.conv1 = nn.Conv1d(in_channels, out_channels, kernel_size=1, stride=strides)
        else:
            self.conv1 = None
        
    def forward(self, x):
        left = self.process(x)
        right = x if self.conv1 is None else self.conv1(x)
        
        return F.relu(left + right)

class cnnModule(nn.Module):
    def __init__(self, in_channel, out_channel, hidden_channel=256, dropout=0.3):  # Increased hidden_channel
        super().__init__()
        
        self.head = nn.Sequential(
            nn.Conv1d(in_channel, hidden_channel, 7, stride=2, padding=3, bias=False),
            nn.BatchNorm1d(hidden_channel),
            nn.ReLU(inplace=True),
            nn.Dropout(p=dropout),
            nn.MaxPool1d(2)
        )
        
        self.cnn = nn.Sequential(
            resBlock(hidden_channel, out_channel, use_conv1=True, strides=1),
            resBlock(out_channel, out_channel, strides=1),
            resBlock(out_channel, out_channel, strides=1),
            resBlock(out_channel, out_channel, strides=1), 
            resBlock(out_channel, out_channel, strides=1),
            resBlock(out_channel, out_channel, strides=1),  # Added extra resBlock
            resBlock(out_channel, out_channel, strides=1)   # Added extra resBlock
        )
    
    def forward(self, x):
        x = self.head(x)
        x = self.cnn(x)
        
        return x

class DeepLPI(nn.Module):
    def __init__(self, molshape, seqshape, dropout=0.3):
        super().__init__()
        
        self.molshape = molshape
        self.seqshape = seqshape

        self.molcnn = cnnModule(1, 128)  # Increased out_channel
        self.seqcnn = cnnModule(1, 128)  # Increased out_channel
        
        self.pool = nn.AvgPool1d(5, stride=3)
        self.lstm = nn.LSTM(128, 128, num_layers=6, batch_first=True, bidirectional=True)  # Increased hidden size and num_layers
        
        self.mlp = nn.Sequential(
            nn.Linear(round(((molshape + seqshape) / 4 - 2) * 2 / 3) * 128, 8192),  # Increased hidden units
            nn.BatchNorm1d(8192),
            nn.ReLU(),
            nn.Dropout(p=dropout),
            
            nn.Linear(8192, 4096),  # Increased hidden units
            nn.BatchNorm1d(4096),
            nn.ReLU(),
            nn.Dropout(p=0.2),
            
            nn.Linear(4096, 1024),  # Added extra layer
            nn.BatchNorm1d(1024),
            nn.ReLU(),
            nn.Dropout(p=0.2),
            
            nn.Linear(1024, 1),
        )

    def forward(self, mol, seq):
        mol = self.molcnn(mol.reshape(-1, 1, self.molshape))
        seq = self.seqcnn(seq.reshape(-1, 1, self.seqshape))
        
        # Concatenate along the sequence dimension
        x = torch.cat((mol, seq), 2)
        x = self.pool(x)
        
        # Reshape for LSTM
        x = x.reshape(-1, round(((self.molshape + self.seqshape) / 4 - 2) / 3), 128)
        x, _ = self.lstm(x)
        
        # Fully connected layer
        x = self.mlp(x.flatten(1))
        
        x = x.flatten()
        
        return x

# Example usage
molshape = 768
seqshape = 320
model = DeepLPI(molshape, seqshape)

In [13]:
def initialize_weights(m):
    if isinstance(m, nn.Conv1d):
        nn.init.kaiming_uniform_(m.weight.data,nonlinearity='relu')
        if m.bias is not None:
            nn.init.constant_(m.bias.data, 0)

    elif isinstance(m, nn.BatchNorm1d):
        nn.init.constant_(m.weight.data, 1)
        nn.init.constant_(m.bias.data, 0)

    elif isinstance(m, nn.Linear):
        nn.init.kaiming_uniform_(m.weight.data)
        nn.init.constant_(m.bias.data, 0)

In [14]:
model.apply(initialize_weights)

DeepLPI(
  (molcnn): cnnModule(
    (head): Sequential(
      (0): Conv1d(1, 256, kernel_size=(7,), stride=(2,), padding=(3,), bias=False)
      (1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU(inplace=True)
      (3): Dropout(p=0.3, inplace=False)
      (4): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    )
    (cnn): Sequential(
      (0): resBlock(
        (process): Sequential(
          (0): Conv1d(256, 128, kernel_size=(3,), stride=(1,), padding=(1,))
          (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (2): ReLU(inplace=True)
          (3): Dropout(p=0.3, inplace=False)
          (4): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,))
          (5): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (conv1): Conv1d(256, 128, kernel_size=(1,), stride=(1,))
      )
      (1): resBlock(
  

In [17]:
import torch
from tqdm import tqdm

def train_loop(model, train_dataloader, val_dataloader, loss_fn, optimizer, scheduler, num_epochs):
    model = model.to("cuda")
    
    for epoch in range(num_epochs):
        model.train()
        loop_loss = 0
        train_loader = tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{num_epochs}", leave=False)
        
        for step, batch in enumerate(train_loader):
            step_mol, step_seq, step_label = batch
            step_mol, step_seq, step_label = step_mol.to("cuda"), step_seq.to("cuda"), step_label.to("cuda")
            # print("REAL ICp50", step_label)
            # print(step_mol.shape)
            # print(step_seq.shape)
            
            optimizer.zero_grad()
            logits = model(step_mol, step_seq)
            # print("MODELS OUTPUT", logits)
            # print("REAL ICp50", step_label)
            loss = loss_fn(logits, step_label)
            loss.backward()
            optimizer.step()
            loop_loss += loss.item()

            if step % 20 == 0:
                train_loader.set_postfix(loss=loss.item())
        
        avg_train_loss = loop_loss / len(train_dataloader)
        
        # Validation step
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for step, batch in enumerate(val_dataloader):
                step_mol, step_seq, step_label = batch
                step_mol, step_seq, step_label = step_mol.to("cuda"), step_seq.to("cuda"), step_label.to("cuda")
                
                logits = model(step_mol, step_seq)
                loss = loss_fn(logits, step_label)
                val_loss += loss.item()
        
        avg_val_loss = val_loss / len(val_dataloader)
        
        print(f"Epoch {epoch+1}/{num_epochs} - Train Loss: {avg_train_loss:.4f} - Val Loss: {avg_val_loss:.4f}")
        
        # Step the scheduler based on validation loss
        scheduler.step(avg_val_loss)

In [18]:
import torch.optim as optim

loss_fn = nn.SmoothL1Loss()  # Using Huber Loss as an example
optimizer = optim.AdamW(model.parameters(), lr=0.0005, weight_decay=0.0001)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.8, min_lr=0.00001)

In [19]:
num_epochs = 1000
train_loop(model, train_dataloader, val_dataloader, loss_fn, optimizer, scheduler, num_epochs)

                                                                            

Epoch 1/1000 - Train Loss: 1.7944 - Val Loss: 1.6954


                                                                            

Epoch 2/1000 - Train Loss: 1.5251 - Val Loss: 1.5219


                                                                            

Epoch 3/1000 - Train Loss: 1.4077 - Val Loss: 1.5552


                                                                          

KeyboardInterrupt: 

In [20]:
# Save the model parameters
save_path = '../models/candidates/deeplpi_model.pth'
torch.save(model.state_dict(), save_path)
print(f"Model parameters saved to {save_path}")

Model parameters saved to ../models/candidates/deeplpi_model.pth


In [22]:
# Load the model parameters
model = DeepLPI(molshape, seqshape)
model.load_state_dict(torch.load(save_path))
model = model.to("cuda")
model.eval()

DeepLPI(
  (molcnn): cnnModule(
    (head): Sequential(
      (0): Conv1d(1, 256, kernel_size=(7,), stride=(2,), padding=(3,), bias=False)
      (1): BatchNorm1d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU(inplace=True)
      (3): Dropout(p=0.3, inplace=False)
      (4): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    )
    (cnn): Sequential(
      (0): resBlock(
        (process): Sequential(
          (0): Conv1d(256, 128, kernel_size=(3,), stride=(1,), padding=(1,))
          (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (2): ReLU(inplace=True)
          (3): Dropout(p=0.3, inplace=False)
          (4): Conv1d(128, 128, kernel_size=(3,), stride=(1,), padding=(1,))
          (5): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (conv1): Conv1d(256, 128, kernel_size=(1,), stride=(1,))
      )
      (1): resBlock(
  