In [1]:
import pandas as pd
import torch
from torch import tensor
from torch.utils.data import DataLoader,TensorDataset,SequentialSampler,RandomSampler
import numpy as np
from sklearn.model_selection import train_test_split

RAMDOMSEED = 42

In [2]:
train = pd.read_parquet('../data/train/train.parquet')

In [3]:
# Apply log transformation to the target variable + conver the measurament into micromolars
train['IC50 (nM)'] = np.log1p((train['IC50 (nM)']) / 1000)  

In [4]:
train['IC50 (nM)'].describe()

count    1.687796e+06
mean     1.183513e+00
std      1.617825e+00
min      0.000000e+00
25%      2.761517e-02
50%      3.442987e-01
75%      2.041220e+00
max      2.532844e+01
Name: IC50 (nM), dtype: float64

In [None]:
def create_tensor_dataset(df, seq_col, mol_col, target_col, batch_size=512):
    
    # Make Tensors
    seq_tensor = tensor(np.array(df[seq_col].tolist())).to(torch.float32)
    mol_tensor = tensor(np.array(df[mol_col].tolist())).to(torch.float32)
    target_tensor = tensor(np.array(df[target_col])).to(torch.float32)
    
    # Create TensorDataset
    dataset = TensorDataset(mol_tensor, seq_tensor, target_tensor)
    
    # Create DataLoader
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    
    return dataloader

In [5]:
# Split the dataset into training and validation sets
train_df, val_df = train_test_split(train, test_size=0.2, random_state=RAMDOMSEED)

In [6]:
# Create DataLoaders for training and validation sets
train_dataloader = create_tensor_dataset(train_df, seq_col='proteins', mol_col='smiles', target_col='IC50 (nM)')
val_dataloader = create_tensor_dataset(val_df, seq_col='proteins', mol_col='smiles', target_col='IC50 (nM)')

In [7]:
step_mol, step_seq, step_label = next(iter(train_dataloader))

In [None]:
# 768 SMILES  embeddings
# 320 proteins embeddings

In [8]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class resBlock(nn.Module):
    def __init__(self, in_channels, out_channels, use_conv1=False, strides=1, dropout=0.4):
        super().__init__()
        
        self.process = nn.Sequential(
            nn.Conv1d(in_channels, out_channels, kernel_size=3, stride=strides, padding=1),
            nn.BatchNorm1d(out_channels),
            nn.ReLU(inplace=True),
            nn.Dropout(p=dropout),
            nn.Conv1d(out_channels, out_channels, kernel_size=3, padding=1),
            nn.BatchNorm1d(out_channels)
        )
        
        if use_conv1:
            self.conv1 = nn.Conv1d(in_channels, out_channels, kernel_size=1, stride=strides)
        else:
            self.conv1 = None
        
    def forward(self, x):
        left = self.process(x)
        right = x if self.conv1 is None else self.conv1(x)
        
        return F.relu(left + right)

class cnnModule(nn.Module):
    def __init__(self, in_channel, out_channel, hidden_channel=128, dropout=0.4):
        super().__init__()
        
        self.head = nn.Sequential(
            nn.Conv1d(in_channel, hidden_channel, 7, stride=2, padding=3, bias=False),
            nn.BatchNorm1d(hidden_channel),
            nn.ReLU(inplace=True),
            nn.Dropout(p=dropout),
            nn.MaxPool1d(2)
        )
        
        self.cnn = nn.Sequential(
            resBlock(hidden_channel, out_channel, use_conv1=True, strides=1),
            resBlock(out_channel, out_channel, strides=1),
            resBlock(out_channel, out_channel, strides=1)
        )
    
    def forward(self, x):
        x = self.head(x)
        x = self.cnn(x)
        
        return x

class DeepLPI(nn.Module):
    def __init__(self, molshape, seqshape, dropout=0.4):
        super().__init__()
        
        self.molshape = molshape
        self.seqshape = seqshape

        self.molcnn = cnnModule(1, 64)  # Adjusted out_channel
        self.seqcnn = cnnModule(1, 64)  # Adjusted out_channel
        
        self.pool = nn.AvgPool1d(5, stride=3)
        self.lstm = nn.LSTM(64, 64, num_layers=3, batch_first=True, bidirectional=True)  # Adjusted hidden size and num_layers
        
        self.mlp = nn.Sequential(
            nn.Linear(round(((molshape + seqshape) / 4 - 2) * 2 / 3) * 64, 4096),  # Adjusted hidden units
            nn.BatchNorm1d(4096),
            nn.ReLU(),
            nn.Dropout(p=dropout),
            
            nn.Linear(4096, 2048),  # Adjusted hidden units
            nn.BatchNorm1d(2048),
            nn.ReLU(),
            nn.Dropout(p=0.3),
            
            nn.Linear(2048, 512),  # Adjusted hidden units
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(p=0.3),
            
            nn.Linear(512, 1),
        )

    def forward(self, mol, seq):
        mol = self.molcnn(mol.reshape(-1, 1, self.molshape))
        seq = self.seqcnn(seq.reshape(-1, 1, self.seqshape))
        
        # Concatenate along the sequence dimension
        x = torch.cat((mol, seq), 2)
        x = self.pool(x)
        
        # Reshape for LSTM
        batch_size = x.size(0)
        x = x.reshape(batch_size, -1, 64)
        x, _ = self.lstm(x)
        
        # Fully connected layer
        x = self.mlp(x.flatten(1))
        
        x = x.flatten()
        
        return x

# Example usage
molshape = 768
seqshape = 320
model = DeepLPI(molshape, seqshape)

In [10]:
def initialize_weights(m):
    if isinstance(m, nn.Conv1d):
        nn.init.kaiming_uniform_(m.weight.data,nonlinearity='relu')
        if m.bias is not None:
            nn.init.constant_(m.bias.data, 0)

    elif isinstance(m, nn.BatchNorm1d):
        nn.init.constant_(m.weight.data, 1)
        nn.init.constant_(m.bias.data, 0)

    elif isinstance(m, nn.Linear):
        nn.init.kaiming_uniform_(m.weight.data)
        nn.init.constant_(m.bias.data, 0)

In [11]:
model.apply(initialize_weights)

DeepLPI(
  (molcnn): cnnModule(
    (head): Sequential(
      (0): Conv1d(1, 128, kernel_size=(7,), stride=(2,), padding=(3,), bias=False)
      (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU(inplace=True)
      (3): Dropout(p=0.4, inplace=False)
      (4): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    )
    (cnn): Sequential(
      (0): resBlock(
        (process): Sequential(
          (0): Conv1d(128, 64, kernel_size=(3,), stride=(1,), padding=(1,))
          (1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (2): ReLU(inplace=True)
          (3): Dropout(p=0.4, inplace=False)
          (4): Conv1d(64, 64, kernel_size=(3,), stride=(1,), padding=(1,))
          (5): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        )
        (conv1): Conv1d(128, 64, kernel_size=(1,), stride=(1,))
      )
      (1): resBlock(
        

In [12]:
import torch
from tqdm import tqdm

def train_loop(model, train_dataloader, val_dataloader, loss_fn, optimizer, scheduler, num_epochs, save_path):
    model = model.to("cuda")
    
    for epoch in range(num_epochs):
        model.train()
        loop_loss = 0
        train_loader = tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{num_epochs}", leave=False)
        
        for step, batch in enumerate(train_loader):
            step_mol, step_seq, step_label = batch
            step_mol, step_seq, step_label = step_mol.to("cuda"), step_seq.to("cuda"), step_label.to("cuda")
            
            optimizer.zero_grad()
            logits = model(step_mol, step_seq)
            loss = loss_fn(logits, step_label)
            loss.backward()
            optimizer.step()
            loop_loss += loss.item()

            if step % 20 == 0:
                train_loader.set_postfix(loss=loss.item())
        
        avg_train_loss = loop_loss / len(train_dataloader)
        
        # Validation step
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for step, batch in enumerate(val_dataloader):
                step_mol, step_seq, step_label = batch
                step_mol, step_seq, step_label = step_mol.to("cuda"), step_seq.to("cuda"), step_label.to("cuda")
                
                logits = model(step_mol, step_seq)
                loss = loss_fn(logits, step_label)
                val_loss += loss.item()
        
        avg_val_loss = val_loss / len(val_dataloader)
        
        print(f"Epoch {epoch+1}/{num_epochs} - Train Loss: {avg_train_loss:.4f} - Val Loss: {avg_val_loss:.4f}")
        
        # Step the scheduler based on validation loss
        scheduler.step(avg_val_loss)
        
        # Save the model parameters
        torch.save(model.state_dict(), save_path)
        print(f"Model parameters saved to {save_path}")

In [11]:
import torch.optim as optim

loss_fn = nn.SmoothL1Loss()  # Using Huber Loss 
optimizer = optim.AdamW(model.parameters(), lr=0.0005, weight_decay=0.0001)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', factor=0.8, min_lr=0.00001)

In [14]:
num_epochs = 10
save_path = "../models/candidates/deeplpi_model.pth"
train_loop(model, train_dataloader, val_dataloader, loss_fn, optimizer, scheduler, num_epochs, save_path)

                                                                           

Epoch 1/10 - Train Loss: 0.6189 - Val Loss: 0.5237
Model parameters saved to ../models/candidates/deeplpi_model.pth


                                                                           

Epoch 2/10 - Train Loss: 0.5079 - Val Loss: 0.4884
Model parameters saved to ../models/candidates/deeplpi_model.pth


                                                                           

Epoch 3/10 - Train Loss: 0.4596 - Val Loss: 0.4314
Model parameters saved to ../models/candidates/deeplpi_model.pth


                                                                           

Epoch 4/10 - Train Loss: 0.4241 - Val Loss: 0.4113
Model parameters saved to ../models/candidates/deeplpi_model.pth


                                                                           

Epoch 5/10 - Train Loss: 0.3930 - Val Loss: 0.3910
Model parameters saved to ../models/candidates/deeplpi_model.pth


                                                                           

Epoch 6/10 - Train Loss: 0.3652 - Val Loss: 0.3778
Model parameters saved to ../models/candidates/deeplpi_model.pth


                                                                           

Epoch 7/10 - Train Loss: 0.3388 - Val Loss: 0.3488
Model parameters saved to ../models/candidates/deeplpi_model.pth


                                                                           

Epoch 8/10 - Train Loss: 0.3166 - Val Loss: 0.3373
Model parameters saved to ../models/candidates/deeplpi_model.pth


                                                                           

Epoch 9/10 - Train Loss: 0.2950 - Val Loss: 0.3271
Model parameters saved to ../models/candidates/deeplpi_model.pth


                                                                            

Epoch 10/10 - Train Loss: 0.2762 - Val Loss: 0.3136
Model parameters saved to ../models/candidates/deeplpi_model.pth


In [9]:
# Load the model parameters
save_path = "../models/production/deeplpi_model_v2.pth"
model = DeepLPI(molshape, seqshape)
model.load_state_dict(torch.load(save_path))
model = model.to("cuda")

In [12]:
import torch
from sklearn.metrics import r2_score

def validate_model(model, val_dataloader, loss_fn):
    model.eval()
    val_loss = 0
    all_labels = []
    all_predictions = []
    
    with torch.no_grad():
        for step, batch in enumerate(val_dataloader):
            step_mol, step_seq, step_label = batch
            step_mol, step_seq, step_label = step_mol.to("cuda"), step_seq.to("cuda"), step_label.to("cuda")
            
            print(step_mol.shape)
            print(step_seq.shape)
            logits = model(step_mol, step_seq)
            loss = loss_fn(logits, step_label)
            val_loss += loss.item()
            
            all_labels.extend(step_label.cpu().numpy())
            all_predictions.extend(logits.cpu().numpy())
    
    avg_val_loss = val_loss / len(val_dataloader)
    
    # Compute R² metric
    r2 = r2_score(all_labels, all_predictions)
    
    print(f"Validation Loss: {avg_val_loss:.4f} - R²: {r2:.4f}")
    
    return avg_val_loss, r2

avg_val_loss, r2 = validate_model(model, val_dataloader, loss_fn)

torch.Size([1024, 768])
torch.Size([1024, 320])
torch.Size([1024, 768])
torch.Size([1024, 320])
torch.Size([1024, 768])
torch.Size([1024, 320])
torch.Size([1024, 768])
torch.Size([1024, 320])
torch.Size([1024, 768])
torch.Size([1024, 320])
torch.Size([1024, 768])
torch.Size([1024, 320])
torch.Size([1024, 768])
torch.Size([1024, 320])
torch.Size([1024, 768])
torch.Size([1024, 320])
torch.Size([1024, 768])
torch.Size([1024, 320])
torch.Size([1024, 768])
torch.Size([1024, 320])
torch.Size([1024, 768])
torch.Size([1024, 320])
torch.Size([1024, 768])
torch.Size([1024, 320])
torch.Size([1024, 768])
torch.Size([1024, 320])
torch.Size([1024, 768])
torch.Size([1024, 320])
torch.Size([1024, 768])
torch.Size([1024, 320])
torch.Size([1024, 768])
torch.Size([1024, 320])
torch.Size([1024, 768])
torch.Size([1024, 320])
torch.Size([1024, 768])
torch.Size([1024, 320])
torch.Size([1024, 768])
torch.Size([1024, 320])
torch.Size([1024, 768])
torch.Size([1024, 320])
torch.Size([1024, 768])
torch.Size([1024

KeyboardInterrupt: 

In [None]:
# Validation Loss: 0.4063 - R²: 0.4774 version 1
# Validation Loss: 0.3160 - R²: 0.6218 version 2
# Validation Loss: 0.3205 - R²: 0.6154 version 3
# Add Normalization into embeddings ??? Validation Loss: 0.3135 - R²: 0.6276