In [65]:
import torch
import polars as pl
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
from sklearn.metrics import mean_squared_error, r2_score
import logging

In [2]:
df = pl.read_csv("../ADME.csv")

In [3]:
df.head(5)

Internal ID,Vendor ID,SMILES,CollectionName,LOG HLM_CLint (mL/min/kg),LOG MDR1-MDCK ER (B-A/A-B),LOG SOLUBILITY PH 6.8 (ug/mL),LOG PLASMA PROTEIN BINDING (HUMAN) (% unbound),LOG PLASMA PROTEIN BINDING (RAT) (% unbound),LOG RLM_CLint (mL/min/kg)
str,str,str,str,f64,f64,f64,f64,f64,f64
"""Mol1""","""317714313""","""CNc1cc(Nc2cccn(-c3ccccn3)c2=O)…","""emolecules""",0.675687,1.493167,0.089905,0.991226,0.518514,1.392169
"""Mol2""","""324056965""","""CCOc1cc2nn(CCC(C)(C)O)cc2cc1NC…","""emolecules""",0.675687,1.04078,0.550228,0.099681,0.268344,1.02792
"""Mol3""","""304005766""","""CN(c1ncc(F)cn1)[C@H]1CCCNC1""","""emolecules""",0.675687,-0.358806,,2.0,2.0,1.02792
"""Mol4""","""194963090""","""CC(C)(Oc1ccc(-c2cnc(N)c(-c3ccc…","""emolecules""",0.675687,1.026662,1.657056,-1.158015,-1.403403,1.02792
"""Mol5""","""324059015""","""CC(C)(O)CCn1cc2cc(NC(=O)c3cccc…","""emolecules""",0.99638,1.010597,,1.015611,1.092264,1.629093


In [4]:
for column in df.columns:
    if df[column].dtype.is_numeric():
        df = df.with_columns(pl.col(column).fill_null(df[column].mean()))
    else:
        df = df.with_columns(pl.col(column).fill_null(df[column].mode()[0]))

In [5]:
df.null_count()

Internal ID,Vendor ID,SMILES,CollectionName,LOG HLM_CLint (mL/min/kg),LOG MDR1-MDCK ER (B-A/A-B),LOG SOLUBILITY PH 6.8 (ug/mL),LOG PLASMA PROTEIN BINDING (HUMAN) (% unbound),LOG PLASMA PROTEIN BINDING (RAT) (% unbound),LOG RLM_CLint (mL/min/kg)
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,0,0,0,0,0


In [6]:
df.columns

['Internal ID',
 'Vendor ID',
 'SMILES',
 'CollectionName',
 'LOG HLM_CLint (mL/min/kg)',
 'LOG MDR1-MDCK ER (B-A/A-B)',
 'LOG SOLUBILITY PH 6.8 (ug/mL)',
 'LOG PLASMA PROTEIN BINDING (HUMAN) (% unbound)',
 'LOG PLASMA PROTEIN BINDING (RAT) (% unbound)',
 'LOG RLM_CLint (mL/min/kg)']

In [7]:
df = df.rename({"LOG HLM_CLint (mL/min/kg)":"log_hlm_clint", "LOG MDR1-MDCK ER (B-A/A-B)":"log_mdr1_mdck", "LOG SOLUBILITY PH 6.8 (ug/mL)":"log_solubility_ph", "LOG PLASMA PROTEIN BINDING (HUMAN) (% unbound)":"log_plasma_protein_human", "LOG PLASMA PROTEIN BINDING (RAT) (% unbound)":"log_plasma_protein_rat", "LOG RLM_CLint (mL/min/kg)":"log_rlm_clint"})

In [8]:
df.columns

['Internal ID',
 'Vendor ID',
 'SMILES',
 'CollectionName',
 'log_hlm_clint',
 'log_mdr1_mdck',
 'log_solubility_ph',
 'log_plasma_protein_human',
 'log_plasma_protein_rat',
 'log_rlm_clint']

In [9]:
smiles = df['SMILES'].to_list()

In [10]:
# SMILES Encoding
vocab = list(set("".join(smiles)))  # Get unique characters from SMILES strings
char_to_idx = {c: i for i, c in enumerate(vocab)}

In [11]:
def encode_smiles(smiles_list):
    encoded_smiles = []
    for smi in smiles_list:
        encoded_smi = [char_to_idx[c] for c in smi]
        encoded_smiles.append(encoded_smi)
    return encoded_smiles

In [12]:
# Pad sequences to have the same length (required for batch processing)
def pad_sequences(sequences, max_length=None):
    if max_length is None:
        max_length = max(len(seq) for seq in sequences)
    padded_sequences = []
    for seq in sequences:
        padded_seq = seq + [0] * (max_length - len(seq))
        padded_sequences.append(padded_seq)
    return torch.tensor(padded_sequences, dtype=torch.long)

In [13]:
encoded_smiles = pad_sequences(encode_smiles(smiles))

In [14]:
encoded_smiles

tensor([[17, 26, 16,  ...,  0,  0,  0],
        [17, 17, 15,  ...,  0,  0,  0],
        [17, 26, 23,  ...,  0,  0,  0],
        ...,
        [26, 17, 23,  ...,  0,  0,  0],
        [17, 17, 17,  ...,  0,  0,  0],
        [17, 16, 11,  ...,  0,  0,  0]])

In [15]:
adme_values = torch.tensor(df[["log_hlm_clint", "log_mdr1_mdck", "log_solubility_ph", "log_plasma_protein_human", "log_plasma_protein_rat", "log_rlm_clint"]].to_numpy(), dtype=torch.float32)

In [16]:
adme_values

tensor([[ 0.6757,  1.4932,  0.0899,  0.9912,  0.5185,  1.3922],
        [ 0.6757,  1.0408,  0.5502,  0.0997,  0.2683,  1.0279],
        [ 0.6757, -0.3588,  1.2599,  2.0000,  2.0000,  1.0279],
        ...,
        [ 0.8638,  0.3978,  1.2599,  0.7657,  0.7642,  2.2562],
        [ 0.8814,  0.3978,  1.2599,  0.7657,  0.7642,  2.2562],
        [ 1.5071,  0.3978,  1.2599,  0.7657,  0.7642,  2.2562]])

In [17]:
class ADMEDataset(Dataset):
    def __init__(self, encoded_smiles, adme_values):
        self.encoded_smiles = encoded_smiles
        self.adme_values = adme_values

    def __len__(self):
        return len(self.encoded_smiles)

    def __getitem__(self, idx):
        return self.encoded_smiles[idx], self.adme_values[idx]

In [18]:
# Create the dataset
dataset = ADMEDataset(encoded_smiles, adme_values)

In [19]:
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

In [23]:
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32)  # No need to shuffle validation data

In [44]:
class ADMEMT_DNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim=10, hidden_dim=64, num_tasks=6):
        super(ADMEMT_DNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm1 = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.lstm2 = nn.LSTM(hidden_dim, hidden_dim // 2, batch_first=True)
        self.dropout = nn.Dropout(0.2)
        self.task_heads = nn.ModuleList([nn.Linear(hidden_dim // 2, 1) for _ in range(num_tasks)])

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.lstm1(x)
        x, _ = self.lstm2(x)
        x = self.dropout(x)
        x = x[:, -1, :]  # Take the last hidden state for each sequence
        outputs = torch.cat([head(x) for head in self.task_heads], dim=1)  # Concatenate outputs along the feature dimension
        return outputs

In [45]:
model = ADMEMT_DNN(len(vocab))

In [46]:
# Loss function and optimizer
criterion = nn.MSELoss()  # Mean Squared Error loss for regression tasks
optimizer = torch.optim.Adam(model.parameters())

In [67]:
# Training loop
num_epochs = 100
num_tasks = 6
for epoch in range(num_epochs):
    model.train()  # Set the model to training mode
    epoch_loss = 0.0

    for smiles_batch, adme_batch in train_loader:
        optimizer.zero_grad()
        outputs = model(smiles_batch)
        outputs = outputs.view(-1, num_tasks)

        loss = 0
        for i in range(num_tasks):
            loss += criterion(outputs[:, i], adme_batch[:, i])

        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()  # Accumulate loss for the epoch

    # Print and log average epoch loss
    avg_loss = epoch_loss / len(train_loader)
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}")
    logging.info(f"Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}")

    # Validation step (optional, but recommended)
    model.eval()  # Set model to evaluation mode
    val_loss = 0.0
    with torch.no_grad():
        for smiles_batch, adme_batch in val_loader:
            outputs = model(smiles_batch)
            outputs = outputs.view(-1, num_tasks)

            loss = 0
            for i in range(num_tasks):
                loss += criterion(outputs[:, i], adme_batch[:, i])
            val_loss += loss.item()

    avg_val_loss = val_loss / len(val_loader)
    print(f"Validation Loss: {avg_val_loss:.4f}")
    logging.info(f"Validation Loss: {avg_val_loss:.4f}")

Epoch [1/100], Loss: 1.5672
Validation Loss: 1.6031
Epoch [2/100], Loss: 1.5663
Validation Loss: 1.6053
Epoch [3/100], Loss: 1.5681
Validation Loss: 1.6064
Epoch [4/100], Loss: 1.5691
Validation Loss: 1.6009
Epoch [5/100], Loss: 1.5639
Validation Loss: 1.6038
Epoch [6/100], Loss: 1.5586
Validation Loss: 1.6038
Epoch [7/100], Loss: 1.5602
Validation Loss: 1.5999
Epoch [8/100], Loss: 1.5599
Validation Loss: 1.6035
Epoch [9/100], Loss: 1.5643
Validation Loss: 1.6037
Epoch [10/100], Loss: 1.5620
Validation Loss: 1.6038
Epoch [11/100], Loss: 1.5665
Validation Loss: 1.6008
Epoch [12/100], Loss: 1.5544
Validation Loss: 1.6007
Epoch [13/100], Loss: 1.5645
Validation Loss: 1.6000
Epoch [14/100], Loss: 1.5620
Validation Loss: 1.6024
Epoch [15/100], Loss: 1.5663
Validation Loss: 1.6032
Epoch [16/100], Loss: 1.5650
Validation Loss: 1.6021
Epoch [17/100], Loss: 1.5616
Validation Loss: 1.6089
Epoch [18/100], Loss: 1.5575
Validation Loss: 1.6031
Epoch [19/100], Loss: 1.5605
Validation Loss: 1.6049
Ep

In [68]:
model.eval()  # Set the model to evaluation mode
total_loss = 0
all_predictions = []
all_targets = []

with torch.no_grad():  # Disable gradient calculation during evaluation
    for smiles_batch, adme_batch in val_loader:
        outputs = model(smiles_batch)
        outputs = outputs.view(-1, num_tasks)  # Reshape outputs

        # Calculate loss
        loss = 0
        for i in range(num_tasks):
            loss += criterion(outputs[:, i], adme_batch[:, i])
        total_loss += loss.item()

        # Store predictions and targets for later metric calculation
        all_predictions.append(outputs)
        all_targets.append(adme_batch)

# Calculate average loss over all batches
avg_loss = total_loss / len(val_loader)

# Concatenate predictions and targets for metric calculation
all_predictions = torch.cat(all_predictions, dim=0).numpy()
all_targets = torch.cat(all_targets, dim=0).numpy()

# Calculate metrics for each task
for i in range(num_tasks):
    mse = mean_squared_error(all_targets[:, i], all_predictions[:, i])
    r2 = r2_score(all_targets[:, i], all_predictions[:, i])
    print(f"Task {i + 1}: MSE = {mse:.4f}, R^2 = {r2:.4f}")

Task 1: MSE = 0.3589, R^2 = -0.0048
Task 2: MSE = 0.3294, R^2 = -0.0024
Task 3: MSE = 0.2689, R^2 = -0.0009
Task 4: MSE = 0.0412, R^2 = -0.0001
Task 5: MSE = 0.0312, R^2 = -0.0060
Task 6: MSE = 0.5192, R^2 = -0.0084


In [69]:
smile_string = "CC(=O)Oc1ccccc1C(=O)O"  # Example SMILES
encoded_smile = pad_sequences([[char_to_idx[c] for c in smile_string]])

In [70]:
with torch.no_grad():  # Disable gradient calculation
    output = model(encoded_smile)

In [71]:
output[0].tolist()

[0.8276425004005432,
 0.16862380504608154,
 0.8602256178855896,
 0.7115737199783325,
 0.7278763055801392,
 1.098250389099121]