<a href="https://colab.research.google.com/github/Luly7/RT/blob/main/CNN_RT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

SMILES to CNN model for Retention Time Prediction

In [1]:
# 1. Install if missing
!pip install torch torchvision pandas scikit-learn rdkit-pypi
!pip install numpy==1.23.5
# 2. Import libraries
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
from rdkit import Chem
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

# 3. Load your data
csv_path = '/content/drive/MyDrive/CS6480/output_rt_fixed_smiles_unique.csv'  # <-- change if needed
data = pd.read_csv(csv_path)

# 4. Build SMILES to integer encoding
# Simple atom-level encoding (you can make it smarter later)
def smiles_to_int(smiles, max_len=100):
    vocab = {ch: idx+1 for idx, ch in enumerate(sorted(set(''.join(smiles))))}  # Unique characters
    encoded = [vocab.get(ch, 0) for ch in smiles]
    padded = encoded + [0] * (max_len - len(encoded))
    return padded[:max_len]

X = np.array([smiles_to_int(smi) for smi in data['fixed_smiles']])
y = np.array(data['rt'], dtype=np.float32)

# 5. Create Dataset and DataLoader
class SmilesDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.long)
        self.y = torch.tensor(y, dtype=torch.float32)

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

dataset = SmilesDataset(X, y)
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

# 6. Define CNN Model
class CNN_RT(nn.Module):
    def __init__(self, vocab_size, emb_dim=32):
        super(CNN_RT, self).__init__()
        self.embedding = nn.Embedding(vocab_size + 1, emb_dim, padding_idx=0)
        self.conv1 = nn.Conv1d(emb_dim, 64, kernel_size=5, padding=2)
        self.conv2 = nn.Conv1d(64, 128, kernel_size=5, padding=2)
        self.pool = nn.AdaptiveAvgPool1d(1)
        self.fc = nn.Linear(128, 1)

    def forward(self, x):
        x = self.embedding(x)  # (batch, seq_len, emb_dim)
        x = x.permute(0, 2, 1)  # (batch, emb_dim, seq_len) for Conv1d
        x = torch.relu(self.conv1(x))
        x = torch.relu(self.conv2(x))
        x = self.pool(x).squeeze(-1)
        out = self.fc(x)
        return out.view(-1)

# 7. Setup
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
vocab_size = max([max(seq) for seq in X])
model = CNN_RT(vocab_size).to(device)
optimizer = optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.MSELoss()

# 8. Training loop
n_epochs = 10
for epoch in range(n_epochs):
    model.train()
    train_loss = 0
    for inputs, targets in train_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        preds = model(inputs)
        loss = loss_fn(preds, targets)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()
    print(f"Epoch {epoch+1}: Train Loss = {train_loss / len(train_loader):.4f}")

# 9. Evaluation
model.eval()
preds = []
trues = []
with torch.no_grad():
    for inputs, targets in test_loader:
        inputs = inputs.to(device)
        preds.append(model(inputs).cpu().numpy())
        trues.append(targets.numpy())

preds = np.concatenate(preds)
trues = np.concatenate(trues)

rmse = np.sqrt(mean_squared_error(trues, preds))
mae = mean_absolute_error(trues, preds)
r2 = r2_score(trues, preds)

print(f"\nTest RMSE: {rmse:.2f}")
print(f"Test MAE: {mae:.2f}")
print(f"Test R² Score: {r2:.4f}")

# 10. Save results
results_df = pd.DataFrame({'True_RT': trues, 'Predicted_RT': preds})
results_df.to_csv('/content/drive/MyDrive/CS6480/CNN/retention_time_predictions_cnn.csv', index=False)
print("\nPredictions saved to CSV ✅")


Epoch 1: Train Loss = 60358.0840
Epoch 2: Train Loss = 37471.7269
Epoch 3: Train Loss = 34918.0839
Epoch 4: Train Loss = 33607.6936
Epoch 5: Train Loss = 32611.6877
Epoch 6: Train Loss = 31746.4482
Epoch 7: Train Loss = 30688.1310
Epoch 8: Train Loss = 29210.2449
Epoch 9: Train Loss = 27993.0444
Epoch 10: Train Loss = 26917.2859

Test RMSE: 162.17
Test MAE: 119.02
Test R² Score: 0.3803

Predictions saved to CSV ✅
