RNN model testing

In [1]:
!pip install rdkit-pypi -qqq
!git clone https://github.com/molecularsets/moses.git


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.4/29.4 MB[0m [31m73.8 MB/s[0m eta [36m0:00:00[0m
[?25hCloning into 'moses'...
remote: Enumerating objects: 1957, done.[K
remote: Counting objects: 100% (4/4), done.[K
remote: Compressing objects: 100% (4/4), done.[K
remote: Total 1957 (delta 0), reused 2 (delta 0), pack-reused 1953 (from 1)[K
Receiving objects: 100% (1957/1957), 164.05 MiB | 14.59 MiB/s, done.
Resolving deltas: 100% (1068/1068), done.
Filtering content: 100% (68/68), 323.72 MiB | 51.49 MiB/s, done.


In [2]:
# prompt: open train.csv

import pandas as pd

df = pd.read_csv('moses/data/train.csv')
print(df.head())
print(df.shape)

                                   SMILES  SPLIT
0  CCCS(=O)c1ccc2[nH]c(=NC(=O)OC)[nH]c2c1  train
1    CC(C)(C)C(=O)C(Oc1ccc(Cl)cc1)n1ccnc1  train
2     Cc1c(Cl)cccc1Nc1ncccc1C(=O)OCC(O)CO  train
3        Cn1cnc2c1c(=O)n(CC(O)CO)c(=O)n2C  train
4          CC1Oc2ccc(Cl)cc2N(CC(O)CO)C1=O  train
(1584663, 2)


In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from tqdm import tqdm

COC(=O)C1=C(C)NC(=O)NC1c1cc(C(F)(F)F)ccc1OC)c1nncn1CC)c1cccnc1C1CC1C)C(C)CC#N)C(C)C)C1CC1COCC1CC1)c1c
CCOc1ccccc1OC(=O)Cc1ccon1)c1cnn2cccnc12)c1ccco1)C1CC1C)C1CC1C)C1CC1CC1=O)CC1(C)CCOCC1)C1CC1OCCO1C)C(C
CCOc1ccccc1NS(=O)(=O)c1cc(C)sc1C(=O)OC)C(C)C1=O)c1ccccc1OCC(N)=O)c1cccnc1NC(C)=O)C(C)C)C(C)C)c1ccco1)
COc1ccc(OC)c(Cc2nc3cccc(C(=O)OC)c3[nH]2)c1OC(F)F)C(N)=O)c1ccccc1F)C1CC1CC1CC1CC1=O)CC1(C)Cl)c1nc[nH]n
CCn1cc(S(=O)(=O)N2CCCCC2)cc1C(=O)OCC#CCN1CCOCC1C1CC1)C(N)=O)c1ccccc1F)C(C)CC#N)C(C)(C)C)C(C)CC#N)C(C)
C(#CCC(=O)N1CCCC(C(F)(F)F)C1)c1ccccc1F)C1CCC1(=O)NC1CC1)C1CC1CC1=O)c1ccncc1F)C1CC1C1CC1(C)CC1F)C(=O)O
COc1cc(CNC(=O)Nc2cnn(CC3CCCO3)c2)ccn1C1CC1CC1CC1)C(N)=O)c1ccccc1F)C1CC1C1CC1)C(C)C)c1nscc1C#N)C(N)=O)
COc1ccc(NC(=O)NCc2cc(C3CC3)cn2C)cn1)c1ccccc1F)c1ccncc1F)C1CC1C)C1CC1(C)OC(C)(C)O1)C1CC1CC1C)C1(C)CC1(
CCCCc1ccc(C2CCCN2C(=O)C(C)NS(C)(=O)=O)cc1OCC(C)CC)c1COC(C)=O)c1ccccc1Cl)c1cccnc1OCCCC#N)C(C)C#N)C1CC1
COC(=O)CC1CCCCN1C(=O)Nc1cccnc1-n1cccn1)c1ccccc1F)C1CC1C1CC1)c1ccncc1Cl)c1ccncc1F)C

In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import numpy as np
import pandas as pd
from rdkit import Chem, RDLogger
RDLogger.DisableLog('rdApp.*')  # Suppress RDKit warnings

# Function to add start and end tokens
def process_smiles(smiles_list):
    return ["^" + s + "$" for s in smiles_list]

# Create character dictionaries including special tokens
def create_vocab(smiles_list):
    all_chars = sorted(list(set(''.join(smiles_list))))
    char2idx = {ch: i + 1 for i, ch in enumerate(all_chars)}
    char2idx['<PAD>'] = 0  # Padding token
    idx2char = {i: ch for ch, i in char2idx.items()}
    return char2idx, idx2char, len(char2idx)

# Enhanced tokenization
def tokenize(smiles, char2idx):
    return [char2idx.get(c, 0) for c in smiles]  # Default to 0 if unknown

def detokenize(tokens, idx2char):
    return ''.join([idx2char.get(t, '') for t in tokens if t != 0])

# Improved dataset class
class SMILESDataset(Dataset):
    def __init__(self, smiles_list, char2idx, seq_len=120):
        self.smiles_list = smiles_list
        self.char2idx = char2idx
        self.seq_len = seq_len

    def __len__(self):
        return len(self.smiles_list)

    def __getitem__(self, idx):
        smiles = self.smiles_list[idx]
        if len(smiles) > self.seq_len:
            smiles = smiles[:self.seq_len]

        tokens = tokenize(smiles, self.char2idx)
        x = torch.tensor(tokens[:-1], dtype=torch.long)
        y = torch.tensor(tokens[1:], dtype=torch.long)

        # Padding
        pad_len = self.seq_len - 1 - len(x)
        if pad_len > 0:
            x = torch.cat([x, torch.zeros(pad_len, dtype=torch.long)])
            y = torch.cat([y, torch.zeros(pad_len, dtype=torch.long)])

        return x, y

# Enhanced RNN model
class RNNGenerator(nn.Module):
    def __init__(self, vocab_size, embedding_dim=256, hidden_dim=512, num_layers=3, dropout=0.2):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.rnn = nn.LSTM(
            embedding_dim,
            hidden_dim,
            num_layers,
            batch_first=True,
            dropout=dropout
        )
        self.fc = nn.Linear(hidden_dim, vocab_size)
        self.dropout = nn.Dropout(dropout)
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers

    def forward(self, x, hidden=None):
        batch_size = x.size(0)
        emb = self.dropout(self.embedding(x))

        if hidden is None:
            # Initialize hidden states
            h0 = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(x.device)
            c0 = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(x.device)
            hidden = (h0, c0)

        out, hidden = self.rnn(emb, hidden)
        out = self.dropout(out)
        logits = self.fc(out)
        return logits, hidden

    def init_hidden(self, batch_size, device):
        h0 = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(device)
        c0 = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(device)
        return (h0, c0)

# Sample a new molecule
def generate_molecule(model, char2idx, idx2char, device, max_len=100, temperature=0.8):
    model.eval()
    with torch.no_grad():
        # Start with the start token
        start_token = char2idx['^']
        current = torch.tensor([[start_token]], dtype=torch.long).to(device)
        hidden = None
        result = [start_token]

        for _ in range(max_len):
            output, hidden = model(current, hidden)
            output = output[:, -1, :] / temperature
            probs = torch.softmax(output, dim=-1)

            # Sample from the probability distribution
            next_token = torch.multinomial(probs, 1).item()
            result.append(next_token)
            current = torch.tensor([[next_token]], dtype=torch.long).to(device)

            # Stop if end token is generated
            if idx2char[next_token] == '$':
                break

        generated = detokenize(result, idx2char)
        # Remove start/end tokens for validation
        clean_smiles = generated.replace('^', '').replace('$', '')
        return clean_smiles, is_valid_smiles(clean_smiles)

# Validate SMILES with RDKit
def is_valid_smiles(smiles):
    mol = Chem.MolFromSmiles(smiles)
    return mol is not None

# Training with teacher forcing and validation
def train_model(model, dataloader, optimizer, criterion, device, epochs,
                char2idx, idx2char, save_path='smiles_rnn_model.pth'):
    best_loss = float('inf')

    for epoch in range(epochs):
        model.train()
        epoch_loss = 0

        for x, y in tqdm(dataloader, desc=f"Epoch {epoch+1}/{epochs}"):
            x, y = x.to(device), y.to(device)
            optimizer.zero_grad()

            logits, _ = model(x)
            loss = criterion(logits.view(-1, len(char2idx)), y.view(-1))

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 5.0)  # Gradient clipping
            optimizer.step()

            epoch_loss += loss.item()

        avg_loss = epoch_loss / len(dataloader)
        print(f"Epoch {epoch+1} loss: {avg_loss:.4f}")

        # Generate and validate some molecules
        if (epoch + 1) % 5 == 0 or epoch == epochs - 1:
            valid_count = 0
            n_samples = 10
            print("\nGenerating sample molecules:")
            for _ in range(n_samples):
                mol, valid = generate_molecule(model, char2idx, idx2char, device)
                validity = "✓" if valid else "✗"
                print(f"{mol} {validity}")
                if valid:
                    valid_count += 1
            print(f"Validity: {valid_count}/{n_samples} ({valid_count/n_samples*100:.1f}%)")

        # Save best model
        if avg_loss < best_loss:
            best_loss = avg_loss
            torch.save(model.state_dict(), save_path)
            print(f"Model saved to {save_path}")

# Main training pipeline
def run_training(dataframe, smiles_column='SMILES', batch_size=128, epochs=20):
    # Get SMILES strings from DataFrame
    smiles_list = dataframe[smiles_column].tolist()

    # Process data
    processed_smiles = process_smiles(smiles_list)
    char2idx, idx2char, vocab_size = create_vocab(processed_smiles)

    # Create dataset and data loader
    dataset = SMILESDataset(processed_smiles, char2idx, seq_len=120)
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    # Setup model
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = RNNGenerator(vocab_size).to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.CrossEntropyLoss(ignore_index=0)

    # Train
    train_model(model, loader, optimizer, criterion, device, epochs=epochs,
                char2idx=char2idx, idx2char=idx2char)

    return model, char2idx, idx2char

# Generate a batch of molecules
def generate_molecules(model, char2idx, idx2char, device, n=25, temperature=0.8):
    valid_mols = []
    attempts = 0
    max_attempts = n * 5  # Try up to 5x the requested number

    while len(valid_mols) < n and attempts < max_attempts:
        smiles, valid = generate_molecule(model, char2idx, idx2char, device, temperature=temperature)
        attempts += 1
        if valid:
            # Check for duplicates
            if smiles not in valid_mols:
                valid_mols.append(smiles)

    return valid_mols, len(valid_mols)/attempts if attempts > 0 else 0

In [6]:
# Main execution script
if __name__ == "__main__":
    # Load data
    df = pd.read_csv('moses/data/train.csv')
    print("Dataset information:")
    print(df.head())
    print(f"Dataset shape: {df.shape}")

    # Check if 'SMILES' column exists, adjust if needed
    smiles_column = 'SMILES'
    if smiles_column not in df.columns:
        # Try to find a column that might contain SMILES strings
        for col in df.columns:
            if any(c in '()[]=' for c in df[col].iloc[0]):
                smiles_column = col
                print(f"Using column '{smiles_column}' for SMILES data")
                break

    # Train the model
    print(f"\nTraining model on {df.shape[0]} SMILES strings...")
    model, char2idx, idx2char = run_training(df, smiles_column=smiles_column)

    # Generate and evaluate new molecules
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print("\nGenerating new molecules...")
    molecules, validity_rate = generate_molecules(model, char2idx, idx2char, device, n=20)

    print(f"\nGenerated {len(molecules)} valid molecules with {validity_rate*100:.1f}% validity rate")
    print("\nSample valid molecules:")
    for i, mol in enumerate(molecules[:10], 1):
        print(f"{i}. {mol}")

    # Optional: Save to CSV
    if molecules:
        output_df = pd.DataFrame({'generated_smiles': molecules})
        output_path = 'generated_molecules.csv'
        output_df.to_csv(output_path, index=False)
        print(f"\nSaved generated molecules to {output_path}")

Dataset information:
                                   SMILES  SPLIT
0  CCCS(=O)c1ccc2[nH]c(=NC(=O)OC)[nH]c2c1  train
1    CC(C)(C)C(=O)C(Oc1ccc(Cl)cc1)n1ccnc1  train
2     Cc1c(Cl)cccc1Nc1ncccc1C(=O)OCC(O)CO  train
3        Cn1cnc2c1c(=O)n(CC(O)CO)c(=O)n2C  train
4          CC1Oc2ccc(Cl)cc2N(CC(O)CO)C1=O  train
Dataset shape: (1584663, 2)

Training model on 1584663 SMILES strings...


Epoch 1/20:  12%|█▏        | 1463/12381 [00:59<07:22, 24.65it/s]


KeyboardInterrupt: 