RNN model testing

In [1]:
!pip install rdkit-pypi -qqq
!git clone https://github.com/molecularsets/moses.git


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.4/29.4 MB[0m [31m42.6 MB/s[0m eta [36m0:00:00[0m
[?25hCloning into 'moses'...
remote: Enumerating objects: 1957, done.[K
remote: Counting objects: 100% (4/4), done.[K
remote: Compressing objects: 100% (4/4), done.[K
remote: Total 1957 (delta 0), reused 2 (delta 0), pack-reused 1953 (from 1)[K
Receiving objects: 100% (1957/1957), 164.05 MiB | 40.32 MiB/s, done.
Resolving deltas: 100% (1068/1068), done.
Filtering content: 100% (68/68), 323.72 MiB | 34.47 MiB/s, done.


In [2]:
# prompt: open train.csv

import pandas as pd

df = pd.read_csv('moses/data/train.csv')
print(df.head())
print(df.shape)

                                   SMILES  SPLIT
0  CCCS(=O)c1ccc2[nH]c(=NC(=O)OC)[nH]c2c1  train
1    CC(C)(C)C(=O)C(Oc1ccc(Cl)cc1)n1ccnc1  train
2     Cc1c(Cl)cccc1Nc1ncccc1C(=O)OCC(O)CO  train
3        Cn1cnc2c1c(=O)n(CC(O)CO)c(=O)n2C  train
4          CC1Oc2ccc(Cl)cc2N(CC(O)CO)C1=O  train
(1584663, 2)


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np
from tqdm import tqdm

In [None]:
smiles_list = df[df['SPLIT'] == 'train']['SMILES'].tolist()
print(len(smiles_list))

1584663


In [None]:
all_chars = sorted(list(set(''.join(smiles_list))))
char2idx = {ch: i + 1 for i, ch in enumerate(all_chars)}  # reserve 0 for padding
idx2char = {i: ch for ch, i in char2idx.items()}
vocab_size = len(char2idx) + 1

# Tokenization helper
def tokenize(smiles):
    return [char2idx[c] for c in smiles]

def detokenize(tokens):
    return ''.join([idx2char[t] for t in tokens if t != 0])

# Dataset class
class SMILESDataset(Dataset):
    def __init__(self, smiles_list, seq_len=100):
        self.data = [tokenize(s) for s in smiles_list if len(s) < seq_len]
        self.seq_len = seq_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        seq = self.data[idx]
        x = torch.tensor(seq[:-1], dtype=torch.long)
        y = torch.tensor(seq[1:], dtype=torch.long)
        pad_len = self.seq_len - 1 - len(x)
        if pad_len > 0:
            x = torch.cat([x, torch.zeros(pad_len, dtype=torch.long)])
            y = torch.cat([y, torch.zeros(pad_len, dtype=torch.long)])
        return x, y

# RNN Model
class RNNGenerator(nn.Module):
    def __init__(self, vocab_size, embedding_dim=128, hidden_dim=256, num_layers=2):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, hidden=None):
        emb = self.embedding(x)
        out, hidden = self.rnn(emb, hidden)
        logits = self.fc(out)
        return logits, hidden

# Hyperparameters
SEQ_LEN = 100
BATCH_SIZE = 128
EPOCHS = 10
LR = 0.001

dataset = SMILESDataset(smiles_list, seq_len=SEQ_LEN)
loader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = RNNGenerator(vocab_size).to(device)
optimizer = optim.Adam(model.parameters(), lr=LR)
criterion = nn.CrossEntropyLoss(ignore_index=0)

# Training loop
for epoch in range(EPOCHS):
    model.train()
    epoch_loss = 0
    for x, y in tqdm(loader, desc=f"Epoch {epoch+1}/{EPOCHS}"):
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad()
        logits, _ = model(x)
        loss = criterion(logits.view(-1, vocab_size), y.view(-1))
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    print(f"Epoch {epoch+1} loss: {epoch_loss / len(loader):.4f}")

Epoch 1/10: 100%|██████████| 12381/12381 [02:47<00:00, 73.91it/s]


Epoch 1 loss: 0.6144


Epoch 2/10: 100%|██████████| 12381/12381 [02:46<00:00, 74.30it/s]


Epoch 2 loss: 0.5418


Epoch 3/10: 100%|██████████| 12381/12381 [02:46<00:00, 74.26it/s]


Epoch 3 loss: 0.5273


Epoch 4/10: 100%|██████████| 12381/12381 [02:46<00:00, 74.33it/s]


Epoch 4 loss: 0.5199


Epoch 5/10: 100%|██████████| 12381/12381 [02:46<00:00, 74.26it/s]


Epoch 5 loss: 0.5153


Epoch 6/10: 100%|██████████| 12381/12381 [02:46<00:00, 74.32it/s]


Epoch 6 loss: 0.5121


Epoch 7/10: 100%|██████████| 12381/12381 [02:46<00:00, 74.29it/s]


Epoch 7 loss: 0.5098


Epoch 8/10: 100%|██████████| 12381/12381 [02:46<00:00, 74.36it/s]


Epoch 8 loss: 0.5081


Epoch 9/10: 100%|██████████| 12381/12381 [02:46<00:00, 74.27it/s]


Epoch 9 loss: 0.5068


Epoch 10/10: 100%|██████████| 12381/12381 [02:46<00:00, 74.31it/s]

Epoch 10 loss: 0.5057





In [None]:
def sample(model, start_token='C', max_len=100, temperature=1.0):
    model.eval()
    input = torch.tensor([[char2idx[start_token]]], dtype=torch.long).to(device)
    generated = [char2idx[start_token]]
    hidden = None
    for _ in range(max_len):
        logits, hidden = model(input, hidden)
        logits = logits[:, -1, :] / temperature
        probs = torch.softmax(logits, dim=-1)
        next_token = torch.multinomial(probs, num_samples=1).item()
        if next_token == 0:
            break
        generated.append(next_token)
        input = torch.tensor([[next_token]], dtype=torch.long).to(device)
    return detokenize(generated)

# Example generation
for _ in range(10):
    print(sample(model))

COC(=O)C1=C(C)NC(=O)NC1c1cc(C(F)(F)F)ccc1OC)c1nncn1CC)c1cccnc1C1CC1C)C(C)CC#N)C(C)C)C1CC1COCC1CC1)c1c
CCOc1ccccc1OC(=O)Cc1ccon1)c1cnn2cccnc12)c1ccco1)C1CC1C)C1CC1C)C1CC1CC1=O)CC1(C)CCOCC1)C1CC1OCCO1C)C(C
CCOc1ccccc1NS(=O)(=O)c1cc(C)sc1C(=O)OC)C(C)C1=O)c1ccccc1OCC(N)=O)c1cccnc1NC(C)=O)C(C)C)C(C)C)c1ccco1)
COc1ccc(OC)c(Cc2nc3cccc(C(=O)OC)c3[nH]2)c1OC(F)F)C(N)=O)c1ccccc1F)C1CC1CC1CC1CC1=O)CC1(C)Cl)c1nc[nH]n
CCn1cc(S(=O)(=O)N2CCCCC2)cc1C(=O)OCC#CCN1CCOCC1C1CC1)C(N)=O)c1ccccc1F)C(C)CC#N)C(C)(C)C)C(C)CC#N)C(C)
C(#CCC(=O)N1CCCC(C(F)(F)F)C1)c1ccccc1F)C1CCC1(=O)NC1CC1)C1CC1CC1=O)c1ccncc1F)C1CC1C1CC1(C)CC1F)C(=O)O
COc1cc(CNC(=O)Nc2cnn(CC3CCCO3)c2)ccn1C1CC1CC1CC1)C(N)=O)c1ccccc1F)C1CC1C1CC1)C(C)C)c1nscc1C#N)C(N)=O)
COc1ccc(NC(=O)NCc2cc(C3CC3)cn2C)cn1)c1ccccc1F)c1ccncc1F)C1CC1C)C1CC1(C)OC(C)(C)O1)C1CC1CC1C)C1(C)CC1(
CCCCc1ccc(C2CCCN2C(=O)C(C)NS(C)(=O)=O)cc1OCC(C)CC)c1COC(C)=O)c1ccccc1Cl)c1cccnc1OCCCC#N)C(C)C#N)C1CC1
COC(=O)CC1CCCCN1C(=O)Nc1cccnc1-n1cccn1)c1ccccc1F)C1CC1C1CC1)c1ccncc1Cl)c1ccncc1F)C

In [None]:
from rdkit import Chem

def is_valid_smiles(smiles):
    mol = Chem.MolFromSmiles(smiles)
    return mol is not None

In [None]:
generated = [sample(model) for _ in range(100)]
valid_smiles = [s for s in generated if is_valid_smiles(s)]

print(f"Generated: {len(generated)}")
print(f"Valid: {len(valid_smiles)}")
print("Some valid SMILES:")
for s in valid_smiles[:5]:
    print(s)


Generated: 100
Valid: 0
Some valid SMILES:


[19:16:55] SMILES Parse Error: extra close parentheses while parsing: CCNC(=O)CSc1nnc(-c2cccnc2)n1C1CC1)c1cccc(C)c1F)NC1CC1)C1CC1C(C)C1CC1=O)CC1CC1O)C(O)C1CC1C)C1CC1C)C1CC
[19:16:55] SMILES Parse Error: Failed parsing SMILES 'CCNC(=O)CSc1nnc(-c2cccnc2)n1C1CC1)c1cccc(C)c1F)NC1CC1)C1CC1C(C)C1CC1=O)CC1CC1O)C(O)C1CC1C)C1CC1C)C1CC' for input: 'CCNC(=O)CSc1nnc(-c2cccnc2)n1C1CC1)c1cccc(C)c1F)NC1CC1)C1CC1C(C)C1CC1=O)CC1CC1O)C(O)C1CC1C)C1CC1C)C1CC'
[19:16:55] SMILES Parse Error: extra close parentheses while parsing: CCNC(=O)CN1CCN(C(=O)Nc2ccccc2F)CC1CC(C)O)C1CC1CCCO1)C1CCC1CCOCC1CC1)c1ccncc1C1CC1Cl)C(C)C#N)C1CC1C(=O
[19:16:55] SMILES Parse Error: Failed parsing SMILES 'CCNC(=O)CN1CCN(C(=O)Nc2ccccc2F)CC1CC(C)O)C1CC1CCCO1)C1CCC1CCOCC1CC1)c1ccncc1C1CC1Cl)C(C)C#N)C1CC1C(=O' for input: 'CCNC(=O)CN1CCN(C(=O)Nc2ccccc2F)CC1CC(C)O)C1CC1CCCO1)C1CCC1CCOCC1CC1)c1ccncc1C1CC1Cl)C(C)C#N)C1CC1C(=O'
[19:16:55] SMILES Parse Error: extra close parentheses while parsing: CC(=O)c1cnc2nc(-c3cccc4ccccc34)nn2c1C=COC(

In [None]:
torch.save(model.state_dict(), "rnn_smiles_model.pt")

In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import numpy as np
import pandas as pd
from rdkit import Chem, RDLogger
RDLogger.DisableLog('rdApp.*')  # Suppress RDKit warnings

# Function to add start and end tokens
def process_smiles(smiles_list):
    return ["^" + s + "$" for s in smiles_list]

# Create character dictionaries including special tokens
def create_vocab(smiles_list):
    all_chars = sorted(list(set(''.join(smiles_list))))
    char2idx = {ch: i + 1 for i, ch in enumerate(all_chars)}
    char2idx['<PAD>'] = 0  # Padding token
    idx2char = {i: ch for ch, i in char2idx.items()}
    return char2idx, idx2char, len(char2idx)

# Enhanced tokenization
def tokenize(smiles, char2idx):
    return [char2idx.get(c, 0) for c in smiles]  # Default to 0 if unknown

def detokenize(tokens, idx2char):
    return ''.join([idx2char.get(t, '') for t in tokens if t != 0])

# Improved dataset class
class SMILESDataset(Dataset):
    def __init__(self, smiles_list, char2idx, seq_len=120):
        self.smiles_list = smiles_list
        self.char2idx = char2idx
        self.seq_len = seq_len

    def __len__(self):
        return len(self.smiles_list)

    def __getitem__(self, idx):
        smiles = self.smiles_list[idx]
        if len(smiles) > self.seq_len:
            smiles = smiles[:self.seq_len]

        tokens = tokenize(smiles, self.char2idx)
        x = torch.tensor(tokens[:-1], dtype=torch.long)
        y = torch.tensor(tokens[1:], dtype=torch.long)

        # Padding
        pad_len = self.seq_len - 1 - len(x)
        if pad_len > 0:
            x = torch.cat([x, torch.zeros(pad_len, dtype=torch.long)])
            y = torch.cat([y, torch.zeros(pad_len, dtype=torch.long)])

        return x, y

# Enhanced RNN model
class RNNGenerator(nn.Module):
    def __init__(self, vocab_size, embedding_dim=256, hidden_dim=512, num_layers=3, dropout=0.2):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.rnn = nn.LSTM(
            embedding_dim,
            hidden_dim,
            num_layers,
            batch_first=True,
            dropout=dropout
        )
        self.fc = nn.Linear(hidden_dim, vocab_size)
        self.dropout = nn.Dropout(dropout)
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers

    def forward(self, x, hidden=None):
        batch_size = x.size(0)
        emb = self.dropout(self.embedding(x))

        if hidden is None:
            # Initialize hidden states
            h0 = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(x.device)
            c0 = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(x.device)
            hidden = (h0, c0)

        out, hidden = self.rnn(emb, hidden)
        out = self.dropout(out)
        logits = self.fc(out)
        return logits, hidden

    def init_hidden(self, batch_size, device):
        h0 = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(device)
        c0 = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(device)
        return (h0, c0)

# Sample a new molecule
def generate_molecule(model, char2idx, idx2char, device, max_len=100, temperature=0.8):
    model.eval()
    with torch.no_grad():
        # Start with the start token
        start_token = char2idx['^']
        current = torch.tensor([[start_token]], dtype=torch.long).to(device)
        hidden = None
        result = [start_token]

        for _ in range(max_len):
            output, hidden = model(current, hidden)
            output = output[:, -1, :] / temperature
            probs = torch.softmax(output, dim=-1)

            # Sample from the probability distribution
            next_token = torch.multinomial(probs, 1).item()
            result.append(next_token)
            current = torch.tensor([[next_token]], dtype=torch.long).to(device)

            # Stop if end token is generated
            if idx2char[next_token] == '$':
                break

        generated = detokenize(result, idx2char)
        # Remove start/end tokens for validation
        clean_smiles = generated.replace('^', '').replace('$', '')
        return clean_smiles, is_valid_smiles(clean_smiles)

# Validate SMILES with RDKit
def is_valid_smiles(smiles):
    mol = Chem.MolFromSmiles(smiles)
    return mol is not None

# Training with teacher forcing and validation
def train_model(model, dataloader, optimizer, criterion, device, epochs,
                char2idx, idx2char, save_path='smiles_rnn_model.pth'):
    best_loss = float('inf')

    for epoch in range(epochs):
        model.train()
        epoch_loss = 0

        for x, y in tqdm(dataloader, desc=f"Epoch {epoch+1}/{epochs}"):
            x, y = x.to(device), y.to(device)
            optimizer.zero_grad()

            logits, _ = model(x)
            loss = criterion(logits.view(-1, len(char2idx)), y.view(-1))

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 5.0)  # Gradient clipping
            optimizer.step()

            epoch_loss += loss.item()

        avg_loss = epoch_loss / len(dataloader)
        print(f"Epoch {epoch+1} loss: {avg_loss:.4f}")

        # Generate and validate some molecules
        if (epoch + 1) % 5 == 0 or epoch == epochs - 1:
            valid_count = 0
            n_samples = 10
            print("\nGenerating sample molecules:")
            for _ in range(n_samples):
                mol, valid = generate_molecule(model, char2idx, idx2char, device)
                validity = "✓" if valid else "✗"
                print(f"{mol} {validity}")
                if valid:
                    valid_count += 1
            print(f"Validity: {valid_count}/{n_samples} ({valid_count/n_samples*100:.1f}%)")

        # Save best model
        if avg_loss < best_loss:
            best_loss = avg_loss
            torch.save(model.state_dict(), save_path)
            print(f"Model saved to {save_path}")

# Main training pipeline
def run_training(dataframe, smiles_column='SMILES', batch_size=128, epochs=20):
    # Get SMILES strings from DataFrame
    smiles_list = dataframe[smiles_column].tolist()

    # Process data
    processed_smiles = process_smiles(smiles_list)
    char2idx, idx2char, vocab_size = create_vocab(processed_smiles)

    # Create dataset and data loader
    dataset = SMILESDataset(processed_smiles, char2idx, seq_len=120)
    loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

    # Setup model
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = RNNGenerator(vocab_size).to(device)
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.CrossEntropyLoss(ignore_index=0)

    # Train
    train_model(model, loader, optimizer, criterion, device, epochs=epochs,
                char2idx=char2idx, idx2char=idx2char)

    return model, char2idx, idx2char

# Generate a batch of molecules
def generate_molecules(model, char2idx, idx2char, device, n=25, temperature=0.8):
    valid_mols = []
    attempts = 0
    max_attempts = n * 5  # Try up to 5x the requested number

    while len(valid_mols) < n and attempts < max_attempts:
        smiles, valid = generate_molecule(model, char2idx, idx2char, device, temperature=temperature)
        attempts += 1
        if valid:
            # Check for duplicates
            if smiles not in valid_mols:
                valid_mols.append(smiles)

    return valid_mols, len(valid_mols)/attempts if attempts > 0 else 0

In [7]:
# Main execution script
if __name__ == "__main__":
    # Load data
    df = pd.read_csv('moses/data/train.csv')
    print("Dataset information:")
    print(df.head())
    print(f"Dataset shape: {df.shape}")

    # Check if 'SMILES' column exists, adjust if needed
    smiles_column = 'SMILES'
    if smiles_column not in df.columns:
        # Try to find a column that might contain SMILES strings
        for col in df.columns:
            if any(c in '()[]=' for c in df[col].iloc[0]):
                smiles_column = col
                print(f"Using column '{smiles_column}' for SMILES data")
                break

    # Train the model
    print(f"\nTraining model on {df.shape[0]} SMILES strings...")
    model, char2idx, idx2char = run_training(df, smiles_column=smiles_column)

    # Generate and evaluate new molecules
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print("\nGenerating new molecules...")
    molecules, validity_rate = generate_molecules(model, char2idx, idx2char, device, n=20)

    print(f"\nGenerated {len(molecules)} valid molecules with {validity_rate*100:.1f}% validity rate")
    print("\nSample valid molecules:")
    for i, mol in enumerate(molecules[:10], 1):
        print(f"{i}. {mol}")

    # Optional: Save to CSV
    if molecules:
        output_df = pd.DataFrame({'generated_smiles': molecules})
        output_path = 'generated_molecules.csv'
        output_df.to_csv(output_path, index=False)
        print(f"\nSaved generated molecules to {output_path}")

Dataset information:
                                   SMILES  SPLIT
0  CCCS(=O)c1ccc2[nH]c(=NC(=O)OC)[nH]c2c1  train
1    CC(C)(C)C(=O)C(Oc1ccc(Cl)cc1)n1ccnc1  train
2     Cc1c(Cl)cccc1Nc1ncccc1C(=O)OCC(O)CO  train
3        Cn1cnc2c1c(=O)n(CC(O)CO)c(=O)n2C  train
4          CC1Oc2ccc(Cl)cc2N(CC(O)CO)C1=O  train
Dataset shape: (1584663, 2)

Training model on 1584663 SMILES strings...


Epoch 1/20: 100%|██████████| 12381/12381 [04:33<00:00, 45.23it/s]


Epoch 1 loss: 0.5848
Model saved to smiles_rnn_model.pth


Epoch 2/20: 100%|██████████| 12381/12381 [04:32<00:00, 45.45it/s]


Epoch 2 loss: 0.5235
Model saved to smiles_rnn_model.pth


Epoch 3/20: 100%|██████████| 12381/12381 [04:32<00:00, 45.41it/s]


Epoch 3 loss: 0.5146
Model saved to smiles_rnn_model.pth


Epoch 4/20: 100%|██████████| 12381/12381 [04:31<00:00, 45.53it/s]


Epoch 4 loss: 0.5106
Model saved to smiles_rnn_model.pth


Epoch 5/20: 100%|██████████| 12381/12381 [04:32<00:00, 45.46it/s]


Epoch 5 loss: 0.5087

Generating sample molecules:
COCCCn1ccc(NC(=O)N2Cc3cncnc3C2)n1 ✓
Cc1ccc(NC(=O)NCc2cn[nH]c2)cc1Cl ✓
Cc1cc(NC(=O)C(C)Sc2ncnc3ccccc23)n(C)n1 ✓
Cc1cc(F)ccc1NC(=O)CNS(=O)(=O)c1ccccc1Cl ✓
CCOCC1CCCN(C(=O)c2ccc(NC(=O)C(C)C)cc2)C1 ✓
O=C(Nc1nccs1)c1ccc(OC2CCC2)cc1 ✓
CCc1ncc(CN(C)C(=O)c2cc(Cl)ccn2)s1 ✓
Cc1ccc(C)c(C(C)NC(=O)C2CCC(=O)N(C)C2)c1 ✓
Cc1cc(C)cc(CCNC(=O)c2ccc(Cn3cncn3)cc2)c1 ✓
O=C(Nc1cccc(Cl)c1F)c1ccc(N2CCOC2=O)cc1 ✓
Validity: 10/10 (100.0%)
Model saved to smiles_rnn_model.pth


Epoch 6/20: 100%|██████████| 12381/12381 [04:31<00:00, 45.53it/s]


Epoch 6 loss: 0.5076
Model saved to smiles_rnn_model.pth


Epoch 7/20: 100%|██████████| 12381/12381 [04:31<00:00, 45.65it/s]


Epoch 7 loss: 0.5072
Model saved to smiles_rnn_model.pth


Epoch 8/20: 100%|██████████| 12381/12381 [04:31<00:00, 45.57it/s]


Epoch 8 loss: 0.5073


Epoch 9/20: 100%|██████████| 12381/12381 [04:31<00:00, 45.62it/s]


Epoch 9 loss: 0.5075


Epoch 10/20: 100%|██████████| 12381/12381 [04:32<00:00, 45.48it/s]


Epoch 10 loss: 0.5079

Generating sample molecules:
COCCNC(=O)NC(C)c1ccc(Br)cc1 ✓
c1cc(CN2CCOC3C(OCc4ccncc4)CCC32)ccn1 ✓
Cc1cccc2sc(NC(=O)COCc3nc(C)no3)nc12 ✓
CCc1nc2n(n1)CCCC2NC(=O)Cc1cc(C)ccc1C ✓
CCC1CCCCN1C(=O)c1cccc(N)c1C ✓
O=C(CN1CCN(c2ccccc2F)CC1)NCc1ccccc1F ✓
CCOC(=O)C1(NC(=O)c2ccc(-n3ccnn3)cc2)CCCC1 ✓
CC(=O)NCCNC(=O)N(C)Cc1ccccc1 ✓
O=C(NCc1ccccc1)c1cc2ccccc2c(=O)[nH]1 ✓
O=C(NCc1nc2ncccn2n1)Nc1ccccc1N1CCCC1=O ✓
Validity: 10/10 (100.0%)


Epoch 11/20: 100%|██████████| 12381/12381 [04:30<00:00, 45.72it/s]


Epoch 11 loss: 0.5086


Epoch 12/20: 100%|██████████| 12381/12381 [04:31<00:00, 45.64it/s]


Epoch 12 loss: 0.5093


Epoch 13/20: 100%|██████████| 12381/12381 [04:31<00:00, 45.57it/s]


Epoch 13 loss: 0.5102


Epoch 14/20: 100%|██████████| 12381/12381 [04:31<00:00, 45.63it/s]


Epoch 14 loss: 0.5113


Epoch 15/20: 100%|██████████| 12381/12381 [04:30<00:00, 45.69it/s]


Epoch 15 loss: 0.5123

Generating sample molecules:
COC(=O)c1cc(Cl)ccc1NC(=O)NCCC1=CCCCC1 ✓
Cc1nc(COc2ccc3c(c2)C(C)(C)C(=O)N3C)cs1 ✓
COC(=O)c1cccc(S(=O)(=O)NCc2ccc(F)cc2)c1 ✓
O=C(CC1=CCCCC1)N1CCc2nc(NC(=O)C3CC3)sc2C1 ✓
CCOc1ccc(C(=O)Nc2nnc(C3CCCCC3)o2)cc1 ✓
CCCCN(CCO)C(=O)NCC(C)Cn1nc(C)c(Cl)c1C ✓
Cc1ccc(C)c(-n2ccnc2SCC(=O)Nc2ccccc2C#N)c1 ✓
CC(=O)Nc1ccc2c(c1)N(CC(=O)c1ccccc1)C(=O)CO2 ✓
COc1ccc(C(=O)CCc2nc(C)cs2)cc1C ✓
CN(C)S(=O)(=O)c1ccc(C(=O)NCCCn2cccn2)o1 ✓
Validity: 10/10 (100.0%)


Epoch 16/20: 100%|██████████| 12381/12381 [04:32<00:00, 45.51it/s]


Epoch 16 loss: 0.5136


Epoch 17/20: 100%|██████████| 12381/12381 [04:31<00:00, 45.58it/s]


Epoch 17 loss: 0.5150


Epoch 18/20: 100%|██████████| 12381/12381 [04:32<00:00, 45.51it/s]


Epoch 18 loss: 0.5165


Epoch 19/20: 100%|██████████| 12381/12381 [04:31<00:00, 45.60it/s]


Epoch 19 loss: 0.5185


Epoch 20/20: 100%|██████████| 12381/12381 [04:31<00:00, 45.63it/s]


Epoch 20 loss: 0.5203

Generating sample molecules:
COC(=O)C(C)(NC(=O)NC(C)c1cc(C)sc1C)C1CC1 ✓
Cc1cccc(C(=O)N(C)C(C#N)c2ccccc2C)c1 ✓
COC(=O)c1ccc(F)c(NC(=O)COc2ccccc2)c1 ✓
COc1ccc(C)cc1NCC(=O)N1CCN(C(C)=O)CC1 ✓
Cc1ccc(CCC(=O)Nc2ccc(C(N)=O)nc2)c(C)c1 ✓
CNC(=O)C1CCCN(C(=O)CCc2c(C)nc3c(C)cccn23)C1 ✓
Cc1ccc(CC(=O)NCC(C)(O)c2ccccc2F)cn1 ✓
COCCN(C)C(=O)Nc1ccc(-n2ccnc2)cc1 ✓
Cc1cccc(Cl)c1NC(=O)NCCCn1cccn1 ✓
CC(C)(C)C(=O)N1CCC(C(=O)NCCCc2ccccc2)CC1 ✓
Validity: 10/10 (100.0%)

Generating new molecules...

Generated 20 valid molecules with 100.0% validity rate

Sample valid molecules:
1. CCCC1CN(C(=O)COc2ccc(-n3cnnn3)cc2)CCO1
2. O=C(NCc1ccccc1F)C12CCC(=O)N1CCCC2
3. Cn1ccnc1C(=O)NCc1ccc(Cn2cncn2)cc1
4. Cc1cc(Cl)ccc1OC(=O)c1ccc(S(=O)(=O)NC(C)C)cc1
5. COCCNC(=O)NCC1(c2ccc(Br)cc2)CC1
6. Cn1cc(CNC(=O)NCCCN2CCOCC2)cn1
7. O=C(NCCC(O)c1ccccc1)c1cccc(Cl)c1
8. CCOC(=O)c1[nH]c2ccc(F)cc2c1NC(=O)C(C)C
9. CC(C)c1nnsc1C(=O)Nc1cccc2c1CCCC2
10. Brc1ccc2nc(-c3ccccn3)cn2c1

Saved generated molecules to generated_m