In [None]:
from google.colab import files
uploaded = files.upload()


In [None]:
!pip install rdkit

In [None]:
from pathlib import Path
import random, numpy as np, torch
from rdkit import Chem, RDLogger

RDLogger.DisableLog('rdApp.*')      

TRAIN_FILE = 'train.txt'

with open(TRAIN_FILE) as f:
    smiles_list = [s.strip() for s in f if s.strip()]

print(f'Training molecules: {len(smiles_list):,}')


In [None]:
class Vocabulary:
    def __init__(self, smiles, bos='$', eos='^', pad='&'):
        self.bos, self.eos, self.pad = bos, eos, pad
        tokens = set([bos, eos, pad])
        for s in smiles: tokens.update(list(s))
        self.idx2tok = sorted(tokens)
        self.tok2idx = {t:i for i,t in enumerate(self.idx2tok)}

    def encode(self, s, max_len):
        s = self.bos + s + self.eos
        s += self.pad*(max_len-len(s))
        return [self.tok2idx[c] for c in s]

    def decode(self, idxs):
        toks = [self.idx2tok[i] for i in idxs]
        s = ''.join(toks).split(self.eos)[0]       # cut EOS
        return s.replace(self.bos,'').replace(self.pad,'')

vocab    = Vocabulary(smiles_list)
MAX_LEN  = max(len(s) for s in smiles_list)+2     # + BOS/EOS

print('Vocab size:', len(vocab.idx2tok))


In [None]:
from torch.utils.data import Dataset, DataLoader

class SmilesDataset(Dataset):
    def __init__(self, smiles, vocab, max_len):
        self.smiles = smiles
        self.vocab  = vocab
        self.max_len= max_len

    def __len__(self): return len(self.smiles)

    def __getitem__(self, idx):
        seq  = torch.tensor(self.vocab.encode(self.smiles[idx], self.max_len))
        inp  = seq[:-1]           # remove last token
        targ = seq[1:]            # left shift
        return inp.long(), targ.long()

BATCH = 512
ds    = SmilesDataset(smiles_list, vocab, MAX_LEN)
dl    = DataLoader(ds, batch_size=BATCH, shuffle=True, num_workers=0, pin_memory=True)


In [None]:
import torch.nn as nn, torch.nn.functional as F

class SmilesLSTM(nn.Module):
    def __init__(self, vocab_size, emb_dim=128, hid_dim=512, n_layers=2, dropout=0.2):
        super().__init__()
        self.emb   = nn.Embedding(vocab_size, emb_dim)
        self.lstm  = nn.LSTM(emb_dim, hid_dim, n_layers,
                             batch_first=True, dropout=dropout)
        self.fc    = nn.Linear(hid_dim, vocab_size)

    def forward(self, x, h=None):
        out, h = self.lstm(self.emb(x), h)
        logits = self.fc(out)
        return logits, h

    @torch.no_grad()
    def sample(self, max_len, temperature=1.0, canonical=True):
        bos = torch.tensor([[vocab.tok2idx[vocab.bos]]], device=self.fc.weight.device)
        out, h = self.forward(bos)
        idx    = bos
        gen    = []
        for _ in range(max_len-1):
            logits = out[:,-1,:]/temperature
            p      = F.softmax(logits, -1)
            idx    = torch.multinomial(p, 1)
            token  = idx.item()
            if token == vocab.tok2idx[vocab.eos]: break
            gen.append(token)
            out, h = self.forward(idx, h)
        smi = vocab.decode(gen)
        if not canonical: return smi
        try:                             # canonicalizza
            return Chem.MolToSmiles(Chem.MolFromSmiles(smi))
        except: return None


In [None]:
import random, torch, numpy as np
from torch.utils.data import DataLoader, random_split

device   = torch.device('cuda')  

# --------------- DATASET + VALIDATION SPLIT ---------------
VAL_FRAC = 0.05
val_len  = int(len(ds) * VAL_FRAC)
train_len= len(ds) - val_len
train_ds, val_ds = random_split(ds, [train_len, val_len],
                                generator=torch.Generator().manual_seed(0))

train_dl = DataLoader(train_ds, batch_size=BATCH, shuffle=True,
                      num_workers=0, pin_memory=True)
val_dl   = DataLoader(val_ds,   batch_size=BATCH, shuffle=False,
                      num_workers=0, pin_memory=True)

# --------------- MODEL, OPTIM, SCHEDULER ---------------
model = SmilesLSTM(len(vocab.idx2tok),
                   emb_dim=128,      
                   hid_dim=512,     
                   n_layers=2,
                   dropout=0.3).to(device)
opt    = torch.optim.AdamW(model.parameters(), lr=1e-3)
crit   = torch.nn.CrossEntropyLoss(ignore_index=vocab.tok2idx[vocab.pad])

sched = torch.optim.lr_scheduler.ReduceLROnPlateau(opt, mode='min', patience=2, factor=0.5)


# --------------- TRAIN LOOP con EARLY-STOP ---------------
EPOCHS     = 50
LOG_EVERY  = 100
PATIENCE   = 5           # early stop
best_val   = float('inf')
stall      = 0

for ep in range(1, EPOCHS+1):
    # --------------- TRAIN ---------------
    model.train(); tot=0; n=0
    for i,(x,y) in enumerate(train_dl,1):
        x,y = x.to(device), y.to(device)
        opt.zero_grad()
        logits,_ = model(x)
        loss = crit(logits.view(-1, logits.size(-1)), y.view(-1))
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.)
        opt.step()
        tot += loss.item(); n += 1

        if i % LOG_EVERY == 0:
            model.eval()

            raw = model.sample(MAX_LEN, temperature=0.8, canonical=False)

            can = model.sample(MAX_LEN, temperature=0.8, canonical=True)
            valid = can is not None           # True/False

            print(f'E{ep:02d} B{i:05d}/{len(train_dl)}  '
                  f'train-loss {loss.item():.3f}  '
                  f'raw: {raw}  '
                  f'can: {can}  '
                  f'valid: {valid}')

            model.train()


    train_loss = tot / n

    # --------------- VALIDATION ---------------
    model.eval(); vtot=0; vn=0
    with torch.no_grad():
        for x,y in val_dl:
            x,y = x.to(device), y.to(device)
            logits,_ = model(x)
            vloss = crit(logits.view(-1, logits.size(-1)), y.view(-1))
            vtot += vloss.item(); vn += 1
    val_loss = vtot / vn
    print(f'>> Epoch {ep:02d}  train {train_loss:.4f} | val {val_loss:.4f}')

    # --------------- SCHEDULER & EARLY-STOP ---------------
    sched.step(val_loss)    

    if val_loss < best_val - 1e-4:     # floating
        best_val = val_loss
        stall = 0
        torch.save(model.state_dict(), 'lstm_best_colab.pth')
        print('   ✓ new best model saved')
    else:
        stall += 1
        print(f'   no improvement ({stall}/{PATIENCE})')
        if stall >= PATIENCE:
            print('*** Early stopping ***')
            break

In [None]:
# --------------- LOAD BEST ---------------
model = SmilesLSTM(len(vocab.idx2tok),
                   emb_dim=128,  
                   hid_dim=512,   
                   n_layers=2,
                   dropout=0.3).to(device)
model.load_state_dict(torch.load('lstm_best_colab.pth'))
model.eval()

N_GEN = 10_000
counter = 0
gen   = []
while len(gen) < N_GEN:
    smi = model.sample(MAX_LEN, temperature=0.95)  
    counter = counter + 1
    if smi and smi not in gen:
        gen.append(smi)
        if len(gen) % 10 == 0:
            print(f"Generated {len(gen)} valid unique SMILES after {counter} total attempts")

SUB_FILE = Path('submission_colab.txt')
SUB_FILE.write_text('\n'.join(gen))
print('Saved', SUB_FILE, 'with', len(gen), 'SMILES')
