In [1]:
#Trained with old fingerprint generator

from rdkit import Chem
from rdkit.Chem import AllChem
GEN0 = AllChem.GetMorganGenerator(radius=0, fpSize=2048)
GEN1 = AllChem.GetMorganGenerator(radius=1, fpSize=2048)
GEN2 = AllChem.GetMorganGenerator(radius=2, fpSize=2048)

def fp6144_from_smiles(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    fp0 = torch.tensor(list(GEN0.GetFingerprint(mol)), dtype=torch.float32)
    fp1 = torch.tensor(list(GEN1.GetFingerprint(mol)), dtype=torch.float32)
    fp2 = torch.tensor(list(GEN2.GetFingerprint(mol)), dtype=torch.float32)
    return torch.cat([fp0, fp1, fp2]) 

In [2]:
# Returns a 60 element array with predicted activity for each cell line.

def predict_activity(smiles: str, model, device='cpu'):
    fp = fp6144_from_smiles(smiles)
    if fp is None:
        raise ValueError("Invalid SMILES string")
    fp = fp.unsqueeze(0).to(device)          
    with torch.no_grad():
        logits = model(fp)                   
    preds = logits.argmax(2).cpu().numpy()[0]
    return preds        


In [3]:
import gzip, json, torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from tqdm.auto import tqdm
import numpy as np

class MultiLineMLP5(nn.Module):
    def __init__(self,
                 input_dim=6144,
                 hidden_dims=[1024, 1024, 512, 512, 256],
                 num_lines=60,
                 num_classes=6,
                 p_drop=0.3):
        super().__init__()
        self.bn_in = nn.BatchNorm1d(input_dim)
        layers = []
        dims = [input_dim] + hidden_dims
        for d_in, d_out in zip(dims[:-1], dims[1:]):
            layers += [
                nn.Linear(d_in, d_out),
                nn.BatchNorm1d(d_out),
                nn.ReLU(),
                nn.Dropout(p_drop),
            ]
        self.shared = nn.Sequential(*layers)
        self.classifier = nn.Linear(hidden_dims[-1], num_lines * num_classes)

    def forward(self, x):
            x = self.bn_in(x)
            x = self.shared(x)
            logits = self.classifier(x)
            return logits.view(-1, 60, 6)




  from .autonotebook import tqdm as notebook_tqdm


In [4]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = MultiLineMLP5().to(device)
ckpt   = torch.load("best_resampled.pt", map_location=device)
state  = ckpt["model_state_dict"]        
model  = MultiLineMLP5().to(device)
model.load_state_dict(state)         
model.eval()

MultiLineMLP5(
  (bn_in): BatchNorm1d(6144, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (shared): Sequential(
    (0): Linear(in_features=6144, out_features=1024, bias=True)
    (1): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Dropout(p=0.3, inplace=False)
    (4): Linear(in_features=1024, out_features=1024, bias=True)
    (5): BatchNorm1d(1024, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): ReLU()
    (7): Dropout(p=0.3, inplace=False)
    (8): Linear(in_features=1024, out_features=512, bias=True)
    (9): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (10): ReLU()
    (11): Dropout(p=0.3, inplace=False)
    (12): Linear(in_features=512, out_features=512, bias=True)
    (13): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (14): ReLU()
    (15): Dropout(p=0.3, inplace=False)
    (16): Linear(in_feature

In [66]:
example = "COc1cc2O[C@]3([C@@H]([C@@H](CN(C)C)[C@@H](O)[C@@]3(O)c2c(OC)n1)c4ccccc4)c5ccc(cc5)C#N"
preds = predict_activity(example, model, device)
print(preds)


[5 5 5 5 4 5 5 5 5 4 5 5 4 5 5 4 5 5 5 5 5 5 5 4 5 4 5 5 5 4 5 4 5 5 4 5 4
 4 5 5 5 5 5 5 5 4 4 5 4 5 5 5 5 5 5 4 5 4 5 4]


In [65]:
import gzip, json

with gzip.open("train.jsonl.gz", "rt") as f:
    for i, line in enumerate(f):
        if i == 987: 
            entry = json.loads(line)
            break

print("SMILES:", entry["smiles"])
print("Labels:", entry["label_vector"])


SMILES: COc1cc2O[C@]3([C@@H]([C@@H](CN(C)C)[C@@H](O)[C@@]3(O)c2c(OC)n1)c4ccccc4)c5ccc(cc5)C#N
Labels: [4, 4, 5, 4, 4, 4, 5, 4, 4, 4, 5, 5, 4, 5, 4, 4, 4, 5, 5, 5, 5, 5, 4, 4, 5, 4, 4, 4, 5, 4, 4, 4, 5, 4, 4, 4, 4, 4, 4, 4, 5, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 4, 4, 4, 4, 4]


In [40]:
import gzip, json
import numpy as np

best_smiles = None
best_accuracy = -1
best_true = None
best_pred = None
best_name = None
best_index = -1  # <--- track line number

with gzip.open("train.jsonl.gz", "rt") as f:
    for i, line in enumerate(f):
        if i >= 50: break
        record = json.loads(line)
        smiles = record['smiles']
        name = record.get('NSC') or record.get('nsc') or "UNKNOWN"
        true = np.array(record['label_vector'])

        # Skip examples where first label is 0 or missing
        if true[0] in (-1, 0):
            continue

        try:
            pred = predict_activity(smiles, model, device)
        except ValueError:
            continue

        mask = true != -1
        acc = np.mean(pred[mask] == true[mask])
        
        if acc > best_accuracy:
            best_accuracy = acc
            best_smiles = smiles
            best_name = name
            best_true = true
            best_pred = pred
            best_index = i  # <--- record the line number

print("✅ Best SMILES found with accuracy:", f"{best_accuracy:.2%}")
print("Index:", best_index)
print("Molecule Name / NSC:", best_name)
print("SMILES:", best_smiles)








✅ Best SMILES found with accuracy: 98.31%
Index: 1
Molecule Name / NSC: UNKNOWN
SMILES: CCCCCCNC(=O)Oc1ccc(cc1)\C(=C(/C)\c2ccc(Cl)cc2)\c3ccc(OCCN(C)C)cc3


[19:26:58] Explicit valence for atom # 4 Cl, 2, is greater than permitted
