In [29]:
from rdkit import Chem
from rdkit.Chem import AllChem

# one generator per radius, 2 048 bits each
GEN0 = AllChem.GetMorganGenerator(radius=0, fpSize=2048)
GEN1 = AllChem.GetMorganGenerator(radius=1, fpSize=2048)
GEN2 = AllChem.GetMorganGenerator(radius=2, fpSize=2048)

def fp6144_from_smiles(smiles):
    """Return torch.float32 tensor (6144,) or None if SMILES fails."""
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None
    fp0 = torch.tensor(list(GEN0.GetFingerprint(mol)), dtype=torch.float32)
    fp1 = torch.tensor(list(GEN1.GetFingerprint(mol)), dtype=torch.float32)
    fp2 = torch.tensor(list(GEN2.GetFingerprint(mol)), dtype=torch.float32)
    return torch.cat([fp0, fp1, fp2])            # (6144,)


In [30]:
import torch
from torch.utils.data import Dataset
import os
import json
import numpy as np
from collections import Counter

class PotencyDataset(Dataset):
    def __init__(self, directory):
        self.records = []
        for fname in os.listdir(directory):
            if fname.endswith(".json"):
                with open(os.path.join(directory, fname)) as f:
                    rec = json.load(f)
                self.records.append((rec["SMILES"], rec["label"]))

    def __len__(self):
        return len(self.records)

    def __getitem__(self, idx):
        smiles, label = self.records[idx]
        fp = fp6144_from_smiles(smiles)
        while fp is None:                # rare bad SMILES → pick next
            idx = (idx + 1) % len(self.records)
            smiles, label = self.records[idx]
            fp = fp6144_from_smiles(smiles)
        return fp, torch.tensor(label, dtype=torch.long)


In [None]:
from collections import Counter

# ── build the Dataset objects ──────────────────────────────────────
train_dataset = PotencyDataset("train")
val_dataset   = PotencyDataset("val")

# ── quick sanity-check of label balance ────────────────────────────
def show_distribution(ds, name):
    labels = []
    for i in range(len(ds)):
        _, lbl = ds[i]
        labels.append(int(lbl))
    print(f"{name} label distribution →", Counter(labels))

show_distribution(train_dataset, "Train")
show_distribution(val_dataset,   "Val")

In [32]:
import torch.nn as nn

class BaselineMLP(nn.Module):
    def __init__(self, input_dim=6144, hidden_dim=256, num_classes=6):
        super(BaselineMLP, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)
        self.output = nn.Linear(hidden_dim, num_classes)  # 6 potency categories

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.output(x)  # raw scores (logits), handled by loss function



In [35]:
import torch
from torch.utils.data import DataLoader
import torch.optim as optim
from rdkit import RDLogger

# Hide all RDKit warnings (keep “error” if you still want fatal messages)
RDLogger.DisableLog("rdApp.*")     # most common choice
# RDLogger.DisableLog("rdApp.*")         # silence everything


# Load datasets
train_dataset = PotencyDataset("train")
val_dataset = PotencyDataset("val")

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64)

# Initialize model, loss, optimizer
model = BaselineMLP()
criterion = nn.CrossEntropyLoss()  # for multi-class classification
optimizer = optim.Adam(model.parameters(), lr=1e-3)

# Use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Training
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    all_preds, all_labels = [], []

    for batch in train_loader:
        inputs, labels = batch
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        all_preds.extend(outputs.argmax(dim=1).cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

    train_acc = (torch.tensor(all_preds) == torch.tensor(all_labels)).float().mean().item()


    # Validation
    model.eval()
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in val_loader:
            inputs, labels = batch
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            all_preds.extend(outputs.argmax(dim=1).cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    val_acc = (torch.tensor(all_preds) == torch.tensor(all_labels)).float().mean().item()


    print(f"Epoch {epoch+1}/{num_epochs} | Train Loss: {total_loss:.4f} | Train Acc: {train_acc:.4f} | Val Acc: {val_acc:.4f}")


Epoch 1/10 | Train Loss: 789.2476 | Train Acc: 0.5396 | Val Acc: 0.5784
Epoch 2/10 | Train Loss: 619.0749 | Train Acc: 0.6490 | Val Acc: 0.5916
Epoch 3/10 | Train Loss: 488.4316 | Train Acc: 0.7270 | Val Acc: 0.6089
Epoch 4/10 | Train Loss: 365.3936 | Train Acc: 0.8018 | Val Acc: 0.6019
Epoch 5/10 | Train Loss: 268.6252 | Train Acc: 0.8579 | Val Acc: 0.6003
Epoch 6/10 | Train Loss: 203.0157 | Train Acc: 0.8939 | Val Acc: 0.6051
Epoch 7/10 | Train Loss: 159.9796 | Train Acc: 0.9160 | Val Acc: 0.6046
Epoch 8/10 | Train Loss: 129.2077 | Train Acc: 0.9340 | Val Acc: 0.5981
Epoch 9/10 | Train Loss: 109.2569 | Train Acc: 0.9445 | Val Acc: 0.5943
Epoch 10/10 | Train Loss: 95.6513 | Train Acc: 0.9513 | Val Acc: 0.6001


In [37]:
print(type(model))

<class '__main__.BaselineMLP'>


In [40]:
#TESTING MINGS SMILES String Example
#
from rdkit import Chem
from rdkit.Chem import AllChem
import numpy as np
from rdkit.DataStructs import ConvertToNumpyArray
import torch
import torch.nn.functional as F

# Prepare SMILES and molecule
smiles = "CC(C)c1c(O)c(O)c(C=O)c2c1cc(C)c(c2O)-c(c3O)c(C)cc4c3c(C=O)c(O)c(O)c4C(C)C"
mol = Chem.MolFromSmiles(smiles)
mol = Chem.AddHs(mol)

# Generate 6144-bit Morgan fingerprint
fp = []
for radius in [0, 1, 2]:
    bitvec = AllChem.GetMorganFingerprintAsBitVect(mol, radius=radius, nBits=2048)
    arr = np.zeros((2048,), dtype=int)
    ConvertToNumpyArray(bitvec, arr)
    fp.append(arr)
fingerprint_6144 = np.concatenate(fp)

# Convert to tensor and move to model's device
x = torch.tensor(fingerprint_6144, dtype=torch.float32).unsqueeze(0)  # Add batch dim
device = next(model.parameters()).device
x = x.to(device)

# Predict
model.eval()
with torch.no_grad():
    logits = model(x)

# If your model returns multiple outputs (one per cell line):
if isinstance(logits, list) or isinstance(logits, tuple):
    probs = [F.softmax(head, dim=1) for head in logits]
    preds = [torch.argmax(p, dim=1).item() for p in probs]
else:
    probs = F.softmax(logits, dim=1)
    preds = torch.argmax(probs, dim=1).tolist()

print("Predicted potency classes:", preds)



Predicted potency classes: [1]
