In [None]:
!pip install rdkit

Collecting rdkit
  Downloading rdkit-2025.9.3-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (4.2 kB)
Downloading rdkit-2025.9.3-cp312-cp312-manylinux_2_28_x86_64.whl (36.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m36.4/36.4 MB[0m [31m75.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit
Successfully installed rdkit-2025.9.3


In [None]:
!pip install torch_geometric

Collecting torch_geometric
  Downloading torch_geometric-2.7.0-py3-none-any.whl.metadata (63 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/63.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.7/63.7 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.7.0-py3-none-any.whl (1.3 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.3 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m76.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch_geometric
Successfully installed torch_geometric-2.7.0


In [None]:
!pip install --quiet optuna


[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/404.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m404.7/404.7 kB[0m [31m33.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
# above code with optuna integrated

In [None]:
# ============================================
# Trial-2: AttentiveFP → Embeddings → SVM+Optuna
# ============================================

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import random
import os

from torch_geometric.utils import from_smiles
from torch_geometric.data import DataLoader
from torch_geometric.nn import AttentiveFP

from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    roc_auc_score, f1_score, cohen_kappa_score,
    precision_score, recall_score, accuracy_score,
    confusion_matrix
)
from sklearn.model_selection import train_test_split

import optuna

# ------------------------
# Reproducibility
# ------------------------
def seed_set(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_set(42)

# ------------------------
# Load dataset
# ------------------------
# Adjust path as needed
df = pd.read_csv('/content/AID1239_data_for_classification_04Nov2024.csv')

# Optional: limit data for quick tests
# df = df.head(20000)

# ------------------------
# SMILES → Graph list
# ------------------------
graph_list = []
for i, smile in enumerate(df['SMILES']):
    try:
        g = from_smiles(smile)
        g.x = g.x.float()
        y_value = 1.0 if df['PUBCHEM_ACTIVITY_OUTCOME'][i] == 'Active' else 0.0
        y = torch.tensor(y_value, dtype=torch.float).view(1, -1)  # shape [1,1]
        g.y = y
        graph_list.append(g)
    except Exception:
        # Skip SMILES that fail to parse
        continue

print(f"Total valid molecules: {len(graph_list)}")

# Infer feature sizes from first graph
in_channels = graph_list[0].x.size(-1)
edge_dim = graph_list[0].edge_attr.size(-1)
print(f"in_channels={in_channels}, edge_dim={edge_dim}")

# ------------------------
# Train / Test split  (random; you can later replace with scaffold split)
# ------------------------
train_size = int(0.8 * len(graph_list))
test_size = len(graph_list) - train_size

train_dataset, test_dataset = torch.utils.data.random_split(
    graph_list,
    [train_size, test_size],
    generator=torch.Generator().manual_seed(42)
)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# ------------------------
# Device
# ------------------------
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Using device:", device)

# =========================================
# AttentiveFP Encoder + Linear Classifier
# =========================================
class AttentiveFPEncoderClassifier(nn.Module):
    """
    Uses AttentiveFP as a graph encoder (outputs embedding of size 'hidden_channels'),
    then applies a Linear layer for classification.
    We can extract the embedding for SVM later.
    """
    def __init__(self, in_channels, edge_dim,
                 hidden_channels=128, num_layers=4,
                 num_timesteps=2, dropout=0.2):
        super().__init__()
        # AttentiveFP will output embeddings of size hidden_channels
        self.encoder = AttentiveFP(
            in_channels=in_channels,
            hidden_channels=hidden_channels,
            out_channels=hidden_channels,  # embedding dimension
            edge_dim=edge_dim,
            num_layers=num_layers,
            num_timesteps=num_timesteps,
            dropout=dropout
        )
        self.classifier = nn.Linear(hidden_channels, 1)  # binary logit

    def forward(self, x, edge_index, edge_attr, batch, return_emb: bool = False):
        emb = self.encoder(x, edge_index, edge_attr, batch)  # [B, hidden_channels]
        logits = self.classifier(emb)                        # [B, 1]
        if return_emb:
            return logits, emb
        return logits

# ------------------------
# Instantiate model
# ------------------------
hidden_channels = 128
num_layers = 4
num_timesteps = 2
dropout = 0.2

model = AttentiveFPEncoderClassifier(
    in_channels=in_channels,
    edge_dim=edge_dim,
    hidden_channels=hidden_channels,
    num_layers=num_layers,
    num_timesteps=num_timesteps,
    dropout=dropout
).to(device)

criterion = nn.BCEWithLogitsLoss()  # works directly on logits
optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-5)

# ======================================
# Step 1 — Train AttentiveFP classifier
# ======================================
def run_epoch(loader, train: bool = True):
    if train:
        model.train()
    else:
        model.eval()

    total_loss = 0.0
    all_probs = []
    all_labels = []

    for data in loader:
        data = data.to(device)

        if train:
            optimizer.zero_grad()

        logits = model(data.x, data.edge_index, data.edge_attr, data.batch)  # [B,1]
        loss = criterion(logits, data.y)

        if train:
            loss.backward()
            optimizer.step()

        total_loss += loss.item() * data.num_graphs

        probs = torch.sigmoid(logits).detach().cpu().view(-1).numpy()
        labels = data.y.detach().cpu().view(-1).numpy()

        all_probs.append(probs)
        all_labels.append(labels)

    all_probs = np.concatenate(all_probs)
    all_labels = np.concatenate(all_labels)

    # Metrics
    try:
        roc = roc_auc_score(all_labels, all_probs)
    except ValueError:
        roc = np.nan

    preds = (all_probs > 0.5).astype(int)
    acc = accuracy_score(all_labels, preds)
    f1 = f1_score(all_labels, preds)
    kappa = cohen_kappa_score(all_labels, preds)
    precision = precision_score(all_labels, preds)
    recall = recall_score(all_labels, preds)
    cm = confusion_matrix(all_labels, preds)

    avg_loss = total_loss / len(loader.dataset)

    return avg_loss, acc, roc, f1, kappa, precision, recall, cm

num_epochs = 30  # increase if needed
best_roc = -1.0

for epoch in range(1, num_epochs + 1):
    train_loss, train_acc, train_roc, train_f1, train_kappa, train_prec, train_rec, _ = run_epoch(
        train_loader, train=True
    )
    test_loss, test_acc, test_roc, test_f1, test_kappa, test_prec, test_rec, test_cm = run_epoch(
        test_loader, train=False
    )

    if not np.isnan(test_roc) and test_roc > best_roc:
        best_roc = test_roc

    print(
        f"Epoch {epoch:03d} | "
        f"Train Loss: {train_loss:.4f}, Train ROC-AUC: {train_roc:.4f} | "
        f"Test Loss: {test_loss:.4f}, Test ROC-AUC: {test_roc:.4f}, "
        f"F1: {test_f1:.4f}, Acc: {test_acc:.4f}, Prec: {test_prec:.4f}, Rec: {test_rec:.4f}"
    )
    print("Test Confusion Matrix:\n", test_cm)

print("\nBest Test ROC-AUC (AttentiveFP-only):", best_roc)

# ==================================================
# Step 1b — Extract molecular embeddings from model
# ==================================================
def get_embeddings(loader):
    model.eval()
    all_embs = []
    all_labels = []

    with torch.no_grad():
        for data in loader:
            data = data.to(device)
            logits, emb = model(
                data.x, data.edge_index, data.edge_attr, data.batch, return_emb=True
            )
            all_embs.append(emb.cpu().numpy())
            all_labels.append(data.y.cpu().numpy())

    X = np.vstack(all_embs)               # [num_mols, hidden_channels]
    y = np.vstack(all_labels).ravel()     # [num_mols]
    return X, y

print("\nExtracting embeddings for train and test sets...")
X_train, y_train = get_embeddings(train_loader)
X_test, y_test = get_embeddings(test_loader)

print("Embedding shapes:", X_train.shape, X_test.shape)

# ======================================
# Scale embeddings (important for SVM)
# ======================================
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# =====================================================
# Step 2 — Optuna tuning for SVM on top of embeddings
# =====================================================

# Split train embeddings into inner-train and validation
X_train_svm, X_val_svm, y_train_svm, y_val_svm = train_test_split(
    X_train_scaled, y_train,
    test_size=0.2,
    stratify=y_train,
    random_state=42
)

def svm_objective(trial):
    # Hyperparameters to tune
    C = trial.suggest_float("C", 1e-2, 1e3, log=True)
    gamma = trial.suggest_float("gamma", 1e-4, 1e1, log=True)
    kernel = trial.suggest_categorical("kernel", ["rbf", "linear"])

    svm = SVC(
        kernel=kernel,
        C=C,
        gamma=gamma if kernel == "rbf" else "scale",
        probability=True,
        class_weight='balanced'
    )

    svm.fit(X_train_svm, y_train_svm)

    y_val_proba = svm.predict_proba(X_val_svm)[:, 1]

    try:
        val_roc = roc_auc_score(y_val_svm, y_val_proba)
    except ValueError:
        val_roc = 0.5  # fallback if something goes wrong

    return val_roc

print("\nRunning Optuna hyperparameter search for SVM...")

study = optuna.create_study(direction="maximize")
study.optimize(svm_objective, n_trials=30)  # increase n_trials for deeper search

print("Best SVM params:", study.best_params)
print("Best validation ROC-AUC from Optuna:", study.best_value)

best_params = study.best_params

# Train final SVM on full training embeddings with best params
final_kernel = best_params["kernel"]
final_C = best_params["C"]
final_gamma = best_params["gamma"] if final_kernel == "rbf" else "scale"

best_svm = SVC(
    kernel=final_kernel,
    C=final_C,
    gamma=final_gamma,
    probability=True,
    class_weight='balanced'
)

print("\nTraining final SVM with best hyperparameters on full training embeddings...")
best_svm.fit(X_train_scaled, y_train)

# Evaluate on held-out test set
y_proba = best_svm.predict_proba(X_test_scaled)[:, 1]
y_pred = best_svm.predict(X_test_scaled)

svm_roc = roc_auc_score(y_test, y_proba)
svm_f1 = f1_score(y_test, y_pred)
svm_acc = accuracy_score(y_test, y_pred)
svm_kappa = cohen_kappa_score(y_test, y_pred)
svm_prec = precision_score(y_test, y_pred)
svm_rec = recall_score(y_test, y_pred)
svm_cm = confusion_matrix(y_test, y_pred)

print("\n===== SVM on AttentiveFP Embeddings (Optuna-tuned) =====")
print(f"ROC-AUC   : {svm_roc:.4f}")
print(f"F1        : {svm_f1:.4f}")
print(f"Accuracy  : {svm_acc:.4f}")
print(f"Kappa     : {svm_kappa:.4f}")
print(f"Precision : {svm_prec:.4f}")
print(f"Recall    : {svm_rec:.4f}")
print("Confusion Matrix:\n", svm_cm)


Total valid molecules: 4000
in_channels=9, edge_dim=3
Using device: cuda


  train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
  test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


Epoch 001 | Train Loss: 0.6293, Train ROC-AUC: 0.7037 | Test Loss: 0.6428, Test ROC-AUC: 0.7877, F1: 0.7196, Acc: 0.6162, Prec: 0.5735, Rec: 0.9657
Test Confusion Matrix:
 [[ 99 293]
 [ 14 394]]
Epoch 002 | Train Loss: 0.5904, Train ROC-AUC: 0.7523 | Test Loss: 0.5561, Test ROC-AUC: 0.7948, F1: 0.7439, Acc: 0.7375, Prec: 0.7403, Rec: 0.7475
Test Confusion Matrix:
 [[285 107]
 [103 305]]
Epoch 003 | Train Loss: 0.5698, Train ROC-AUC: 0.7747 | Test Loss: 0.5487, Test ROC-AUC: 0.7999, F1: 0.7520, Acc: 0.7238, Prec: 0.6936, Rec: 0.8211
Test Confusion Matrix:
 [[244 148]
 [ 73 335]]
Epoch 004 | Train Loss: 0.5604, Train ROC-AUC: 0.7833 | Test Loss: 0.5654, Test ROC-AUC: 0.8018, F1: 0.6938, Acc: 0.7175, Prec: 0.7758, Rec: 0.6275
Test Confusion Matrix:
 [[318  74]
 [152 256]]
Epoch 005 | Train Loss: 0.5527, Train ROC-AUC: 0.7932 | Test Loss: 0.5349, Test ROC-AUC: 0.8143, F1: 0.7646, Acc: 0.7475, Prec: 0.7289, Rec: 0.8039
Test Confusion Matrix:
 [[270 122]
 [ 80 328]]
Epoch 006 | Train Loss: 0

[I 2025-12-05 17:17:09,042] A new study created in memory with name: no-name-c4dadd57-1aad-4145-816c-8d0897b07856


Embedding shapes: (3200, 128) (800, 128)

Running Optuna hyperparameter search for SVM...


[I 2025-12-05 17:17:10,813] Trial 0 finished with value: 0.8685788507363568 and parameters: {'C': 0.10019800923586661, 'gamma': 0.0021494938298300325, 'kernel': 'linear'}. Best is trial 0 with value: 0.8685788507363568.
[I 2025-12-05 17:17:12,522] Trial 1 finished with value: 0.8684030626196335 and parameters: {'C': 0.09180941338464514, 'gamma': 0.6012137094838448, 'kernel': 'linear'}. Best is trial 0 with value: 0.8685788507363568.
[I 2025-12-05 17:17:14,523] Trial 2 finished with value: 0.8534708387046369 and parameters: {'C': 0.6936851225123591, 'gamma': 0.004545197215670044, 'kernel': 'rbf'}. Best is trial 0 with value: 0.8685788507363568.
[I 2025-12-05 17:17:16,650] Trial 3 finished with value: 0.8714842376655338 and parameters: {'C': 0.4017557334496838, 'gamma': 0.00017491547783003838, 'kernel': 'rbf'}. Best is trial 3 with value: 0.8714842376655338.
[I 2025-12-05 17:40:36,671] Trial 4 finished with value: 0.867817102230556 and parameters: {'C': 589.8714421965096, 'gamma': 0.0001

Best SVM params: {'C': 1.143774217375179, 'gamma': 0.0004116943377543645, 'kernel': 'linear'}
Best validation ROC-AUC from Optuna: 0.8719383569670691

Training final SVM with best hyperparameters on full training embeddings...

===== SVM on AttentiveFP Embeddings (Optuna-tuned) =====
ROC-AUC   : 0.8444
F1        : 0.7760
Accuracy  : 0.7762
Kappa     : 0.5527
Precision : 0.7928
Recall    : 0.7598
Confusion Matrix:
 [[311  81]
 [ 98 310]]


In [None]:
# from attentiveFP alone

"Train Accuracy: 0.86
Test Accuracy 0.8125
ROC-AUC 0.8134
F1-Score 0.8108
Cohen's Kappa 0.6256
Precision 0.8851
Recall 0.9167 "

In [None]:
===== SVM on AttentiveFP Embeddings (Optuna-tuned) ===== AID 932
ROC-AUC   : 0.8463
F1        : 0.7928
Accuracy  : 0.7688
Kappa     : 0.5355
Precision : 0.7299
Recall    : 0.8676
Confusion Matrix:
 [[261 131]
 [ 54 354]]