In [None]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu
!pip install torch-scatter torch-sparse torch-geometric
!pip install rdkit-pypi pandas scikit-learn

Looking in indexes: https://download.pytorch.org/whl/cpu
INFO: pip is looking at multiple versions of torch to determine which version is compatible with other requirements. This could take a while.
Collecting torch
  Downloading https://download.pytorch.org/whl/cpu/torch-2.6.0%2Bcpu-cp311-cp311-linux_x86_64.whl.metadata (26 kB)
Downloading https://download.pytorch.org/whl/cpu/torch-2.6.0%2Bcpu-cp311-cp311-linux_x86_64.whl (178.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m178.7/178.7 MB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch
  Attempting uninstall: torch
    Found existing installation: torch 2.6.0+cu124
    Uninstalling torch-2.6.0+cu124:
      Successfully uninstalled torch-2.6.0+cu124
Successfully installed torch-2.6.0+cpu
Collecting torch-scatter
  Downloading torch_scatter-2.1.2.tar.gz (108 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m108.0/108.0 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[

In [None]:
from rdkit.Chem import AllChem
from torch_geometric.data import Data
import torch
import os
!pip install numpy==1.23.5
import numpy as np



In [None]:
import pandas as pd
tox21_url = "https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/tox21.csv.gz"
df = pd.read_csv(tox21_url)

df = df.dropna(subset=["smiles"]).reset_index(drop=True)

In [None]:
tox21_targets = [
    "NR-AR", "NR-AR-LBD", "NR-AhR", "NR-Aromatase",
    "NR-ER", "NR-ER-LBD", "NR-PPAR-gamma", "SR-ARE",
    "SR-ATAD5", "SR-HSE", "SR-MMP", "SR-p53"
]
from rdkit import Chem

In [None]:
def smiles_to_graph(smiles, label_row):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None

    mol = Chem.AddHs(mol)

    x = torch.tensor([atom.GetAtomicNum() for atom in mol.GetAtoms()], dtype=torch.float).unsqueeze(1)

    edge_index = []
    for bond in mol.GetBonds():
        i = bond.GetBeginAtomIdx()
        j = bond.GetEndAtomIdx()
        edge_index += [[i, j], [j, i]]

    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()

    y = torch.tensor([label_row[t] if not np.isnan(label_row[t]) else -1 for t in tox21_targets], dtype=torch.float)

    return Data(x=x, edge_index=edge_index, y=y)

In [None]:
graph_list = []
for i, row in df.iterrows():
    graph = smiles_to_graph(row["smiles"], row)
    if graph:
        graph_list.append(graph)

print(f"Loaded {len(graph_list)} molecular graphs.")



Loaded 7831 molecular graphs.


In [None]:
from torch_geometric.loader import DataLoader
from sklearn.metrics import roc_auc_score
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, global_mean_pool
import torch.nn as nn
import torch

In [None]:
graph_list = [g for g in graph_list if (g.y != -1).any()]

In [None]:
torch.manual_seed(42)
perm = torch.randperm(len(graph_list))
train_split = int(0.8 * len(graph_list))
val_split = int(0.9 * len(graph_list))

In [None]:
train_graphs = [graph_list[i] for i in perm[:train_split]]
val_graphs = [graph_list[i] for i in perm[train_split:val_split]]
test_graphs = [graph_list[i] for i in perm[val_split:]]

train_loader = DataLoader(train_graphs, batch_size=32, shuffle=True)
val_loader = DataLoader(val_graphs, batch_size=32)
test_loader = DataLoader(test_graphs, batch_size=32)

In [None]:
class ToxGNN(nn.Module):
    def __init__(self, num_tasks):
        super().__init__()
        self.conv1 = GCNConv(1, 64)
        self.conv2 = GCNConv(64, 64)
        self.lin = nn.Linear(64, num_tasks)

    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch
        x = F.relu(self.conv1(x, edge_index))
        x = F.relu(self.conv2(x, edge_index))
        x = global_mean_pool(x, batch)
        return self.lin(x)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = ToxGNN(num_tasks=12).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = torch.nn.BCEWithLogitsLoss()

In [None]:
def train():
    model.train()
    total_loss = 0
    for data in train_loader:
        data = data.to(device)
        out = model(data)               # [batch_size, 12]
        target = data.y                 # [batch_size, 12]

        out = out.view(-1)              # Shape: [batch_size * 12]
        target = target.view(-1)        # Same shape

        mask = target != -1
        out_masked = out[mask]
        target_masked = target[mask]

        loss = criterion(out_masked, target_masked)

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        total_loss += loss.item()

    return total_loss / len(train_loader)

In [None]:
from sklearn.metrics import roc_auc_score
import numpy as np

def evaluate(loader):
    model.eval()
    y_true = [[] for _ in range(12)]
    y_scores = [[] for _ in range(12)]

    with torch.no_grad():
        for data in loader:
            data = data.to(device)
            out = torch.sigmoid(model(data))  # [batch_size, 12]
            target = data.y.view(-1, 12)       # Make sure it's [batch_size, 12]

            for task in range(12):
                mask = target[:, task] != -1
                if mask.sum() == 0:
                    continue
                y_true[task] += target[mask, task].cpu().numpy().tolist()
                y_scores[task] += out[mask, task].cpu().numpy().tolist()

    aucs = []
    for t in range(12):
        if len(set(y_true[t])) < 2:
            aucs.append(np.nan)
        else:
            aucs.append(roc_auc_score(y_true[t], y_scores[t]))
    return aucs


In [None]:
for epoch in range(1, 11):
    loss = train()
    val_aucs = evaluate(val_loader)
    avg_auc = np.nanmean(val_aucs)
    print(f"Epoch {epoch:02d} | Loss: {loss:.4f} | Avg Val ROC-AUC: {avg_auc:.4f}")

Epoch 01 | Loss: 0.3212 | Avg Val ROC-AUC: 0.4143
Epoch 02 | Loss: 0.2743 | Avg Val ROC-AUC: 0.4141
Epoch 03 | Loss: 0.2717 | Avg Val ROC-AUC: 0.4139
Epoch 04 | Loss: 0.2670 | Avg Val ROC-AUC: 0.4126
Epoch 05 | Loss: 0.2611 | Avg Val ROC-AUC: 0.4214
Epoch 06 | Loss: 0.2563 | Avg Val ROC-AUC: 0.5277
Epoch 07 | Loss: 0.2542 | Avg Val ROC-AUC: 0.5555
Epoch 08 | Loss: 0.2542 | Avg Val ROC-AUC: 0.5651
Epoch 09 | Loss: 0.2532 | Avg Val ROC-AUC: 0.5787
Epoch 10 | Loss: 0.2532 | Avg Val ROC-AUC: 0.5972


**Upgrading the GNN architecture to GINConv**

In [None]:
from torch_geometric.nn import GCNConv, GINConv, global_mean_pool
from torch_geometric.loader import DataLoader
from sklearn.metrics import roc_auc_score

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, GINConv, global_mean_pool
from torch_geometric.loader import DataLoader
from sklearn.metrics import roc_auc_score
import numpy as np

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

class ToxGCN(nn.Module):
    def __init__(self, num_tasks):
        super().__init__()
        self.conv1 = GCNConv(1, 64)
        self.conv2 = GCNConv(64, 64)
        self.lin = nn.Linear(64, num_tasks)

    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch
        x = F.relu(self.conv1(x, edge_index))
        x = F.relu(self.conv2(x, edge_index))
        x = global_mean_pool(x, batch)
        return self.lin(x)

class ToxGIN(nn.Module):
    def __init__(self, num_tasks):
        super().__init__()
        nn1 = nn.Sequential(nn.Linear(1, 64), nn.ReLU(), nn.Linear(64, 64))
        self.conv1 = GINConv(nn1)
        nn2 = nn.Sequential(nn.Linear(64, 64), nn.ReLU(), nn.Linear(64, 64))
        self.conv2 = GINConv(nn2)
        self.lin = nn.Linear(64, num_tasks)

    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch
        x = F.relu(self.conv1(x, edge_index))
        x = F.relu(self.conv2(x, edge_index))
        x = global_mean_pool(x, batch)
        return self.lin(x)

def train(model, loader, optimizer, criterion):
    model.train()
    total_loss = 0
    for data in loader:
        data = data.to(device)
        optimizer.zero_grad()
        out = model(data)

        y = data.y.to(torch.float32)
        mask = y != -1

        y = y.view(out.shape)
        mask = mask.view(out.shape)

        loss = criterion(out[mask], y[mask])
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(loader)

def evaluate(model, loader):
    model.eval()
    y_true, y_pred = [], []
    with torch.no_grad():
        for data in loader:
            data = data.to(device)
            out = model(data)

            y = data.y.to(torch.float32)
            y = y.view(out.shape)

            mask = y != -1
            y_true.append(y[mask].cpu())
            y_pred.append(out[mask].cpu())

    y_true = torch.cat(y_true, dim=0).numpy()
    y_pred = torch.cat(y_pred, dim=0).numpy()

    if len(y_true.shape) == 1:
        y_true = y_true[:, np.newaxis]
        y_pred = y_pred[:, np.newaxis]

    aucs = []
    for i in range(y_true.shape[1]):
        try:
            auc = roc_auc_score(y_true[:, i], y_pred[:, i])
            aucs.append(auc)
        except ValueError:
            continue

    return sum(aucs) / len(aucs) if aucs else 0.0

def run_experiment(ModelClass, name):
    model = ModelClass(num_tasks=12).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.BCEWithLogitsLoss()

    for epoch in range(1, 21):
        loss = train(model, train_loader, optimizer, criterion)
        val_auc = evaluate(model, val_loader)
        print(f"[{name}] Epoch {epoch:02d} | Loss: {loss:.4f} | Val AUC: {val_auc:.4f}")

    test_auc = evaluate(model, test_loader)
    print(f"[{name}] Test ROC-AUC: {test_auc:.4f}\n")
    return model, test_auc

model_gcn, auc_gcn = run_experiment(ToxGCN, "GCN")
model_gin, auc_gin = run_experiment(ToxGIN, "GIN")

print(f"GCN Test ROC-AUC: {auc_gcn:.4f}")
print(f"GIN Test ROC-AUC: {auc_gin:.4f}")


[GCN] Epoch 01 | Loss: 0.3330 | Val AUC: 0.5944
[GCN] Epoch 02 | Loss: 0.2756 | Val AUC: 0.5929
[GCN] Epoch 03 | Loss: 0.2718 | Val AUC: 0.6137
[GCN] Epoch 04 | Loss: 0.2685 | Val AUC: 0.6204
[GCN] Epoch 05 | Loss: 0.2644 | Val AUC: 0.6340
[GCN] Epoch 06 | Loss: 0.2599 | Val AUC: 0.6455
[GCN] Epoch 07 | Loss: 0.2557 | Val AUC: 0.6676
[GCN] Epoch 08 | Loss: 0.2543 | Val AUC: 0.6820
[GCN] Epoch 09 | Loss: 0.2538 | Val AUC: 0.6778
[GCN] Epoch 10 | Loss: 0.2535 | Val AUC: 0.6811
[GCN] Epoch 11 | Loss: 0.2528 | Val AUC: 0.6855
[GCN] Epoch 12 | Loss: 0.2530 | Val AUC: 0.6839
[GCN] Epoch 13 | Loss: 0.2531 | Val AUC: 0.6840
[GCN] Epoch 14 | Loss: 0.2526 | Val AUC: 0.6863
[GCN] Epoch 15 | Loss: 0.2525 | Val AUC: 0.6902
[GCN] Epoch 16 | Loss: 0.2533 | Val AUC: 0.6924
[GCN] Epoch 17 | Loss: 0.2522 | Val AUC: 0.6918
[GCN] Epoch 18 | Loss: 0.2519 | Val AUC: 0.6894
[GCN] Epoch 19 | Loss: 0.2515 | Val AUC: 0.6880
[GCN] Epoch 20 | Loss: 0.2510 | Val AUC: 0.6930
[GCN] Test ROC-AUC: 0.7040

[GIN] Epoch 

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!ls /content/drive/MyDrive/ai_lab_testing

01_baseline_deepchem.ipynb  02_gnn_baseline.ipynb  03_gnn_advanced_features.ipynb


In [3]:
!git config --global user.name "MreegendraNarayan"
!git config --global user.email "mreegendra2003211@gmail.com"

In [8]:
from getpass import getpass
token = getpass('Enter your GitHub Personal Access Token: ')

Enter your GitHub Personal Access Token: ··········


In [9]:
!git clone https://{token}@github.com/MreegendraNarayan/Toxicity_prediction.git

Cloning into 'Toxicity_prediction'...
