In [1]:
# Install RDKit (Standard library for Cheminformatics)
!pip install rdkit

import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem

print("Libraries installed and imported successfully.")

Collecting rdkit
  Downloading rdkit-2025.9.3-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (4.2 kB)
Downloading rdkit-2025.9.3-cp312-cp312-manylinux_2_28_x86_64.whl (36.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m36.4/36.4 MB[0m [31m28.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rdkit
Successfully installed rdkit-2025.9.3
Libraries installed and imported successfully.


In [2]:
# Function to convert a chemical SMILES string into a Graph
def smiles_to_graph(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None: return None

    # 1. Node Features (Map Atom types to numbers: C=1, N=2, O=3, etc.)
    # This allows the AI to distinguish between Carbon and Oxygen
    atom_map = {'C': 1, 'N': 2, 'O': 3, 'S': 4, 'F': 5, 'Cl': 6}
    nodes = []
    for atom in mol.GetAtoms():
        symbol = atom.GetSymbol()
        nodes.append(atom_map.get(symbol, 0)) # 0 = Unknown atom (like P or Br)

    # 2. Edge Features (Adjacency Matrix)
    # This matrix tells the model which atoms are connected to which
    adj = Chem.GetAdjacencyMatrix(mol)

    # Convert to PyTorch tensors (formats the data for the neural network)
    return torch.tensor(nodes, dtype=torch.long), torch.tensor(adj, dtype=torch.float)

# Test the function with Aspirin to ensure it works
nodes, adj = smiles_to_graph("CC(=O)Oc1ccccc1C(=O)O")
print(f"Aspirin Graph Created: {len(nodes)} Atoms.")
print(f"Node Features: {nodes}")

Aspirin Graph Created: 13 Atoms.
Node Features: tensor([1, 1, 3, 3, 1, 1, 1, 1, 1, 1, 1, 3, 3])


In [3]:
# Define the Graph Neural Network Architecture
class DrugAffinityGNN(nn.Module):
    def __init__(self, vocab_size=10, embed_dim=16):
        super(DrugAffinityGNN, self).__init__()

        # Embedding Layer: Converts simple atom numbers (1, 2) into rich vectors
        self.embedding = nn.Embedding(vocab_size, embed_dim)

        # Graph Convolution Layers (The "GNN" part)
        # These layers aggregate information from neighboring atoms
        self.fc1 = nn.Linear(embed_dim, 32)
        self.fc2 = nn.Linear(32, 16)

        # Output Layer (Predicts the Binding Affinity Score)
        self.output = nn.Linear(16, 1)

    def forward(self, nodes, adj):
        # 1. Embed the atoms
        x = self.embedding(nodes) # [Num_Atoms, Embed_Dim]

        # 2. Message Passing (Graph Convolution)
        # Formula: New_State = Activation( Adjacency * Old_State * Weights )
        x = torch.matmul(adj, x)
        x = F.relu(self.fc1(x))

        x = torch.matmul(adj, x) # Second hop (looking at neighbors of neighbors)
        x = F.relu(self.fc2(x))

        # 3. Global Pooling (Readout)
        # Average all atom vectors to get a single vector representing the whole molecule
        x = torch.mean(x, dim=0)

        # 4. Predict Affinity
        return self.output(x)

model = DrugAffinityGNN()
print(model)

DrugAffinityGNN(
  (embedding): Embedding(10, 16)
  (fc1): Linear(in_features=16, out_features=32, bias=True)
  (fc2): Linear(in_features=32, out_features=16, bias=True)
  (output): Linear(in_features=16, out_features=1, bias=True)
)


In [4]:
# Mock Dataset: Drug SMILES and their Binding Affinity (pKd values)
# In a real project, you would load thousands of these from a CSV file (e.g., BindingDB)
dataset = [
    ("CC(=O)Oc1ccccc1C(=O)O", 6.5), # Aspirin
    ("CN1C=NC2=C1C(=O)N(C(=O)N2C)C", 5.2), # Caffeine
    ("CC(C)CC1=CC=C(C=C1)C(C)C(=O)O", 7.1), # Ibuprofen
    ("CCO", 1.2), # Ethanol (Weak binding)
    ("c1ccccc1", 2.0) # Benzene (Toxic/Weak)
]

# Optimization setup
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = nn.MSELoss() # Mean Squared Error (Standard for regression)

print("Starting GNN Training...")
model.train()

# Train for 50 epochs
for epoch in range(50):
    total_loss = 0
    for smiles, label in dataset:
        # Convert data
        nodes, adj = smiles_to_graph(smiles)
        target = torch.tensor([label], dtype=torch.float)

        # Forward pass
        optimizer.zero_grad()
        prediction = model(nodes, adj)

        # Backward pass (Learning)
        loss = criterion(prediction, target)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    if (epoch+1) % 10 == 0:
        print(f"Epoch {epoch+1}: Loss = {total_loss/len(dataset):.4f}")

print("Training Complete. Model successfully learned from molecular graphs.")

Starting GNN Training...
Epoch 10: Loss = 2.2723
Epoch 20: Loss = 1.7478
Epoch 30: Loss = 1.2089
Epoch 40: Loss = 1.1370
Epoch 50: Loss = 1.1219
Training Complete. Model successfully learned from molecular graphs.
