<a href="https://colab.research.google.com/github/Jahan08/Ambertools-CP2K-MM-QM-Biomolecular-Simulation/blob/main/GCN-AI-hERG-data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [48]:
! pip3 install rdkit

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting rdkit
  Downloading rdkit-2022.9.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.4/29.4 MB[0m [31m33.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: rdkit
Successfully installed rdkit-2022.9.5


In [50]:
! pip3 install torch_geometric

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torch_geometric
  Downloading torch_geometric-2.3.1.tar.gz (661 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m661.6/661.6 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: torch_geometric
  Building wheel for torch_geometric (pyproject.toml) ... [?25l[?25hdone
  Created wheel for torch_geometric: filename=torch_geometric-2.3.1-py3-none-any.whl size=910476 sha256=0039a275bdcbeb7c26b3416b5de988a057254bd230f5a283eec9d4dac67d1d35
  Stored in directory: /root/.cache/pip/wheels/ac/dc/30/e2874821ff308ee67dcd7a66dbde912411e19e35a1addda028
Successfully built torch_geometric
Installing collected packages: torch_geometric
Successfully installed torch_geomet

In [51]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import MolFromSmiles
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn import functional as F
from torch_geometric.nn import MessagePassing
from torch_geometric.utils import add_self_loops, degree
from torch_geometric.data import Data
from tqdm import tqdm

In [52]:
df_url = 'https://github.com/Jahan08/Amber-tutorial/raw/main/hERG_all.csv'
df = pd.read_csv(df_url)
df

Unnamed: 0,Name,assay_chembl_id,assay_description,Source,hERG_uM,Activity,Canonical_Smiles
0,CHEMBL12713,CHEMBL841079,Inhibition of hERG currents Kv11.1,J Med Chem,0.0140,Yes,O=C1NCCN1CCN1CCC(c2cn(-c3ccc(F)cc3)c3ccc(Cl)cc...
1,CHEMBL1108,CHEMBL691014,K+ channel blocking activity in human embryoni...,J Med Chem,0.0322,Yes,O=C(CCCN1CC=C(n2c(=O)[nH]c3ccccc32)CC1)c1ccc(F...
2,CHEMBL6966,CHEMBL691014,K+ channel blocking activity in human embryoni...,J Med Chem,0.1430,Yes,COc1ccc(CCN(C)CCCC(C#N)(c2ccc(OC)c(OC)c2)C(C)C...
3,CHEMBL1107,CHEMBL877203,K+ channel blocking activity in Chinese hamste...,J Med Chem,0.1960,Yes,CCCCN(CCCC)CCC(O)c1cc2c(Cl)cc(Cl)cc2c2cc(C(F)(...
4,CHEMBL998,CHEMBL691014,K+ channel blocking activity in human embryoni...,J Med Chem,0.1730,Yes,CCOC(=O)N1CCC(=C2c3ccc(Cl)cc3CCc3cccnc32)CC1
...,...,...,...,...,...,...,...
2319,CHEMBL5081517,CHEMBL5048865,Inhibition of hERG by patch clamp method,J Med Chem,44.0000,No,CCOP(=O)(Cn1ccc(NC(=O)c2cc(Oc3ccc(S(C)(=O)=O)c...
2320,CHEMBL5072442,CHEMBL5048865,Inhibition of hERG by patch clamp method,J Med Chem,33.0000,No,CCOP(=O)(Cn1ccc(NC(=O)c2cc(Oc3ccc(S(=O)(=O)N4C...
2321,CHEMBL5093378,CHEMBL5049389,Inhibition of hERG,J Med Chem,0.6700,Yes,Cc1nc(C)c([C@H](OC(C)(C)C)C(=O)O)c(N2CCC(C)(C)...
2322,CHEMBL5070887,CHEMBL5050750,Inhibition of human ERG,ACS Med Chem Lett,0.0140,Yes,Cc1cnc(Nc2ccnn2C)nc1-c1cc2n(c1)C(=O)N([C@H](CO...


In [53]:
df['Molecule'] = df['Canonical_Smiles'].apply(lambda x: MolFromSmiles(x))

# Calculate ECFP6 molecular descriptors
df['ECFP6'] = df['Molecule'].apply(lambda x: AllChem.GetMorganFingerprintAsBitVect(x, 3, nBits=2048))

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(df['ECFP6'], df['Activity'], test_size=0.2, random_state=42)

In [54]:
# Define a custom PyTorch dataset
class MoleculeDataset(Dataset):
    def __init__(self, descriptors, labels):
        self.descriptors = descriptors
        self.labels = labels

    def __len__(self):
        return len(self.descriptors)

    def __getitem__(self, idx):
        X = torch.tensor(self.descriptors[idx], dtype=torch.float)
        y = torch.tensor(self.labels[idx], dtype=torch.float)
        return X, y


In [55]:
# Instantiate the custom dataset and create dataloaders
train_dataset = MoleculeDataset(X_train, y_train)
test_dataset = MoleculeDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [56]:
# Define the GNN model
class Net(torch.nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = GCNConv(2048, 64)
        self.conv2 = GCNConv(64, 32)
        self.fc1 = nn.Linear(32, 16)
        self.fc2 = nn.Linear(16, 1)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        # Add self-loops and calculate degrees
        edge_index, _ = add_self_loops(edge_index, num_nodes=x.size(0))
        row, col = edge_index
        deg = degree(col, x.size(0), dtype=x.dtype)
        deg_inv_sqrt = deg.pow(-0.5)
        norm = deg_inv_sqrt[row] * deg_inv_sqrt[col]

        # First GCN layer
        x = F.relu(self.conv1(x, edge_index))
        x = F.dropout(x, training=self.training)

        # Second GCN layer
        x = F.relu(self.conv2(x, edge_index))
        x = F.dropout(x, training=self.training)

        # Average pooling
        x = torch.mean(x, dim=0)

        # Fully connected layers
        x = F.relu(self.fc1(x))
        x = F.dropout(x, training=self.training)
        x = self.fc2(x)

        return x

# Instantiate the GNN model
model = Net()

# Define the optimizer and loss function
optimizer = optim.Adam

NameError: ignored