In [2]:
!pip install deepchem

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting deepchem
  Downloading deepchem-2.7.1-py3-none-any.whl (693 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m693.2/693.2 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
Collecting scipy<1.9 (from deepchem)
  Downloading scipy-1.8.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (42.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.2/42.2 MB[0m [31m30.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rdkit (from deepchem)
  Downloading rdkit-2023.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.7/29.7 MB[0m [31m54.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: scipy, rdkit, deepchem
  Attempting uninstall: scipy
    Found existing installation: scipy 1.10.1
    Uninstalling scipy-1.10.1:
      Successfully uninstalled sc

In [3]:
!pip install torch_geometric

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torch_geometric
  Downloading torch_geometric-2.3.1.tar.gz (661 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m661.6/661.6 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: torch_geometric
  Building wheel for torch_geometric (pyproject.toml) ... [?25l[?25hdone
  Created wheel for torch_geometric: filename=torch_geometric-2.3.1-py3-none-any.whl size=910459 sha256=0ebbb663538ee4e1cdd374d4b126f767a39631888b09303b4aec14d01e6ad1db
  Stored in directory: /root/.cache/pip/wheels/ac/dc/30/e2874821ff308ee67dcd7a66dbde912411e19e35a1addda028
Successfully built torch_geometric
Installing collected packages: torch_geometric
Successfully installed torch_geomet

In [4]:
import pandas as pd
import torch
import torch_geometric
from torch_geometric.data import Dataset, Data
import numpy as np 
import os
from tqdm import tqdm
import deepchem as dc
from rdkit import Chem



In [5]:
class MyOwnDataset(Dataset):
    def __init__(self, root, transform=None, pre_transform=None, pre_filter=None):
        super().__init__(root, transform, pre_transform, pre_filter)

    @property
    def raw_file_names(self):
        return 'HIV.csv'

    @property
    def processed_file_names(self):
        return 'not implemented.pt'

    def download(self):
        # Download to `self.raw_dir`.
        pass
       
        

    def process(self):
      self.data = pd.read_csv(self.raw_paths[0])
      for index,mol_obj in tqdm(self.data.iterrows()):
        mol = Chem.MolFromSmiles(mol_obj["smiles"])
        #get node features
        node_feat = self.get_node_features(mol)

        #get edge 
        edge = self.get_edge_features(mol)

        #get adjacency matrix
        adj = self.get_adj_matrix(mol)

        #get labels
        label = self.get_labels(mol_obj["HIV_active"])

        data = Data(x = node_feat , edge_index = edge , y = label , smiles=mol_obj["smiles"])
        
        torch.save(data, os.path.join(self.processed_dir, f'data_{index}.pt'))
    
    def get_node_features(self , mol):

      all_node_feat = []

      for atom in mol.GetAtoms():
        node_feats = []
        
        # Atomic number
        node_feats.append(atom.GetAtomicNum())
        
        # Atom degree
        node_feats.append(atom.GetDegree())
        
        # Formal charge
        node_feats.append(atom.GetFormalCharge())
        
        # Hybridization
        node_feats.append(atom.GetHybridization())
        
        # Aromaticity
        node_feats.append(atom.GetIsAromatic())
        
        # Total Num Hs
        node_feats.append(atom.GetTotalNumHs())
        
        # Radical Electrons
        node_feats.append(atom.GetNumRadicalElectrons())
        
        # In Ring
        node_feats.append(atom.IsInRing())
        
        
        # Chirality
        node_feats.append(atom.GetChiralTag())
    
        all_node_feat.append(node_feats)
      
      all_node_feat = np.asarray(all_node_feat)
      return torch.tensor(all_node_feat, dtype=torch.float)


    def get_edge_features(self, mol):

      all_edge_feats = []

      for bond in mol.GetBonds():
        edge_feats = []

        #Bond Type
        edge_feats.append(bond.GetBondTypeAsDouble())

        # Rings
        edge_feats.append(bond.IsInRing())

        all_edge_feats += [edge_feats , edge_feats]
      
      all_edge_feats = np.asarray(all_edge_feats)
      return torch.tensor(all_edge_feats , dtype = torch.int64)
    
    def get_adj_matrix(self, mol):

      edge_indices = []
      for bond in mol.GetBonds():
            i = bond.GetBeginAtomIdx()
            j = bond.GetEndAtomIdx()
            edge_indices += [[i, j], [j, i]]

      edge_indices = torch.tensor(edge_indices)
      edge_indices = edge_indices.t().to(torch.long).view(2, -1)
      return edge_indices
    
    def get_labels(self, label):
      label = np.asarray([label])
      return torch.tensor(label, dtype=torch.int64)

    def len(self):
        return len(self.processed_file_names)

    def get(self, idx):
      data = torch.load(os.path.join(self.processed_dir, f'data_{idx}.pt')) 
      return data

In [6]:
dataset = MyOwnDataset(root = "data/")

Processing...
41127it [01:40, 409.00it/s]
Done!


In [7]:
from torch_geometric.nn import GATConv, TopKPooling 
from torch.nn import Linear

In [8]:
from torch_geometric.utils import embedding

class ClassificationHIV(torch.nn.Module):
  def __init__(self , feature_size):
    super(ClassificationHIV , self).__init__()
    num_classes = 2
    embedding_size = 1024

    self.layer1 = torch.nn.Sequentail(GATConv(feature_size , embedding_size , heads = 4 , dropout = 0.3),
                                      Linear(embedding_size*4 , embedding_size),
                                      TopKPooling(embedding_size , ratio = 0.8))
    
    self.layer2 = torch.nn.Sequentail(GATConv(feature_size , embedding_size , heads = 4 , dropout = 0.3),
                                      Linear(embedding_size*4 , embedding_size),
                                      TopKPooling(embedding_size , ratio = 0.8))
    
    self.layer3 = torch.nn.Sequentail(GATConv(feature_size , embedding_size , heads = 4 , dropout = 0.3),
                                      Linear(embedding_size*4 , embedding_size),
                                      TopKPooling(embedding_size , ratio = 0.8))
    
    #Linear Layer
    self.layer4 = Linear(embedding_size * 3 , 1024)
    self.layer5 = Linear(1024 , num_classes)








In [9]:
import torch
import torch.nn.functional as F 
from torch.nn import Linear, BatchNorm1d, ModuleList
from torch_geometric.nn import TransformerConv, TopKPooling 
from torch_geometric.nn import global_mean_pool as gap, global_max_pool as gmp
torch.manual_seed(42)

class GNN(torch.nn.Module):
    def __init__(self, feature_size, model_params):
        super(GNN, self).__init__()
        embedding_size = model_params["model_embedding_size"]
        n_heads = model_params["model_attention_heads"]
        self.n_layers = model_params["model_layers"]
        dropout_rate = model_params["model_dropout_rate"]
        top_k_ratio = model_params["model_top_k_ratio"]
        self.top_k_every_n = model_params["model_top_k_every_n"]
        dense_neurons = model_params["model_dense_neurons"]
        edge_dim = model_params["model_edge_dim"]

        self.conv_layers = ModuleList([])
        self.transf_layers = ModuleList([])
        self.pooling_layers = ModuleList([])
        self.bn_layers = ModuleList([])

        # Transformation layer
        self.conv1 = TransformerConv(feature_size, 
                                    embedding_size, 
                                    heads=n_heads, 
                                    dropout=dropout_rate,
                                    edge_dim=edge_dim,
                                    beta=True) 

        self.transf1 = Linear(embedding_size*n_heads, embedding_size)
        self.bn1 = BatchNorm1d(embedding_size)

        # Other layers
        for i in range(self.n_layers):
            self.conv_layers.append(TransformerConv(embedding_size, 
                                                    embedding_size, 
                                                    heads=n_heads, 
                                                    dropout=dropout_rate,
                                                    edge_dim=edge_dim,
                                                    beta=True))

            self.transf_layers.append(Linear(embedding_size*n_heads, embedding_size))
            self.bn_layers.append(BatchNorm1d(embedding_size))
            if i % self.top_k_every_n == 0:
                self.pooling_layers.append(TopKPooling(embedding_size, ratio=top_k_ratio))
            

        # Linear layers
        self.linear1 = Linear(embedding_size*2, dense_neurons)
        self.linear2 = Linear(dense_neurons, int(dense_neurons/2))  
        self.linear3 = Linear(int(dense_neurons/2), 1)  

    def forward(self, x, edge_attr, edge_index, batch_index):
        # Initial transformation
        x = self.conv1(x, edge_index, edge_attr)
        x = torch.relu(self.transf1(x))
        x = self.bn1(x)

        # Holds the intermediate graph representations
        global_representation = []

        for i in range(self.n_layers):
            x = self.conv_layers[i](x, edge_index, edge_attr)
            x = torch.relu(self.transf_layers[i](x))
            x = self.bn_layers[i](x)
            # Always aggregate last layer
            if i % self.top_k_every_n == 0 or i == self.n_layers:
                x , edge_index, edge_attr, batch_index, _, _ = self.pooling_layers[int(i/self.top_k_every_n)](
                    x, edge_index, edge_attr, batch_index
                    )
                # Add current representation
                global_representation.append(torch.cat([gmp(x, batch_index), gap(x, batch_index)], dim=1))
    
        x = sum(global_representation)

        # Output block
        x = torch.relu(self.linear1(x))
        x = F.dropout(x, p=0.8, training=self.training)
        x = torch.relu(self.linear2(x))
        x = F.dropout(x, p=0.8, training=self.training)
        x = self.linear3(x)

        return x