### Classification GCN with deepchem (HIV dataset from MolNet)

Please click below to open this notebook with colab.

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1r3QAoLsI-k6se1EubeepUs8p0Bqvapb_?usp=sharing)

The Deepchem and dataset setup below was taken from the official tutorial: [link ](https://github.com/deepchem/deepchem/blob/master/examples/tutorials/03_Modeling_Solubility.ipynb)

In [None]:
!pip install --pre deepchem

Collecting deepchem
  Downloading deepchem-2.7.2.dev20230730200710-py3-none-any.whl (827 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m827.4/827.4 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
Collecting rdkit (from deepchem)
  Downloading rdkit-2023.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.7/29.7 MB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: rdkit, deepchem
Successfully installed deepchem-2.7.2.dev20230730200710 rdkit-2023.3.2


In [None]:
import deepchem as dc

tasks, datasets, transformers = dc.molnet.load_hiv(featurizer='GraphConv')
train_dataset, valid_dataset, test_dataset = datasets

In [None]:
train_dataset

<DiskDataset X.shape: (32901,), y.shape: (32901, 1), w.shape: (32901, 1), task_names: ['HIV_active']>

In [None]:
n_tasks = len(tasks)
model = dc.models.GraphConvModel(n_tasks, mode='classification')
model.fit(train_dataset, nb_epoch=50)



0.2730265617370605

In [None]:
metric = dc.metrics.Metric(dc.metrics.roc_auc_score)
print('Training set score:', model.evaluate(train_dataset, [metric], transformers))
print('Validation score:', model.evaluate(valid_dataset, [metric], transformers))
print('Test set score:', model.evaluate(test_dataset, [metric], transformers))

Training set score: {'roc_auc_score': 0.9871351990434334}
Validation score: {'roc_auc_score': 0.7909356322261414}
Test set score: {'roc_auc_score': 0.7346443538886422}


## Custom Classification GCN model with DeepChem and Keras (hERG literature dataset)

In [None]:
import pandas as pd

In [None]:
df= pd.read_csv("/content/hERG_bioactivity_pIC50.csv")
df

Unnamed: 0,assay_chembl_id,assay_description,canonical_smiles,Source,Name,hERG_uM,Activity,pIC50,hERG_Activity
0,CHEMBL841079,Inhibition of hERG currents Kv11.1,O=C1NCCN1CCN1CCC(c2cn(-c3ccc(F)cc3)c3ccc(Cl)cc...,J Med Chem,CHEMBL12713,0.0140,Yes,7.853872,1
1,CHEMBL691014,K+ channel blocking activity in human embryoni...,O=C(CCCN1CC=C(n2c(=O)[nH]c3ccccc32)CC1)c1ccc(F...,J Med Chem,CHEMBL1108,0.0322,Yes,7.492144,1
2,CHEMBL691014,K+ channel blocking activity in human embryoni...,COc1ccc(CCN(C)CCCC(C#N)(c2ccc(OC)c(OC)c2)C(C)C...,J Med Chem,CHEMBL6966,0.1430,Yes,6.844664,1
3,CHEMBL877203,K+ channel blocking activity in Chinese hamste...,CCCCN(CCCC)CCC(O)c1cc2c(Cl)cc(Cl)cc2c2cc(C(F)(...,J Med Chem,CHEMBL1107,0.1960,Yes,6.707744,1
4,CHEMBL691014,K+ channel blocking activity in human embryoni...,CCOC(=O)N1CCC(=C2c3ccc(Cl)cc3CCc3cccnc32)CC1,J Med Chem,CHEMBL998,0.1730,Yes,6.761954,1
...,...,...,...,...,...,...,...,...,...
2963,CHEMBL5048865,Inhibition of hERG by patch clamp method,CCOP(=O)(Cn1ccc(NC(=O)c2cc(Oc3ccc(S(C)(=O)=O)c...,J Med Chem,CHEMBL5081517,44.0000,No,4.356547,0
2964,CHEMBL5048865,Inhibition of hERG by patch clamp method,CCOP(=O)(Cn1ccc(NC(=O)c2cc(Oc3ccc(S(=O)(=O)N4C...,J Med Chem,CHEMBL5072442,33.0000,No,4.481486,0
2965,CHEMBL5049389,Inhibition of hERG,Cc1nc(C)c([C@H](OC(C)(C)C)C(=O)O)c(N2CCC(C)(C)...,J Med Chem,CHEMBL5093378,0.6700,Yes,6.173925,1
2966,CHEMBL5050750,Inhibition of human ERG,Cc1cnc(Nc2ccnn2C)nc1-c1cc2n(c1)C(=O)N([C@H](CO...,ACS Med Chem Lett,CHEMBL5070887,0.0140,Yes,7.853872,1


In [None]:
dataset_file= r"/content/hERG_bioactivity_pIC50.csv"
dataset_file

'/content/hERG_bioactivity_pIC50.csv'

In [None]:
# Loading the data from the CSV file
loader = dc.data.CSVLoader(tasks=["hERG_Activity"],
                                 smiles_field="canonical_smiles",
                                 featurizer=dc.feat.ConvMolFeaturizer())




In [None]:
# Featurizing the dataset with ConvMolFeaturizer
dataset = loader.featurize(dataset_file)



In [None]:
# Splitter splits the dataset
# In this case it's is an equivalent of train_test_split from sklearn
splitter = dc.splits.ScaffoldSplitter()
# frac_test is 0.01 because we only use a train and valid as an example
train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(dataset,
                                                  frac_train=0.7,
                                                  frac_valid=0.29,
                                                  frac_test=0.01)
# Normalizer will normalize y values in the dataset
normalizer = dc.trans.NormalizationTransformer(transform_y=True,
                                                     dataset=train_dataset,
                                                     move_mean=True)
train = normalizer.transform(train_dataset)
test = normalizer.transform(test_dataset)
valid = normalizer.transform(valid_dataset)

In [None]:
from deepchem.models.layers import GraphConv, GraphPool, GraphGather
import tensorflow as tf
import tensorflow.keras.layers as layers

batch_size = 100

class MyGraphConvModel(tf.keras.Model):

  def __init__(self):
    super(MyGraphConvModel, self).__init__()
    self.gc1 = GraphConv(128, activation_fn=tf.nn.tanh)
    self.batch_norm1 = layers.BatchNormalization()
    self.gp1 = GraphPool()

    self.gc2 = GraphConv(128, activation_fn=tf.nn.tanh)
    self.batch_norm2 = layers.BatchNormalization()
    self.gp2 = GraphPool()

    self.dense1 = layers.Dense(256, activation=tf.nn.tanh)
    self.batch_norm3 = layers.BatchNormalization()
    self.readout = GraphGather(batch_size=batch_size, activation_fn=tf.nn.tanh)

    self.dense2 = layers.Dense(n_tasks*2)
    self.logits = layers.Reshape((n_tasks, 2))
    self.softmax = layers.Softmax()

  def call(self, inputs):
    gc1_output = self.gc1(inputs)
    batch_norm1_output = self.batch_norm1(gc1_output)
    gp1_output = self.gp1([batch_norm1_output] + inputs[1:])

    gc2_output = self.gc2([gp1_output] + inputs[1:])
    batch_norm2_output = self.batch_norm1(gc2_output)
    gp2_output = self.gp2([batch_norm2_output] + inputs[1:])

    dense1_output = self.dense1(gp2_output)
    batch_norm3_output = self.batch_norm3(dense1_output)
    readout_output = self.readout([batch_norm3_output] + inputs[1:])

    logits_output = self.logits(self.dense2(readout_output))
    return self.softmax(logits_output)

In [None]:
model = dc.models.KerasModel(MyGraphConvModel(), loss=dc.models.losses.CategoricalCrossEntropy())

In [None]:
from deepchem.metrics import to_one_hot
from deepchem.feat.mol_graphs import ConvMol
import numpy as np

def data_generator(dataset, epochs=1):
  for ind, (X_b, y_b, w_b, ids_b) in enumerate(dataset.iterbatches(batch_size, epochs,
                                                                   deterministic=False, pad_batches=True)):
    multiConvMol = ConvMol.agglomerate_mols(X_b)
    inputs = [multiConvMol.get_atom_features(), multiConvMol.deg_slice, np.array(multiConvMol.membership)]
    for i in range(1, len(multiConvMol.get_deg_adjacency_lists())):
      inputs.append(multiConvMol.get_deg_adjacency_lists()[i])
    labels = [to_one_hot(y_b.flatten(), 2).reshape(-1, n_tasks, 2)]
    weights = [w_b]
    yield (inputs, labels, weights)

In [None]:
model.fit_generator(data_generator(train_dataset, epochs=50))

0.02855067014694214

In [None]:
print('Training set score:', model.evaluate_generator(data_generator(train_dataset), [metric], transformers))
print('Validation set score:', model.evaluate_generator(data_generator(valid_dataset), [metric], transformers))
print('Test set score:', model.evaluate_generator(data_generator(test_dataset), [metric], transformers))


Training set score: {'roc_auc_score': 0.8785116073148835}
Validation set score: {'roc_auc_score': 0.7048509190063641}
Test set score: {'roc_auc_score': 0.8786525974025974}
