Please click below to open this notebook with colab.

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/drive/1r3QAoLsI-k6se1EubeepUs8p0Bqvapb_?usp=sharing)

The Deepchem and dataset setup below was taken from the official tutorial: [link ](https://github.com/deepchem/deepchem/blob/master/examples/tutorials/03_Modeling_Solubility.ipynb)

In [1]:
# Installing conda
!curl -Lo conda_installer.py https://raw.githubusercontent.com/deepchem/deepchem/master/scripts/colab_install.py
import conda_installer
conda_installer.install()
!/root/miniconda/bin/conda info -e

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0100  3457  100  3457    0     0  14888      0 --:--:-- --:--:-- --:--:-- 14900


add /root/miniconda/lib/python3.10/site-packages to PYTHONPATH
INFO:conda_installer:add /root/miniconda/lib/python3.10/site-packages to PYTHONPATH
python version: 3.10.6
INFO:conda_installer:python version: 3.10.6
fetching installer from https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
INFO:conda_installer:fetching installer from https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
done
INFO:conda_installer:done
installing miniconda to /root/miniconda
INFO:conda_installer:installing miniconda to /root/miniconda
done
INFO:conda_installer:done
installing openmm, pdbfixer
INFO:conda_installer:installing openmm, pdbfixer
added conda-forge to channels
INFO:conda_installer:added conda-forge to channels
done
INFO:conda_installer:done
conda packages installation finished!
INFO:conda_installer:conda packages installation finished!


Error while loading conda entry point: conda-libmamba-solver (libarchive.so.19: cannot open shared object file: No such file or directory)
# conda environments:
#
base                     /root/miniconda



In [8]:
# Installing Deepchem
!pip3 install --pre deepchem




In [10]:
!pip3 install pyopenssl

Collecting pyopenssl
  Downloading pyOpenSSL-23.2.0-py3-none-any.whl (59 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.0/59.0 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting cryptography!=40.0.0,!=40.0.1,<42,>=38.0.0 (from pyopenssl)
  Downloading cryptography-41.0.2-cp37-abi3-manylinux_2_28_x86_64.whl (4.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.3/4.3 MB[0m [31m47.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: cryptography, pyopenssl
  Attempting uninstall: cryptography
    Found existing installation: cryptography 3.4.8
    Uninstalling cryptography-3.4.8:
      Successfully uninstalled cryptography-3.4.8
Successfully installed cryptography-41.0.2 pyopenssl-23.2.0


In [1]:
import deepchem



In [13]:
import pandas as pd


In [64]:
df = r"/content/hERG_bioactivity_pIC50.csv"
df

'/content/hERG_bioactivity_pIC50.csv'

In [65]:
loader = deepchem.data.CSVLoader(tasks=["pIC50"],
                                 smiles_field="canonical_smiles",
                                 featurizer=deepchem.feat.ConvMolFeaturizer())



In [66]:
dataset = loader.featurize(df)



## Introducing Random Splitter

In [67]:
# Splitter splits the dataset
# In this case it's is an equivalent of train_test_split from sklearn
splitter = deepchem.splits.RandomSplitter()
# frac_test is 0.01 because we only use a train and valid as an example
train, valid, test = splitter.train_valid_test_split(dataset,
                                                      frac_train=0.7,
                                                      frac_valid=0.29,
                                                      frac_test=0.01)
# Normalizer will normalize y values in the dataset
normalizer = deepchem.trans.NormalizationTransformer(transform_y=True,
                                                         dataset=train,
                                                         move_mean=True)
train = normalizer.transform(train)
valid = normalizer.transform(valid)
test = normalizer.transform(test)

In [68]:
print(f"Size of the training data: {len(train.ids)}")
print(f"Size of the validation data: {len(valid.ids)}")
print(test)

Size of the training data: 2077
Size of the validation data: 861
<DiskDataset X.shape: (30,), y.shape: (30, 1), w.shape: (30, 1), ids: ['Fc1ccc(N(CCNCCCc2ccccc2)c2ccc(F)cc2)cc1'
 'CC(C)(C)c1ccc(C(O)CCCN2CCC(C(O)(c3ccccc3)c3ccccc3)CC2)cc1'
 'COc1ccc(CN2CCC(NC(=O)c3cc(=O)c4ccc(F)cc4o3)CC2)cc1F' ...
 'C[C@H](c1ccnc(Cl)c1)N1[C@@H]2CC[C@H]1C[C@H](Oc1cccc(C(N)=O)c1)C2'
 'COc1cc2ccc(C#N)cc2cc1[C@@H](c1cccc(F)c1)[C@@](O)(CCN(C)C)c1cccc2ccoc12'
 'CC[C@@H]1Sc2ccccc2O[C@@H]1c1ccc(OCCCN2CCCC2)cc1'], task_names: ['pIC50']>


In [48]:
# GraphConvModel is a GNN model based on
# Duvenaud, David K., et al. "Convolutional networks on graphs for
# learning molecular fingerprints."
from deepchem.models import GraphConvModel
graph_conv = GraphConvModel(1,
                            batch_size=50,
                            mode="regression")
# Defining metric. Closer to 1 is better
metric = deepchem.metrics.Metric(deepchem.metrics.pearson_r2_score)

In [54]:
# Fitting the model
graph_conv.fit(train, nb_epoch=40)

0.06582967827959758

In [55]:
# Reversing the transformation and getting the metric scores on 2 datasets
train_scores = graph_conv.evaluate(train, [metric], [normalizer])
valid_scores = graph_conv.evaluate(valid, [metric], [normalizer])
test_scores = graph_conv.evaluate(test, [metric], [normalizer])
print(f"Train Scores: {train_scores}")
print(f"Validation Scores: {valid_scores}")
print(f"Test Scores: {test_scores}")

Train Scores: {'pearson_r2_score': 0.949459289695439}
Validation Scores: {'pearson_r2_score': 0.656088669990449}
Test Scores: {'pearson_r2_score': 0.8579418751698057}


## Introducing Scaffold splitter

In [59]:
    # Splitter splits the dataset
    # In this case it's is an equivalent of train_test_split from sklearn
    splitter = deepchem.splits.ScaffoldSplitter()
    # frac_test is 0.01 because we only use a train and valid as an example
    train, valid, test = splitter.train_valid_test_split(dataset,
                                                      frac_train=0.7,
                                                      frac_valid=0.29,
                                                      frac_test=0.01)
    # Normalizer will normalize y values in the dataset
    normalizer = deepchem.trans.NormalizationTransformer(transform_y=True,
                                                         dataset=train,
                                                         move_mean=True)
    train = normalizer.transform(train)
    valid = normalizer.transform(valid)
    test = normalizer.transform(test)

In [60]:
print(f"Size of the training data: {len(train.ids)}")
print(f"Size of the validation data: {len(valid.ids)}")
print(test)

Size of the training data: 2077
Size of the validation data: 861
<DiskDataset X.shape: (30,), y.shape: (30, 1), w.shape: (30, 1), ids: ['CN(C(=O)CC[C@@H](C1CCCCC1)N1Cc2cc(Oc3ccccc3)ccc2N=C1N)C1CCCCC1'
 'N[C@H]1CN(c2ccn3cnnc3n2)CC[C@@H]1c1cc(F)c(F)cc1F.O=C(O)C(F)(F)F'
 'N[C@H]1CN(c2ccc3nc(=O)ccn3n2)CC[C@@H]1c1cc(F)c(F)cc1F.O=C(O)C(F)(F)F'
 ... 'CS(=O)(=O)Nc1ccc2c(c1)C(=O)CC1(CCN(CCc3ccc4nonc4c3)CC1)O2'
 'CN(C)CCCn1nc(C2=C(c3cn(-c4ccc5ccccc5c4)c4ccccc34)C(=O)NC2=O)c2ccccc21'
 'CN(C)CCN1C(=O)C(NC(=O)CCc2ccc(Cl)cc2Cl)N=C(c2ccccc2)c2ccccc21'], task_names: ['pIC50']>


In [61]:
# GraphConvModel is a GNN model based on
# Duvenaud, David K., et al. "Convolutional networks on graphs for
# learning molecular fingerprints."
from deepchem.models import GraphConvModel
graph_conv = GraphConvModel(1,
                            batch_size=50,
                            mode="regression")
# Defining metric. Closer to 1 is better
metric = deepchem.metrics.Metric(deepchem.metrics.pearson_r2_score)


In [62]:
# Fitting the model
graph_conv.fit(train, nb_epoch=40)

0.10503836870193481

In [63]:
# Reversing the transformation and getting the metric scores on 2 datasets
train_scores = graph_conv.evaluate(train, [metric], [normalizer])
valid_scores = graph_conv.evaluate(valid, [metric], [normalizer])
test_scores = graph_conv.evaluate(test, [metric], [normalizer])
print(f"Train Scores: {train_scores}")
print(f"Validation Scores: {valid_scores}")
print(f"Test Scores: {test_scores}")

Train Scores: {'pearson_r2_score': 0.9107839407669827}
Validation Scores: {'pearson_r2_score': 0.3206801291348099}
Test Scores: {'pearson_r2_score': 0.1780441434805936}
