In [1]:
from rdkit import Chem
import numpy as np
import pandas as pd
from IPython.display import display
import tqdm
import torch
import deepchem as dc

2022-10-27 15:23:50.678863: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-10-27 15:23:50.818617: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-10-27 15:23:50.818636: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2022-10-27 15:23:50.840029: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2022-10-27 15:23:51.681698: W tensorflow/stream_executor/platform/de

# Train a embedding model on ChemBL

In [2]:
%%time
# Load raw dataset
chembl_tasks, datasets, transformers = dc.molnet.load_chembl(shard_size=2000, featurizer="raw", set="5thresh", splitter="random")
train_dataset, valid_dataset, test_dataset = datasets

CPU times: user 8.16 ms, sys: 9 µs, total: 8.17 ms
Wall time: 7.07 ms


In [3]:
%%time
# Featurize the input
f = dc.feat.MolGraphConvFeaturizer(use_edges=True, use_partial_charge=True)
y = train_dataset.y

CPU times: user 37.6 ms, sys: 41 ms, total: 78.5 ms
Wall time: 77.7 ms


In [4]:
%%time

# Train the MPNN model
model = dc.models.torch_models.MPNNModel(len(chembl_tasks), number_atom_features=31, number_bond_features=11)

print(model.model)

model.fit(dc.data.NumpyDataset(f.featurize(train_dataset.X), y)) # On original dataset

MPNN(
  (model): MPNNPredictor(
    (gnn): MPNNGNN(
      (project_node_feats): Sequential(
        (0): Linear(in_features=31, out_features=64, bias=True)
        (1): ReLU()
      )
      (gnn_layer): NNConv(
        (edge_func): Sequential(
          (0): Linear(in_features=11, out_features=128, bias=True)
          (1): ReLU()
          (2): Linear(in_features=128, out_features=4096, bias=True)
        )
      )
      (gru): GRU(64, 64)
    )
    (readout): Set2Set(
      n_iters=6
      (lstm): LSTM(128, 64, num_layers=3)
    )
    (predict): Sequential(
      (0): Linear(in_features=128, out_features=64, bias=True)
      (1): ReLU()
      (2): Linear(in_features=64, out_features=691, bias=True)
    )
  )
)




CPU times: user 2h 14min 47s, sys: 20min 27s, total: 2h 35min 14s
Wall time: 29min 15s


0.8006814956665039

In [7]:
# Lets see some scores (rmse)
avg_rms = dc.metrics.Metric(dc.metrics.rms_score, np.mean)
model.evaluate(dc.data.NumpyDataset(f.featurize(train_dataset.X), y), [avg_rms], transformers), model.evaluate(dc.data.NumpyDataset(f.featurize(valid_dataset.X), valid_dataset.y), [avg_rms], transformers), model.evaluate(dc.data.NumpyDataset(f.featurize(test_dataset.X), test_dataset.y), [avg_rms], transformers)

({'mean-rms_score': 0.29822712602210644},
 {'mean-rms_score': 0.2937193354348491},
 {'mean-rms_score': 0.2922992680009213})

# Output of 1 sample for qualitative comparision
list(zip(train_dataset.y[:1].flatten(), model.predict(dc.data.NumpyDataset(x[:1], y[:1])).flatten()))

%%time

# Train the MPNN model
model = dc.models.torch_models.MPNNModel(len(chembl_tasks), number_atom_features=31, number_bond_features=11)

print(model.model)

# There are single atom molecules in the actions - For these, featurizer does not work - so adding H's so each atom has at least some neighbors
X = []
for mol in tqdm.tqdm(train_dataset.X):
    X.append(Chem.AddHs(mol))
X = np.array(X)
x = f.featurize(X)

# train
model.fit(dc.data.NumpyDataset(x, y)) 

# Lets see some scores (rmse)
avg_rms = dc.metrics.Metric(dc.metrics.rms_score, np.mean)
model.evaluate(dc.data.NumpyDataset(x, y), [avg_rms], transformers), \
    model.evaluate(dc.data.NumpyDataset(f.featurize(np.vectorize(Chem.AddHs)(valid_dataset.X)), valid_dataset.y), [avg_rms], transformers), \
    model.evaluate(dc.data.NumpyDataset(f.featurize(np.vectorize(Chem.AddHs)(test_dataset.X)), test_dataset.y), [avg_rms], transformers)

In [9]:
# Output of 1 sample for qualitative comparision
list(zip(train_dataset.y[:1].flatten(), model.predict(dc.data.NumpyDataset(f.featurize(train_dataset.X[:1]), y[:1])).flatten()))

[(-0.007236693326090918, 0.046405684),
 (-0.04238371855537214, 0.03333094),
 (0.0, 0.0042083673),
 (0.0, 0.0054020616),
 (-0.027899843079487056, -0.1623196),
 (-0.025025194419930924, 0.087228),
 (-0.01908282090230921, -0.12722534),
 (-0.02155178296682597, 0.17189683),
 (0.0, 0.00073351525),
 (-0.022514736706494138, 0.0722889),
 (-0.023869481191052172, -0.18381195),
 (0.0, 0.0032516294),
 (0.0, 0.00078660273),
 (-0.017423504840988296, -0.01792328),
 (-0.00723669332609096, -0.0012517273),
 (-0.025991172597553413, 0.1282811),
 (0.0, 0.0006487537),
 (0.0, 0.005099844),
 (0.0, 0.00057454384),
 (-0.010232547811979919, 0.049526636),
 (-0.01601282273079471, -0.025450686),
 (-0.010137734473236647, 0.33755314),
 (0.0, -0.003937754),
 (-0.06908539966106848, 0.17600611),
 (-0.0846528929503079, 0.16919325),
 (-0.014448120136706365, 0.5802632),
 (-0.014447497539225454, -0.061601836),
 (-0.007236693326090872, -0.11938442),
 (-0.07556022955680562, -0.14868255),
 (-0.07963641257843566, 0.052914195),
 (

# Dump the model(s)

In [10]:
from torch import nn
import dgl

class MPNNMolEmbedder(nn.Module):
    """MPNN embedder."""
    def __init__(self, gnn, readout):
        super(MPNNMolEmbedder, self).__init__()

        self.gnn = gnn
        self.readout = readout

    def _prepare_batch(self, g):
        dgl_graphs = [graph.to_dgl_graph() for graph in g]
        inputs = dgl.batch(dgl_graphs).to("cpu")
        return inputs
        
    def forward(self, g):
        """Graph-level regression/soft classification.

        Parameters
        ----------
        g : GraphData
            GraphData for a batch of graphs.

        Returns
        -------
        graph embeddings
        """
        dgl_g = self._prepare_batch(g)
        node_feats = self.gnn(dgl_g, dgl_g.ndata["x"], dgl_g.edata["edge_attr"])
        graph_feats = self.readout(dgl_g, node_feats)
        return graph_feats

class MPNNAtomEmbedder(nn.Module):
    """MPNN embedder."""
    def __init__(self, gnn):
        super(MPNNAtomEmbedder, self).__init__()
        self.gnn = gnn

    def _prepare_batch(self, g):
        dgl_graphs = [graph.to_dgl_graph() for graph in g]
        inputs = dgl.batch(dgl_graphs).to("cpu")
        return inputs
        
    def forward(self, g, idx):
        """Graph-level regression/soft classification.

        Parameters
        ----------
        g : GraphData
            GraphData for a batch of graphs.

        Returns
        -------
        graph embeddings
        """
        dgl_g = self._prepare_batch(g)
        node_feats = self.gnn(dgl_g, dgl_g.ndata["x"], dgl_g.edata["edge_attr"])
        return node_feats[idx]

In [11]:
mol_embedder = MPNNMolEmbedder(*list(model.model.model.children())[:2])
atom_embedder = MPNNAtomEmbedder(*list(model.model.model.children())[:1])

In [12]:
mol_embedder

MPNNMolEmbedder(
  (gnn): MPNNGNN(
    (project_node_feats): Sequential(
      (0): Linear(in_features=31, out_features=64, bias=True)
      (1): ReLU()
    )
    (gnn_layer): NNConv(
      (edge_func): Sequential(
        (0): Linear(in_features=11, out_features=128, bias=True)
        (1): ReLU()
        (2): Linear(in_features=128, out_features=4096, bias=True)
      )
    )
    (gru): GRU(64, 64)
  )
  (readout): Set2Set(
    n_iters=6
    (lstm): LSTM(128, 64, num_layers=3)
  )
)

In [13]:
atom_embedder

MPNNAtomEmbedder(
  (gnn): MPNNGNN(
    (project_node_feats): Sequential(
      (0): Linear(in_features=31, out_features=64, bias=True)
      (1): ReLU()
    )
    (gnn_layer): NNConv(
      (edge_func): Sequential(
        (0): Linear(in_features=11, out_features=128, bias=True)
        (1): ReLU()
        (2): Linear(in_features=128, out_features=4096, bias=True)
      )
    )
    (gru): GRU(64, 64)
  )
)

In [18]:
atom_embedder([f.featurize(train_dataset.X[4])], 1).shape

AttributeError: 'numpy.ndarray' object has no attribute 'to_dgl_graph'

In [19]:
torch.save(mol_embedder, "models/MPNNMolEmbedder.pt")
torch.save(atom_embedder, "models/MPNNAtomEmbedder.pt")

# Load the model and test on a new molecule

In [20]:
import deepchem as dc
import torch

# Featurizer
f = dc.feat.MolGraphConvFeaturizer(use_edges=True, use_partial_charge=True)

# Model
mol_em_model = torch.load("models/MPNNMolEmbedder.pt")
atom_em_model = torch.load("models/MPNNAtomEmbedder.pt")

def mol_to_embedding(mol):
    features = f.featurize([mol])[0]
    return mol_em_model([features])[0]

def atom_to_embedding(mol, idx):
    features = f.featurize([mol])[0]
    return atom_em_model([features], idx)

mol_to_embedding(Chem.MolFromSmiles("CCCCCC")), atom_to_embedding(Chem.MolFromSmiles("CCCCCC"), 5)

(tensor([ 0.2059, -0.0849, -0.1006, -0.0044, -0.1132,  0.0636, -0.0562, -0.0880,
          0.0564, -0.1513,  0.6869, -0.0258,  0.1665, -0.0613, -0.0625,  0.0389,
          0.1549,  0.2592, -0.0081, -0.1675,  0.0320,  0.2885, -0.4807,  0.2448,
          0.1672, -0.1394, -0.2062, -0.1849,  0.3588, -0.0253,  0.2217,  0.0475,
         -0.0602, -0.0502,  0.1159,  0.5457,  0.1212,  0.0366, -0.1384, -0.2877,
          0.0331,  0.2224,  0.1219,  0.1996, -0.2527, -0.3921,  0.1214,  0.0367,
         -0.2402, -0.4625,  0.2983,  0.2036,  0.0237,  0.0604, -0.2049,  0.7599,
         -0.1182, -0.0570, -0.5632,  0.3133,  0.1270,  0.0536,  0.1903,  0.4422,
         -0.9176, -0.3247, -0.4847, -0.9910, -0.0241,  0.1806,  0.0437, -0.9601,
          0.2674,  0.9859,  0.9639, -0.4737, -0.0225, -0.9927, -0.2939, -0.5575,
         -0.7356,  0.0216,  0.8634,  0.2863, -0.9869, -0.4371,  0.9459, -0.7089,
         -0.5666,  0.9137, -0.1022, -0.3242, -0.9025,  0.6342, -0.4937,  0.6653,
         -0.6451, -0.9403, -