## Deepchem practical tutorial

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import deepchem as dc
dc.__version__

'2.5.0'

In [3]:
import tensorflow as tf
tf.__version__

'2.6.0'

In [4]:
from tensorflow import split

In [5]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = ""

In [6]:
import numpy as np
dataset = dc.data.NumpyDataset(np.ones((2,2)))
for x, y, w, id in dataset.itersamples():
    print(x.tolist(), y.tolist(), w.tolist(), id)


[1.0, 1.0] [0.0] [0.0] 0
[1.0, 1.0] [0.0] [0.0] 1


In [7]:
x = np.random.random((6, 10))
y = np.random.random((6, 1))
dataset = dc.data.NumpyDataset(x,y)
dataset.X, dataset.y

(array([[0.98061288, 0.51613009, 0.64889066, 0.20825363, 0.32777987,
         0.61200768, 0.26747545, 0.8585619 , 0.06894629, 0.91007398],
        [0.50289433, 0.7670384 , 0.62159224, 0.81093639, 0.79424752,
         0.46826205, 0.69337994, 0.50866602, 0.97735135, 0.69990091],
        [0.6140219 , 0.37558648, 0.51959826, 0.44720134, 0.08134524,
         0.21439663, 0.08637916, 0.89782651, 0.42676526, 0.87740057],
        [0.10296489, 0.14044087, 0.91670406, 0.38471363, 0.05389896,
         0.50735557, 0.41653522, 0.64925473, 0.05373749, 0.420119  ],
        [0.07105486, 0.67035387, 0.80185969, 0.44948472, 0.67762173,
         0.95557751, 0.46046668, 0.36845699, 0.26362487, 0.54592717],
        [0.89588203, 0.46052511, 0.01911249, 0.25006961, 0.6460712 ,
         0.0469692 , 0.40661947, 0.61423298, 0.19020875, 0.86404968]]),
 array([[0.91793046],
        [0.65706537],
        [0.6120886 ],
        [0.14507993],
        [0.97935215],
        [0.65168843]]))

In [8]:
splitter = dc.splits.RandomSplitter()
train_dataset, test_dataset = splitter.train_test_split(dataset)
train_dataset, test_dataset

(<NumpyDataset X.shape: (4, 10), y.shape: (4, 1), w.shape: (4, 1), ids: [5 2 0 3], task_names: [0]>,
 <NumpyDataset X.shape: (2, 10), y.shape: (2, 1), w.shape: (2, 1), ids: [4 1], task_names: [0]>)

In [9]:
train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(dataset)
train_dataset, valid_dataset, test_dataset

(<NumpyDataset X.shape: (4, 10), y.shape: (4, 1), w.shape: (4, 1), ids: [3 2 4 0], task_names: [0]>,
 <NumpyDataset X.shape: (1, 10), y.shape: (1, 1), w.shape: (1, 1), ids: [5], task_names: [0]>,
 <NumpyDataset X.shape: (1, 10), y.shape: (1, 1), w.shape: (1, 1), ids: [1], task_names: [0]>)

For predicting toxicity, we will use the Tox21 toxicity dataset from MoleculeNet and we will use DeepChem to load the required dataset.

In [10]:
tox21_tasks, tox21_datasets, transformers = dc.molnet.load_tox21()

In [11]:
train_dataset, valid_dataset, test_dataset = tox21_datasets

In [12]:
train_dataset.ids

array(['CC(O)(P(=O)(O)O)P(=O)(O)O',
       'CC(C)(C)OOC(C)(C)CCC(C)(C)OOC(C)(C)C',
       'OC[C@H](O)[C@@H](O)[C@H](O)CO', ...,
       'O=C1OC(OC(=O)c2cccnc2Nc2cccc(C(F)(F)F)c2)c2ccccc21',
       'CC(=O)C1(C)CC2=C(CCCC2(C)C)CC1C',
       'CC(C)CCC[C@@H](C)[C@H]1CC(=O)C2=C3CC[C@H]4C[C@@H](O)CC[C@]4(C)[C@H]3CC[C@@]21C'],
      dtype=object)

In [13]:
print(tox21_tasks)
print(len(tox21_tasks))

['NR-AR', 'NR-AR-LBD', 'NR-AhR', 'NR-Aromatase', 'NR-ER', 'NR-ER-LBD', 'NR-PPAR-gamma', 'SR-ARE', 'SR-ATAD5', 'SR-HSE', 'SR-MMP', 'SR-p53']
12


In [14]:
print(tox21_datasets)

(<DiskDataset X.shape: (6264, 1024), y.shape: (6264, 12), w.shape: (6264, 12), task_names: ['NR-AR' 'NR-AR-LBD' 'NR-AhR' ... 'SR-HSE' 'SR-MMP' 'SR-p53']>, <DiskDataset X.shape: (783, 1024), y.shape: (783, 12), w.shape: (783, 12), ids: ['N#C[C@@H]1CC(F)(F)CN1C(=O)CNC1CC2CCC(C1)N2c1ncccn1'
 'CN(C)C(=O)NC1(c2ccccc2)CCN(CCC[C@@]2(c3ccc(Cl)c(Cl)c3)CCCN(C(=O)c3ccccc3)C2)CC1'
 'CSc1nnc(C(C)(C)C)c(=O)n1N' ...
 'O=C(O[C@H]1CN2CCC1CC2)N1CCc2ccccc2[C@@H]1c1ccccc1'
 'C#C[C@]1(O)CC[C@H]2[C@@H]3CCC4=CC(=O)CC[C@@H]4[C@H]3C(=C)C[C@@]21CC'
 'NC(=O)C(c1ccccc1)(c1ccccc1)[C@@H]1CCN(CCc2ccc3c(c2)CCO3)C1'], task_names: ['NR-AR' 'NR-AR-LBD' 'NR-AhR' ... 'SR-HSE' 'SR-MMP' 'SR-p53']>, <DiskDataset X.shape: (784, 1024), y.shape: (784, 12), w.shape: (784, 12), ids: ['CC1(C)S[C@@H]2[C@H](NC(=O)Cc3ccccc3)C(=O)N2[C@H]1C(=O)O.CC1(C)S[C@@H]2[C@H](NC(=O)Cc3ccccc3)C(=O)N2[C@H]1C(=O)O.c1ccc(CNCCNCc2ccccc2)cc1'
 'CC(C)(c1ccc(Oc2ccc3c(c2)C(=O)OC3=O)cc1)c1ccc(Oc2ccc3c(c2)C(=O)OC3=O)cc1'
 'Cc1cc(C(C)(C)C)c(O)c(C)c1Cn1c(=O)n

In [16]:
transformers

[<deepchem.trans.transformers.BalancingTransformer at 0x7f290547aa10>]

In [17]:
print(train_dataset.X.shape, valid_dataset.X.shape)
print(np.shape(train_dataset.y), np.shape(valid_dataset.y))

(6264, 1024) (783, 1024)
(6264, 12) (783, 12)


In [18]:
print(train_dataset.w.shape,
      np.count_nonzero(train_dataset.w),
      np.count_nonzero(train_dataset.w == 0))

(6264, 12) 63647 11521


Теперь попробуем создать и обучить модель

In [19]:
model = dc.models.MultitaskClassifier(n_tasks=12, n_features=1024, layer_sizes=[1000]) #задаем алгоритм
model.fit(train_dataset, nb_epoch=10) #обучаем модель
#определяем метрики, по которым будем оценивать качество модели
metric = dc.metrics.Metric(dc.metrics.roc_auc_score, np.mean)
train_scores = model.evaluate(train_dataset, [metric], transformers)
test_scores = model.evaluate(test_dataset, [metric], transformers)

2021-10-27 22:48:01.010526: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2021-10-27 22:48:01.010626: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (mariia-G3-3590): /proc/driver/nvidia/version does not exist
2021-10-27 22:48:01.011948: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2021-10-27 22:48:01.603726: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


In [20]:
print(f'AUC-ROC for train dataset: {train_scores}')
print(f'AUC-ROC for test dataset: {test_scores}')

AUC-ROC for train dataset: {'mean-roc_auc_score': 0.9589079279769371}
AUC-ROC for test dataset: {'mean-roc_auc_score': 0.6835421842194602}


Тут видно, что модель переобучаеся и отрабатывает не очень хорошо на тестовой выборке. 

In [21]:
tasks, datasets, transformers = dc.molnet.load_delaney(featurizer='GraphConv')

In [22]:
train_dataset, valid_dataset, test_dataset = datasets

In [23]:
model = dc.models.GraphConvModel(n_tasks=1, mode='regression', dropout=0.2)
model.fit(train_dataset, nb_epoch=100)

0.10784532546997071

In [24]:
metric = dc.metrics.Metric(dc.metrics.pearson_r2_score)
print(model.evaluate(train_dataset, [metric], transformers))
print(model.evaluate(test_dataset, [metric], transformers))

{'pearson_r2_score': 0.9205461853034781}
{'pearson_r2_score': 0.6820242335644608}


In [25]:
smiles = ['COC(C)(C)CCCC(C)CC=CC(C)=CC(=O)OC(C)C',
'CCOC(=O)CC',
'CSc1nc(NC(C)C)nc(NC(C)C)n1',
'CC(C#C)N(C)C(=O)Nc1ccc(Cl)cc1',
'Cc1cc2ccccc2cc1C']

In [26]:
from rdkit import Chem
mols = [Chem.MolFromSmiles(s) for s in smiles]
featurizer = dc.feat.ConvMolFeaturizer()
x = featurizer.featurize(mols)
predicted_solubility = model.predict_on_batch(x)
predicted_solubility

array([[-0.550318  ],
       [ 1.4608353 ],
       [ 0.44491947],
       [-0.5343154 ],
       [-1.209742  ]], dtype=float32)