# Example Application of Rational Neural Graph Fingerprints to Predict Water Solubility of Molecules 
# Delaney Data Set

In [5]:
# import packages

import sys
import numpy as np
import pandas as pd
import tensorflow as tf

from rdkit import Chem
from tensorflow.keras.layers import Input, Dense, Add
from tensorflow.keras import models

from rational_neural_graph_fingerprints.auxiliary_functions_graph_tensorisation import tensorise_smiles
from rational_neural_graph_fingerprints.tf_keras_layers_rational_neural_graph_convolutions import RationalNeuralFingerprintOutput, DeepRationalNeuralFingerprintHidden

In [7]:
# load and prepare data

filepath = 'data_delaney.csv'
delaney_df = pd.read_csv(filepath, delimiter = ',')

smiles = delaney_df.values[:,9]
labels = np.array(delaney_df.values[:,1], dtype = np.float32)
labels = np.reshape(labels, (len(labels),1))

print("Smiles = ", smiles.shape)
print("Labels = ", labels.shape, type(labels[0][0]))
display(delaney_df.head())

#print(labels)
print(np.mean(labels[:]))
print(np.std(labels[:]))
#rint(labels)

Smiles =  (1128,)
Labels =  (1128, 1) <class 'numpy.float32'>


Unnamed: 0,Compound ID,ESOL predicted log solubility in mols per litre,Minimum Degree,Molecular Weight,Number of H-Bond Donors,Number of Rings,Number of Rotatable Bonds,Polar Surface Area,measured log solubility in mols per litre,smiles
0,Amigdalin,-0.974,1,457.432,7,3,7,202.32,-0.77,OCC3OC(OCC2OC(OC(C#N)c1ccccc1)C(O)C(O)C2O)C(O)...
1,Fenfuram,-2.885,1,201.225,1,2,2,42.24,-3.3,Cc1occc1C(=O)Nc2ccccc2
2,citral,-2.579,1,152.237,0,0,4,17.07,-2.06,CC(C)=CCCC(C)=CC(=O)
3,Picene,-6.618,2,278.354,0,5,0,0.0,-7.87,c1ccc2c(c1)ccc3c2ccc4c5ccccc5ccc43
4,Thiophene,-2.232,2,84.143,0,1,0,0.0,-1.33,c1ccsc1


-2.9881923
1.6824735


In [9]:
# tensorise smiles strings of molecules

(X_atoms, X_bonds, X_edges, X_atoms_existence) = tensorise_smiles(smiles, max_degree = None)
print('Atoms:', X_atoms.shape, type(X_atoms[0][0][0]))
print('Bonds:', X_bonds.shape, type(X_bonds[0][0][0][0]))
print('Edges:', X_edges.shape, type(X_edges[0][0][0]))
print('Atoms_Existence:', X_atoms_existence.shape, type(X_atoms_existence[0][0]))

Atoms: (1128, 55, 62) <class 'numpy.float32'>
Bonds: (1128, 55, 4, 6) <class 'numpy.float32'>
Edges: (1128, 55, 4) <class 'numpy.float32'>
Atoms_Existence: (1128, 55) <class 'numpy.float32'>


In [10]:
# load dimensional sizes from data shape

num_molecules = X_atoms.shape[0]
max_atoms = X_atoms.shape[1]
max_degree = X_bonds.shape[2]
num_atom_features = X_atoms.shape[-1]
num_bond_features = X_bonds.shape[-1]

print(" Number of Molecules = ", num_molecules,"\n", 
      "Maximum Number of Atoms in a Molecule = ",  max_atoms,"\n",  
      "Maximum Atom Degree = ", max_degree,"\n",  
      "Number of Atom Features = ", num_atom_features,"\n",  
      "Number of Bond Features = ", num_bond_features)

 Number of Molecules =  1128 
 Maximum Number of Atoms in a Molecule =  55 
 Maximum Atom Degree =  4 
 Number of Atom Features =  62 
 Number of Bond Features =  6


In [17]:
# set hyperparameters

conv_width = 62 #  output dimension of neural networks associated with hidden graph convolutional layers

output_hidden_length = 68 # dimension of the hidden layer of neural networks associated with output graph convolutional layers
output_fp_length = 68 # output dimension of neural networks associated with output graph convolutional layers

In [18]:
# define the input layers

atoms_0 = Input(name = 'atom_inputs', shape = (max_atoms, num_atom_features))
bonds = Input(name = 'bond_inputs', shape = (max_atoms, max_degree, num_bond_features))
edges = Input(name = 'edge_inputs', shape = (max_atoms, max_degree))
atoms_existence = Input(name = 'atoms_existence_inputs', shape=(max_atoms,))

In [19]:
# define the convoluted atom feature layers

atoms_1 = DeepRationalNeuralFingerprintHidden(conv_width)([atoms_0, bonds, edges, atoms_existence])
atoms_2 = DeepRationalNeuralFingerprintHidden(conv_width)([atoms_1, bonds, edges, atoms_existence])

In [20]:
# define the output layers for each convoluted atom featuer layer (layerwise neural fingerprints)

fp_out_0 = RationalNeuralFingerprintOutput(output_hidden_length, output_fp_length)([atoms_0, bonds, edges, atoms_existence])
fp_out_1 = RationalNeuralFingerprintOutput(output_hidden_length, output_fp_length)([atoms_1, bonds, edges, atoms_existence])
fp_out_2 = RationalNeuralFingerprintOutput(output_hidden_length, output_fp_length)([atoms_2, bonds, edges, atoms_existence])

In [21]:
# sum outputs to obtain fingerprint

final_fp = Add()([fp_out_0, fp_out_1, fp_out_2])

In [22]:
# define neural machinery on top of neural fingerprints

intermediate_prediction = Dense(30, activation = tf.keras.activations.relu, use_bias = True, name = 'intermediate_prediction')(final_fp)
main_prediction = Dense(1, activation = 'linear', use_bias = True, name = 'main_prediction')(intermediate_prediction)

In [23]:
# build and compile model for regression.

model = models.Model(inputs = [atoms_0, bonds, edges, atoms_existence], outputs = [main_prediction])
model.compile(optimizer = tf.keras.optimizers.Adam(learning_rate=0.001), loss = 'mse')

In [24]:
# show summary

model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
atom_inputs (InputLayer)        [(None, 55, 62)]     0                                            
__________________________________________________________________________________________________
bond_inputs (InputLayer)        [(None, 55, 4, 6)]   0                                            
__________________________________________________________________________________________________
edge_inputs (InputLayer)        [(None, 55, 4)]      0                                            
__________________________________________________________________________________________________
atoms_existence_inputs (InputLa [(None, 55)]         0                                            
______________________________________________________________________________________________

In [25]:
# train the model

model.fit([X_atoms, X_bonds, X_edges, X_atoms_existence], labels, epochs = 10, batch_size = 2**6, validation_split = 0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f77480482d0>