In [1]:
import numpy as np
import pandas as pd

import tensorflow as tf
import nfp

from tqdm import tqdm
tqdm.pandas()

print(f"tensorflow {tf.__version__}")
print(f"nfp {nfp.__version__}")

2022-02-01 14:48:44.517500: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /nopt/slurm/current/lib:
2022-02-01 14:48:44.517539: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


tensorflow 2.4.1
nfp 0.3.8


In [2]:
# Load the input data, here YSI (10.1016/j.combustflame.2017.12.005)
ysi = pd.read_csv('../data/ysi_xtb.csv')
ysi.head()

Unnamed: 0,Species,CAS,Ref,Type,YSI,YSI_err,SMILES,xtbjson
0,"1-ethynyl-2,5-dimethylbenzene",74331-70-7,1.0,aromatic,512.7,19.6,C#Cc1cc(C)ccc1C,519175_3342b2_bd_coord.json
1,2-methylindene,2177-47-1,1.0,aromatic,500.1,19.1,CC1=Cc2ccccc2C1,437628_fd0a51_bd_coord.json
2,azulene,275-51-4,2.0,aromatic,492.3,19.0,c1ccc2cccc-2cc1,435907_1c36e8_bd_coord.json
3,1-ethynyl-2-methylbenzene,766-47-2,1.0,aromatic,485.0,18.5,C#Cc1ccccc1C,22307_0_bd_coord.json
4,"1-ethenyl-2,5-dimethylbenzene",2039-89-6,1.0,aromatic,469.3,17.8,C=Cc1cc(C)ccc1C,509114_ad5711_bd_coord.json


In [3]:
# Split the data into training, validation, and test sets
valid, test, train = np.split(ysi[['SMILES','xtbjson']].sample(frac=1., random_state=1), [50, 100])
len(train), len(valid), len(test)

(134, 50, 50)

In [6]:
# Define how to featurize the input molecules
from nfp.preprocessing.xtb_preprocessor import xTBSmilesPreprocessor
from nfp.preprocessing.features import get_ring_size


def atom_featurizer(atom):
    """ Return an string representing the atom type
    """

    return str((
        atom.GetSymbol(),
        atom.GetIsAromatic(),
        get_ring_size(atom, max_size=6),
        atom.GetDegree(),
        atom.GetTotalNumHs(includeNeighbors=True)
    ))


def bond_featurizer(bond, flipped=False):
    """ Get a similar classification of the bond type.
    Flipped indicates which 'direction' the bond edge is pointing. """
    
    if not flipped:
        atoms = "{}-{}".format(
            *tuple((bond.GetBeginAtom().GetSymbol(),
                    bond.GetEndAtom().GetSymbol())))
    else:
        atoms = "{}-{}".format(
            *tuple((bond.GetEndAtom().GetSymbol(),
                    bond.GetBeginAtom().GetSymbol())))
    
    btype = str(bond.GetBondType())
    ring = 'R{}'.format(get_ring_size(bond, max_size=6)) if bond.IsInRing() else ''
    
    return " ".join([atoms, btype, ring]).strip()


preprocessor = xTBSmilesPreprocessor(atom_features=atom_featurizer, bond_features=bond_featurizer)

In [7]:
# Initially, the preprocessor has no data on atom types, so we have to loop over the 
# training set once to pre-allocate these mappings
print("before pre-allocating")
print(preprocessor.atom_tokenizer._data)

for row in train.index:
    smiles = train.at[row,'SMILES']
    jsonfile = train.at[row,'xtbjson']
    input_dict = preprocessor(smiles, jsonfile='../data/json/'+jsonfile,train=True)
    
print()
print("after pre-allocating")
print(preprocessor.atom_tokenizer._data)

before pre-allocating
{'unk': 1}


2022-02-01 14:48:58.336917: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2022-02-01 14:48:58.337520: W tensorflow/stream_executor/platform/default/dso_loader.cc:60] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /nopt/slurm/current/lib:
2022-02-01 14:48:58.337550: W tensorflow/stream_executor/cuda/cuda_driver.cc:326] failed call to cuInit: UNKNOWN ERROR (303)
2022-02-01 14:48:58.337590: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (el1): /proc/driver/nvidia/version does not exist
2022-02-01 14:48:58.338118: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F FMA
To enable them in other 


after pre-allocating
{'unk': 1, "('C', False, 0, 3, 2)": 2, "('C', False, 0, 3, 0)": 3, "('C', False, 0, 4, 3)": 4, "('C', False, 0, 4, 2)": 5, "('C', False, 0, 4, 0)": 6, "('H', False, 0, 1, 0)": 7, "('O', False, 0, 2, 1)": 8, "('O', False, 0, 2, 0)": 9, "('O', False, 0, 1, 0)": 10, "('C', False, 0, 3, 1)": 11, "('C', False, 0, 4, 1)": 12, "('C', True, 'max', 3, 0)": 13, "('C', True, 'max', 3, 1)": 14, "('C', False, 'max', 3, 0)": 15, "('C', False, 'max', 4, 2)": 16, "('C', False, 'max', 4, 1)": 17, "('C', False, 5, 3, 0)": 18, "('C', False, 5, 3, 1)": 19, "('C', False, 5, 4, 2)": 20, "('C', True, 5, 3, 0)": 21, "('C', True, 5, 3, 1)": 22, "('O', True, 5, 2, 0)": 23, "('C', False, 'max', 3, 1)": 24, "('C', False, 0, 2, 0)": 25, "('C', False, 5, 4, 1)": 26, "('C', False, 0, 2, 1)": 27, "('O', False, 5, 2, 0)": 28}


In [8]:
# Main input types for a SMILES-based prediction
smiles = 'C=C(C)CC(C)(C)C'
jsonfile = '../data/json/4921_0_bd_coord.json'

# Atom types, as integer classes
preprocessor(smiles, jsonfile=jsonfile, train=True)['atom']

array([2, 3, 4, 5, 6, 4, 4, 4, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
       7, 7], dtype=int32)

In [9]:
# Bond types, as integer classes
preprocessor(smiles, jsonfile=jsonfile, train=True)['bond']

array([2, 3, 3, 2, 2, 2, 2, 3, 3, 3, 2, 2, 3, 3, 2, 2, 2, 2, 2, 3, 3, 3,
       2, 3, 3, 3, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3], dtype=int32)

In [10]:
# A connectivity array, where row i indicates bond i connects atom j to atom k
preprocessor(smiles, jsonfile=jsonfile, train=True)['connectivity']

array([[ 0,  1],
       [ 0,  8],
       [ 0,  9],
       [ 1,  0],
       [ 1,  2],
       [ 1,  3],
       [ 2,  1],
       [ 2, 10],
       [ 2, 11],
       [ 2, 12],
       [ 3,  1],
       [ 3,  4],
       [ 3, 13],
       [ 3, 14],
       [ 4,  3],
       [ 4,  5],
       [ 4,  6],
       [ 4,  7],
       [ 5,  4],
       [ 5, 15],
       [ 5, 16],
       [ 5, 17],
       [ 6,  4],
       [ 6, 18],
       [ 6, 19],
       [ 6, 20],
       [ 7,  4],
       [ 7, 21],
       [ 7, 22],
       [ 7, 23],
       [ 8,  0],
       [ 9,  0],
       [10,  2],
       [11,  2],
       [12,  2],
       [13,  3],
       [14,  3],
       [15,  5],
       [16,  5],
       [17,  5],
       [18,  6],
       [19,  6],
       [20,  6],
       [21,  7],
       [22,  7],
       [23,  7]], dtype=int32)

In [11]:
preprocessor(smiles, jsonfile=jsonfile, train=True)['atom_xtb']

array([[-1.1438e-01, -2.1724e-01, -1.1700e-01, -7.1000e-02, -9.4000e-02,
         1.0410e+00,  3.0730e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
         0.0000e+00,  0.0000e+00,  3.0767e+01,  9.0910e+00],
       [ 2.3920e-02, -3.7200e-03, -1.7400e-01, -5.7000e-02, -1.1600e-01,
         1.0910e+00,  2.8850e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
         0.0000e+00,  0.0000e+00,  2.7551e+01,  8.6080e+00],
       [-9.1840e-02, -2.3884e-01,  4.7000e-02,  2.7000e-02,  3.7000e-02,
         1.0080e+00,  3.0840e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
         0.0000e+00,  0.0000e+00,  2.2643e+01,  6.7840e+00],
       [-6.6130e-02, -1.6850e-01,  2.6000e-02, -0.0000e+00,  1.3000e-02,
         1.0370e+00,  3.0290e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
         0.0000e+00,  0.0000e+00,  2.1883e+01,  6.6580e+00],
       [ 4.2450e-02,  5.1000e-03, -1.2000e-02, -2.6000e-02, -1.9000e-02,
         1.0810e+00,  2.8770e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
         0.0000e+00,  0.00

In [12]:
preprocessor(smiles, jsonfile=jsonfile, train=True)['bond_xtb']

array([[1.9268622 , 1.3242947 ],
       [0.9755401 , 1.0764849 ],
       [0.976035  , 1.0775048 ],
       [1.9268622 , 1.3242947 ],
       [1.0250528 , 1.4998841 ],
       [1.0109937 , 1.5060523 ],
       [1.0250528 , 1.4998841 ],
       [0.96929526, 1.0907905 ],
       [0.9851119 , 1.0871836 ],
       [0.9720007 , 1.0873462 ],
       [1.0109937 , 1.5060523 ],
       [0.9652852 , 1.5472038 ],
       [0.9695576 , 1.0925564 ],
       [0.9729345 , 1.0923398 ],
       [0.9652852 , 1.5472038 ],
       [0.9969557 , 1.5319772 ],
       [1.0006814 , 1.5286028 ],
       [0.9998898 , 1.5291462 ],
       [0.9969557 , 1.5319772 ],
       [0.98522764, 1.0879004 ],
       [0.9861606 , 1.0879096 ],
       [0.98609173, 1.0880784 ],
       [1.0006814 , 1.5286028 ],
       [0.98516583, 1.0881023 ],
       [0.9858648 , 1.0878273 ],
       [0.982679  , 1.0868458 ],
       [0.9998898 , 1.5291462 ],
       [0.98415387, 1.0871557 ],
       [0.9851873 , 1.0883454 ],
       [0.98518836, 1.0881969 ],
       [0.

In [13]:
preprocessor(smiles, jsonfile=jsonfile, train=True)['mol_xtb']

array([-25.27425892, -25.68649081, -10.3364    ,  -5.1588    ])

In [14]:
# Construct the tf.data pipeline. There's a lot of specifying data types and
# expected shapes for tensorflow to pre-allocate the necessary arrays. But 
# essentially, this is responsible for calling the input constructor, batching 
# together multiple molecules, and padding the resulting molecules so that all
# molecules in the same batch have the same number of atoms (we pad with zeros,
# hence why the atom and bond types above start with 1 as the unknown class)

train_dataset = tf.data.Dataset.from_generator(
    lambda: ((preprocessor(row.SMILES, '../data/json/'+row.xtbjson, train=True), row.YSI)
             for i, row in ysi[ysi.SMILES.isin(train.SMILES)].iterrows()),
    output_signature=(preprocessor.output_signature, tf.TensorSpec((), dtype=tf.float32)))\
    .cache().shuffle(buffer_size=200)\
    .padded_batch(batch_size=64)\
    .prefetch(tf.data.experimental.AUTOTUNE)


valid_dataset = tf.data.Dataset.from_generator(
    lambda: ((preprocessor(row.SMILES, '../data/json/'+row.xtbjson, train=False), row.YSI)
             for i, row in ysi[ysi.SMILES.isin(valid.SMILES)].iterrows()),
    output_signature=(preprocessor.output_signature, tf.TensorSpec((), dtype=tf.float32)))\
    .cache()\
    .padded_batch(batch_size=64)\
    .prefetch(tf.data.experimental.AUTOTUNE)

In [15]:
inputs, outputs = next(train_dataset.as_numpy_iterator())

2022-02-01 14:49:06.063810: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:116] None of the MLIR optimization passes are enabled (registered 2)
2022-02-01 14:49:06.067276: I tensorflow/core/platform/profile_utils/cpu_utils.cc:112] CPU Frequency: 2300000000 Hz


In [16]:
## Define the keras model
from tensorflow.keras import layers

# Input layers
atom = layers.Input(shape=[None], dtype=tf.int64, name='atom')
bond = layers.Input(shape=[None], dtype=tf.int64, name='bond')
connectivity = layers.Input(shape=[None, 2], dtype=tf.int64, name='connectivity')
atom_xtb = layers.Input(shape=[None,None], dtype=tf.float64, name="atom_xtb")
bond_xtb = layers.Input(shape=[None,None], dtype=tf.float64, name="bond_xtb")

num_features = 8  # Controls the size of the model

# Convert from a single integer defining the atom state to a vector
# of weights associated with that class
atom_state = layers.Embedding(preprocessor.atom_classes, num_features,
                              name='atom_embedding', mask_zero=True)(atom)

# Ditto with the bond state
bond_state = layers.Embedding(preprocessor.bond_classes, num_features,
                              name='bond_embedding', mask_zero=True)(bond)

# Here we use our first nfp layer. This is an attention layer that looks at
# the atom and bond states and reduces them to a single, graph-level vector. 
# mum_heads * units has to be the same dimension as the atom / bond dimension
global_state = nfp.GlobalUpdate(units=8, num_heads=1)([atom_state, bond_state, connectivity])

for _ in range(3):  # Do the message passing
    new_bond_state = nfp.EdgeUpdate()([atom_state, bond_state, connectivity, global_state])
    bond_state = layers.Add()([bond_state, new_bond_state])
    
    new_atom_state = nfp.NodeUpdate()([atom_state, bond_state, connectivity, global_state])
    atom_state = layers.Add()([atom_state, new_atom_state])
    
    new_global_state = nfp.GlobalUpdate(units=8, num_heads=1)(
        [atom_state, bond_state, connectivity, global_state]) 
    global_state = layers.Add()([global_state, new_global_state])

    
# Since the final prediction is a single, molecule-level property (YSI), we 
# reduce the last global state to a single prediction.
ysi_prediction = layers.Dense(1)(global_state)

# Construct the tf.keras model
model = tf.keras.Model([atom, bond, connectivity, atom_xtb, bond_xtb], [ysi_prediction])

[TensorShape([None, None, 8]), TensorShape([None, None, 8]), TensorShape([None, None, 2]), TensorShape([None, 8])]
[TensorShape([None, None, 8]), TensorShape([None, None, 8]), TensorShape([None, None, 2]), TensorShape([None, 8])]
[TensorShape([None, None, 8]), TensorShape([None, None, 8]), TensorShape([None, None, 2]), TensorShape([None, 8])]


In [17]:
model.compile(loss='mae', optimizer=tf.keras.optimizers.Adam(1E-3))

# Fit the model. The first epoch is slower, since it needs to cache
# the preprocessed molecule inputs
model.fit(train_dataset, validation_data=valid_dataset, epochs=100)

Epoch 1/100




Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 7

<tensorflow.python.keras.callbacks.History at 0x7fbd2fd90850>

In [18]:
# Here, we create a test dataset that doesn't assume we know the values for the YSI

test_dataset = tf.data.Dataset.from_generator(
    lambda: (preprocessor(row.SMILES, '../data/json/'+row.xtbjson, train=False)
             for i, row in test.iterrows()),
    output_signature=preprocessor.output_signature)\
    .padded_batch(batch_size=64)\
    .prefetch(tf.data.experimental.AUTOTUNE)

In [19]:
# Here are the predictions on the test set
test_predictions = model.predict(test_dataset)
test_db_values = ysi.set_index('SMILES').reindex(test.SMILES).YSI.values

np.abs(test_db_values - test_predictions.flatten()).mean()

16.83903667831421