In [1]:
from itertools import chain
import dill
    
import pandas as pd
import numpy as np
import tensorflow as tf
import nfp

from alfabet.fragment import fragment_iterator

In [4]:
from alfabet.preprocess_inputs import atom_featurizer, bond_featurizer

In [2]:
model = tf.keras.models.load_model(
    'alfabet/model_files/best_model.hdf5',
    custom_objects=nfp.custom_objects)

with open('alfabet/model_files/preprocessor.p', 'rb') as f:
    preprocessor = dill.load(f)

In [3]:
preprocessor.to_json('alfabet/model_files/preprocessor.json')

In [5]:
preprocessor.construct_feature_matrices('CCO', train=False)

{'n_atom': 9,
 'n_bond': 8,
 'bond_indices': array([0, 2, 3, 4, 0, 1, 5, 6, 1, 7, 2, 3, 4, 5, 6, 7]),
 'atom': array([ 2, 18, 19,  8,  8,  8,  8,  8,  8]),
 'bond': array([ 2,  3,  3,  3,  2, 20,  3,  3, 21, 22, 10, 10, 10, 10, 10, 24]),
 'connectivity': array([[0, 1],
        [0, 3],
        [0, 4],
        [0, 5],
        [1, 0],
        [1, 2],
        [1, 6],
        [1, 7],
        [2, 1],
        [2, 8],
        [3, 0],
        [4, 0],
        [5, 0],
        [6, 1],
        [7, 1],
        [8, 2]])}

In [6]:
del preprocessor

In [8]:
preprocessor = nfp.SmilesPreprocessor(atom_features=atom_featurizer, bond_features=bond_featurizer)
preprocessor.from_json('alfabet/model_files/preprocessor.json')

In [9]:
preprocessor.construct_feature_matrices('CCO', train=False)

{'n_atom': 9,
 'n_bond': 8,
 'bond_indices': array([0, 2, 3, 4, 0, 1, 5, 6, 1, 7, 2, 3, 4, 5, 6, 7]),
 'atom': array([ 2, 18, 19,  8,  8,  8,  8,  8,  8]),
 'bond': array([ 2,  3,  3,  3,  2, 20,  3,  3, 21, 22, 10, 10, 10, 10, 10, 24]),
 'connectivity': array([[0, 1],
        [0, 3],
        [0, 4],
        [0, 5],
        [1, 0],
        [1, 2],
        [1, 6],
        [1, 7],
        [2, 1],
        [2, 8],
        [3, 0],
        [4, 0],
        [5, 0],
        [6, 1],
        [7, 1],
        [8, 2]])}

In [None]:
smiles_iterator = ['CC', 'NCCO', 'CF', 'CO', 'CN', 'CCl', 'c1ccccc1', 'B']

In [None]:
frag_df = pd.DataFrame(chain(*(fragment_iterator(smiles)
                               for smiles in smiles_iterator)))

In [None]:
frag_df.head()

In [None]:
from tqdm import tqdm

In [None]:
def check_valid(iinput):
    """ Check the given SMILES to ensure it's present in the model's
    preprocessor dictionary.

    Returns:
    (is_outlier, missing_atom, missing_bond)

    """

    missing_bond = np.array(
        list(set(iinput['bond_indices'][np.array(iinput['bond'] == 1)].numpy().tolist())))
    missing_atom = np.arange(iinput['n_atom'])[np.array(iinput['atom'] == 1).squeeze()]
    is_outlier = bool((missing_bond.size != 0) | (missing_atom.size != 0))

    return not is_outlier

In [None]:
def prediction_generator(smiles_iterator):
    
    dataset = tf.data.Dataset.from_generator(
        lambda: (preprocessor.construct_feature_matrices(item, train=False)
                 for item in smiles_iterator),
        output_types=preprocessor.output_types,
        output_shapes=preprocessor.output_shapes).batch(1)
    
    for molecule, inputs in tqdm(zip(smiles_iterator, dataset)):
        out = model.predict_on_batch(inputs)
        df = pd.DataFrame(out[0, :inputs['n_bond'][0], 0], columns=['BDE'])
        df['molecule'] = molecule
        df.index.name = 'bond_index'
        df.reset_index(inplace=True)
        
        df['is_valid'] = check_valid(inputs)
        
        yield df

In [None]:
df = pd.concat(prediction_generator(smiles_iterator))

In [None]:
frag_df.merge(df, on=['molecule', 'bond_index'], how='left')

In [None]:
df

In [None]:
fr

In [None]:
df.shape

In [None]:
df.indexex.

In [None]:
df

In [None]:
x

In [None]:
smiles

In [None]:
x

In [None]:
from itertools import zi

In [None]:
import pandas as pd

In [None]:
out.numpy()