In [8]:
from alfabet import model
from alfabet.model import model, preprocessor

In [9]:
import numpy as np
import pandas as pd
from alfabet.fragment import fragment_iterator
from alfabet.preprocessor_utils import ConcatGraphSequence

In [10]:
smiles_list = ('CC', 'CCC', 'CF')
batch_size=128

In [48]:
def check_valid(iinput):
    """ Check the given SMILES to ensure it's present in the model's
    preprocessor dictionary.

    Returns:
    (is_outlier, missing_atom, missing_bond)

    """

    missing_bond = np.array(
        list(set(iinput['bond_indices'][iinput['bond'] == 1])))
    missing_atom = np.arange(iinput['n_atom'])[iinput['atom'] == 1]

    is_outlier = bool((missing_bond.size != 0) | (missing_atom.size != 0))

    return not is_outlier

def inputs_to_dataframe(smiles, inputs):
    molecule = np.repeat(np.array(smiles), np.stack([iinput['n_bond'] for iinput in inputs]))
    bond_index = np.concatenate([iinput['bond_indices'] for iinput in inputs])
    input_df = pd.DataFrame(np.vstack([molecule, bond_index]).T,
                            columns=['molecule', 'bond_index'])
    input_df['bond_index'] = input_df.bond_index.astype('int64')

    return input_df

# Process the smiles list into graph representations
inputs = preprocessor.predict(smiles_list)

# Predict the inputs with the neural network
pred = model.predict_generator(
    ConcatGraphSequence(inputs, batch_size=batch_size, shuffle=False), verbose=0)

100%|██████████| 3/3 [00:00<00:00, 2607.86it/s]


In [55]:
    bde_df = inputs_to_dataframe(smiles_list, inputs)
    bde_df['bde_pred'] = pred
    bde_df = bde_df.groupby(['molecule',
                             'bond_index']).bde_pred.mean().reset_index()

    # Check mols for preprocessor class presence in training data
    valid_mols = pd.Series([check_valid(iinput) for iinput in inputs],
                           dtype=bool, index=smiles_list, name='is_valid')
    bde_df = bde_df.merge(valid_mols, left_on='molecule', right_index=True,
                          how='left')

    # Seperately fragment the molecules to find their valid bonds
    def smiles_fragment_iterator():
        for smiles in smiles_list:
            for row in fragment_iterator(smiles):
                yield row

    frag_df = pd.DataFrame(smiles_fragment_iterator())
    pred_df = frag_df.merge(bde_df, on=['molecule', 'bond_index'], how='left')

In [56]:
pred_df

Unnamed: 0,molecule,bond_index,bond_type,fragment1,fragment2,delta_assigned_stereo,delta_unassigned_stereo,bde_pred,is_valid
0,CC,0,C-C,[CH3],[CH3],0,0,90.278282,True
1,CC,1,C-H,[H],[CH2]C,0,0,99.346191,True
2,CC,2,C-H,[H],[CH2]C,0,0,99.346191,True
3,CC,3,C-H,[H],[CH2]C,0,0,99.346191,True
4,CC,4,C-H,[H],[CH2]C,0,0,99.346191,True
5,CC,5,C-H,[H],[CH2]C,0,0,99.346191,True
6,CC,6,C-H,[H],[CH2]C,0,0,99.346191,True
7,CCC,0,C-C,[CH2]C,[CH3],0,0,89.084686,True
8,CCC,1,C-C,[CH2]C,[CH3],0,0,89.084686,True
9,CCC,2,C-H,[H],[CH2]CC,0,0,100.11734,True
