In [1]:
%load_ext autoreload

In [2]:
%autoreload 2

In [3]:
import numpy as np

import rdkit
import rdkit.Chem.AllChem
from rdkit import Chem

from molecule_builder import build_molecules

In [4]:
# `build_molecules` returns an iterator that enumerates the action space
# from a given starting molecule.

start = Chem.MolFromSmiles('C1CCCCC1')
sorted([Chem.MolToSmiles(mol) for mol in build_molecules(start)])

['C1CC2=C(C1)C2',
 'C1CC2=C1CC2',
 'C1CC2CC2C1',
 'C1CC2CCC12',
 'C=C1CCCCC1',
 'CC1CCCCC1',
 'N=C1CCCCC1',
 'NC1CCCCC1',
 'O=C1CCCCC1',
 'OC1CCCCC1']

In [5]:
def build_nested(mol, num):
    """ Simulate tree search / RL by recursively calling build_molecules
    on it's own output a desired number of times. Note, this is NOT the
    most efficient way to iterate over sets of molecules at higher depths.
    (For that, we should de-duplicate each iteration). This is fast for a
    depth-first search, though.
    """
    
    for mol in build_molecules(mol):
        if num == 1:
            yield mol
            
        else:
            yield from build_nested(mol, num=num-1)

In [6]:
# Generate a random molecule with desired depth
Chem.MolToSmiles(next(build_nested(start, num=50)))

'CN1N2C3=NN(O)N4OC56ON=NC78C9(OOC(=NC#N)C#CON9c9nn4o9)C49OC4%10OC4=NC7(C35N6N1C(=O)N29)C4%108'

In [7]:
%timeit Chem.MolToSmiles(next(build_nested(start, num=50)))

72.9 ms ± 6.64 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [8]:
%prun Chem.MolToSmiles(next(build_nested(start, num=50)))

 

         63206 function calls (63157 primitive calls) in 0.070 seconds

   Ordered by: internal time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
     4909    0.018    0.000    0.030    0.000 molecule_builder.py:50(get_valid_bonds)
      100    0.012    0.000    0.070    0.001 molecule_builder.py:23(build_molecules)
     9004    0.010    0.000    0.010    0.000 molecule_builder.py:18(get_free_valence)
     5267    0.009    0.000    0.018    0.000 molecule_builder.py:12(shuffle)
      308    0.005    0.000    0.007    0.000 molecule_builder.py:41(get_valid_partners)
     5267    0.005    0.000    0.010    0.000 random.py:264(shuffle)
     6049    0.003    0.000    0.004    0.000 random.py:224(_randbelow)
     9818    0.002    0.000    0.002    0.000 {built-in method builtins.min}
      308    0.001    0.000    0.001    0.000 {built-in method numpy.arange}
       50    0.001    0.000    0.001    0.000 molecule_builder.py:61(add_bond)
     9378    0.001    0.00

In [9]:
# The tree size increases fast
[len({Chem.MolToSmiles(m) for m in build_nested(start, num=i)}) for i in range(1, 5)]

[10, 182, 2709, 37565]

In [10]:
def get_fingerprint(mol, radius=2, fp_length=128):
    """ In case we want to work with fingerprints for development, this calculates a
    Morgan fingerprint as a numpy array """

    fingerprint = rdkit.Chem.AllChem.GetMorganFingerprintAsBitVect(mol, radius, fp_length)
    arr = np.zeros((fp_length,))
    rdkit.DataStructs.ConvertToNumpyArray(fingerprint, arr)
    return arr


mol = next(build_nested(start, num=12))
get_fingerprint(mol)

array([1., 0., 0., 1., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
       1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0.,
       1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 1., 0., 1., 1., 1., 0., 1., 0., 1., 0., 0., 1., 0., 0.,
       0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 1., 0., 0., 0., 0., 1.,
       1., 0., 0., 0., 1., 0., 0., 1., 0., 1., 0., 0., 0., 0., 0., 1., 0.,
       0., 1., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
       0., 1., 0., 1., 1., 0., 0., 0., 0.])