In [1]:
%load_ext autoreload

In [2]:
%autoreload 2

In [3]:
import numpy as np

import rdkit
import rdkit.Chem.AllChem
from rdkit import Chem

from molecule_builder import build_molecules

In [4]:
# `build_molecules` returns an iterator that enumerates the action space
# from a given starting molecule.

start = Chem.MolFromSmiles('C')
[Chem.MolToSmiles(mol) for mol in build_molecules(start)]

['C=O', 'CO', 'C=N', 'C#N', 'CN', 'C=C', 'C#C', 'CC']

In [5]:
def build_nested(mol, num):
    """ Simulate tree search / RL by recursively calling build_molecules
    on it's own output a desired number of times. Note, this is NOT the
    most efficient way to iterate over sets of molecules at higher depths.
    (For that, we should de-duplicate each iteration). This is fast for a
    depth-first search, though.
    """
    
    for mol in build_molecules(mol):
        if num == 1:
            yield mol
            
        else:
            yield from build_nested(mol, num=num-1)

In [6]:
# Generate a random molecule with desired depth
Chem.MolToSmiles(next(build_nested(start, num=50)))

'NOC=C1N=C(N=O)C(=O)N(OOOC2(OC(=O)OOC#CN(ON=O)OOOOOC(=O)N=NOC(N=O)=NN=O)OC(=O)N2N=O)O1'

In [7]:
%timeit Chem.MolToSmiles(next(build_nested(start, num=50)))

74 ms ± 3.55 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [8]:
%prun Chem.MolToSmiles(next(build_nested(start, num=50)))

 

         50221 function calls (50172 primitive calls) in 0.074 seconds

   Ordered by: internal time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
       50    0.017    0.000    0.017    0.000 EnumerateStereoisomers.py:79(_getFlippers)
     3823    0.012    0.000    0.021    0.000 molecule_builder.py:50(get_valid_bonds)
     6998    0.008    0.000    0.008    0.000 molecule_builder.py:18(get_free_valence)
      100    0.007    0.000    0.024    0.000 EnumerateStereoisomers.py:164(EnumerateStereoisomers)
     4176    0.007    0.000    0.015    0.000 molecule_builder.py:12(shuffle)
      100    0.007    0.000    0.073    0.001 molecule_builder.py:23(build_molecules)
      253    0.004    0.000    0.006    0.000 molecule_builder.py:41(get_valid_partners)
     4176    0.004    0.000    0.008    0.000 random.py:264(shuffle)
     4790    0.002    0.000    0.003    0.000 random.py:224(_randbelow)
     7646    0.001    0.000    0.001    0.000 {built-in method builtin

In [10]:
# The tree size increases fast
[len({Chem.MolToSmiles(m) for m in build_nested(start, num=i)}) for i in range(1, 5)]

[8, 38, 234, 1383]

In [None]:
def get_fingerprint(mol, radius=2, fp_length=128):
    """ In case we want to work with fingerprints for development, this calculates a
    Morgan fingerprint as a numpy array """

    fingerprint = rdkit.Chem.AllChem.GetMorganFingerprintAsBitVect(mol, radius, fp_length)
    arr = np.zeros((fp_length,))
    rdkit.DataStructs.ConvertToNumpyArray(fingerprint, arr)
    return arr


mol = next(build_nested(start, num=12))
get_fingerprint(mol)