In [9]:
from copy import deepcopy
import time
from chemper.chemper_utils import get_full_path, create_tuples_for_clusters
from chemper.mol_toolkits.mol_toolkit import mols_from_mol2, Mol, HAS_OE, HAS_RDK
from chemper.smirksify import SMIRKSifier, print_smirks
import pickle as p

In [2]:
def parse_smarts_file(file_path):
    """

    Parameters
    ----------
    file_path: str
        relative path in chemper/data or absolute path

    Returns
    -------
    smirks_list: list of tuples (label, smirks)
        This is the ordered list of SMIRKS from the smarts file
        if a label is provided in the file then it is assigned, otherwise
        the indices from the file is used
    """
    fn = get_full_path(file_path)
    f = open(fn)
    lines = f.readlines()
    f.close()

    lines = [l.split() for l in lines]
    if len(lines[0]) > 1:
        return [(label, smirks) for smirks, label in lines]

    return [(str(i), l[0]) for i, l in enumerate(lines)]

In [3]:
def make_smarts_file(smirks_list, output_file):
    """
    Parameters
    ----------
    smirks_list: list of tuples
        smirks tuples have the form (label, smirks
    output_file: relative or absolute path
    """
    f = open(output_file, 'w')
    for label, smirks in smirks_list:
        f.write("%s %s\n" % (smirks, label))
    f.close()

In [4]:
def run_fragement(frag, mols, steps=1000):
    """

    Parameters
    ----------
    frag: str
        angle, bond, nonbond, proper_torsion
    mols: list of molecules
    steps: number of reducing steps

    Returns
    -------
    new_smirks_list: list of tuples
        Final list of tuples
    """
    print("Searching %s" % frag)

    # open and parse SMIRKS file
    smirks_list = parse_smarts_file('smarts_files/%s_smirks.smarts' % frag)

    # copy the molecules:
    current_mols = deepcopy(mols)

    # type molecules with these smirks:
    print("clustering molecules:")
    init = time.time()
    clusters = create_tuples_for_clusters(smirks_list, current_mols)
    end = time.time()
    print("Took %.3f minutes to type %i mols\n" % ((end-init)/60., len(mols)))

    # Try to make SMIRKS for these clusters:
    print("Creating initial SMIRKS...")
    init = time.time()
    create_frag = SMIRKSifier(current_mols, clusters, verbose=False, strict_smirks=False)
    end = time.time()
    print("generating smirksifier took %.3f minutes\n" % ((end-init)/60.))
    
    if not create_frag.checks:
        return None, create_frag

    print_smirks(create_frag.current_smirks)

    print("Reducing SMIRKS...")
    init = time.time()
    new_smirks_list = create_frag.reduce(steps)
    end = time.time()
    print("reducing SMIRKS took %.3f minutes\n" % ((end-init)/60.))

    return new_smirks_list, create_frag

In [10]:
# 1. load molecules
mol_file1 = 'MiniDrugBank_tripos.mol2'
mol_file2 = '/Users/bannanc/Desktop/baby_minidrugbank.mol2'
mol_file3 = "/Users/bannanc/gitHub/openforcefield/openforcefield/data/molecules/AlkEthOH_test_filt1_tripos.mol2"

mol_opts = [
    (mol_file1, 'minidrugbank'), (mol_file2, 'baby_mini_drugbank'), (mol_file3, 'alkethoh')
]

if HAS_OE:
    prefix = 'oe'
else:
    prefix = 'rdk'
    
for mol_file, mol_name in mol_opts:
    
    mols = list()
    for m in mols_from_mol2(mol_file):
        new_m = Mol(m)
        m.set_aromaticity_mdl()
        mols.append(m)

    # Loop over all fragment types
    fragments = ['nonbond', 'bond', 'angle', 'proper_torsion']
    n_steps = 1000
    smirksifiers = dict()
    smirks_lists = dict()

    for fragment in fragments:
        frag_smirks_list, frag_smirksifier = run_fragement(fragment, mols, steps=n_steps)
        smirksifiers[fragment] = frag_smirksifier
        smirks_lists[fragment] = frag_smirks_list
    
    p.dump((smirks_lists, smirksifiers), open('%s_%s.p' % (prefix, mol_name), 'wb'))

Searching nonbond
clustering molecules:
Took 0.122 minutes to type 363 mols

Creating initial SMIRKS...
                      SMIRKSifier was not able to create SMIRKS for the provided
                      clusters with 5 layers. Try increasing the number of layers
                      or changing your clusters
                      
generating smirksifier took 7.281 minutes

Searching bond
clustering molecules:
Took 0.282 minutes to type 363 mols

Creating initial SMIRKS...
                      SMIRKSifier was not able to create SMIRKS for the provided
                      clusters with 5 layers. Try increasing the number of layers
                      or changing your clusters
                      
generating smirksifier took 12.971 minutes

Searching angle
clustering molecules:
Took 0.244 minutes to type 363 mols

Creating initial SMIRKS...


KeyboardInterrupt: 

# Reload molecule sets:

In [None]:
baby_mini_drugbank = p.load(open('baby_minidrugbank.p', 'rb'))
alkethoh = p.load(open('AlkEthOH.p', 'rb'))
minidrugbank = p.load(open('minidrugbank.p', 'rb'))