In [1]:
from copy import deepcopy
import time
from chemper.chemper_utils import get_full_path, create_tuples_for_clusters, get_typed_molecules
from chemper.mol_toolkits.mol_toolkit import mols_from_mol2, Mol, HAS_OE, HAS_RDK
from chemper.smirksify import SMIRKSifier, print_smirks
import pickle as p

In [2]:
def parse_smarts_file(file_path):
    """

    Parameters
    ----------
    file_path: str
        relative path in chemper/data or absolute path

    Returns
    -------
    smirks_list: list of tuples (label, smirks)
        This is the ordered list of SMIRKS from the smarts file
        if a label is provided in the file then it is assigned, otherwise
        the indices from the file is used
    """
    fn = get_full_path(file_path)
    f = open(fn)
    lines = f.readlines()
    f.close()

    lines = [l.split() for l in lines]
    if len(lines[0]) > 1:
        return [(label, smirks) for smirks, label in lines]

    return [(str(i), l[0]) for i, l in enumerate(lines)]

In [3]:
def type_molecules(current_mols):
    #clusters = create_tuples_for_clusters(smirks_list, current_mols)
    smirnoff_clusters = dict()
    smirnoff_dict = dict()
    for frag in ['nonbond', 'bond', 'angle', 'proper_torsion']:
        smirks_list = parse_smarts_file('smarts_files/%s_smirks.smarts' % frag)
        smirnoff_clusters[frag] = create_tuples_for_clusters(smirks_list, current_mols)
        smirnoff_dict[frag] = get_typed_molecules(smirks_list, current_mols)
    return smirnoff_clusters, smirnoff_dict

In [4]:
def make_smarts_file(smirks_list, output_file):
    """
    Parameters
    ----------
    smirks_list: list of tuples
        smirks tuples have the form (label, smirks
    output_file: relative or absolute path
    """
    f = open(output_file, 'w')
    for label, smirks in smirks_list:
        f.write("%s %s\n" % (smirks, label))
    f.close()

In [5]:
def run_fragement(frag, mols, steps=1000):
    """

    Parameters
    ----------
    frag: str
        angle, bond, nonbond, proper_torsion
    mols: list of molecules
    steps: number of reducing steps

    Returns
    -------
    new_smirks_list: list of tuples
        Final list of tuples
    """
    print("Searching %s" % frag)

    # open and parse SMIRKS file
    smirks_list = parse_smarts_file('smarts_files/%s_smirks.smarts' % frag)

    # copy the molecules:
    current_mols = deepcopy(mols)

    # type molecules with these smirks:
    print("clustering molecules:")
    init = time.time()
    clusters = create_tuples_for_clusters(smirks_list, current_mols)
    end = time.time()
    print("Took %.3f minutes to type %i mols\n" % ((end-init)/60., len(mols)))

    # Try to make SMIRKS for these clusters:
    print("Creating initial SMIRKS...")
    init = time.time()
    create_frag = SMIRKSifier(current_mols, clusters, verbose=False, strict_smirks=False)
    end = time.time()
    print("generating smirksifier took %.3f minutes\n" % ((end-init)/60.))
    
    if not create_frag.checks:
        return None, create_frag

    print_smirks(create_frag.current_smirks)

    print("Reducing SMIRKS...")
    init = time.time()
    new_smirks_list = create_frag.reduce(steps)
    end = time.time()
    print("reducing SMIRKS took %.3f minutes\n" % ((end-init)/60.))

    return new_smirks_list, create_frag

# Try generating smirks for all the molecules

In [6]:
# 1. load molecules
mol_file1 = 'MiniDrugBank_tripos.mol2'
mol_file2 = '/Users/bannanc/Desktop/baby_minidrugbank.mol2'
mol_file3 = "/Users/bannanc/gitHub/openforcefield/openforcefield/data/molecules/AlkEthOH_test_filt1_tripos.mol2"

mol_opts = [
    (mol_file1, 'minidrugbank'), 
    #(mol_file2, 'baby_mini_drugbank'), 
    #(mol_file3, 'alkethoh')
]

if HAS_OE:
    prefix = 'oe'
else:
    prefix = 'rdk'
    
for mol_file, mol_name in mol_opts:
    
    mols = list()
    for m in mols_from_mol2(mol_file):
        new_m = Mol(m)
        m.set_aromaticity_mdl()
        mols.append(m)

    # Loop over all fragment types
    fragments = ['nonbond', 'bond', 'angle', 'proper_torsion']
    n_steps = 1000
    smirksifiers = dict()
    smirks_lists = dict()

    for fragment in fragments:
        frag_smirks_list, frag_smirksifier = run_fragement(fragment, mols, steps=n_steps)
        smirksifiers[fragment] = frag_smirksifier
        smirks_lists[fragment] = frag_smirks_list
    
    p.dump((smirks_lists, smirksifiers), open('%s_%s.p' % (prefix, mol_name), 'wb'))

Searching nonbond
clustering molecules:
Took 0.123 minutes to type 363 mols

Creating initial SMIRKS...
                      SMIRKSifier was not able to create SMIRKS for the provided
                      clusters with 5 layers. Try increasing the number of layers
                      or changing your clusters
                      
generating smirksifier took 7.125 minutes

Searching bond
clustering molecules:
Took 0.292 minutes to type 363 mols

Creating initial SMIRKS...
                      SMIRKSifier was not able to create SMIRKS for the provided
                      clusters with 5 layers. Try increasing the number of layers
                      or changing your clusters
                      
generating smirksifier took 13.266 minutes

Searching angle
clustering molecules:
Took 0.236 minutes to type 363 mols

Creating initial SMIRKS...
                      SMIRKSifier was not able to create SMIRKS for the provided
                      clusters with 5 layers. Try increas

# Reload molecule sets:

In [54]:
mols = mols_from_mol2(mol_file1)
smirnoff_clusters, smirnoff_dicts = type_molecules(mols)

In [8]:
if HAS_OE:
    prefix = 'oe'
else:
    prefix = 'rdk'
    
#baby_mini_drugbank = p.load(open('%s_baby_minidrugbank.p' % prefix, 'rb'))
#alkethoh = p.load(open('%s_AlkEthOH.p' % prefix, 'rb'))
minidrugbank = p.load(open('%s_minidrugbank.p' % prefix, 'rb'))

In [18]:
nonbond.intermediate_smirks

{0: [('zz_1', '[#1AH0X1x0!r+0:1]'),
  ('zz_2', '[#1AH0X1x0!r+0:1]'),
  ('zz_3', '[#1AH0X1x0!r+0:1]'),
  ('zz_4', '[#1AH0X1x0!r+0:1]'),
  ('zz_5', '[#1AH0X1x0!r+0:1]'),
  ('zz_6', '[#1AH0X1x0!r+0:1]'),
  ('zz_7', '[#1AH0X1x0!r+0:1]'),
  ('zz_8', '[#1AH0X1x0!r+0:1]'),
  ('zz_9', '[#1AH0X1x0!r+0:1]'),
  ('zz_10', '[#1AH0X1x0!r+0:1]'),
  ('zz_11', '[#1AH0X1x0!r+0:1]'),
  ('zz_12', '[#1AH0X1x0!r+0:1]'),
  ('zz_13',
   '[#6!rAH0x0,#6!rAH1x0,#6!rAH2x0,#6AH0r14x2,#6AH0r16x2,#6AH0r18x2,#6AH0r24x2,#6AH0r25x2,#6AH0r29x2,#6AH0r4x2,#6AH0r5x2,#6AH0r5x3,#6AH0r6x2,#6AH0r6x3,#6AH0r7x2,#6AH1r24x2,#6AH1r5x2,#6AH1r6x2,#6AH1r7x2,#6H0ar5x3,#6H0ar6x2,#6H0ar6x3,#6H1ar6x2;+0;X3:1]'),
  ('zz_14', '[#6H0,#6H1;!r;+0;A;X2;x0:1]'),
  ('zz_15',
   '[#6!rH0x0,#6!rH1x0,#6!rH2x0,#6!rH3x0,#6H0r16x2,#6H0r3x2,#6H0r3x3,#6H0r3x4,#6H0r4x2,#6H0r5x2,#6H0r5x3,#6H0r5x4,#6H0r6x2,#6H0r6x3,#6H0r6x4,#6H1r14x2,#6H1r16x2,#6H1r18x2,#6H1r24x2,#6H1r25x2,#6H1r29x2,#6H1r3x2,#6H1r3x3,#6H1r4x2,#6H1r4x3,#6H1r5x2,#6H1r5x3,#6H1r6x2,#6H1r6x3,#6H

In [10]:
nonbond = minidrugbank[1]['nonbond']

In [27]:
#current_clusters = {
#    'nonbond': {i:list() for i in range(6)},
#    'bond': {i:list() for i in range(6)},
#    'angle': {i:list() for i in range(6)},
#    'torsions': {i:list() for i in range(6)}
#}

for frag, smirksifier in minidrugbank[1].items():
    print(frag)
    if frag != 'proper_torsion':
        temp_frag = frag
        continue
    else:
        temp_frag = 'torsions'
    for layers, smirks_list in smirksifier.intermediate_smirks.items():
        print('\t',layers)
        current_clusters[temp_frag][layers] = create_tuples_for_clusters(smirks_list, mols)

bond
nonbond
angle
proper_torsion
	 0
	 1
	 2
	 3
	 4
	 5


In [30]:
p.dump(current_clusters, open('current_clusters_minidrugbank.p', 'wb'))

In [26]:
current_clusters.keys()

dict_keys(['bond', 'nonbond', 'angle', 'torsions'])

# Use clusters by layer to identify atoms in different clusters

In [6]:
mols = mols_from_mol2('MiniDrugBank_tripos.mol2')
smirnoff_clusters, smirnoff_dicts = type_molecules(mols)
current_clusters = p.load(open('current_clusters_minidrugbank.p', 'rb'))

In [7]:
minidrugbank = p.load(open('oe_minidrugbank.p','rb'))

In [8]:
from chemper.chemper_utils import match_reference, check_smirks_to_reference
from chemper.chemper_utils import score_match_reference
import copy

In [9]:
smirks_list = minidrugbank[1]['bond'].intermediate_smirks[0]
current_assignments = get_typed_molecules(smirks_list, mols)
ref_assignments = smirnoff_dicts['bond']
match_reference(current_assignments, ref_assignments)

(set(), False)

In [10]:
current2 = dict()
for mol_idx, mol_dict in current_assignments.items():
    if mol_idx not in current2:
        current2[mol_idx] = dict()
    for indices, lab in mol_dict.items():
        current2[mol_idx][indices] = lab+'2'

In [11]:
match_reference(current_assignments, current2)

({('zz_0', 'zz_02'),
  ('zz_1', 'zz_12'),
  ('zz_10', 'zz_102'),
  ('zz_11', 'zz_112'),
  ('zz_12', 'zz_122'),
  ('zz_13', 'zz_132'),
  ('zz_14', 'zz_142'),
  ('zz_15', 'zz_152'),
  ('zz_16', 'zz_162'),
  ('zz_17', 'zz_172'),
  ('zz_18', 'zz_182'),
  ('zz_19', 'zz_192'),
  ('zz_2', 'zz_22'),
  ('zz_20', 'zz_202'),
  ('zz_22', 'zz_222'),
  ('zz_23', 'zz_232'),
  ('zz_24', 'zz_242'),
  ('zz_25', 'zz_252'),
  ('zz_26', 'zz_262'),
  ('zz_27', 'zz_272'),
  ('zz_28', 'zz_282'),
  ('zz_3', 'zz_32'),
  ('zz_31', 'zz_312'),
  ('zz_32', 'zz_322'),
  ('zz_34', 'zz_342'),
  ('zz_35', 'zz_352'),
  ('zz_36', 'zz_362'),
  ('zz_37', 'zz_372'),
  ('zz_38', 'zz_382'),
  ('zz_4', 'zz_42'),
  ('zz_41', 'zz_412'),
  ('zz_42', 'zz_422'),
  ('zz_47', 'zz_472'),
  ('zz_48', 'zz_482'),
  ('zz_5', 'zz_52'),
  ('zz_50', 'zz_502'),
  ('zz_52', 'zz_522'),
  ('zz_53', 'zz_532'),
  ('zz_54', 'zz_542'),
  ('zz_55', 'zz_552'),
  ('zz_58', 'zz_582'),
  ('zz_6', 'zz_62'),
  ('zz_61', 'zz_612'),
  ('zz_62', 'zz_622'),
  

In [12]:
score_match_reference(current_assignments, current2)

KeyError: 'zz_4'

In [None]:
for thing in zip(smirnoff_clusters['bond'], current_clusters['bond'][0]):
    s = thing[0][1]
    mine = thing[0][1]
    for idx, mol_list in enumerate(s):
        print(set(mol_list) - set(mine[idx]))
        print(set(mol_list) - set(mine[idx]))
    #print(set(s) - set(mine))

In [48]:
new

[set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),
 set(),


In [33]:
smirnoff_clusters['bond']

[('0',
  [[(6, 9), (8, 9), (10, 11), (6, 11), (5, 10), (5, 7), (7, 8)],
   [],
   [(8, 9), (7, 8)],
   [(12, 24), (10, 12), (7, 9), (12, 23)],
   [(4, 7),
    (7, 12),
    (6, 14),
    (7, 15),
    (14, 18),
    (10, 12),
    (4, 13),
    (5, 6),
    (9, 11),
    (8, 11),
    (8, 21),
    (4, 16),
    (3, 9),
    (13, 19),
    (19, 20),
    (3, 5),
    (5, 10),
    (6, 8),
    (15, 20),
    (3, 4)],
   [(16, 17),
    (10, 11),
    (5, 6),
    (12, 13),
    (8, 9),
    (7, 11),
    (18, 19),
    (6, 7),
    (14, 15),
    (15, 16),
    (9, 10),
    (33, 34),
    (5, 10),
    (13, 14),
    (17, 18),
    (21, 22),
    (13, 18),
    (7, 8)],
   [(5, 6), (6, 7), (4, 5), (3, 4), (2, 3)],
   [(5, 9),
    (5, 7),
    (10, 11),
    (1, 2),
    (6, 11),
    (9, 10),
    (2, 3),
    (6, 8),
    (7, 8)],
   [(25, 26), (24, 25)],
   [(12, 17), (4, 5), (12, 18), (5, 15), (13, 18), (4, 15)],
   [(7, 8)],
   [(23, 35), (35, 36), (23, 36)],
   [(0, 1), (0, 8)],
   [(9, 26),
    (19, 20),
    (16, 17),
 

In [34]:
current_clusters['bond'][0]

[('zz_0',
  [[(6, 9), (8, 9), (10, 11), (6, 11), (5, 10), (5, 7), (7, 8)],
   [],
   [(8, 9), (7, 8)],
   [(12, 24), (10, 12), (7, 9), (12, 23)],
   [(4, 7),
    (7, 12),
    (6, 14),
    (7, 15),
    (14, 18),
    (10, 12),
    (4, 13),
    (5, 6),
    (9, 11),
    (8, 11),
    (8, 21),
    (4, 16),
    (3, 9),
    (13, 19),
    (19, 20),
    (3, 5),
    (5, 10),
    (6, 8),
    (15, 20),
    (3, 4)],
   [(16, 17),
    (10, 11),
    (5, 6),
    (8, 9),
    (7, 11),
    (18, 19),
    (6, 7),
    (12, 13),
    (14, 15),
    (15, 16),
    (9, 10),
    (33, 34),
    (5, 10),
    (13, 14),
    (17, 18),
    (21, 22),
    (13, 18),
    (7, 8)],
   [(5, 6), (6, 7), (4, 5), (2, 3), (3, 4)],
   [(5, 9),
    (5, 7),
    (10, 11),
    (1, 2),
    (6, 11),
    (9, 10),
    (2, 3),
    (6, 8),
    (7, 8)],
   [(25, 26), (24, 25)],
   [(12, 17), (4, 5), (12, 18), (5, 15), (13, 18), (4, 15)],
   [(7, 8)],
   [(23, 35), (35, 36), (23, 36)],
   [(0, 1), (0, 8)],
   [(9, 26),
    (19, 20),
    (16, 17)