# Applying SMIRKSifier to AlkEthOH

1. Type molecules with reference SMIRKS
2. Split into training and test set, checking that all parameters are represented in each
3. Make SMIRKS for training set
4. Check SMIRKS on test set


In [14]:
# import chemper utils and SMIRKSifier

from chemper import chemper_utils as cutils
from chemper.mol_toolkits import mol_toolkit
from chemper.smirksify import SMIRKSifier, Reducer
from numpy import random
import copy

In [2]:
def parse_smarts_file(file_path, lab=''):
    full_file_path = cutils.get_data_path(file_path)
    f = open(full_file_path)
    lines = f.readlines()
    f.close()
    type_list = list()
    for idx, l in enumerate(lines):
        type_list.append(('%s%i' % (lab, idx), l.strip()))
    return type_list

In [3]:
def write_smarts_files(type_list, file_path):
    f = open(file_path)
    for lab, smirks in type_list:
        f.write('%s %s\n' % (smirks, lab))
    f.close()

## 1. Load and Type Molecules

In [4]:
# load molecules
training_mols = mol_toolkit.mols_from_mol2('AlkEthOH_filtered_tripos.mol2')
test_mols = mol_toolkit.mols_from_mol2('AlkEthOH_test_set.mol2')


In [5]:
smarts_files = [
    ('angle', 'smarts_files/angle_smirks.smarts'),
    ('bond', 'smarts_files/bond_smirks.smarts'),
    ('proper', 'smarts_files/proper_torsion_smirks.smarts'),
    ('nonbond', 'smarts_files/nonbond_smirks.smarts'),
]

## 4. Make SMIRKS for training set

In [6]:
# when not in travis tests increase the number of interations
its = 800 # 1000

In [17]:
# Get clusters for each parameter type
training_dict = dict()
for label, smarts_file in smarts_files:
    type_list = parse_smarts_file(smarts_file, smarts_file[0])
    
    if label != 'nonbond':
        continue
    training_dict[label] = dict()
    
    train_ref = cutils.get_typed_molecules(type_list, training_mols)
    
    # store only SMIRKS relevant:
    pids = {p for m, a in train_ref.items() for ai, p in a.items()}
    type_list = [t for t in type_list if t[0] in pids]
    
    test_ref = cutils.get_typed_molecules(type_list, test_mols)
    
    clusters = cutils.create_tuples_for_clusters(type_list, training_mols)
    
    its = 1000
    ifier = SMIRKSifier(training_mols, clusters, verbose=False)
    red = Reducer(ifier.current_smirks, training_mols, verbose=False)
    final_smirks = red.run()

    train_check = cutils.check_smirks_to_reference(final_smirks, copy.deepcopy(train_ref), training_mols)
    test_check = cutils.check_smirks_to_reference(final_smirks, copy.deepcopy(test_ref), test_mols)
    runs = 0
    while not test_check and runs <=10:
        runs += 1
        red = Reducer(ifier.current_smirks, training_mols, verbose=False)
        final_smirks = red.run()
        train_check = cutils.check_smirks_to_reference(final_smirks, copy.deepcopy(train_ref), training_mols)
        test_check = cutils.check_smirks_to_reference(final_smirks, copy.deepcopy(test_ref), test_mols)
        print(its, label, train_check, test_check)

    training_dict[label]['training_check'] = train_check
    training_dict[label]['test_check'] = test_check
    training_dict[label]['input_smirks'] = type_list
    training_dict[label]['output_smirks'] = final_smirks
    training_dict[label]['training_clusters'] = clusters
    training_dict[label]['its'] = its
    
    print(label, train_check, test_check)

1000 nonbond True False
1000 nonbond True False
1000 nonbond True False
1000 nonbond True False
1000 nonbond True False
1000 nonbond True False
1000 nonbond True False
1000 nonbond True False
1000 nonbond True False
1000 nonbond True False
1000 nonbond True False
nonbond True False


In [None]:
for (n, sm) in final_smirks:
    print(n, sm)
print('-'*60)
for n, sm in type_list:
    print(n, sm)

In [None]:
help(cutils.check_smirks_to_reference)

## 5. Checks SMIRKS on test set

In [None]:
for label, d in training_dict.items():


In [None]:
help(cutils.check_smirks_agree)