In [1]:
from autotemplate.extract_utils import extract_from_rxn_smiles, canon_remap
from autotemplate.run_utils import rdchiralRunText_modified, RemoveReagent, clearIsotope
from autotemplate.graph_utils import mapping_for_gold_multiple_smiles, find_unique_templates_dict, countBCN
import pickle
from tqdm import tqdm
from collections import Counter

def remap_one_template(template, target_smiles):
    try:
        outcomes = rdchiralRunText_modified(template, target_smiles)
        return outcomes
    except:
        return

def check_reconstruct(template, rxn_smiles, retro = True):
    """Check whether the template obtained from the function "extract_from_rxn_smiles" is True for the original rxn_smiles. """
    reactants, products = rxn_smiles.split('>>')
    gold_reac = set(canon_remap(reactants).split('.'))
    
    if retro:
        reac_list = remap_one_template(template, products)
        if not reac_list: return False
        for reac in reac_list:
            reac = canon_remap(reac)
            if reac:
                check_reac = set(reac.split('.'))
            else:
                continue
            if gold_reac.issubset(check_reac):
                return True
        return False            
    else:
        pass

temp_data = "data/DielsAlder/processed_data.pkl"
with open(temp_data, "rb") as f:
    records = pickle.load(f)
print(records[0])

{'rxn_smiles': '[CH:13]1=[CH:14][CH:15]=[CH:16][CH2:17]1.[O:1]=[C:2]1[C:3]([I:4])=[CH:5][C:6]2([O:7][CH2:8][CH2:9][O:10]2)[CH:11]=[CH:12]1>>[O:1]=[C:2]1[C:3]([I:4])=[CH:5][C:6]2([O:7][CH2:8][CH2:9][O:10]2)[C@H:11]2[C@@H:12]1[C@H:13]1[CH:14]=[CH:15][C@@H:16]2[CH2:17]1', 'template': '[#6:3]1-[#6:4]=[#6:5]-[#6:6]-[#6:1]-[#6:2]-1>>[#6:1]=[#6:2].[#6:3]=[#6:4]-[#6:5]=[#6:6]', 'reaxys id': '28100547', 'final rxn_smiles': '[CH:13]1=[CH:14][CH:15]=[CH:16][CH2:17]1.[O:1]=[C:2]1[C:3]([I:4])=[CH:5][C:6]2([O:7][CH2:8][CH2:9][O:10]2)[CH:11]=[CH:12]1>>[O:1]=[C:2]1[C:3]([I:4])=[CH:5][C:6]2([O:7][CH2:8][CH2:9][O:10]2)[C@H:11]2[C@@H:12]1[C@H:13]1[CH:14]=[CH:15][C@@H:16]2[CH2:17]1'}


In [2]:
f = open("data/DielsAlder/MappingResult_DielsAlder.txt", 'r')
input_file = f.readlines()[12000:16000]
f.close()

""" Remove reagent. 
Remove 
(1) same molecule without shared atom-mapping appear in both reactant and product sites. 
(2) unmapped molecules in both reactant and product sites. """
print('Removing reagent...')
input_file = [(RemoveReagent(clearIsotope(line.split('\t')[0])), line.split('\t')[1].strip('\n')) for line in tqdm(input_file)]

print('Extracting templates...')
templates = []
data = []
for i in tqdm(range(len(input_file))):
    rxn_smiles, reaxys_id = input_file[i]
    template = extract_from_rxn_smiles(rxn_smiles, radius = 0)

    # TODO: avoid using the wrong atom-mapped reaction template
    if check_reconstruct(template, rxn_smiles, retro = True):
        templates.append(template)
    #     data.append({'rxn_smiles': rxn_smiles, 'template': template, 'reaxys id': reaxys_id})
    # else:
    data.append({'rxn_smiles': rxn_smiles, 'template': None, 'reaxys id': reaxys_id})

Removing reagent...


  0%|          | 0/4000 [00:00<?, ?it/s]

100%|██████████| 4000/4000 [00:13<00:00, 305.28it/s]


Extracting templates...


100%|██████████| 4000/4000 [02:39<00:00, 25.08it/s]


In [3]:
templates = Counter(templates)
for key, value in sorted(templates.items(), key = lambda templates : templates[1], reverse=True):
    print(key, value)

[#6:1]1-[#6:2]-[#6:3]-[#6:4]=[#6:5]-[#6:6]-1>>[#6:1]=[#6:2].[#6:3]=[#6:4]-[#6:5]=[#6:6] 89
[#6:3]1-[#6:4]=[#6:5]-[#6:6]-[#6:1]-[#6:2]-1>>[#6:1]=[#6:2].[#6:3]=[#6:4]-[#6:5]=[#6:6] 80
[#6:1]1-[#6:3]-[#6:4]=[#6:5]-[#6:6]-[#6:2]-1>>[#6:1]=[#6:2].[#6:3]=[#6:4]-[#6:5]=[#6:6] 72
[#6:3]1-[#6:1]-[#6:2]-[#6:6]-[#6:5]=[#6:4]-1>>[#6:1]=[#6:2].[#6:3]=[#6:4]-[#6:5]=[#6:6] 64
[#6:5]1=[#6:4]-[#6:3]-[#6:1]-[#6:2]-[#6:6]-1>>[#6:1]=[#6:2].[#6:3]=[#6:4]-[#6:5]=[#6:6] 54
[#6:4]1=[#6:5]-[#6:6]-[#6:2]-[#6:1]-[#6:3]-1>>[#6:1]=[#6:2].[#6:3]=[#6:4]-[#6:5]=[#6:6] 46
[#6:1]-[#6:3]-[#6:4]-[#6:6]-[#6:5]-[#6:2]>>[#6:1]-[#6:2].[#6:3]=[#6:4].[#6:5]=[#6:6] 46
[#6:3]1-[#6:4]=[#6:5]-[#6:6]-[#6:1]=[#6:2]-1>>[#6:1]#[#6:2].[#6:3]=[#6:4]-[#6:5]=[#6:6] 34
[#6:1]-[#6:5]-[#6:6]-[#6:3]-[#6:4]-[#6:2]>>[#6:1]-[#6:2].[#6:3]=[#6:4].[#6:5]=[#6:6] 25
[#6:1]-[#6:4]-[#6:5]-[#6:2]-[#7:3]>>[#6:1].[#6:2]=[#7:3].[#6:4]=[#6:5] 23
[#6:3]1-[#6:4]=[#6:5]-[#6:6]-[#6:1]-[#6:2]-1>>[#6:1]=[#6:2].[#6:3]=[#6:4]/[#6:5]=[#6:6] 20
[#6:1]:[#6:4]:[#6:3]:[

In [4]:
processed_templates, changed_records = find_unique_templates_dict(templates)

Init


In [6]:
changed_records
for prev, current in changed_records.items():
    if countBCN(prev) != countBCN(current):
        print(countBCN(prev), countBCN(current))
        print(prev, current)

8 6
[#6:5]-[#6:6]1-[#6:3]-[#6:1]-[#6:2]-[#6:4]=[#6:7]-1>>[#6:1]=[#6:2].[#6:3]=[#6:4]-[#6:5]=[#6:6]-[#6:7] [#6:1]-[#6:2]1-[#6:6]-[#6:7]-[#6:5]-[#6:4]=[#6:3]-1>>[#6:1]-[#6:2]=[#6:3]-[#6:4]=[#6:5].[#6:6]=[#6:7]
5 4
[#6:2]-[#6:4]-[#6:5]-[#6:3]-[#6:1]>>[#6:1].[#6:2]=[#6:3].[#6:4]=[#6:5] [#6:3]-[#6:2]-[#6:5]-[#6:4]-[#6:1]>>[#6:1].[#6:2]=[#6:3].[#6:4]=[#6:5]
11 8
[#6:9]-[#6:10]1:[#6:11](-[#6:12]):[#6:4](-[#6:5]):[#6:2](-[#6:3]):[#6:1](-[#6:6]):[#6:7]:1-[#6:8]>>O=[#6:1]1-[#6:2](-[#6:3])=[#6:4](-[#6:5])-C(-[#6:6])=[#6:7]-1-[#6:8].[#6:9]-[#6:10]#[#6:11]-[#6:12] [#6:9]-[#6:10]1:[#6:1](-[#6:2]):[#6:3](-[#6:4]):[#6:5](-[#6:6]):[#6:7](-[#6:8]):[#6:11]:1-[#6:12]>>O=C1-[#6:1](-[#6:2])=[#6:3](-[#6:4])-[#6:5](-[#6:6])=[#6:7]-1-[#6:8].[#6:9]-[#6:10]#[#6:11]-[#6:12]
7 6
[#16:1]1-[#6:3]-[#6:4]=[#6:6]-[#6:5]-[#7:2]-1>>[#16:1]=[#7:2].[#6:3]=[#6:4]-[#6:5]=[#6:6] [#16:1]1-[#6:6]-[#6:5]=[#6:4]-[#6:3]-[#7:2]-1>>[#16:1]=[#7:2].[#6:3]=[#6:4]-[#6:5]=[#6:6]
7 6
[#6:7]-[#6:5]1=[#6:4]-[#6:1]-[#6:2]-[#6:3]-[#6:6]-1>>[#

In [7]:
countBCN("[#16:1]1-[#6:3]-[#6:4]=[#6:6]-[#6:5]-[#7:2]-1>>[#16:1]=[#7:2].[#6:3]=[#6:4]-[#6:5]=[#6:6]")
countBCN("[#16:1]1-[#6:6]-[#6:5]=[#6:4]-[#6:3]-[#7:2]-1>>[#16:1]=[#7:2].[#6:3]=[#6:4]-[#6:5]=[#6:6]")

7

In [8]:
rxn = "Br[c:3]1[cH:4][cH:5][cH:6][cH:7][cH:8]1.[O:1]=[CH:2][c:9]1[cH:10][cH:11][cH:12][c:13]([Br:14])[cH:15]1>>[OH:1][CH:2]([c:3]1[cH:4][cH:5][cH:6][cH:7][cH:8]1)[c:9]1[cH:10][cH:11][cH:12][c:13]([Br:14])[cH:15]1"
template = extract_from_rxn_smiles(rxn)
print(template)

[#6:1]-[#6:2]-[#8:3]>>Br-[#6:1].[#6:2]=[#8:3]
