In [22]:
from tqdm import tqdm
import pandas as pd
import numpy as np
import rdchiral
from tqdm import tqdm
from common import *
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem import rdChemReactions
from rdkit.Chem.Draw import rdMolDraw2D
from rdkit.Chem.Draw import IPythonConsole
from IPython.display import SVG, display
from rxnmapper import RXNMapper
from rdkit.Chem import AllChem
from rdchiral.main import rdchiralRun, rdchiralRunText, rdchiralReaction, rdchiralReactants
from rdchiral.template_extractor import mols_from_smiles_list, replace_deuterated, clear_mapnum, \
    get_tagged_atoms_from_mols, \
    get_tagged_atoms_from_mol, atoms_are_different, find_map_num, get_tetrahedral_atoms, set_isotope_to_equal_mapnum, \
    get_frag_around_tetrahedral_center, check_tetrahedral_centers_equivalent, clear_isotope, get_changed_atoms, \
    get_special_groups, expand_atoms_to_use, expand_atoms_to_use_atom, convert_atom_to_wildcard, reassign_atom_mapping, \
    get_strict_smarts_for_atom, expand_changed_atom_tags, get_fragments_for_changed_atoms, canonicalize_transform, \
    canonicalize_template, bond_to_label

### input and output

In [23]:
#input
mnxreac_smile_file_path = '../../Data/database/MNXreaction_smile.csv'

# mnxreac_smile_file_path = '../Data-new/metnetx/MNXreaction_smile.csv'
DeepEC_file_path = '../../Data/Sce_DeepECv2.txt'

#output

mnxreac_smile_file_reverse_path = '../../Data/database/MNXreaction_smile_reverse.csv'
mnxreac_smile_file_reverse_ATP_ADP_path = '../../Data/database/MNXreaction_smile_reverse_ATP_ADP.csv'
mnxreac_smile_atom_mapping_file_path = '../../Data/rules/MNXreaction_smiles_atommap.csv'
mnxreac_smile_atom_mapping_rules_file_filter_add_no_ec_path = '../../Data/rules/MNXreaction_smiles_atommap_rules_filter_add_no_ec.csv'
mnxreac_smile_atom_mapping_rules_file_filter_path = '../../Data/rules/MNXreaction_smiles_atommap_rules_filter.csv'
mnxreac_smile_atom_mapping_rules_file_all_path = '../../Data/rules/MNXreaction_smiles_atommap_rules_all.csv'

In [24]:
def MNXreaction_smiles_atom_mapping(df):
    rxn_mapper = RXNMapper()
    mapping_failed_list = []
    for index,row in df.iterrows():
        error = ['MNXR171790','MNXR171819','MNXR172553','MNXR171008','MNXR186735','MNXR194090']
        if row['MNX_ID'] in error:
            pass
        else:
            try:    
                rxnsmiles = row['equ_smiles']
                df.loc[index,'RxnMapped'] = rxn_mapper.get_attention_guided_atom_maps([rxnsmiles])[0]['mapped_rxn']  # atom mapping
                df.loc[index,'confidence'] = rxn_mapper.get_attention_guided_atom_maps([rxnsmiles])[0]['confidence']
            except:
                mapping_failed_list.append(row['MNX_ID'])
    df.dropna(subset=['RxnMapped'],inplace=True)
    return df, mapping_failed_list
def parallel_MNXreaction_smiles_atom_mapping(mnxreac_smile_file_path,mnxreac_smile_atom_mapping_file_path):
    mnxreac_smile = pd.read_csv(mnxreac_smile_file_path)
    mnxreac_smile = mnxreac_smile[mnxreac_smile['equ_smiles']!='>>']

    # Split the DataFrame into chunks
    chunks = np.array_split(mnxreac_smile, 4)

    # Create a Pool object with a progress bar
    with mp.Pool(4) as pool:

        # Use the map function to run your function in parallel
        results = list(tqdm(pool.imap(MNXreaction_smiles_atom_mapping, chunks), total=len(chunks)))

    # Concatenate the results into a single DataFrame
    new_mnxreac_smile = pd.concat([result[0] for result in results])
    mapping_failed_list = [item for sublist in [result[1] for result in results] for item in sublist]

    print('done')
    print('mapping failed reactions',mapping_failed_list)

    new_mnxreac_smile.to_csv(mnxreac_smile_atom_mapping_file_path, index=None)
# def MNXreaction_smiles_atom_mapping(mnxreac_smile_file_path,mnxreac_smile_atom_mapping_file_path):
#     mnxreac_smile = pd.read_csv(mnxreac_smile_file_path)
#     mnxreac_smile = mnxreac_smile[mnxreac_smile['equ_smiles']!='>>']
#     rxn_mapper = RXNMapper()
#     mapping_failed_list = []
#     # mnxreac_smile = mnxreac_smile.loc[20000:,:]
#     for index,row in tqdm(mnxreac_smile.iterrows(),total=len(mnxreac_smile)):
#         error = ['MNXR171790','MNXR171819','MNXR172553','MNXR171008','MNXR186735','MNXR194090']
#         if row['MNX_ID'] in error:
#             pass
#         else:
#             try:    
#                 rxnsmiles = row['equ_smiles']
#                 mnxreac_smile.loc[index,'RxnMapped'] = rxn_mapper.get_attention_guided_atom_maps([rxnsmiles])[0]['mapped_rxn']  # atom mapping
#                 mnxreac_smile.loc[index,'confidence'] = rxn_mapper.get_attention_guided_atom_maps([rxnsmiles])[0]['confidence']
#             except:
#                 mapping_failed_list.append(row['MNX_ID'])
#                 # print('failed',row['MNX_ID'])#row['equ_smiles'
#     print('done')
#     print('mapping failed reactions',mapping_failed_list)
#     mnxreac_smile.dropna(subset=['RxnMapped'],inplace=True)
#     mnxreac_smile.to_csv(mnxreac_smile_atom_mapping_file_path, index=None)

def extract_from_reaction(reaction):
    reactants = mols_from_smiles_list(replace_deuterated(reaction['reactants']).split('.'))
    products = mols_from_smiles_list(replace_deuterated(reaction['products']).split('.'))

    # if rdkit cant understand molecule, return
    if None in reactants: return {'reaction_id': reaction['_id']}
    if None in products: return {'reaction_id': reaction['_id']}

    # try to sanitize molecules
    try:
        for i in range(len(reactants)):
            reactants[i] = AllChem.RemoveHs(reactants[i])  # *might* not be safe
        for i in range(len(products)):
            products[i] = AllChem.RemoveHs(products[i])  # *might* not be safe
        [Chem.SanitizeMol(mol) for mol in reactants + products]  # redundant w/ RemoveHs
        [mol.UpdatePropertyCache() for mol in reactants + products]
    except Exception as e:
        # can't sanitize -> skip
        print(e)
        print('Could not load SMILES or sanitize')
        print('ID: {}'.format(reaction['_id']))
        return {'reaction_id': reaction['_id']}

    are_unmapped_product_atoms = False
    extra_reactant_fragment = ''
    for product in products:
        prod_atoms = product.GetAtoms()
        if sum([a.HasProp('molAtomMapNumber') for a in prod_atoms]) < len(prod_atoms):
            if VERBOSE: print('Not all product atoms have atom mapping')
            if VERBOSE: print('ID: {}'.format(reaction['_id']))
            are_unmapped_product_atoms = True

    if are_unmapped_product_atoms:  # add fragment to template
        for product in products:
            prod_atoms = product.GetAtoms()
            # Get unmapped atoms
            unmapped_ids = [
                a.GetIdx() for a in prod_atoms if not a.HasProp('molAtomMapNumber')
            ]
            if len(unmapped_ids) > MAXIMUM_NUMBER_UNMAPPED_PRODUCT_ATOMS:
                # Skip this example - too many unmapped product atoms!
                return
            # Define new atom symbols for fragment with atom maps, generalizing fully
            atom_symbols = ['[{}]'.format(a.GetSymbol()) for a in prod_atoms]
            # And bond symbols...
            bond_symbols = ['~' for b in product.GetBonds()]
            if unmapped_ids:
                extra_reactant_fragment += AllChem.MolFragmentToSmiles(
                    product, unmapped_ids,
                    allHsExplicit=False, isomericSmiles=USE_STEREOCHEMISTRY,
                    atomSymbols=atom_symbols, bondSymbols=bond_symbols
                ) + '.'
        if extra_reactant_fragment:
            extra_reactant_fragment = extra_reactant_fragment[:-1]
            if VERBOSE: print('    extra reactant fragment: {}'.format(extra_reactant_fragment))

        # Consolidate repeated fragments (stoichometry)
        extra_reactant_fragment = '.'.join(sorted(list(set(extra_reactant_fragment.split('.')))))

    if None in reactants + products:
        print('Could not parse all molecules in reaction, skipping')
        print('ID: {}'.format(reaction['_id']))
        return {'reaction_id': reaction['_id']}

    # Calculate changed atoms
    changed_atoms, changed_atom_tags, err = get_changed_atoms(reactants, products)
    if err:
        if VERBOSE:
            print('Could not get changed atoms')
            print('ID: {}'.format(reaction['_id']))
        return
    if not changed_atom_tags:
        if VERBOSE:
            print('No atoms changed?')
            print('ID: {}'.format(reaction['_id']))
        # print('Reaction SMILES: {}'.format(example_doc['RXN_SMILES']))
        return {'reaction_id': reaction['_id']}

    try:
        # Get fragments for reactants
        reactant_fragments, intra_only, dimer_only = get_fragments_for_changed_atoms(reactants, changed_atom_tags,
                                                                                     radius=1, expansion=[],
                                                                                     category='reactants')
        # Get fragments for products
        # (WITHOUT matching groups but WITH the addition of reactant fragments)
        product_fragments, _, _ = get_fragments_for_changed_atoms(products, changed_atom_tags,
                                                                  radius=0,
                                                                  expansion=expand_changed_atom_tags(changed_atom_tags,
                                                                                                     reactant_fragments),
                                                                  category='products')
        # while loop for the radius definition
        r = 1  # radius
        while len(product_fragments.split('.')) > len(products) or len(reactant_fragments.split('.')) > len(reactants):
            r += 1
            reactant_fragments, intra_only, dimer_only = get_fragments_for_changed_atoms(reactants, changed_atom_tags,
                                                                                         radius=r, expansion=[],
                                                                                         category='reactants')
            product_fragments, _, _ = get_fragments_for_changed_atoms(products, changed_atom_tags,
                                                                      radius=0,
                                                                      expansion=expand_changed_atom_tags(
                                                                          changed_atom_tags,
                                                                          reactant_fragments),
                                                                      category='products')
            if r > 10:
                break

    except ValueError as e:
        if VERBOSE:
            print(e)
            print(reaction['_id'])
        return {'reaction_id': reaction['_id']}

    # Put together and canonicalize (as best as possible)
    rxn_string = '{}>>{}'.format(reactant_fragments, product_fragments)
    rxn_canonical = canonicalize_transform(rxn_string)
    # Change from inter-molecular to intra-molecular
    rxn_canonical_split = rxn_canonical.split('>>')
    rxn_canonical = rxn_canonical_split[0][1:-1].replace(').(', '.') + \
                    '>>' + rxn_canonical_split[1][1:-1].replace(').(', '.')

    reactants_string = rxn_canonical.split('>>')[0]
    products_string = rxn_canonical.split('>>')[1]

    retro_canonical = products_string + '>>' + reactants_string  # not used in this template

    # Load into RDKit
    rxn = AllChem.ReactionFromSmarts(retro_canonical)
    if rxn.Validate()[1] != 0:
        print('Could not validate reaction successfully')
        print('ID: {}'.format(reaction['_id']))
        print('retro_canonical: {}'.format(retro_canonical))
        if VERBOSE: raw_input('Pausing...')
        return {'reaction_id': reaction['_id']}

    template = {
        'productSMARTs': products_string,
        'reactantSMARTs': reactants_string,
        'products': reaction['products'],
        'reactants': reaction['reactants'],
        'reaction_smarts': rxn_canonical,
        'retro_reaction_smarts':retro_canonical,
        'intra_only': intra_only,
        'dimer_only': dimer_only,
        'reaction_id': reaction['_id'],
        'necessary_reagent': extra_reactant_fragment,
        'radius': r,
    }

    return template

def extract_MNXreaction_rules(mnxreac_smile_atom_mapping_file_path,set_confidence_score,mnxreac_smile_atom_mapping_rules_file_path):
    mnxreac_smile_atom_mapping = pd.read_csv(mnxreac_smile_atom_mapping_file_path)
    failed_extract = []
    for index,row in tqdm(mnxreac_smile_atom_mapping.iterrows(), total=len(mnxreac_smile_atom_mapping)):
        rxn_id = row['MNX_ID']
        atommap = row['RxnMapped']
        confidencescore = row['confidence']
        if confidencescore > set_confidence_score:
            reaction = {
            'reactants': atommap.split('>')[0],
            'products': atommap.split('>')[-1],
            '_id': rxn_id,
            }
            try:

                template = extract_from_reaction(reaction)
                mnxreac_smile_atom_mapping.loc[index,'ReactantsSMARTs'] = template['reactantSMARTs']
                mnxreac_smile_atom_mapping.loc[index,'ProductSMARTs'] = template['productSMARTs']
                mnxreac_smile_atom_mapping.loc[index,'RetroRules'] = template['retro_reaction_smarts']
                mnxreac_smile_atom_mapping.loc[index,'Radius'] = template['radius']

            except:
                # print('failed extract:', row['MNX_ID'])
                failed_extract.append(row['MNX_ID'])
    print('done')
    print('failed extract:',failed_extract)

    mnxreac_smile_atom_mapping.dropna(subset=['RetroRules'], inplace=True)
    mnxreac_smile_atom_mapping.reset_index(drop=True, inplace=True)
    
    mnxreac_smile_atom_mapping.to_csv(mnxreac_smile_atom_mapping_rules_file_path,index=None)

In [25]:
def process_data(df):
    # Create an empty DataFrame to store the modified data
    new_df = pd.DataFrame(columns=df.columns)
    # Iterate through each row of the original DataFrame
    for index, row in df.iterrows():
        # Create a new row, modify the original row
        if pd.isnull(row['equ_name']):
            continue
        new_row = row.copy()
        new_row['MNX_ID'] = row['MNX_ID'] + '_reverse'
        new_row['equation'] = row['equation'].split(' <=> ')[1] + ' <=> ' + row['equation'].split(' <=> ')[0]
        new_row['REFERENCE'] = row['REFERENCE']
        new_row['classifs'] = row['classifs']
        new_row['substrate_smiles'] = row['product_smiles']
        new_row['product_smiles'] = row['substrate_smiles']
        new_row['equ_name'] = row['equ_name'].split(' <=> ')[1] + ' <=> ' + row['equ_name'].split(' <=> ')[0]
        new_row['equ_smiles'] = row['equ_smiles'].split('>>')[1] + '>>' + row['equ_smiles'].split('>>')[0]
        new_row['deprecated_equ'] = row['deprecated_equ'].split(' <=> ')[1] + ' <=> ' + row['deprecated_equ'].split(' <=> ')[0]
        new_row['deprecated_equ_smiles'] = row['deprecated_equ_smiles'].split('>>')[1] + '>>' + row['deprecated_equ_smiles'].split('>>')[0]

        # Add both the original row and the new row to the new DataFrame
        new_df = new_df._append(row, ignore_index=True)
        new_df = new_df._append(new_row, ignore_index=True)
    return new_df

def reaction_reverse(mnxreac_smile_file_path,mnxreac_smile_file_reverse_path):
    mnxreac_smile = pd.read_csv(mnxreac_smile_file_path)

    # Split the DataFrame into chunks
    chunks = np.array_split(mnxreac_smile, 50)

    # Create a Pool object
    pool = mp.Pool(50)

    # Use the map function to run your function in parallel
    results = pool.map(process_data, chunks)

    # Concatenate the results into a single DataFrame
    new_mnxreac_smile = pd.concat(results)

    new_mnxreac_smile.to_csv(mnxreac_smile_file_reverse_path, index=None)
    print(new_mnxreac_smile.shape)

In [26]:
def reverse_ATP_ADP(row,ATP,ADP):
    dep_reactant_name,dep_product_name = row['deprecated_equ'].split(' <=> ')
    equ_name = row['equ_name'].replace(' <=> ',' + ').split(' + ')
    reactant_name,product_name = row['equ_name'].split(' <=> ')
    
    if 'ATP' in equ_name and 'ADP' in equ_name:
        dep_reactant,dep_product = row['deprecated_equ_smiles'].split('>>') 
        reactant,product = row['equ_smiles'].split('>>') 
        reactant = reactant.split('.')
        product = product.split('.')
        reactant_name = reactant_name.split(' + ')
        product_name = product_name.split(' + ')
        if ATP in reactant:
            reactant = [r for r in reactant if r != ATP]
            product = [p for p in product if p != ADP]
            reactant_name = [rn for rn in reactant_name if rn != 'ATP']
            product_name = [pn for pn in product_name if pn != 'ADP']
            if dep_reactant == '':
                dep_reactant = ATP
            else:
                dep_reactant = dep_reactant.split('.')
                dep_reactant.append(ATP)
                dep_reactant = '.'.join(dep_reactant)
            if dep_product == '':
                dep_product = ADP
            else:
                dep_product = dep_product.split('.')

                dep_product.append(ADP)
                dep_product = '.'.join(dep_product)
            if dep_reactant_name == '':
                dep_reactant_name = 'ATP'
            else:
                dep_reactant_name = dep_reactant_name.split(' + ')
                dep_reactant_name.append('ATP')
                dep_reactant_name = ' + '.join(dep_reactant_name)
            if dep_product_name == '':
                dep_product_name = 'ADP'
            else:
                dep_product_name = dep_product_name.split(' + ')

                dep_product_name.append('ADP')
                dep_product_name = ' + '.join(dep_product_name)
        else:
            reactant = [r for r in reactant if r != ADP]
            product = [p for p in product if p != ATP]
            reactant_name = [rn for rn in reactant_name if rn != 'ADP']
            product_name = [pn for pn in product_name if pn != 'ATP']
            
            if dep_reactant == '':
                dep_reactant = ADP
            else:
                dep_reactant = dep_reactant.split('.')
                dep_reactant.append(ADP)
                dep_reactant = '.'.join(dep_reactant)
            if dep_product == '':
                dep_product = ATP
            else:
                dep_product = dep_product.split('.')

                dep_product.append(ATP)
                dep_product = '.'.join(dep_product)
            if dep_reactant_name == '':
                dep_reactant_name = 'ADP'
            else:
                dep_reactant_name = dep_reactant_name.split(' + ')
                dep_reactant_name.append('ADP')
                dep_reactant_name = ' + '.join(dep_reactant_name)
            if dep_product_name == '':
                dep_product_name = 'ATP'
            else:
                dep_product_name = dep_product_name.split(' + ')

                dep_product_name.append('ATP')
                dep_product_name = ' + '.join(dep_product_name)
    
        row['deprecated_equ_smiles'] = dep_reactant + '>>' + dep_product
        row['equ_smiles'] = '.'.join(reactant) + '>>' + '.'.join(product)
        row['deprecated_equ'] = dep_reactant_name + ' <=> ' + dep_product_name
        row['equ_name'] = ' + '.join(reactant_name) + ' <=> ' + ' + '.join(product_name)
    return row['equ_name'],row['equ_smiles'],row['deprecated_equ'],row['deprecated_equ_smiles']

In [27]:
def reverse_ATP_ADP_parallel(mnxreac_smile_file_reverse_path,mnxreac_smile_file_reverse_ATP_ADP_path):
    mnxreac_smile = pd.read_csv(mnxreac_smile_file_reverse_path)
    new_mnxreac_smile = pd.DataFrame(columns=mnxreac_smile.columns)
    ATP = 'Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)([O-])OP(=O)([O-])OP(=O)([O-])[O-])[C@@H](O)[C@H]1O'
    ADP = 'Nc1ncnc2c1ncn2[C@@H]1O[C@H](COP(=O)([O-])OP(=O)([O-])[O-])[C@@H](O)[C@H]1O'
    reverse_ATP_ADP_partial = partial(reverse_ATP_ADP,ATP=ATP,ADP=ADP)
    with multiprocessing.Pool(30) as pool:
        result = list(tqdm(pool.imap(reverse_ATP_ADP_partial, [row for index,row in mnxreac_smile.iterrows()],chunksize=50),total=len(mnxreac_smile)))
        mnxreac_smile['equ_name'],mnxreac_smile['equ_smiles'],mnxreac_smile['deprecated_equ'],mnxreac_smile['deprecated_equ_smiles'] = zip(*result)
        
        mnxreac_smile.to_csv(mnxreac_smile_file_reverse_ATP_ADP_path,index=None)
        # return mnxreac_smile

In [28]:
def filter_retrorule(mnxreac_smile_atom_mapping_rules_file_path,DeepProZyme_path,mnxreac_smile_atom_mapping_rules_file_filter_path):
    retrorule = pd.read_csv(mnxreac_smile_atom_mapping_rules_file_path,index_col=None)
    print(retrorule.shape)
    DeepProZyme_gene2ec_dict = get_gene2ec_dict_DeepProZyme(DeepProZyme_path)
    DeepProZyme_ec2gene_dict = get_ec2gene_dict_DeepProZyme(DeepProZyme_gene2ec_dict)
    predict_ec_list = list(set(DeepProZyme_ec2gene_dict.keys()))
    print(retrorule.shape)
    retrorule = retrorule[~retrorule['classifs'].isna()]
    print(retrorule.shape)
    for index, row in retrorule.iterrows():
        tmp_list = row['classifs'].split(';')
        tmp_list = [ec for ec in tmp_list if len(ec.split('.')) > 2]
        tmp_list = ['.'.join(ec.split('.')[0:3]) for ec in tmp_list]
        need_rules = any(ec in predict_ec_list for ec in tmp_list)
        
        if not need_rules:
            retrorule = retrorule.drop(index, axis=0)
    print(retrorule.shape)
    retrorule.dropna(subset=['RetroRules'], inplace=True)
    print(retrorule.shape)
    retrorule.reset_index(drop=True, inplace=True)
    print(retrorule.shape)
    retrorule['num_substrings'] = retrorule['product_smiles'].apply(lambda x: len(x.split('.')))
    retrorule = retrorule[retrorule['num_substrings'] < 4]
    print(retrorule.shape)
    retrorule.to_csv(mnxreac_smile_atom_mapping_rules_file_filter_path,index=None)

### reaction reverse

In [29]:
reaction_reverse(mnxreac_smile_file_path,mnxreac_smile_file_reverse_path)


(45588, 12)


### drop ATP-ADP pair in reaction

In [30]:
reverse_ATP_ADP_parallel(mnxreac_smile_file_reverse_path,mnxreac_smile_file_reverse_ATP_ADP_path)

100%|██████████| 45588/45588 [00:01<00:00, 36734.59it/s]


### atom mapping

In [31]:
parallel_MNXreaction_smiles_atom_mapping(mnxreac_smile_file_reverse_ATP_ADP_path,mnxreac_smile_atom_mapping_file_path)

  return self.fget.__get__(instance, owner)()
  return self.fget.__get__(instance, owner)()
  return self.fget.__get__(instance, owner)()
  return self.fget.__get__(instance, owner)()
Token indices sequence length is longer than the specified maximum sequence length for this model (811 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (611 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (615 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1632 > 512). Running this sequence through the model will result in indexing errors
100%|██████████| 4/4 [06:46<00:00, 101.74s/it]


done
mapping failed reactions ['MNXR100481', 'MNXR100481_reverse', 'MNXR100602', 'MNXR100602_reverse', 'MNXR100606', 'MNXR100606_reverse', 'MNXR100915', 'MNXR100915_reverse', 'MNXR100923', 'MNXR100923_reverse', 'MNXR101078', 'MNXR101078_reverse', 'MNXR101899', 'MNXR101899_reverse', 'MNXR102371', 'MNXR102371_reverse', 'MNXR102499', 'MNXR102499_reverse', 'MNXR102500', 'MNXR102500_reverse', 'MNXR103114', 'MNXR103114_reverse', 'MNXR103129', 'MNXR103129_reverse', 'MNXR103131', 'MNXR103131_reverse', 'MNXR104025', 'MNXR104025_reverse', 'MNXR105476', 'MNXR105476_reverse', 'MNXR106751', 'MNXR106751_reverse', 'MNXR107087', 'MNXR107087_reverse', 'MNXR107088', 'MNXR107088_reverse', 'MNXR107512', 'MNXR107512_reverse', 'MNXR107550', 'MNXR107550_reverse', 'MNXR109067', 'MNXR109067_reverse', 'MNXR109489', 'MNXR109489_reverse', 'MNXR109724', 'MNXR109724_reverse', 'MNXR109725', 'MNXR109725_reverse', 'MNXR110253', 'MNXR110253_reverse', 'MNXR110258', 'MNXR110258_reverse', 'MNXR110263', 'MNXR110263_reverse

### extract MNXreaction rules

In [32]:
set_confidence_score = 0.3
extract_MNXreaction_rules(mnxreac_smile_atom_mapping_file_path,set_confidence_score,mnxreac_smile_atom_mapping_rules_file_all_path)

  5%|▍         | 2172/44195 [00:26<07:18, 95.91it/s] 

100%|██████████| 44195/44195 [09:26<00:00, 77.95it/s] 


done
failed extract: ['MNXR100288', 'MNXR100288_reverse', 'MNXR101281', 'MNXR101281_reverse', 'MNXR101325', 'MNXR101325_reverse', 'MNXR101342', 'MNXR101342_reverse', 'MNXR101657', 'MNXR101657_reverse', 'MNXR103423', 'MNXR103423_reverse', 'MNXR103425', 'MNXR103425_reverse', 'MNXR103434', 'MNXR103434_reverse', 'MNXR103571', 'MNXR103571_reverse', 'MNXR103621', 'MNXR103621_reverse', 'MNXR103622', 'MNXR103633', 'MNXR103713', 'MNXR103713_reverse', 'MNXR103915', 'MNXR103915_reverse', 'MNXR104002', 'MNXR104002_reverse', 'MNXR104003', 'MNXR104003_reverse', 'MNXR104004', 'MNXR104004_reverse', 'MNXR104083', 'MNXR104083_reverse', 'MNXR104348', 'MNXR104348_reverse', 'MNXR104953', 'MNXR104953_reverse', 'MNXR105224', 'MNXR105224_reverse', 'MNXR106586', 'MNXR106586_reverse', 'MNXR106589', 'MNXR106589_reverse', 'MNXR106628', 'MNXR106628_reverse', 'MNXR106636', 'MNXR106636_reverse', 'MNXR106643', 'MNXR106643_reverse', 'MNXR106740', 'MNXR106740_reverse', 'MNXR106886', 'MNXR106886_reverse', 'MNXR107011', 

### 3.filter rules

In [33]:
filter_retrorule(mnxreac_smile_atom_mapping_rules_file_all_path,DeepEC_file_path,mnxreac_smile_atom_mapping_rules_file_filter_path)

(29336, 18)
(29336, 18)
(22062, 18)
(20316, 18)
(20316, 18)
(20316, 18)
(20210, 19)


In [34]:
retrorule = pd.read_csv(mnxreac_smile_atom_mapping_rules_file_all_path)
retrorule.shape

(29336, 18)

In [35]:
retrorule['num_substrings'] = retrorule['product_smiles'].apply(lambda x: len(x.split('.')))
retrorule_ = retrorule[retrorule['num_substrings'] < 4]
print(retrorule_.shape)
retrorule_.to_csv(mnxreac_smile_atom_mapping_rules_file_filter_add_no_ec_path,index=None)

(29164, 19)
