**Metabolome-Microbiome-Network**
## Parsing KEGG Reactions

In [14]:
import os
import pandas as pd
import requests
import re

In [7]:
url_kegg_get_all_reactions = 'http://rest.kegg.jp/list/reaction'
response = requests.request("GET", url_kegg_get_all_reactions)
text = response.text
text = text.strip()

In [9]:
print(text)

R00001	polyphosphate polyphosphohydrolase
R00002	Reduced ferredoxin:dinitrogen oxidoreductase (ATP-hydrolysing)
R00004	diphosphate phosphohydrolase
R00005	urea-1-carboxylate amidohydrolase
R00006	pyruvate:pyruvate acetaldehydetransferase (decarboxylating)
R00008	4-hydroxy-4-methyl-2-oxoglutarate pyruvate-lyase (pyruvate-forming)
R00009	hydrogen-peroxide:hydrogen-peroxide oxidoreductase
R00010	alpha,alpha-trehalose glucohydrolase
R00011	Mn(II):hydrogen-peroxide oxidoreductase
R00012	GTP:GTP guanylyltransferase
R00013	glyoxylate carboxy-lyase (dimerizing
R00014	pyruvate:thiamin diphosphate acetaldehydetransferase (decarboxylating)
R00015	sucrose:sucrose 1'-beta-D-fructosyltransferase
R00017	ferrocytochrome-c:hydrogen-peroxide oxidoreductase
R00018	putrescine:putrescine 4-aminobutyltransferase (ammonia-forming)
R00019	hydrogen:ferredoxin oxidoreductase
R00021	L-glutamate:ferredoxin oxidoreductase (transaminating)
R00022	chitobiose N-acetylglucosaminohydrolase
R00023	hydroxylamine:NAD+ oxi

In [10]:
all_reactions = text.split('\n')
all_reactions = [reaction.split('\t') for reaction in all_reactions]
all_reactions

[['R00001', 'polyphosphate polyphosphohydrolase'],
 ['R00002', 'Reduced ferredoxin:dinitrogen oxidoreductase (ATP-hydrolysing)'],
 ['R00004', 'diphosphate phosphohydrolase'],
 ['R00005', 'urea-1-carboxylate amidohydrolase'],
 ['R00006', 'pyruvate:pyruvate acetaldehydetransferase (decarboxylating)'],
 ['R00008',
  '4-hydroxy-4-methyl-2-oxoglutarate pyruvate-lyase (pyruvate-forming)'],
 ['R00009', 'hydrogen-peroxide:hydrogen-peroxide oxidoreductase'],
 ['R00010', 'alpha,alpha-trehalose glucohydrolase'],
 ['R00011', 'Mn(II):hydrogen-peroxide oxidoreductase'],
 ['R00012', 'GTP:GTP guanylyltransferase'],
 ['R00013', 'glyoxylate carboxy-lyase (dimerizing'],
 ['R00014',
  'pyruvate:thiamin diphosphate acetaldehydetransferase (decarboxylating)'],
 ['R00015', "sucrose:sucrose 1'-beta-D-fructosyltransferase"],
 ['R00017', 'ferrocytochrome-c:hydrogen-peroxide oxidoreductase'],
 ['R00018', 'putrescine:putrescine 4-aminobutyltransferase (ammonia-forming)'],
 ['R00019', 'hydrogen:ferredoxin oxidored

In [11]:
len(all_reactions)

11911

In [36]:
entry_re = re.compile(r'(?<=ENTRY       )R[0-9]{5}(?=\s*Reaction\nNAME)')

name_re = re.compile(r'(?<=NAME        ).+(?=;\n)|'
                     r'(?<=NAME        ).+(?=\n)')

class_re = re.compile(r'(?<=CLASS       ).+(?=;\n)|'
                      r'(?<=CLASS       ).+(?=\n)')

reaction_re = re.compile(r'(?<=ALL_REAC    )R[0-9]{5}(?= )|'
                         r'(?<=ALL_REAC    )R[0-9]{5}(?=;)|'
                         r'(?<= )R[0-9]{5}(?= R)|'
                         r'(?<= )R[0-9]{5}(?=;)|'
                         r'(?<= )R[0-9]{5}(?= \n            R)|'
                         r'(?<= )R[0-9]{5}(?=\nSUBSTRATE)')

substrates_re = re.compile(r'(?<=SUBSTRATE   ).+ \[CPD:C[0-9]{5}\];(?=;)|'
                           r'(?<=SUBSTRATE   ).+ \[CPD:C[0-9]{5}\];(?=\n)|'
                           r'(?<=            ).+ \[CPD:C[0-9]{5}\](?=;)|'
                           r'(?<=            ).+ \[CPD:C[0-9]{5}\](?=\n)')

products_re = re.compile(r'(?<=PRODUCT     ).+ \[CPD:C[0-9]{5}\];(?=;)|'
                         r'(?<=PRODUCT     ).+ \[CPD:C[0-9]{5}\];(?=\n)|'
                         r'(?<=            ).+ \[CPD:C[0-9]{5}\](?=;)|'
                         r'(?<=            ).+ \[CPD:C[0-9]{5}\](?=\n)')

enzymes_re = re.compile(r'(?<=ENZYME      )[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+(?= )|'
                        r'(?<= )[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+(?=\n)|'
                        r'(?<= )[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+(?= )')

pathway_re = re.compile(r'(?<=PATHWAY     )rn[0-9]{5}  .+(?=\n)|'
                        r'(?<=            )rn[0-9]{5}  .+(?=\n)')

compound_re = re.compile(r'C[0-9]{5}')

In [52]:
def get_reaction_details(reaction):
    url_kegg_reaction_get = f'http://rest.kegg.jp/get/rn:{reaction}'
    response = requests.request("GET", url_kegg_reaction_get)
    entry = response.text

    # ENTRY
    entry_id = entry_re.findall(entry)

    if entry_id:
        entry_id = entry_id[0]

        # NAME
        name = name_re.findall(entry)[0]

        reaction_info = [entry_id, name]

        # EQUATION
        equation_split = entry.split('EQUATION')[1]

        if 'REMARK' in entry:
            equation = equation_split.split('REMARK')[0].strip()
        elif 'COMMENT' in entry:
            equation = equation_split.split('COMMENT')[0].strip()
        elif 'RCLASS' in entry:
            equation = equation_split.split('RCLASS')[0].strip()
        elif 'ENZYME' in entry:
            equation = equation_split.split('ENZYME')[0].strip()
        else:
            equation = equation_split.split('\n')[0].strip()

        substrates_temp = equation.split('<=>')[0].strip()
        products_temp = equation.split('<=>')[1].strip()

        substrates = compound_re.findall(substrates_temp)
        products = compound_re.findall(products_temp)

        substrates = list(map(lambda s: (s, entry_id), substrates))
        products = list(map(lambda p: (p, entry_id), products))

        # ENZYME
        enzymes = enzymes_re.findall(entry)
        enzymes = list(map(lambda e: (e, entry_id), enzymes))

        # PATHWAY
        pathways = pathway_re.findall(entry)
        pathways = list(map(lambda p: (p, entry_id), pathways))

        return reaction_info, substrates, products, enzymes, pathways

    else:
        return None, None, None, None, None

In [40]:
reaction_info, substrates, products, enzymes, pathways = get_reaction_details(all_reactions[1500][0])

In [53]:
parsed_reaction_entries = [['reaction_id', 'reaction_name']]
parsed_reaction_enzymes = [['enzyme_id', 'reaction_id']]
parsed_reaction_substrates = [['substrate', 'reaction_id']]
parsed_reaction_products = [['product', 'reaction_id']]
parsed_reaction_pathways = [['pathway', 'reaction_id']]

for reaction in all_reactions:
    reaction_info, substrates, products, enzymes, pathways = get_reaction_details(reaction[0])
    if reaction_info:
        parsed_reaction_entries.append(reaction_info)

    if substrates:
        parsed_reaction_substrates.extend(substrates)

    if products:
        parsed_reaction_products.extend(products)

    if enzymes:
        parsed_reaction_enzymes.extend(enzymes)

    if pathways:
        parsed_reaction_pathways.extend(pathways)

In [54]:
parsed_reaction_entries_df = pd.DataFrame(parsed_reaction_entries[1:], columns=parsed_reaction_entries[0])
parsed_reaction_entries_df

Unnamed: 0,reaction_id,reaction_name
0,R00001,polyphosphate polyphosphohydrolase
1,R00002,Reduced ferredoxin:dinitrogen oxidoreductase (...
2,R00004,diphosphate phosphohydrolase
3,R00005,urea-1-carboxylate amidohydrolase
4,R00006,pyruvate:pyruvate acetaldehydetransferase (dec...
...,...,...
8949,R13133,N-(sulfonatooxy)prop-2-enimidothioate sulfate-...
8950,R13134,N-(sulfonatooxy)prop-2-enimidothioate sulfate-...
8951,R13136,propanoate:NAD+ oxidoreductase
8952,R13137,succinate:ferricytochrome-c oxidoreductase


In [55]:
parsed_reaction_substrates_df = pd.DataFrame(parsed_reaction_substrates[1:], columns=parsed_reaction_substrates[0])
parsed_reaction_substrates_df

Unnamed: 0,substrate,reaction_id
0,C00404,R00001
1,C00001,R00001
2,C00002,R00002
3,C00001,R00002
4,C00138,R00002
...,...,...
19680,C00042,R13137
19681,C00125,R13137
19682,C06366,R13141
19683,C00007,R13141


In [56]:
parsed_reaction_products_df = pd.DataFrame(parsed_reaction_products[1:], columns=parsed_reaction_products[0])
parsed_reaction_products_df

Unnamed: 0,product,reaction_id
0,C02174,R00001
1,C05359,R00002
2,C00009,R00002
3,C00008,R00002
4,C00139,R00002
...,...,...
20781,C00122,R13137
20782,C00126,R13137
20783,C22648,R13141
20784,C00027,R13141


In [57]:
parsed_reaction_enzymes_df = pd.DataFrame(parsed_reaction_enzymes[1:], columns=parsed_reaction_enzymes[0])
parsed_reaction_enzymes_df

Unnamed: 0,enzyme_id,reaction_id
0,3.6.1.10,R00001
1,1.18.6.1,R00002
2,3.6.1.1,R00004
3,6.3.4.6,R00005
4,3.5.1.54,R00005
...,...,...
10041,4.8.1.8,R13133
10042,4.8.1.8,R13134
10043,1.3.1.125,R13136
10044,1.3.2.4,R13137


In [58]:
parsed_reaction_pathways_df = pd.DataFrame(parsed_reaction_pathways[1:], columns=parsed_reaction_pathways[0])
parsed_reaction_pathways_df

Unnamed: 0,pathway,reaction_id
0,rn00220 Arginine biosynthesis,R00005
1,rn00791 Atrazine degradation,R00005
2,rn01100 Metabolic pathways,R00005
3,rn01120 Microbial metabolism in diverse envir...,R00005
4,rn00362 Benzoate degradation,R00008
...,...,...
14236,rn00620 Pyruvate metabolism,R13137
14237,rn00650 Butanoate metabolism,R13137
14238,rn00720 Carbon fixation pathways in prokaryotes,R13137
14239,rn01100 Metabolic pathways,R13137


In [59]:
kegg_folder = '/path/to/kegg/data/folder/'

parsed_reaction_entries_df.to_csv(os.path.join(kegg_folder, "kegg_reactions.csv"), index=False)
parsed_reaction_enzymes_df.to_csv(os.path.join(kegg_folder, "kegg_reaction_enzymes.csv"), index=False)
parsed_reaction_substrates_df.to_csv(os.path.join(kegg_folder, "kegg_reaction_substrates.csv"), index=False)
parsed_reaction_products_df.to_csv(os.path.join(kegg_folder, "kegg_reaction_products.csv"), index=False)
parsed_reaction_pathways_df.to_csv(os.path.join(kegg_folder, "kegg_reaction_pathways.csv"), index=False)