**MeNu GUIDE**
## Extracting KEGG Compounds

In [1]:
import pandas as pd
import requests
import re
import os

## Get all compound IDs

In [None]:
kegg_folder = '/path/to/kegg/data/folder/'

In [2]:
url_kegg_compound_database_info = 'https://rest.kegg.jp/info/compound'
response = requests.request("GET", url_kegg_compound_database_info)
text = response.text

In [3]:
print(text)

compound         KEGG Compound Database
cpd              Release 105.0+/01-28, Jan 23
                 Kanehisa Laboratories
                 19,054 entries

linked db        pathway
                 brite
                 module
                 genome
                 glycan
                 reaction
                 enzyme
                 network
                 drug
                 pubchem
                 chebi



In [4]:
url_kegg_compounds_list = 'https://rest.kegg.jp/list/compound'
response = requests.request("GET", url_kegg_compounds_list)
text = response.text
print(text)

cpd:C00001	H2O; Water
cpd:C00002	ATP; Adenosine 5'-triphosphate
cpd:C00003	NAD+; NAD; Nicotinamide adenine dinucleotide; DPN; Diphosphopyridine nucleotide; Nadide; beta-NAD+
cpd:C00004	NADH; DPNH; Reduced nicotinamide adenine dinucleotide
cpd:C00005	NADPH; TPNH; Reduced nicotinamide adenine dinucleotide phosphate
cpd:C00006	NADP+; NADP; Nicotinamide adenine dinucleotide phosphate; beta-Nicotinamide adenine dinucleotide phosphate; TPN; Triphosphopyridine nucleotide; beta-NADP+
cpd:C00007	Oxygen; O2
cpd:C00008	ADP; Adenosine 5'-diphosphate
cpd:C00009	Orthophosphate; Phosphate; Phosphoric acid; Orthophosphoric acid
cpd:C00010	CoA; Coenzyme A; CoA-SH
cpd:C00011	CO2; Carbon dioxide
cpd:C00012	Peptide
cpd:C00013	Diphosphate; Diphosphoric acid; Pyrophosphate; Pyrophosphoric acid; PPi
cpd:C00014	Ammonia; NH3
cpd:C00015	UDP; Uridine 5'-diphosphate
cpd:C00016	FAD; Flavin adenine dinucleotide
cpd:C00017	Protein
cpd:C00018	Pyridoxal phosphate; Pyridoxal 5'-phosphate; Pyridoxal 5-phosphate; PLP
cpd

In [5]:
print(len(text.split('\n')))

19055


In [6]:
identifiers = []
for line in text.split('\n'):
    # last line is empty
    if '\t' in line:
        identifier, names = line.split('\t')
        identifiers.append(identifier)

In [7]:
# This makes sense, as the last line is just a new line
len(identifiers)

19054

In [8]:
identifiers

['cpd:C00001',
 'cpd:C00002',
 'cpd:C00003',
 'cpd:C00004',
 'cpd:C00005',
 'cpd:C00006',
 'cpd:C00007',
 'cpd:C00008',
 'cpd:C00009',
 'cpd:C00010',
 'cpd:C00011',
 'cpd:C00012',
 'cpd:C00013',
 'cpd:C00014',
 'cpd:C00015',
 'cpd:C00016',
 'cpd:C00017',
 'cpd:C00018',
 'cpd:C00019',
 'cpd:C00020',
 'cpd:C00021',
 'cpd:C00022',
 'cpd:C00023',
 'cpd:C00024',
 'cpd:C00025',
 'cpd:C00026',
 'cpd:C00027',
 'cpd:C00028',
 'cpd:C00029',
 'cpd:C00030',
 'cpd:C00031',
 'cpd:C00032',
 'cpd:C00033',
 'cpd:C00034',
 'cpd:C00035',
 'cpd:C00036',
 'cpd:C00037',
 'cpd:C00038',
 'cpd:C00039',
 'cpd:C00040',
 'cpd:C00041',
 'cpd:C00042',
 'cpd:C00043',
 'cpd:C00044',
 'cpd:C00045',
 'cpd:C00046',
 'cpd:C00047',
 'cpd:C00048',
 'cpd:C00049',
 'cpd:C00050',
 'cpd:C00051',
 'cpd:C00052',
 'cpd:C00053',
 'cpd:C00054',
 'cpd:C00055',
 'cpd:C00058',
 'cpd:C00059',
 'cpd:C00060',
 'cpd:C00061',
 'cpd:C00062',
 'cpd:C00063',
 'cpd:C00064',
 'cpd:C00065',
 'cpd:C00066',
 'cpd:C00067',
 'cpd:C00068',
 'cpd:C000

## Extract Metabolite Information

In [9]:
def get_metabolite_from_kegg_via_id(metabolite_id: str) -> str:
    url_kegg_get = f'http://rest.kegg.jp/get/{metabolite_id}'
    response = requests.request("GET", url_kegg_get)
    return response.text

In [66]:
example_output = get_metabolite_from_kegg_via_id(identifiers[121])
print(example_output)

ENTRY       C00125                      Compound
NAME        Ferricytochrome c;
            Cytochrome c3+
FORMULA     C42H44FeN8O8S2R4
COMMENT     Heme type: Heme C [CPD:C15817]
REACTION    R00017 R00029 R00081 R00196 R00197 R00294 R00364 R00528 
            R00640 R00783 R00865 R01106 R01685 R01892 R02560 R02804 
            R03136 R04927 R05062 R05198 R05285 R05318 R05712 R06981 
            R07174 R07679 R09499 R09787 R09799 R09938 R10103 R10151 
            R10164 R10165 R10913 R10914 R11343 R11971 R12144 R12163 
            R12164 R12631 R13010
MODULE      M00151  Cytochrome bc1 complex respiratory unit
            M00152  Cytochrome bc1 complex
ENZYME      1.1.2.2         1.1.2.3         1.1.2.4         1.1.2.6         
            1.1.2.8         1.1.2.9         1.1.2.11        1.1.2.-         
            1.3.2.3         1.4.2.1         1.4.2.3         1.4.2.-         
            1.7.2.1         1.7.2.2         1.7.2.3         1.7.2.4         
            1.7.2.5         1.7.

### All possible keywords for compounds
* ENTRY
* NAME
* FORMULA
* EXACT_MASS
* MOL_WEIGHT
* (gif image)
* SEQUENCE
* REMARK
* COMMENT
* REACTION
* PATHWAY
* MODULE
* ENZYME
* BRITE
* REFERENCE
* DBLINKS
* ATOM
* BOND
* BRACKET

In [11]:
# Generate regular expressions to extract all the information from the flat file
all_keywords = ["ENTRY", "NAME", "FORMULA", "EXACT_MASS", "MOL_WEIGHT", "SEQUENCE", "REMARK", "COMMENT", "REACTION", "PATHWAY", "MODULE", "ENZYME", "BRITE", "REFERENCE", "DBLINKS", "ATOM", "BOND", "BRACKET"]

entry_re = re.compile(r'(?<=ENTRY       )C[0-9]{5}(?=                      Compound\nNAME)')

name_re = re.compile(r'(?<=NAME        ).+(?=;\n)|'
                     r'(?<=NAME        ).+(?=\n)|'
                     r'(?<=;\n            ).+(?=;\n)|'
                     r'(?<=;\n            ).+(?=\nFORMULA)')

formula_re = re.compile(r'(?<=FORMULA     )[A-Za-z0-9\)\(]+(?=\n)')
exact_mass_re = re.compile(r'(?<=EXACT_MASS  )[0-9]+\.[0-9]+(?=\n)')
molecular_weight_re = re.compile(r'(?<=MOL_WEIGHT  )[0-9]+\.[0-9]+(?=\n)')

reaction_re = re.compile(r'(?<=REACTION    )R[0-9]{5}(?= )|'
                         r'(?<= )R[0-9]{5}(?= R)|'
                         r'(?<= )R[0-9]{5}(?= \n            R)|'
                         r'(?<= )R[0-9]{5}(?=\nPATHWAY)')

pathway_re = re.compile(r'(?<=PATHWAY     )map[0-9]{5}  .+(?=\n)|'
                        r'(?<=            )map[0-9]{5}  .+(?=\n)')

module_re = re.compile(r'(?<=MODULE      )M[0-9]{5}  .+(?=\n)|'
                       r'(?<=            )M[0-9]{5}  .+(?=\n)')

enzymes_re = re.compile(r'(?<=ENZYME      )[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+(?= )|'
                        r'(?<= )[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+(?=\n)|'
                        r'(?<= )[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+(?= )')

# Could also have reference instead of DBLINKS afterwards
brite_re = re.compile(r'(?<=BRITE       ).+\[BR:br[0-9]{5}\][\s\S]+?(?=\n            .+ \[BR:br[0-9]{5}\]\n)|'
                      r'(?<=BRITE       ).+\[BR:br[0-9]{5}\][\S\s]+?(?=\nDBLINKS)|'
                      r'(?<=            ).+\[BR:br[0-9]{5}\][\S\s]+?(?=\n            .+\[BR:br[0-9]{5}\]\n)|'
                      r'(?<=            ).+\[BR:br[0-9]{5}\][\S\s]+?(?=\nDBLINKS)')

dblink_re = re.compile(r'(?<=DBLINKS     |            )(.+): (.+)(?=\n)')

def parse_metabolite_text_kegg(metabolite_text):
    entry_id = entry_re.findall(metabolite_text)[0]
    print(entry_id)

    names = name_re.findall(metabolite_text)

    formula = formula_re.findall(metabolite_text)
    if formula:
        formula = formula[0]
    else:
        formula = float("NaN")

    exact_mass = exact_mass_re.findall(metabolite_text)
    if exact_mass:
        exact_mass = float(exact_mass[0])
    else:
        exact_mass = float("NaN")

    molecular_weight = molecular_weight_re.findall(metabolite_text)
    if molecular_weight:
        molecular_weight = float(molecular_weight[0])
    else:
        molecular_weight = float("NaN")

    reactions = reaction_re.findall(metabolite_text)
    if not reactions:
        reactions = float('NaN')

    pathways = pathway_re.findall(metabolite_text)
    if not pathways:
        pathways = float('NaN')

    modules = module_re.findall(metabolite_text)
    if not modules:
        modules = float('NaN')

    enzymes = enzymes_re.findall(metabolite_text)
    if not enzymes:
        enzymes = float('NaN')

    brite = brite_re.findall(metabolite_text)
    if not brite:
        brite = float('NaN')

    dblinks = dblink_re.findall(metabolite_text)
    if dblinks:
        for dblink in dblinks:
            if dblink[0] not in dblink_keys:
                print(dblink[0])
                dblink_keys.add(dblink[0])
    else:
        dblinks = float('NaN')


    if not all(word in metabolite_text for word in all_keywords):
        not_matching_words = [word for word in all_keywords if word not in metabolite_text]
        if ("NAME" or "FORMULA" or "PATHWAY" or "DBLINKS") in not_matching_words:
            print(not_matching_words)
            print(metabolite_text)
            raise Exception

    return {"entry_id": entry_id, "names": names, "formula": formula, "extact_mass": exact_mass, "molecular_weight": molecular_weight, "reactions": reactions, "pathways": pathways, "modules": modules, "enzymes": enzymes, "brite": brite, "db_links": dblinks}

In [12]:
# Save all possible external databases that could be linked
dblink_keys = set()

In [13]:
kegg_entries = []

In [14]:
for i in range(0, len(identifiers)):
    entry = parse_metabolite_text_kegg(get_metabolite_from_kegg_via_id(identifiers[i]))
    kegg_entries.append(entry)

C00001
CAS
PubChem
ChEBI
PDB-CCD
3DMET
NIKKAJI
C00002
KNApSAcK
C00003
C00004
C00005
C00006
C00007
C00008
C00009
C00010
C00011
C00012
C00013
C00014
C00015
C00016
C00017
C00018
Drug group
C00019
C00020
C00021
C00022
LIPIDMAPS
C00023
C00024
C00025
C00026
C00027
C00028
C00029
C00030
C00031
C00032
C00033
C00034
C00035
C00036
C00037
C00038
C00039
C00040
C00041
C00042
C00043
C00044
C00045
C00046
C00047
C00048
C00049
C00050
C00051
C00052
C00053
C00054
C00055
C00058
C00059
C00060
C00061
C00062
C00063
C00064
 Genetic, Enzyme, or Protein Disorder
C00065
C00066
C00067
C00068
C00069
C00070
C00071
C00072
C00073
C00074
C00075
C00076
C00077
C00078
C00079
C00080
C00081
C00082
C00083
C00084
C00085
C00086
C00087
C00088
C00089
C00090
C00091
C00092
C00093
C00094
C00095
C00096
C00097
C00098
C00099
C00100
C00101
C00102
C00103
C00104
C00105
C00106
C00107
C00108
C00109
C00110
C00111
C00112
C00113
C00114
C00116
C00117
C00118
C00119
C00120
C00121
C00122
C00123
C00124
C00125
C00126
C00127
C00128
C00129
C00130
C00

In [15]:
kegg_entries[-1]

{'entry_id': 'C22611',
 'names': ['N-Acetyl-D-glutamate', 'N-Acetyl-D-glutamic acid'],
 'formula': 'C7H11NO5',
 'extact_mass': 189.0637,
 'molecular_weight': 189.1659,
 'reactions': ['R13096'],
 'pathways': nan,
 'modules': nan,
 'enzymes': nan,
 'brite': nan,
 'db_links': nan}

In [16]:
len(kegg_entries)

19054

In [17]:
df_entries = pd.DataFrame(kegg_entries)
df_entries

Unnamed: 0,entry_id,names,formula,extact_mass,molecular_weight,reactions,pathways,modules,enzymes,brite,db_links
0,C00001,"[H2O, Water]",H2O,18.0106,18.0153,"[R00001, R00002, R00004, R00005, R00009, R0001...","[map00190 Oxidative phosphorylation, map00195...",[M00416 Cytochrome aa3-600 menaquinol oxidase],"[1.1.1.1, 1.1.1.22, 1.1.1.23, 1.1.1.115, 1.1.1...",[Therapeutic category of drugs in Japan [BR:br...,"[(CAS, 7732-18-5), (PubChem, 3303), (ChEBI, 15..."
1,C00002,"[ATP, Adenosine 5'-triphosphate]",C10H16N5O13P3,506.9957,507.1810,"[R00002, R00076, R00085, R00086, R00087, R0008...","[map00190 Oxidative phosphorylation, map00195...","[M00049 Adenine ribonucleotide biosynthesis, ...","[1.1.98.6, 1.2.1.30, 1.2.1.95, 1.2.1.101, 1.3....",[Compounds with biological roles [BR:br08001]\...,"[(CAS, 56-65-5), (PubChem, 3304), (ChEBI, 1542..."
2,C00003,"[NAD+, NAD, Nicotinamide adenine dinucleotide,...",C21H28N7O14P2,664.1169,664.4330,"[R00023, R00090, R00091, R00092, R00093, R0009...","[map00190 Oxidative phosphorylation, map00730...","[M00115 NAD biosynthesis, aspartate => quinol...","[1.1.1.1, 1.1.1.3, 1.1.1.4, 1.1.1.6, 1.1.1.7, ...",[Compounds with biological roles [BR:br08001]\...,"[(CAS, 53-84-9), (PubChem, 3305), (ChEBI, 1584..."
3,C00004,"[NADH, DPNH, Reduced nicotinamide adenine dinu...",C21H29N7O14P2,665.1248,665.4410,"[R00023, R00090, R00091, R00092, R00093, R0009...","[map00190 Oxidative phosphorylation, map01100...","[M00142 NADH:ubiquinone oxidoreductase, mitoc...","[1.1.1.1, 1.1.1.3, 1.1.1.4, 1.1.1.6, 1.1.1.7, ...",,"[(CAS, 58-68-4), (PubChem, 3306), (ChEBI, 1690..."
4,C00005,"[NADPH, TPNH, Reduced nicotinamide adenine din...",C21H30N7O17P3,745.0911,745.4209,"[R00105, R00106, R00108, R00109, R00111, R0011...","[map00195 Photosynthesis, map00480 Glutathio...",,"[1.1.1.1, 1.1.1.2, 1.1.1.3, 1.1.1.10, 1.1.1.19...",,"[(CAS, 2646-71-1), (PubChem, 3307), (ChEBI, 16..."
...,...,...,...,...,...,...,...,...,...,...,...
19049,C22607,[Isourolithin A],C13H8O4,228.0423,228.2002,,,,,,
19050,C22608,"[Avicularin, Quercetin 3-O-alpha-L-arabinofura...",C20H18O11,434.0849,434.3503,,,,,,"[(CAS, 572-30-5)]"
19051,C22609,[Dammarane],C30H54,414.4226,414.7498,,,,,,
19052,C22610,[(-)-3-exo-Hydroxycamphor],C10H16O2,168.1150,168.2328,[R13092],,,"[1.1.1.327, 1.14.15.1]",,


## Clean up Dataframe

In [4]:
df_entries.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19051 entries, 0 to 19050
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   entry_id          19051 non-null  object 
 1   names             19051 non-null  object 
 2   formula           17986 non-null  object 
 3   extact_mass       16688 non-null  float64
 4   molecular_weight  16683 non-null  float64
 5   reactions         7034 non-null   object 
 6   pathways          6369 non-null   object 
 7   modules           1709 non-null   object 
 8   enzymes           7585 non-null   object 
 9   brite             8553 non-null   object 
 10  db_links          18708 non-null  object 
dtypes: float64(2), object(9)
memory usage: 1.6+ MB


### Pathways

In [22]:
compound_pathway_list = []

def extract_compound_pathway_relationships(row):
    entry_id = row.entry_id
    pathways = row.pathways

    if type(pathways) != float:
        for pathway in pathways:
            pathway_key, pathway_value = pathway.split('  ')
            compound_pathway_list.append({'compound_id': entry_id, 'pathway_id': pathway_key, 'pathway_name': pathway_value})

df_entries.apply(extract_compound_pathway_relationships, axis=1)

0        None
1        None
2        None
3        None
4        None
         ... 
19049    None
19050    None
19051    None
19052    None
19053    None
Length: 19054, dtype: object

In [23]:
compound_pathway_list

[{'compound_id': 'C00001',
  'pathway_id': 'map00190',
  'pathway_name': 'Oxidative phosphorylation'},
 {'compound_id': 'C00001',
  'pathway_id': 'map00195',
  'pathway_name': 'Photosynthesis'},
 {'compound_id': 'C00001',
  'pathway_id': 'map00710',
  'pathway_name': 'Carbon fixation in photosynthetic organisms'},
 {'compound_id': 'C00001',
  'pathway_id': 'map01100',
  'pathway_name': 'Metabolic pathways'},
 {'compound_id': 'C00001',
  'pathway_id': 'map01120',
  'pathway_name': 'Microbial metabolism in diverse environments'},
 {'compound_id': 'C00001',
  'pathway_id': 'map04918',
  'pathway_name': 'Thyroid hormone synthesis'},
 {'compound_id': 'C00001',
  'pathway_id': 'map04924',
  'pathway_name': 'Renin secretion'},
 {'compound_id': 'C00001',
  'pathway_id': 'map04962',
  'pathway_name': 'Vasopressin-regulated water reabsorption'},
 {'compound_id': 'C00001',
  'pathway_id': 'map04964',
  'pathway_name': 'Proximal tubule bicarbonate reclamation'},
 {'compound_id': 'C00001',
  'pathw

In [24]:
compound_pathway_df = pd.DataFrame(compound_pathway_list)

In [25]:
compound_pathway_df

Unnamed: 0,compound_id,pathway_id,pathway_name
0,C00001,map00190,Oxidative phosphorylation
1,C00001,map00195,Photosynthesis
2,C00001,map00710,Carbon fixation in photosynthetic organisms
3,C00001,map01100,Metabolic pathways
4,C00001,map01120,Microbial metabolism in diverse environments
...,...,...,...
18551,C22570,map00946,Degradation of flavonoids
18552,C22571,map00946,Degradation of flavonoids
18553,C22572,map00946,Degradation of flavonoids
18554,C22575,map00541,O-Antigen nucleotide sugar biosynthesis


In [26]:
compound_pathway_df.compound_id.nunique()

6369

In [27]:
compound_pathway_df.to_csv(os.path.join(kegg_folder, "kegg_compounds_pathways.csv"), index=False)

### Enzymes

In [29]:
compound_enzyme_list = []

def extract_compound_enzyme_relationships(row):
    entry_id = row.entry_id
    enzymes = row.enzymes

    if type(enzymes) != float:
        for enzyme in enzymes:
            compound_enzyme_list.append({'compound_id': entry_id, 'enzyme_ec_number': enzyme})

df_entries.apply(extract_compound_enzyme_relationships, axis=1)

0        None
1        None
2        None
3        None
4        None
         ... 
19049    None
19050    None
19051    None
19052    None
19053    None
Length: 19054, dtype: object

In [30]:
compound_enzyme_df = pd.DataFrame(compound_enzyme_list)
compound_enzyme_df

Unnamed: 0,compound_id,enzyme_ec_number
0,C00001,1.1.1.1
1,C00001,1.1.1.22
2,C00001,1.1.1.23
3,C00001,1.1.1.115
4,C00001,1.1.1.132
...,...,...
32328,C22573,3.5.4.37
32329,C22574,3.5.4.37
32330,C22577,1.3.1.77
32331,C22610,1.1.1.327


In [31]:
compound_enzyme_df.to_csv(os.path.join(kegg_folder, "kegg_compounds_enzymes.csv"), index=False)

### DBLinks

In [32]:
df_entries

Unnamed: 0,entry_id,names,formula,extact_mass,molecular_weight,reactions,pathways,modules,enzymes,brite,db_links
0,C00001,"[H2O, Water]",H2O,18.0106,18.0153,"[R00001, R00002, R00004, R00005, R00009, R0001...","[map00190 Oxidative phosphorylation, map00195...",[M00416 Cytochrome aa3-600 menaquinol oxidase],"[1.1.1.1, 1.1.1.22, 1.1.1.23, 1.1.1.115, 1.1.1...",[Therapeutic category of drugs in Japan [BR:br...,"[(CAS, 7732-18-5), (PubChem, 3303), (ChEBI, 15..."
1,C00002,"[ATP, Adenosine 5'-triphosphate]",C10H16N5O13P3,506.9957,507.1810,"[R00002, R00076, R00085, R00086, R00087, R0008...","[map00190 Oxidative phosphorylation, map00195...","[M00049 Adenine ribonucleotide biosynthesis, ...","[1.1.98.6, 1.2.1.30, 1.2.1.95, 1.2.1.101, 1.3....",[Compounds with biological roles [BR:br08001]\...,"[(CAS, 56-65-5), (PubChem, 3304), (ChEBI, 1542..."
2,C00003,"[NAD+, NAD, Nicotinamide adenine dinucleotide,...",C21H28N7O14P2,664.1169,664.4330,"[R00023, R00090, R00091, R00092, R00093, R0009...","[map00190 Oxidative phosphorylation, map00730...","[M00115 NAD biosynthesis, aspartate => quinol...","[1.1.1.1, 1.1.1.3, 1.1.1.4, 1.1.1.6, 1.1.1.7, ...",[Compounds with biological roles [BR:br08001]\...,"[(CAS, 53-84-9), (PubChem, 3305), (ChEBI, 1584..."
3,C00004,"[NADH, DPNH, Reduced nicotinamide adenine dinu...",C21H29N7O14P2,665.1248,665.4410,"[R00023, R00090, R00091, R00092, R00093, R0009...","[map00190 Oxidative phosphorylation, map01100...","[M00142 NADH:ubiquinone oxidoreductase, mitoc...","[1.1.1.1, 1.1.1.3, 1.1.1.4, 1.1.1.6, 1.1.1.7, ...",,"[(CAS, 58-68-4), (PubChem, 3306), (ChEBI, 1690..."
4,C00005,"[NADPH, TPNH, Reduced nicotinamide adenine din...",C21H30N7O17P3,745.0911,745.4209,"[R00105, R00106, R00108, R00109, R00111, R0011...","[map00195 Photosynthesis, map00480 Glutathio...",,"[1.1.1.1, 1.1.1.2, 1.1.1.3, 1.1.1.10, 1.1.1.19...",,"[(CAS, 2646-71-1), (PubChem, 3307), (ChEBI, 16..."
...,...,...,...,...,...,...,...,...,...,...,...
19049,C22607,[Isourolithin A],C13H8O4,228.0423,228.2002,,,,,,
19050,C22608,"[Avicularin, Quercetin 3-O-alpha-L-arabinofura...",C20H18O11,434.0849,434.3503,,,,,,"[(CAS, 572-30-5)]"
19051,C22609,[Dammarane],C30H54,414.4226,414.7498,,,,,,
19052,C22610,[(-)-3-exo-Hydroxycamphor],C10H16O2,168.1150,168.2328,[R13092],,,"[1.1.1.327, 1.14.15.1]",,


In [36]:
dblink_keys

{'  Cell adhesion molecules',
 ' Genetic, Enzyme, or Protein Disorder',
 '(Cyclization',
 '(Disulfide bridge',
 '3DMET',
 '7 D-Ala  8 Leu  9 D-Asp  10 Arg  (Cyclization',
 '7 D-MeAsp  8 Arg  (Cyclization',
 '7 fOHOrn  8 Thr  9 Thr  (Cyclization',
 '8 D-Phe  9 His  10 D-Asp  11 Asn  (Cyclization',
 '8 D-Ser  9 Ile  10 Ile  11 Asp  (Cyclization',
 '8 Gln  9 D-Tyr  10 Ile  (Cyclization',
 '8 Tyr  9 Pro  (Cyclization',
 '9 Asp  10 Gly  11 D-Asn  12 Glu  13 Ile  (Cyclization',
 '9 D-Asn  10 Glu  11 Trp  (Cyclization',
 '9 Leu  (Cyclization',
 '9 Leu  10 Ala  (Cyclization',
 '9 Thr  (Cyclization',
 'ATC code',
 'CAS',
 'ChEBI',
 'Compound group such as',
 'Cys Arg Cys Tyr Gln (Disulfide bridge',
 'Cys Cys Leu Arg Lys (Disulfide bridge',
 'Cys Ser Leu Ser Val Ala Gln Glu (Disulfide bridge',
 'Cys Ser Lys Ser Glu Ile Ser Ser Leu Cys (Disulfide bridge',
 'Cys Thr Lys Arg Ser Leu Ala Arg Phe Cys (Disulfide bridge',
 'Drug group',
 'Gln Asp Val Ile Trp (Disulfide bridge',
 'Gln His-NH2 (Disulfide

In [37]:
dblink_keys_useful = {'PubChem', 'PDB-CCD', 'NIKKAJI', 'LIPIDMAPS', 'KNApSAcK', 'ChEBI', 'CAS', '3DMET', 'Drug group'}
dblink_keys_not_useful = dblink_keys.difference(dblink_keys_useful)
dblink_keys_not_useful

{'  Cell adhesion molecules',
 ' Genetic, Enzyme, or Protein Disorder',
 '(Cyclization',
 '(Disulfide bridge',
 '7 D-Ala  8 Leu  9 D-Asp  10 Arg  (Cyclization',
 '7 D-MeAsp  8 Arg  (Cyclization',
 '7 fOHOrn  8 Thr  9 Thr  (Cyclization',
 '8 D-Phe  9 His  10 D-Asp  11 Asn  (Cyclization',
 '8 D-Ser  9 Ile  10 Ile  11 Asp  (Cyclization',
 '8 Gln  9 D-Tyr  10 Ile  (Cyclization',
 '8 Tyr  9 Pro  (Cyclization',
 '9 Asp  10 Gly  11 D-Asn  12 Glu  13 Ile  (Cyclization',
 '9 D-Asn  10 Glu  11 Trp  (Cyclization',
 '9 Leu  (Cyclization',
 '9 Leu  10 Ala  (Cyclization',
 '9 Thr  (Cyclization',
 'ATC code',
 'Compound group such as',
 'Cys Arg Cys Tyr Gln (Disulfide bridge',
 'Cys Cys Leu Arg Lys (Disulfide bridge',
 'Cys Ser Leu Ser Val Ala Gln Glu (Disulfide bridge',
 'Cys Ser Lys Ser Glu Ile Ser Ser Leu Cys (Disulfide bridge',
 'Cys Thr Lys Arg Ser Leu Ala Arg Phe Cys (Disulfide bridge',
 'Gln Asp Val Ile Trp (Disulfide bridge',
 'Gln His-NH2 (Disulfide bridge',
 'Gln Leu Glu Asn Tyr Cys Asn (Di

In [38]:
df_entries.db_links

0        [(CAS, 7732-18-5), (PubChem, 3303), (ChEBI, 15...
1        [(CAS, 56-65-5), (PubChem, 3304), (ChEBI, 1542...
2        [(CAS, 53-84-9), (PubChem, 3305), (ChEBI, 1584...
3        [(CAS, 58-68-4), (PubChem, 3306), (ChEBI, 1690...
4        [(CAS, 2646-71-1), (PubChem, 3307), (ChEBI, 16...
                               ...                        
19049                                                  NaN
19050                                    [(CAS, 572-30-5)]
19051                                                  NaN
19052                                                  NaN
19053                                                  NaN
Name: db_links, Length: 19054, dtype: object

In [39]:
def extract_dblinks(row):
    dblinks = row.db_links

    dblinks_parsed = {'PubChem': float('Nan'), 'PDB-CCD': float('Nan'), 'NIKKAJI': float('Nan'), 'LIPIDMAPS': float('Nan'), 'KNApSAcK': float('Nan'), 'ChEBI': float('Nan'), 'CAS': float('Nan'), '3DMET': float('Nan'), 'Drug group': float('Nan')}

    if type(dblinks) != float:
        for dblink in dblinks:
            db_key = dblink[0]
            db_value = dblink[1]

            if db_key in dblink_keys_useful:
                dblinks_parsed[db_key] = db_value

    return pd.Series(dblinks_parsed)

df_entries_dblink_parsed = df_entries.copy()
df_entries_dblink_parsed[['PubChem_subtance_id', 'PDB-CCD', 'NIKKAJI', 'LIPIDMAPS', 'KNApSAcK', 'ChEBI', 'CAS', '3DMET', 'Drug group']] = df_entries_dblink_parsed.apply(extract_dblinks, axis=1)
df_entries_dblink_parsed = df_entries_dblink_parsed.drop(columns=['db_links'])
df_entries_dblink_parsed

Unnamed: 0,entry_id,names,formula,extact_mass,molecular_weight,reactions,pathways,modules,enzymes,brite,PubChem_subtance_id,PDB-CCD,NIKKAJI,LIPIDMAPS,KNApSAcK,ChEBI,CAS,3DMET,Drug group
0,C00001,"[H2O, Water]",H2O,18.0106,18.0153,"[R00001, R00002, R00004, R00005, R00009, R0001...","[map00190 Oxidative phosphorylation, map00195...",[M00416 Cytochrome aa3-600 menaquinol oxidase],"[1.1.1.1, 1.1.1.22, 1.1.1.23, 1.1.1.115, 1.1.1...",[Therapeutic category of drugs in Japan [BR:br...,3303,HOH O,J43.587B,,,15377,7732-18-5,B01124,
1,C00002,"[ATP, Adenosine 5'-triphosphate]",C10H16N5O13P3,506.9957,507.1810,"[R00002, R00076, R00085, R00086, R00087, R0008...","[map00190 Oxidative phosphorylation, map00195...","[M00049 Adenine ribonucleotide biosynthesis, ...","[1.1.98.6, 1.2.1.30, 1.2.1.95, 1.2.1.101, 1.3....",[Compounds with biological roles [BR:br08001]\...,3304,ATP,J10.680A,,C00001491,15422,56-65-5,B01125,
2,C00003,"[NAD+, NAD, Nicotinamide adenine dinucleotide,...",C21H28N7O14P2,664.1169,664.4330,"[R00023, R00090, R00091, R00092, R00093, R0009...","[map00190 Oxidative phosphorylation, map00730...","[M00115 NAD biosynthesis, aspartate => quinol...","[1.1.1.1, 1.1.1.3, 1.1.1.4, 1.1.1.6, 1.1.1.7, ...",[Compounds with biological roles [BR:br08001]\...,3305,NAD NAJ,J136.554A,,C00007256,15846,53-84-9,B01126,
3,C00004,"[NADH, DPNH, Reduced nicotinamide adenine dinu...",C21H29N7O14P2,665.1248,665.4410,"[R00023, R00090, R00091, R00092, R00093, R0009...","[map00190 Oxidative phosphorylation, map01100...","[M00142 NADH:ubiquinone oxidoreductase, mitoc...","[1.1.1.1, 1.1.1.3, 1.1.1.4, 1.1.1.6, 1.1.1.7, ...",,3306,NAI,J213.546I,,C00019343,16908,58-68-4,B01127,
4,C00005,"[NADPH, TPNH, Reduced nicotinamide adenine din...",C21H30N7O17P3,745.0911,745.4209,"[R00105, R00106, R00108, R00109, R00111, R0011...","[map00195 Photosynthesis, map00480 Glutathio...",,"[1.1.1.1, 1.1.1.2, 1.1.1.3, 1.1.1.10, 1.1.1.19...",,3307,NDP,J208.978E,,,16474,2646-71-1,B01128,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19049,C22607,[Isourolithin A],C13H8O4,228.0423,228.2002,,,,,,,,,,,,,,
19050,C22608,"[Avicularin, Quercetin 3-O-alpha-L-arabinofura...",C20H18O11,434.0849,434.3503,,,,,,,,,,,,572-30-5,,
19051,C22609,[Dammarane],C30H54,414.4226,414.7498,,,,,,,,,,,,,,
19052,C22610,[(-)-3-exo-Hydroxycamphor],C10H16O2,168.1150,168.2328,[R13092],,,"[1.1.1.327, 1.14.15.1]",,,,,,,,,,


### Names

In [40]:
compound_name_list = []

def extract_compound_names(row):
    entry_id = row.entry_id
    names = row.names

    for name in names:
        compound_name_list.append({'compound_id': entry_id, 'name': name})

    return names[0]

df_entries_dblink_parsed['name'] = df_entries_dblink_parsed.apply(extract_compound_names, axis=1)

In [41]:
df_entries_dblink_parsed

Unnamed: 0,entry_id,names,formula,extact_mass,molecular_weight,reactions,pathways,modules,enzymes,brite,PubChem_subtance_id,PDB-CCD,NIKKAJI,LIPIDMAPS,KNApSAcK,ChEBI,CAS,3DMET,Drug group,name
0,C00001,"[H2O, Water]",H2O,18.0106,18.0153,"[R00001, R00002, R00004, R00005, R00009, R0001...","[map00190 Oxidative phosphorylation, map00195...",[M00416 Cytochrome aa3-600 menaquinol oxidase],"[1.1.1.1, 1.1.1.22, 1.1.1.23, 1.1.1.115, 1.1.1...",[Therapeutic category of drugs in Japan [BR:br...,3303,HOH O,J43.587B,,,15377,7732-18-5,B01124,,H2O
1,C00002,"[ATP, Adenosine 5'-triphosphate]",C10H16N5O13P3,506.9957,507.1810,"[R00002, R00076, R00085, R00086, R00087, R0008...","[map00190 Oxidative phosphorylation, map00195...","[M00049 Adenine ribonucleotide biosynthesis, ...","[1.1.98.6, 1.2.1.30, 1.2.1.95, 1.2.1.101, 1.3....",[Compounds with biological roles [BR:br08001]\...,3304,ATP,J10.680A,,C00001491,15422,56-65-5,B01125,,ATP
2,C00003,"[NAD+, NAD, Nicotinamide adenine dinucleotide,...",C21H28N7O14P2,664.1169,664.4330,"[R00023, R00090, R00091, R00092, R00093, R0009...","[map00190 Oxidative phosphorylation, map00730...","[M00115 NAD biosynthesis, aspartate => quinol...","[1.1.1.1, 1.1.1.3, 1.1.1.4, 1.1.1.6, 1.1.1.7, ...",[Compounds with biological roles [BR:br08001]\...,3305,NAD NAJ,J136.554A,,C00007256,15846,53-84-9,B01126,,NAD+
3,C00004,"[NADH, DPNH, Reduced nicotinamide adenine dinu...",C21H29N7O14P2,665.1248,665.4410,"[R00023, R00090, R00091, R00092, R00093, R0009...","[map00190 Oxidative phosphorylation, map01100...","[M00142 NADH:ubiquinone oxidoreductase, mitoc...","[1.1.1.1, 1.1.1.3, 1.1.1.4, 1.1.1.6, 1.1.1.7, ...",,3306,NAI,J213.546I,,C00019343,16908,58-68-4,B01127,,NADH
4,C00005,"[NADPH, TPNH, Reduced nicotinamide adenine din...",C21H30N7O17P3,745.0911,745.4209,"[R00105, R00106, R00108, R00109, R00111, R0011...","[map00195 Photosynthesis, map00480 Glutathio...",,"[1.1.1.1, 1.1.1.2, 1.1.1.3, 1.1.1.10, 1.1.1.19...",,3307,NDP,J208.978E,,,16474,2646-71-1,B01128,,NADPH
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19049,C22607,[Isourolithin A],C13H8O4,228.0423,228.2002,,,,,,,,,,,,,,,Isourolithin A
19050,C22608,"[Avicularin, Quercetin 3-O-alpha-L-arabinofura...",C20H18O11,434.0849,434.3503,,,,,,,,,,,,572-30-5,,,Avicularin
19051,C22609,[Dammarane],C30H54,414.4226,414.7498,,,,,,,,,,,,,,,Dammarane
19052,C22610,[(-)-3-exo-Hydroxycamphor],C10H16O2,168.1150,168.2328,[R13092],,,"[1.1.1.327, 1.14.15.1]",,,,,,,,,,,(-)-3-exo-Hydroxycamphor


In [42]:
compound_names_df = pd.DataFrame(compound_name_list)
compound_names_df

Unnamed: 0,compound_id,name
0,C00001,H2O
1,C00001,Water
2,C00002,ATP
3,C00002,Adenosine 5'-triphosphate
4,C00003,NAD+
...,...,...
31991,C22608,Quercetin 3-O-alpha-L-arabinofuranoside
31992,C22609,Dammarane
31993,C22610,(-)-3-exo-Hydroxycamphor
31994,C22611,N-Acetyl-D-glutamate


In [43]:
compound_names_df.to_csv(os.path.join(kegg_folder, "kegg_compounds_names.csv"), index=False)

In [44]:
df_entries_dblink_parsed = df_entries_dblink_parsed.drop(columns=['names'])
df_entries_dblink_parsed

Unnamed: 0,entry_id,formula,extact_mass,molecular_weight,reactions,pathways,modules,enzymes,brite,PubChem_subtance_id,PDB-CCD,NIKKAJI,LIPIDMAPS,KNApSAcK,ChEBI,CAS,3DMET,Drug group,name
0,C00001,H2O,18.0106,18.0153,"[R00001, R00002, R00004, R00005, R00009, R0001...","[map00190 Oxidative phosphorylation, map00195...",[M00416 Cytochrome aa3-600 menaquinol oxidase],"[1.1.1.1, 1.1.1.22, 1.1.1.23, 1.1.1.115, 1.1.1...",[Therapeutic category of drugs in Japan [BR:br...,3303,HOH O,J43.587B,,,15377,7732-18-5,B01124,,H2O
1,C00002,C10H16N5O13P3,506.9957,507.1810,"[R00002, R00076, R00085, R00086, R00087, R0008...","[map00190 Oxidative phosphorylation, map00195...","[M00049 Adenine ribonucleotide biosynthesis, ...","[1.1.98.6, 1.2.1.30, 1.2.1.95, 1.2.1.101, 1.3....",[Compounds with biological roles [BR:br08001]\...,3304,ATP,J10.680A,,C00001491,15422,56-65-5,B01125,,ATP
2,C00003,C21H28N7O14P2,664.1169,664.4330,"[R00023, R00090, R00091, R00092, R00093, R0009...","[map00190 Oxidative phosphorylation, map00730...","[M00115 NAD biosynthesis, aspartate => quinol...","[1.1.1.1, 1.1.1.3, 1.1.1.4, 1.1.1.6, 1.1.1.7, ...",[Compounds with biological roles [BR:br08001]\...,3305,NAD NAJ,J136.554A,,C00007256,15846,53-84-9,B01126,,NAD+
3,C00004,C21H29N7O14P2,665.1248,665.4410,"[R00023, R00090, R00091, R00092, R00093, R0009...","[map00190 Oxidative phosphorylation, map01100...","[M00142 NADH:ubiquinone oxidoreductase, mitoc...","[1.1.1.1, 1.1.1.3, 1.1.1.4, 1.1.1.6, 1.1.1.7, ...",,3306,NAI,J213.546I,,C00019343,16908,58-68-4,B01127,,NADH
4,C00005,C21H30N7O17P3,745.0911,745.4209,"[R00105, R00106, R00108, R00109, R00111, R0011...","[map00195 Photosynthesis, map00480 Glutathio...",,"[1.1.1.1, 1.1.1.2, 1.1.1.3, 1.1.1.10, 1.1.1.19...",,3307,NDP,J208.978E,,,16474,2646-71-1,B01128,,NADPH
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19049,C22607,C13H8O4,228.0423,228.2002,,,,,,,,,,,,,,,Isourolithin A
19050,C22608,C20H18O11,434.0849,434.3503,,,,,,,,,,,,572-30-5,,,Avicularin
19051,C22609,C30H54,414.4226,414.7498,,,,,,,,,,,,,,,Dammarane
19052,C22610,C10H16O2,168.1150,168.2328,[R13092],,,"[1.1.1.327, 1.14.15.1]",,,,,,,,,,,(-)-3-exo-Hydroxycamphor


### Extract Reaction Relationship

In [45]:
compound_reactions_list = []

def extract_compound_reactions(row):
    entry_id = row.entry_id
    reactions = row.reactions

    if type(reactions) != float:
        for reaction in reactions:
            compound_reactions_list.append({'compound_id': entry_id, 'reaction': reaction})

df_entries_dblink_parsed.apply(extract_compound_reactions, axis=1)

compound_reactions_df = pd.DataFrame(compound_reactions_list)
compound_reactions_df

Unnamed: 0,compound_id,reaction
0,C00001,R00001
1,C00001,R00002
2,C00001,R00004
3,C00001,R00005
4,C00001,R00009
...,...,...
46536,C22575,R13087
46537,C22576,R13088
46538,C22577,R07351
46539,C22610,R13092


In [46]:
compound_reactions_df.to_csv(os.path.join(kegg_folder, "kegg_compounds_reactions.csv"), index=False)

### Extract Module Relationship

In [47]:
compound_modules_list = []

def extract_compound_modules(row):
    entry_id = row.entry_id
    modules = row.modules

    if type(modules) != float:
        for module in modules:
            module_key, module_value = module.split('  ')
            compound_modules_list.append({'compound_id': entry_id, 'module_id': module_key, 'module_name': module_value})

df_entries_dblink_parsed.apply(extract_compound_modules, axis=1)

compound_modules_df = pd.DataFrame(compound_modules_list)
compound_modules_df

Unnamed: 0,compound_id,module_id,module_name
0,C00001,M00416,Cytochrome aa3-600 menaquinol oxidase
1,C00002,M00049,"Adenine ribonucleotide biosynthesis, IMP => AD..."
2,C00002,M00889,"Puromycin biosynthesis, ATP => puromycin"
3,C00003,M00115,"NAD biosynthesis, aspartate => quinolinate => NAD"
4,C00003,M00142,"NADH:ubiquinone oxidoreductase, mitochondria"
...,...,...,...
2873,C22458,M00950,"Biotin biosynthesis, BioU pathway, pimeloyl-AC..."
2874,C22556,M00966,"Equol biosynthesis, daidzein => equol"
2875,C22557,M00966,"Equol biosynthesis, daidzein => equol"
2876,C22558,M00966,"Equol biosynthesis, daidzein => equol"


In [48]:
compound_modules_df.to_csv(os.path.join(kegg_folder, "kegg_compounds_modules.csv"), index=False)

### Extract Brite Relationship

In [49]:
compound_brite_list = []

def extract_compound_brite(row):
    entry_id = row.entry_id
    brite = row.brite

    if type(brite) != float:
        for item in brite:
            compound_brite_list.append({'compound_id': entry_id, 'brite': item})

df_entries_dblink_parsed.apply(extract_compound_brite, axis=1)

compound_brite_df = pd.DataFrame(compound_brite_list)
compound_brite_df

Unnamed: 0,compound_id,brite
0,C00001,Therapeutic category of drugs in Japan [BR:br0...
1,C00001,Risk category of Japanese OTC drugs [BR:br0831...
2,C00001,Drugs listed in the Japanese Pharmacopoeia [BR...
3,C00002,Compounds with biological roles [BR:br08001]\n...
4,C00002,Glycosides [BR:br08021]\n N-glycos...
...,...,...
15157,C22382,Anatomical Therapeutic Chemical (ATC) classifi...
15158,C22446,Lipids [BR:br08002]\n ST Sterol l...
15159,C22503,Lipids [BR:br08002]\n FA Fatty ac...
15160,C22548,Compounds with biological roles [BR:br08001]\n...


In [50]:
compound_brite_df.to_csv(os.path.join(kegg_folder, "kegg_compounds_brite.csv"), index=False)

### Save cleaned compounds dataframe

In [51]:
df_entries_dblink_parsed = df_entries_dblink_parsed[['entry_id', 'name', 'formula', 'extact_mass', 'molecular_weight', 'PubChem_subtance_id', 'PDB-CCD', 'NIKKAJI', 'LIPIDMAPS', 'KNApSAcK', 'ChEBI', 'CAS', '3DMET', 'Drug group']]
df_entries_dblink_parsed

Unnamed: 0,entry_id,name,formula,extact_mass,molecular_weight,PubChem_subtance_id,PDB-CCD,NIKKAJI,LIPIDMAPS,KNApSAcK,ChEBI,CAS,3DMET,Drug group
0,C00001,H2O,H2O,18.0106,18.0153,3303,HOH O,J43.587B,,,15377,7732-18-5,B01124,
1,C00002,ATP,C10H16N5O13P3,506.9957,507.1810,3304,ATP,J10.680A,,C00001491,15422,56-65-5,B01125,
2,C00003,NAD+,C21H28N7O14P2,664.1169,664.4330,3305,NAD NAJ,J136.554A,,C00007256,15846,53-84-9,B01126,
3,C00004,NADH,C21H29N7O14P2,665.1248,665.4410,3306,NAI,J213.546I,,C00019343,16908,58-68-4,B01127,
4,C00005,NADPH,C21H30N7O17P3,745.0911,745.4209,3307,NDP,J208.978E,,,16474,2646-71-1,B01128,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19049,C22607,Isourolithin A,C13H8O4,228.0423,228.2002,,,,,,,,,
19050,C22608,Avicularin,C20H18O11,434.0849,434.3503,,,,,,,572-30-5,,
19051,C22609,Dammarane,C30H54,414.4226,414.7498,,,,,,,,,
19052,C22610,(-)-3-exo-Hydroxycamphor,C10H16O2,168.1150,168.2328,,,,,,,,,


In [52]:
df_entries_dblink_parsed = df_entries_dblink_parsed.rename(columns={'extact_mass': 'exact_mass'})

In [53]:
df_entries_dblink_parsed

Unnamed: 0,entry_id,name,formula,exact_mass,molecular_weight,PubChem_subtance_id,PDB-CCD,NIKKAJI,LIPIDMAPS,KNApSAcK,ChEBI,CAS,3DMET,Drug group
0,C00001,H2O,H2O,18.0106,18.0153,3303,HOH O,J43.587B,,,15377,7732-18-5,B01124,
1,C00002,ATP,C10H16N5O13P3,506.9957,507.1810,3304,ATP,J10.680A,,C00001491,15422,56-65-5,B01125,
2,C00003,NAD+,C21H28N7O14P2,664.1169,664.4330,3305,NAD NAJ,J136.554A,,C00007256,15846,53-84-9,B01126,
3,C00004,NADH,C21H29N7O14P2,665.1248,665.4410,3306,NAI,J213.546I,,C00019343,16908,58-68-4,B01127,
4,C00005,NADPH,C21H30N7O17P3,745.0911,745.4209,3307,NDP,J208.978E,,,16474,2646-71-1,B01128,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19049,C22607,Isourolithin A,C13H8O4,228.0423,228.2002,,,,,,,,,
19050,C22608,Avicularin,C20H18O11,434.0849,434.3503,,,,,,,572-30-5,,
19051,C22609,Dammarane,C30H54,414.4226,414.7498,,,,,,,,,
19052,C22610,(-)-3-exo-Hydroxycamphor,C10H16O2,168.1150,168.2328,,,,,,,,,


In [54]:
df_entries_dblink_parsed.to_csv(os.path.join(kegg_folder, "kegg_compounds_processed.csv"), index=False)

## Get PubChem Entries and SMILES for KEGG compounds

In [2]:
# Get Pubchem substance ID for every compound
kegg_pubchem_compound_list = []

def get_pubchem_id(kegg_id):
    url_kegg_get_pubchem = f'https://rest.kegg.jp/conv/pubchem/cpd:{kegg_id}'
    response = requests.request("GET", url_kegg_get_pubchem)
    response = response.text
    if response != "\n":
        kegg_query_id, pubchem_id = response.split('\t')
        kegg_query_id = kegg_query_id.lstrip('cpd:')
        pubchem_id = pubchem_id.lstrip('pubchem:').strip()
        print(f"{kegg_query_id}: {pubchem_id}")
        kegg_pubchem_compound_list.append({'kegg_id': kegg_query_id, 'pubchem_id': pubchem_id})
    else:
        print(kegg_id)
        print(response)

In [55]:
# Get compound ID for every substance ID
def get_cid_from_sid_pubchem(sid):
    input_specification = f"substance/sid/{sid}"
    operation_specification = "cids"
    output_specification = "JSON"
    operation_options = ""
    url_pubchem_get_smiles = f'https://pubchem.ncbi.nlm.nih.gov/rest/pug/{input_specification}/{operation_specification}/{output_specification}{operation_options}'
    response = requests.request("GET", url_pubchem_get_smiles)
    response = response.json()
    if 'InformationList' in response.keys():
        cid = response['InformationList']['Information'][0]['CID'][0]
        return str(cid)
    else:
        return float('NaN')

In [56]:
# With the help of the pubchem compound ID we can now extract further information such as the SMILES and InChI
def get_info_pubchem_cid(row):
    if pd.notna(row.pubchem_compound_id):
        input_specification = f"compound/cid/{row.pubchem_compound_id}"
        operation_specification = "property/CanonicalSMILES,InChI,InChIKey,Title"
        output_specification = "JSON"
        operation_options = ""
        url_pubchem_get_smiles = f'https://pubchem.ncbi.nlm.nih.gov/rest/pug/{input_specification}/{operation_specification}/{output_specification}{operation_options}'
        response = requests.request("GET", url_pubchem_get_smiles)
        response = response.json()
        if 'PropertyTable' in response.keys():
            response = response['PropertyTable']['Properties'][0]

            if 'Title' in response:
                title = response['Title']
            else:
                title = float("NaN")

            if 'CanonicalSMILES' in response:
                smiles = response['CanonicalSMILES']
            else:
                smiles = float('NaN')

            if 'InChI' in response:
                inchi = response['InChI']
            else:
                inchi = float("NaN")

            if 'InChIKey' in response:
                inchi_key = response['InChIKey']
            else:
                inchi_key = float("NaN")

            return pd.Series([title, smiles, inchi, inchi_key])
        else:
            print(response)
    else:
        return pd.Series([float("NaN"), float("NaN"), float("NaN"), float("NaN")])

In [57]:
df_entries_dblink_parsed

Unnamed: 0,entry_id,name,formula,exact_mass,molecular_weight,PubChem_subtance_id,PDB-CCD,NIKKAJI,LIPIDMAPS,KNApSAcK,ChEBI,CAS,3DMET,Drug group
0,C00001,H2O,H2O,18.0106,18.0153,3303,HOH O,J43.587B,,,15377,7732-18-5,B01124,
1,C00002,ATP,C10H16N5O13P3,506.9957,507.1810,3304,ATP,J10.680A,,C00001491,15422,56-65-5,B01125,
2,C00003,NAD+,C21H28N7O14P2,664.1169,664.4330,3305,NAD NAJ,J136.554A,,C00007256,15846,53-84-9,B01126,
3,C00004,NADH,C21H29N7O14P2,665.1248,665.4410,3306,NAI,J213.546I,,C00019343,16908,58-68-4,B01127,
4,C00005,NADPH,C21H30N7O17P3,745.0911,745.4209,3307,NDP,J208.978E,,,16474,2646-71-1,B01128,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19049,C22607,Isourolithin A,C13H8O4,228.0423,228.2002,,,,,,,,,
19050,C22608,Avicularin,C20H18O11,434.0849,434.3503,,,,,,,572-30-5,,
19051,C22609,Dammarane,C30H54,414.4226,414.7498,,,,,,,,,
19052,C22610,(-)-3-exo-Hydroxycamphor,C10H16O2,168.1150,168.2328,,,,,,,,,


In [58]:
df_entries_dblink_parsed['pubchem_compound_id'] = df_entries_dblink_parsed.PubChem_subtance_id.apply(get_cid_from_sid_pubchem)

In [59]:
df_entries_dblink_parsed

Unnamed: 0,entry_id,name,formula,exact_mass,molecular_weight,PubChem_subtance_id,PDB-CCD,NIKKAJI,LIPIDMAPS,KNApSAcK,ChEBI,CAS,3DMET,Drug group,pubchem_compound_id
0,C00001,H2O,H2O,18.0106,18.0153,3303,HOH O,J43.587B,,,15377,7732-18-5,B01124,,962
1,C00002,ATP,C10H16N5O13P3,506.9957,507.1810,3304,ATP,J10.680A,,C00001491,15422,56-65-5,B01125,,5957
2,C00003,NAD+,C21H28N7O14P2,664.1169,664.4330,3305,NAD NAJ,J136.554A,,C00007256,15846,53-84-9,B01126,,5893
3,C00004,NADH,C21H29N7O14P2,665.1248,665.4410,3306,NAI,J213.546I,,C00019343,16908,58-68-4,B01127,,439153
4,C00005,NADPH,C21H30N7O17P3,745.0911,745.4209,3307,NDP,J208.978E,,,16474,2646-71-1,B01128,,5884
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19049,C22607,Isourolithin A,C13H8O4,228.0423,228.2002,,,,,,,,,,
19050,C22608,Avicularin,C20H18O11,434.0849,434.3503,,,,,,,572-30-5,,,
19051,C22609,Dammarane,C30H54,414.4226,414.7498,,,,,,,,,,
19052,C22610,(-)-3-exo-Hydroxycamphor,C10H16O2,168.1150,168.2328,,,,,,,,,,


In [60]:
df_entries_dblink_parsed[['pubchem_name', 'smiles', 'inchi', 'inchi_key']] = df_entries_dblink_parsed.apply(get_info_pubchem_cid, axis=1)

In [61]:
df_entries_dblink_parsed

Unnamed: 0,entry_id,name,formula,exact_mass,molecular_weight,PubChem_subtance_id,PDB-CCD,NIKKAJI,LIPIDMAPS,KNApSAcK,ChEBI,CAS,3DMET,Drug group,pubchem_compound_id,smiles,inchi,inchi_key
0,C00001,Water,H2O,18.0106,18.0153,3303,HOH O,J43.587B,,,15377,7732-18-5,B01124,,962,O,InChI=1S/H2O/h1H2,XLYOFNOQVPJJNP-UHFFFAOYSA-N
1,C00002,Adenosine-5'-triphosphate,C10H16N5O13P3,506.9957,507.1810,3304,ATP,J10.680A,,C00001491,15422,56-65-5,B01125,,5957,C1=NC(=C2C(=N1)N(C=N2)C3C(C(C(O3)COP(=O)(O)OP(...,InChI=1S/C10H16N5O13P3/c11-8-5-9(13-2-12-8)15(...,ZKHQWZAMYRWXGA-KQYNXXCUSA-N
2,C00003,Nicotinamide-Adenine-Dinucleotide,C21H28N7O14P2,664.1169,664.4330,3305,NAD NAJ,J136.554A,,C00007256,15846,53-84-9,B01126,,5893,C1=CC(=C[N+](=C1)C2C(C(C(O2)COP(=O)(O)OP(=O)(O...,InChI=1S/C21H27N7O14P2/c22-17-12-19(25-7-24-17...,BAWFJGJZGIEFAR-NNYOXOHSSA-O
3,C00004,"1,4-Dihydronicotinamide adenine dinucleotide",C21H29N7O14P2,665.1248,665.4410,3306,NAI,J213.546I,,C00019343,16908,58-68-4,B01127,,439153,C1C=CN(C=C1C(=O)N)C2C(C(C(O2)COP(=O)(O)OP(=O)(...,InChI=1S/C21H29N7O14P2/c22-17-12-19(25-7-24-17...,BOPGDPNILDQYTO-NNYOXOHSSA-N
4,C00005,NADPH,C21H30N7O17P3,745.0911,745.4209,3307,NDP,J208.978E,,,16474,2646-71-1,B01128,,5884,C1C=CN(C=C1C(=O)N)C2C(C(C(O2)COP(=O)(O)OP(=O)(...,InChI=1S/C21H30N7O17P3/c22-17-12-19(25-7-24-17...,ACFIXJIJDZMPPO-NNYOXOHSSA-N
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19049,C22607,,C13H8O4,228.0423,228.2002,,,,,,,,,,,,,
19050,C22608,,C20H18O11,434.0849,434.3503,,,,,,,572-30-5,,,,,,
19051,C22609,,C30H54,414.4226,414.7498,,,,,,,,,,,,,
19052,C22610,,C10H16O2,168.1150,168.2328,,,,,,,,,,,,,


In [62]:
df_entries_dblink_parsed.to_csv(os.path.join(kegg_folder, "kegg_compounds_processed.csv"), index=False)