In [6]:
%reset -f

# Implementing KEGG API

1. REST: https://www.kegg.jp/kegg/rest/
2. API: https://www.kegg.jp/kegg/rest/keggapi.html
3. DB entry: https://www.kegg.jp/kegg/docs/dbentry.html
4. Weblinks: https://www.kegg.jp/kegg/docs/weblink.html


## Libraries Installed

1. conda install -c conda-forge notebook
2. conda install -c anaconda urllib3 
3. conda install pandas


### Info:

    KEGG:
    kegg             Kyoto Encyclopedia of Genes and Genomes
    kegg             Release 96.0+/12-14, Dec 20
                     Kanehisa Laboratories
                     pathway     758,081 entries
                     brite       264,435 entries
                     module          512 entries
                     orthology    23,916 entries
                     genome        7,256 entries
                     genes     33,082,417 entries
                     compound     18,749 entries
                     glycan       11,042 entries
                     reaction     11,467 entries
                     rclass        3,168 entries
                     enzyme        7,787 entries
                     network       1,442 entries
                     variant         441 entries
                     disease       2,456 entries
                     drug         11,370 entries
                     dgroup        2,298 entries
                     environ         864 entries
    -----------------------------------------------------------------------------

    pathway          KEGG Pathway Database
    path             Release 96.0+/12-15, Dec 20
                     Kanehisa Laboratories
                     758,081 entries

    linked db        module
                     ko
                     genome
                     <org>
                     compound
                     glycan
                     reaction
                     rclass
                     enzyme
                     network
                     disease
                     drug
                     pubmed
                     
    -----------------------------------------------------------------------------                     
                     
    compound         KEGG Compound Database
    cpd              Release 96.0+/12-15, Dec 20
                     Kanehisa Laboratories
                     18,749 entries

    linked db        pathway
                     brite
                     module
                     genome
                     glycan
                     reaction
                     enzyme
                     network
                     disease
                     drug
                     environ
                     pubchem
                     chebi
                     
    ----------------------------------------------------------------------------- 

    reaction         KEGG Reaction Database
    rn               Release 96.0+/12-15, Dec 20
                     Kanehisa Laboratories
                     11,467 entries

    linked db        pathway
                     brite
                     module
                     ko
                     compound
                     glycan
                     rclass
                     enzyme


In [7]:
import urllib3
import io
import pandas as pd

In [8]:
def parseResponse(cols, *args):
    keggUrl = 'http://rest.kegg.jp/'
    
    for arg in args:
        keggUrl += arg+'/'

    df = pd.DataFrame(columns=cols)
    http = urllib3.PoolManager()
    pathwayResponse = http.request('GET', keggUrl, preload_content=False)
    pathwayResponse.auto_close=False

    for line in io.TextIOWrapper(pathwayResponse):
        df = df.append(pd.Series(line.strip('\n').split('\t'),index = df.columns), ignore_index=True)
    return df


## LIST

In [9]:
# Pathway
cols = ['pathwayId','pathwayDesc']
pathwayDF = parseResponse(cols, 'list','pathway')
print('Pathways Count: '+str(len(pathwayDF)))
pathwayDF.to_csv('data/pathway.tsv', sep='\t', index=False)
pathwayDF.head()

Pathways Count: 542


Unnamed: 0,pathwayId,pathwayDesc
0,path:map00010,Glycolysis / Gluconeogenesis
1,path:map00020,Citrate cycle (TCA cycle)
2,path:map00030,Pentose phosphate pathway
3,path:map00040,Pentose and glucuronate interconversions
4,path:map00051,Fructose and mannose metabolism


In [10]:
# Compound
cols = ['compoundId','compoundDesc']
compoundDF = parseResponse(cols, 'list','cpd')
print('Compounds Count: '+str(len(compoundDF)))
compoundDF.to_csv('data/compound.tsv', sep='\t', index=False)
compoundDF.head()

Compounds Count: 18756


Unnamed: 0,compoundId,compoundDesc
0,cpd:C00001,H2O; Water
1,cpd:C00002,ATP; Adenosine 5'-triphosphate
2,cpd:C00003,NAD+; NAD; Nicotinamide adenine dinucleotide; ...
3,cpd:C00004,NADH; DPNH; Reduced nicotinamide adenine dinuc...
4,cpd:C00005,NADPH; TPNH; Reduced nicotinamide adenine dinu...


In [11]:
# Reaction
cols = ['reactionId','reactionDesc']
reactionDF = parseResponse(cols, 'list','rn')
print('Reactions Count:' + str(len(reactionDF)))
reactionDF.to_csv('data/reaction.tsv', sep='\t', index=False)
reactionDF.head()

Reactions Count:11474


Unnamed: 0,reactionId,reactionDesc
0,rn:R00001,polyphosphate polyphosphohydrolase; Polyphosph...
1,rn:R00002,Reduced ferredoxin:dinitrogen oxidoreductase (...
2,rn:R00004,diphosphate phosphohydrolase; pyrophosphate ph...
3,rn:R00005,urea-1-carboxylate amidohydrolase; Urea-1-carb...
4,rn:R00006,pyruvate:pyruvate acetaldehydetransferase (dec...


## LINK

In [12]:
# Reaction -> Pathway

cols = ['reactionId', 'pathwayId']
reactionPathwayLinkDF = parseResponse(cols, 'link','pathway','rn')
print('Reaction2Pathway Links:' + str(len(reactionPathwayLinkDF)))
reactionPathwayLinkDF.to_csv('data/reactionPathwayLink.tsv', sep='\t', index=False)
reactionPathwayLinkDF.head()

Reaction2Pathway Links:34908


Unnamed: 0,reactionId,pathwayId
0,rn:R00014,path:map00010
1,rn:R00014,path:rn00010
2,rn:R00199,path:map00010
3,rn:R00199,path:rn00010
4,rn:R00200,path:map00010


In [13]:
# Compound <-> Reaction

cols = ['compoundId', 'reactionId']
compoundReactionLinkDF = parseResponse(cols, 'link','rn', 'cpd')
print('Compound2Reaction Links:' + str(len(compoundReactionLinkDF)))
compoundReactionLinkDF.to_csv('data/compoundReactionLink.tsv', sep='\t', index=False)
compoundReactionLinkDF.head()

Compound2Reaction Links:48416


Unnamed: 0,compoundId,reactionId
0,cpd:C00001,rn:R00001
1,cpd:C00001,rn:R00002
2,cpd:C00001,rn:R00004
3,cpd:C00001,rn:R00005
4,cpd:C00001,rn:R00009


In [14]:
# Compound <-> Pathway

cols = ['compoundId', 'pathwayId']
compoundPathwayLinkDF = parseResponse(cols, 'link','pathway', 'cpd')
print('Compound2Pathway Links:' + str(len(compoundPathwayLinkDF)))
compoundPathwayLinkDF.to_csv('data/compoundPathwayLink.tsv', sep='\t', index=False)
compoundPathwayLinkDF.head()

Compound2Pathway Links:17372


Unnamed: 0,compoundId,pathwayId
0,cpd:C00022,path:map00010
1,cpd:C00024,path:map00010
2,cpd:C00031,path:map00010
3,cpd:C00033,path:map00010
4,cpd:C00036,path:map00010


In [15]:
# Reaction <-> Pathway
cols = ['reactionId', 'pathwayId']
reactionPathwayLinkDF = parseResponse(cols, 'link','pathway', 'rn')
print('Reaction2Pathway Links:' + str(len(reactionPathwayLinkDF)))
reactionPathwayLinkDF.to_csv('data/reactionPathwayLink.tsv', sep='\t', index=False)
reactionPathwayLinkDF.head()

Reaction2Pathway Links:34908


Unnamed: 0,reactionId,pathwayId
0,rn:R00014,path:map00010
1,rn:R00014,path:rn00010
2,rn:R00199,path:map00010
3,rn:R00199,path:rn00010
4,rn:R00200,path:map00010


## Reaction <=> Compound  <=> Pathway

In [16]:
rnCpdPath = compoundPathwayLinkDF.merge(compoundReactionLinkDF, on='compoundId', sort=True)
print('rn <-> cpd <-> path # of rows: '+ str(len(rnCpdPath)))
rnCpdPath.head()

rn <-> cpd <-> path # of rows: 436086


Unnamed: 0,compoundId,pathwayId,reactionId
0,cpd:C00001,path:map00190,rn:R00001
1,cpd:C00001,path:map00190,rn:R00002
2,cpd:C00001,path:map00190,rn:R00004
3,cpd:C00001,path:map00190,rn:R00005
4,cpd:C00001,path:map00190,rn:R00009


In [17]:
rnCpdPath[rnCpdPath['compoundId'] == 'cpd:C00031'] #glucose

Unnamed: 0,compoundId,pathwayId,reactionId
253151,cpd:C00031,path:map00010,rn:R00010
253152,cpd:C00031,path:map00010,rn:R00015
253153,cpd:C00031,path:map00010,rn:R00028
253154,cpd:C00031,path:map00010,rn:R00049
253155,cpd:C00031,path:map00010,rn:R00063
...,...,...,...
257021,cpd:C00031,path:map05230,rn:R11791
257022,cpd:C00031,path:map05230,rn:R11942
257023,cpd:C00031,path:map05230,rn:R12208
257024,cpd:C00031,path:map05230,rn:R12209


## GET

This gets back given DB entries. For e.g.:<br/>
    - For Reaction: http://rest.kegg.jp/get/rn:R00001 <br/>
    - For Coumpounds: http://rest.kegg.jp/get/cpd:C00001 <br/>
    - For Pathways: http://rest.kegg.jp/get/path:map00010


### Reaction

In [20]:
def hasDigit(string):
    for s in string:
        if s.isdigit():
            return True
    return False

In [32]:
def makeDataFrame(df, idColName): 
    #count = 0
    listOfDict = []
    keggUrl = 'http://rest.kegg.jp/'

    for row in df.iterrows():
        dictionary = {}
        lastKey = ''
        rn = row[1][idColName]
        newKeggUrl = keggUrl + 'get/' + rn

        http = urllib3.PoolManager()
        pathwayResponse = http.request('GET', newKeggUrl, preload_content=False)
        pathwayResponse.auto_close=False

        #count += 1

        for line in io.TextIOWrapper(pathwayResponse):
            lineElements = line.split('  ')
            listofElements = [x.strip() for x in lineElements if x]

            if listofElements[0].isupper() and not hasDigit(listofElements[0]):
                lastKey = listofElements[0]
                if lastKey == 'ENZYME':
                    dictionary[lastKey] = ' | '.join(listofElements[1:])
                else:
                    dictionary[lastKey] = ' '.join(listofElements[1:])
            else:
                if lastKey == 'COMMENT':
                    dictionary[lastKey] += (' '+' '.join(listofElements))
                else:
                    dictionary[lastKey] += (' | '+'-'.join(listofElements))

            dictionary[lastKey] = dictionary[lastKey].replace(' | ///', '')
        listOfDict.append(dictionary)
        #if count == 10:
            #break;
    return pd.DataFrame(listOfDict)


In [152]:
%%time

keggReactionsDF = makeDataFrame(reactionDF, 'reactionId')

CPU times: user 23.8 s, sys: 5.24 s, total: 29 s
Wall time: 1h 13min 7s


In [153]:
keggReactionsDF.head()

Unnamed: 0,ENTRY,NAME,DEFINITION,EQUATION,ENZYME,DBLINKS,COMMENT,RCLASS,PATHWAY,ORTHOLOGY,MODULE,REMARK,REFERENCE,AUTHORS,TITLE,JOURNAL,BANYULS
0,R00001 Reaction,polyphosphate polyphosphohydrolase,Polyphosphate + n H2O <=> (n+1) Oligophosphate,C00404 + n C00001 <=> (n+1) C02174,3.6.1.10,RHEA: 22455,,,,,,,,,,,
1,R00002 Reaction,Reduced ferredoxin:dinitrogen oxidoreductase (...,16 ATP + 16 H2O + 8 Reduced ferredoxin <=> 8 e...,16 C00002 + 16 C00001 + 8 C00138 <=> 8 C05359 ...,1.18.6.1,,"a part of multi-step reaction (see R05185, R00...",RC00002 C00002_C00008,,,,,,,,,
2,R00004 Reaction,diphosphate phosphohydrolase; | pyrophosphate ...,Diphosphate + H2O <=> 2 Orthophosphate,C00013 + C00001 <=> 2 C00009,3.6.1.1,RHEA: 24579,,,,,,,,,,,
3,R00005 Reaction,urea-1-carboxylate amidohydrolase,Urea-1-carboxylate + H2O <=> 2 CO2 + 2 Ammonia,C01010 + C00001 <=> 2 C00011 + 2 C00014,3.5.1.54,RHEA: 19032,The yeast enzyme (but not that from green alga...,RC02756 C00011_C01010,rn00220 Arginine biosynthesis | rn00791-Atrazi...,K01457 allophanate hydrolase [EC:3.5.1.54] | K...,,,,,,,
4,R00006 Reaction,pyruvate:pyruvate acetaldehydetransferase (dec...,2-Acetolactate + CO2 <=> 2 Pyruvate,C00900 + C00011 <=> 2 C00022,2.2.1.6,,TPP-dependent enzymatic reaction (R00014+R03050),RC00106 C00022_C00900,,K01652 acetolactate synthase I/II/III large su...,,,,,,,


In [154]:
keggReactionsDF.to_csv('data/KEGG-Reactions.tsv', sep='\t', index=False)

### Pathways

In [None]:
keggPathwaysDF = makeDataFrame(pathwayDF, 'pathwayId')
keggPathwaysDF.head()

In [None]:
keggPathwaysDF.to_csv('data/KEGG-Pathways.tsv', sep='\t', index=False)

## CONV

In [16]:
# Compound <-> CHEBI

cols = ['CHEBI', 'compoundId']
compoundChebiConvDF = parseResponse(cols, 'conv', 'chebi', 'cpd')
print('CompoundCHEBI Conversions:' + str(len(compoundChebiConvDF)))
compoundChebiConvDF.to_csv('data/compoundChebiConv.tsv', sep='\t', index=False)
compoundChebiConvDF.head()

CompoundCHEBI Conversions:16993


Unnamed: 0,CHEBI,compoundId
0,cpd:C00462,chebi:16042
1,cpd:C00742,chebi:17051
2,cpd:C08142,chebi:28741
3,cpd:C00698,chebi:17996
4,cpd:C07755,chebi:6636
