## <h1><center>7. KEGG Enrichment Anlaysis</center></h1>

In [4]:
import os
import re
from collections import OrderedDict

### scipy, numpy, pandas, matplotlib ...
from scipy.stats import hypergeom
import numpy as np
import pandas as pd

### rpy2
from rpy2.robjects.packages import importr
from rpy2.robjects.vectors import FloatVector
from rpy2.robjects import pandas2ri
stats = importr('stats')

### Data organization
**1. store kegg pathway information into a dictionary** --> SUCCESSES
> kegg['hsa01210:2-Oxocarboxylic acid metabolism'] = [CS,ACO2,ACO1]

**2. store all genes into a list** --> POPULATION

**3. store DE genes into a list** --> SAMPLE

In [5]:
kegg = OrderedDict()
pop = []
with open('hsa00001.cleaned.keg') as f:
    for line in f:
        line = line.rstrip()
        lst = line.split('\t')
        if len(lst) > 3:
            geneList = lst[-2].split(';')
            kegg[lst[2]] = geneList
            pop = pop + [gene for gene in geneList if gene not in pop]
            
            
            
DEGs = []
with open('limma_001_2X_RNAseq_II2N.txt') as f:
    for line in f:
        line = line.rstrip()
        lst = line.split('\t')
        if lst[0] == '':
            continue
            
        elif lst[0] in pop:
            DEGs.append(lst[0])
            
popTotal = len(pop)
listTotal = len(DEGs)

print ('Number of genes in the background list: %d' %popTotal)
print ('Number of DE genes involved in any pathways: %d' %listTotal)

Number of genes in the background list: 7234
Number of DE genes involved in any pathways: 870


### Hypergeometric Test

In [6]:
from scipy.stats import hypergeom

keggEnrich = OrderedDict()
for ke, val in kegg.items():
    hits = [gene for gene in val if gene in DEGs]
    hitCount = len(hits)
    popHits = len(val)
    
    if hitCount == 0:
        print (ke)
    else:
        #rv = hypergeom.sf(k, M, n, N)
        pVal = hypergeom.sf(hitCount-1, popTotal, popHits, listTotal) ### P(X >= hitCount)
        keggEnrich[ke] = [hitCount, listTotal, popHits, popTotal, pVal, ';'.join(hits)]

hsa00660:C5-Branched dibasic acid metabolism
hsa00073:Cutin, suberine and wax biosynthesis
hsa00290:Valine, leucine and isoleucine biosynthesis
hsa00400:Phenylalanine, tyrosine and tryptophan biosynthesis
hsa00430:Taurine and hypotaurine metabolism
hsa00440:Phosphonate and phosphinate metabolism
hsa00471:D-Glutamine and D-glutamate metabolism
hsa00472:D-Arginine and D-ornithine metabolism
hsa00563:Glycosylphosphatidylinositol(GPI)-anchor biosynthesis
hsa00740:Riboflavin metabolism
hsa00780:Biotin metabolism
hsa00785:Lipoic acid metabolism
hsa00130:Ubiquinone and other terpenoid-quinone biosynthesis
hsa01051:Biosynthesis of ansamycins
hsa00523:Polyketide sugar unit biosynthesis
hsa01055:Biosynthesis of vancomycin group antibiotics
hsa00232:Caffeine metabolism
hsa00311:Penicillin and cephalosporin biosynthesis
hsa00524:Neomycin, kanamycin and gentamicin biosynthesis
hsa00525:Acarbose and validamycin biosynthesis
hsa03022:Basal transcription factors
hsa03010:Ribosome
hsa03060:Protein expo

In [1]:
#for ke, val in keggEnrich.items():
#    print (ke)
#    print (val)

### Output organization

In [7]:
keggOutput = pd.DataFrame.from_dict(keggEnrich, orient='columns', dtype=None)
keggOutput = pd.DataFrame.transpose(keggOutput)
keggOutput.columns = ['Count','List Total','pop Hits','pop Total','pVal','Genes']
keggOutput = keggOutput.sort_values(by='pVal',axis=0)

In [9]:
keggOutput.head()

Unnamed: 0,Count,List Total,pop Hits,pop Total,pVal,Genes
hsa04110:Cell cycle,45,870,124,7234,1.63946e-12,CCND2;RBL1;ABL1;E2F1;E2F2;TGFB2;TGFB3;MYC;CDKN...
hsa04510:Focal adhesion,53,870,199,7234,9.30412e-09,COL1A2;COL4A6;COL6A1;COL6A2;COL6A3;LAMA2;LAMA4...
hsa05200:Pathways in cancer,86,870,395,7234,1.28301e-08,CDH1;AXIN2;TCF7L1;BIRC5;MYC;WNT2B;WNT3;WNT5B;W...
hsa04022:cGMP - PKG signaling pathway,45,870,163,7234,3.90382e-08,EDNRA;EDNRB;ADRA2A;PPP3CC;MEF2C;MEF2D;NFATC1;N...
hsa05166:HTLV-I infection,60,870,256,7234,1.59522e-07,TGFB2;TGFB3;VCAM1;CD3E;TLN1;TLN2;MYC;CCND2;PPP...


### FDR correction

In [11]:
from rpy2.robjects.packages import importr
from rpy2.robjects.vectors import FloatVector
from rpy2.robjects import pandas2ri
stats = importr('stats')


pVal = keggOutput['pVal']
fdr = stats.p_adjust(FloatVector(pVal), method = 'fdr')
fdrPD = pandas2ri.ri2py(fdr)

keggOutput.insert(5,'FDR',fdrPD)

In [12]:
keggOutput.head()

Unnamed: 0,Count,List Total,pop Hits,pop Total,pVal,FDR,Genes
hsa04110:Cell cycle,45,870,124,7234,1.63946e-12,4.82e-10,CCND2;RBL1;ABL1;E2F1;E2F2;TGFB2;TGFB3;MYC;CDKN...
hsa04510:Focal adhesion,53,870,199,7234,9.30412e-09,1.257355e-06,COL1A2;COL4A6;COL6A1;COL6A2;COL6A3;LAMA2;LAMA4...
hsa05200:Pathways in cancer,86,870,395,7234,1.28301e-08,1.257355e-06,CDH1;AXIN2;TCF7L1;BIRC5;MYC;WNT2B;WNT3;WNT5B;W...
hsa04022:cGMP - PKG signaling pathway,45,870,163,7234,3.90382e-08,2.869307e-06,EDNRA;EDNRB;ADRA2A;PPP3CC;MEF2C;MEF2D;NFATC1;N...
hsa05166:HTLV-I infection,60,870,256,7234,1.59522e-07,9.379905e-06,TGFB2;TGFB3;VCAM1;CD3E;TLN1;TLN2;MYC;CCND2;PPP...


In [295]:
keggOutput.to_excel('kegg_pathway_enrichment.xlsx', sheet_name='kegg')