# GO analysis using GOATOOLS

In [1]:
import functools
import pandas as pd
import collections as cx
from pybiomart import Dataset
# GO analysis
from goatools.base import download_go_basic_obo
from goatools.base import download_ncbi_associations
from goatools.obo_parser import GODag
from goatools.anno.genetogo_reader import Gene2GoReader
from goatools.goea.go_enrichment_ns import GOEnrichmentStudyNS

In [2]:
@functools.lru_cache()
def get_database():
    dataset = Dataset(name="hsapiens_gene_ensembl", 
                      host="http://www.ensembl.org",
                      use_cache=True)
    db = dataset.query(attributes=["ensembl_gene_id", 
                                   "external_gene_name", 
                                   "entrezgene_id"], 
                       use_attr_names=True).dropna(subset=['entrezgene_id'])
    return db


@functools.lru_cache()
def get_deg():
    fn = '../../_m/genes/diffExpr_maleVfemale_FDR05.txt'
    return pd.read_csv(fn, sep='\t')


@functools.lru_cache()
def convert2entrez():
    df = get_deg()
    if 'EntrezID' in df.columns:
        return df.rename(columns={'EntrezID': 'entrezgene_id'})
    else: 
        return df.merge(get_database(), left_on='ensemblID', 
                        right_on='ensembl_gene_id')
    

@functools.lru_cache()
def get_upregulated():
    df = convert2entrez()
    return df.loc[(df['t'] > 0)]
    
    
@functools.lru_cache()
def get_downregulated():
    df = convert2entrez()
    return df.loc[(df['t'] < 0)]


In [3]:
def obo_annotation(alpha=0.05):
    # database annotation
    fn_obo = download_go_basic_obo()
    fn_gene2go = download_ncbi_associations() # must be gunzip to work
    obodag = GODag(fn_obo) # downloads most up-to-date
    anno_hs = Gene2GoReader(fn_gene2go, taxids=[9606])
    # get associations
    ns2assoc = anno_hs.get_ns2assc()
    for nspc, id2gos in ns2assoc.items():
        print("{NS} {N:,} annotated human genes".format(NS=nspc, N=len(id2gos)))
    goeaobj = GOEnrichmentStudyNS(
        get_database()['entrezgene_id'], # List of human genes with entrez IDs
        ns2assoc, # geneid/GO associations
        obodag, # Ontologies
        propagate_counts = False,
        alpha = alpha, # default significance cut-off
        methods = ['fdr_bh'])
    return goeaobj


def run_goea(direction):
    df = get_upregulated() if direction == 'Up' else get_downregulated()
    if direction == 'all':
        df = convert2entrez()
    geneids_study = {z[0]:z[1] for z in zip(df['entrezgene_id'], df['Symbol'])}
    goeaobj = obo_annotation()
    goea_results_all = goeaobj.run_study(geneids_study)
    goea_results_sig = [r for r in goea_results_all if r.p_fdr_bh < 0.05]

    ctr = cx.Counter([r.NS for r in goea_results_sig])
    print('Significant results[{TOTAL}] = {BP} BP + {MF} MF + {CC} CC'.format(
        TOTAL=len(goea_results_sig),
        BP=ctr['BP'],  # biological_process
        MF=ctr['MF'],  # molecular_function
        CC=ctr['CC'])) # cellular_component

    label = "upregulated" if direction == "Up" else "downregulated"
    if direction == 'all':
        label = 'all'
    goeaobj.wr_xlsx("GO_analysis_%s.xlsx" % label, goea_results_sig)
    goeaobj.wr_txt("GO_analysis_%s.txt" % label, goea_results_sig)

## Upregulated DEG

In [4]:
direction = "Up"
run_goea(direction)

requests.get(http://purl.obolibrary.org/obo/go/go-basic.obo, stream=True)
  WROTE: go-basic.obo

FTP RETR ftp.ncbi.nlm.nih.gov gene/DATA gene2go.gz -> gene2go.gz
  gunzip gene2go.gz
go-basic.obo: fmt(1.2) rel(2020-08-11) 47,277 GO Terms
HMS:0:00:04.772754 340,575 annotations, 20,693 genes, 18,417 GOs, 1 taxids READ: gene2go 
CC 19,420 annotated human genes
BP 18,665 annotated human genes
MF 18,165 annotated human genes

Load BP Gene Ontology Analysis ...
 70% 20,474 of 29,114 population items found in association

Load CC Gene Ontology Analysis ...
 73% 21,386 of 29,114 population items found in association

Load MF Gene Ontology Analysis ...
 70% 20,291 of 29,114 population items found in association

Run BP Gene Ontology Analysis: current study set of 52 IDs ... 74%     23 of     31 study items found in association
 60%     31 of     52 study items found in population(29114)
Calculating 12,358 uncorrected p-values using fisher
  12,358 GO terms are associated with 18,029 of 29,114 po

## Downregulated DEG

In [5]:
direction = "Down"
run_goea(direction)

  EXISTS: go-basic.obo
  EXISTS: gene2go
go-basic.obo: fmt(1.2) rel(2020-08-11) 47,277 GO Terms
HMS:0:00:05.264281 340,575 annotations, 20,693 genes, 18,417 GOs, 1 taxids READ: gene2go 
CC 19,420 annotated human genes
BP 18,665 annotated human genes
MF 18,165 annotated human genes

Load BP Gene Ontology Analysis ...
 70% 20,474 of 29,114 population items found in association

Load CC Gene Ontology Analysis ...
 73% 21,386 of 29,114 population items found in association

Load MF Gene Ontology Analysis ...
 70% 20,291 of 29,114 population items found in association

Run BP Gene Ontology Analysis: current study set of 56 IDs ... 78%     29 of     37 study items found in association
 66%     37 of     56 study items found in population(29114)
Calculating 12,358 uncorrected p-values using fisher
  12,358 GO terms are associated with 18,029 of 29,114 population items
     179 GO terms are associated with     29 of     56 study items
  METHOD fdr_bh:
       0 GO terms found significant (< 0.0

## All DEG

In [6]:
direction = "all"
run_goea(direction)

  EXISTS: go-basic.obo
  EXISTS: gene2go
go-basic.obo: fmt(1.2) rel(2020-08-11) 47,277 GO Terms
HMS:0:00:05.150650 340,575 annotations, 20,693 genes, 18,417 GOs, 1 taxids READ: gene2go 
CC 19,420 annotated human genes
BP 18,665 annotated human genes
MF 18,165 annotated human genes

Load BP Gene Ontology Analysis ...
 70% 20,474 of 29,114 population items found in association

Load CC Gene Ontology Analysis ...
 73% 21,386 of 29,114 population items found in association

Load MF Gene Ontology Analysis ...
 70% 20,291 of 29,114 population items found in association

Run BP Gene Ontology Analysis: current study set of 108 IDs ... 76%     52 of     68 study items found in association
 63%     68 of    108 study items found in population(29114)
Calculating 12,358 uncorrected p-values using fisher
  12,358 GO terms are associated with 18,029 of 29,114 population items
     219 GO terms are associated with     52 of    108 study items
  METHOD fdr_bh:
       1 GO terms found significant (< 0.