# GO analysis using GOATOOLS

In [1]:
import functools
import errno, os
import pandas as pd
import collections as cx
from pybiomart import Dataset
# GO analysis
from goatools.base import download_go_basic_obo
from goatools.base import download_ncbi_associations
from goatools.obo_parser import GODag
from goatools.anno.genetogo_reader import Gene2GoReader
from goatools.goea.go_enrichment_ns import GOEnrichmentStudyNS

In [2]:
os.environ['NUMEXPR_MAX_THREADS'] = '10'

In [3]:
@functools.lru_cache()
def get_database():
    dataset = Dataset(name="hsapiens_gene_ensembl", 
                      host="http://www.ensembl.org",
                      use_cache=True)
    db = dataset.query(attributes=["ensembl_gene_id", 
                                   "external_gene_name", 
                                   "entrezgene_id"], 
                       use_attr_names=True).dropna(subset=['entrezgene_id'])
    return db


@functools.lru_cache()
def get_deg_with_modality():
    fn = '../../_m/degs_with_modality.txt'
    return pd.read_csv(fn, sep='\t', index_col=0)


@functools.lru_cache()
def get_uni_modality():
    df = get_deg_with_modality()
    return df[(df['Modality_AA'] == 1) & (df['Modality_EA'] == 1)]


@functools.lru_cache()
def get_AAonly_modality():
    df = get_deg_with_modality()
    return df[(df['Modality_AA'] >= 2) & (df['Modality_EA'] == 1)]


@functools.lru_cache()
def get_EAonly_modality():
    df = get_deg_with_modality()
    return df[(df['Modality_AA'] == 1) & (df['Modality_EA'] >= 2)]


@functools.lru_cache()
def get_multi_modality():
    df = get_deg_with_modality()
    return df[(df['Modality_AA'] > 1) & (df['Modality_EA'] > 1)]


@functools.lru_cache()
def convert2entrez(deg_func):
    df = deg_func()
    if 'EntrezID' in df.columns:
        return df.rename(columns={'EntrezID': 'entrezgene_id'})
    else: 
        return df.merge(get_database(), left_on='ensemblID', 
                        right_on='ensembl_gene_id')
    

@functools.lru_cache()
def get_upregulated(deg_func):
    df = convert2entrez(deg_func)
    return df.loc[(df['t'] > 0)]
    
    
@functools.lru_cache()
def get_downregulated(deg_func):
    df = convert2entrez(deg_func)
    return df.loc[(df['t'] < 0)]

In [4]:
def mkdir_p(directory):
    try:
        os.makedirs(directory)
    except OSError as e:
        if e.errno != errno.EEXIST:
            raise
            

def obo_annotation(alpha=0.05):
    # database annotation
    fn_obo = download_go_basic_obo()
    fn_gene2go = download_ncbi_associations() # must be gunzip to work
    obodag = GODag(fn_obo) # downloads most up-to-date
    anno_hs = Gene2GoReader(fn_gene2go, taxids=[9606])
    # get associations
    ns2assoc = anno_hs.get_ns2assc()
    for nspc, id2gos in ns2assoc.items():
        print("{NS} {N:,} annotated human genes".format(NS=nspc, N=len(id2gos)))
    goeaobj = GOEnrichmentStudyNS(
        get_database()['entrezgene_id'], # List of human genes with entrez IDs
        ns2assoc, # geneid/GO associations
        obodag, # Ontologies
        propagate_counts = False,
        alpha = alpha, # default significance cut-off
        methods = ['fdr_bh'])
    return goeaobj


def run_goea(deg_func, outdir):
    df = convert2entrez(deg_func)
    geneids_study = {z[0]:z[1] for z in zip(df['entrezgene_id'], df['Symbol'])}
    goeaobj = obo_annotation()
    goea_results_all = goeaobj.run_study(geneids_study)
    goea_results_sig = [r for r in goea_results_all if r.p_fdr_bh < 0.05]
    ctr = cx.Counter([r.NS for r in goea_results_sig])
    print('Significant results[{TOTAL}] = {BP} BP + {MF} MF + {CC} CC'.format(
        TOTAL=len(goea_results_sig),
        BP=ctr['BP'],  # biological_process
        MF=ctr['MF'],  # molecular_function
        CC=ctr['CC'])) # cellular_component
    label = "all"
    goeaobj.wr_xlsx("%s/GO_analysis_%s.xlsx" % (outdir, label), goea_results_sig)
    goeaobj.wr_txt("%s/GO_analysis_%s.txt" % (outdir, label), goea_results_sig)
    
    
def run_goea_direction(direction, deg_func, outdir):
    df = get_upregulated(deg_func) if direction == 'Up' else get_downregulated(deg_func)
    geneids_study = {z[0]:z[1] for z in zip(df['entrezgene_id'], df['Symbol'])}
    goeaobj = obo_annotation()
    goea_results_all = goeaobj.run_study(geneids_study)
    goea_results_sig = [r for r in goea_results_all if r.p_fdr_bh < 0.05]
    ctr = cx.Counter([r.NS for r in goea_results_sig])
    print('Significant results[{TOTAL}] = {BP} BP + {MF} MF + {CC} CC'.format(
        TOTAL=len(goea_results_sig),
        BP=ctr['BP'],  # biological_process
        MF=ctr['MF'],  # molecular_function
        CC=ctr['CC'])) # cellular_component
    label = "upregulated" if direction == "Up" else "downregulated"
    goeaobj.wr_xlsx("%s/GO_analysis_%s.xlsx" % (outdir, label), goea_results_sig)
    goeaobj.wr_txt("%s/GO_analysis_%s.txt" % (outdir, label), goea_results_sig)

## Multimodal AA only

### All DEG

In [5]:
directory = 'AAonly'
mkdir_p(directory)
run_goea(get_AAonly_modality, directory)

  EXISTS: go-basic.obo
  EXISTS: gene2go
go-basic.obo: fmt(1.2) rel(2020-09-10) 47,314 GO Terms
HMS:0:00:05.077761 340,574 annotations, 20,692 genes, 18,417 GOs, 1 taxids READ: gene2go 
MF 18,165 annotated human genes
CC 19,419 annotated human genes
BP 18,665 annotated human genes

Load BP Gene Ontology Analysis ...
 70% 20,474 of 29,114 population items found in association

Load CC Gene Ontology Analysis ...
 73% 21,385 of 29,114 population items found in association

Load MF Gene Ontology Analysis ...
 70% 20,291 of 29,114 population items found in association

Run BP Gene Ontology Analysis: current study set of 854 IDs ... 80%    509 of    633 study items found in association
 74%    633 of    854 study items found in population(29114)
Calculating 12,344 uncorrected p-values using fisher
  12,344 GO terms are associated with 18,029 of 29,114 population items
   2,391 GO terms are associated with    509 of    854 study items
  METHOD fdr_bh:
       0 GO terms found significant (< 0.

### Upregulated DEG

In [6]:
direction = "Up"
run_goea_direction(direction, get_AAonly_modality, directory)

  EXISTS: go-basic.obo
  EXISTS: gene2go
go-basic.obo: fmt(1.2) rel(2020-09-10) 47,314 GO Terms
HMS:0:00:05.314919 340,574 annotations, 20,692 genes, 18,417 GOs, 1 taxids READ: gene2go 
MF 18,165 annotated human genes
CC 19,419 annotated human genes
BP 18,665 annotated human genes

Load BP Gene Ontology Analysis ...
 70% 20,474 of 29,114 population items found in association

Load CC Gene Ontology Analysis ...
 73% 21,385 of 29,114 population items found in association

Load MF Gene Ontology Analysis ...
 70% 20,291 of 29,114 population items found in association

Run BP Gene Ontology Analysis: current study set of 483 IDs ... 78%    262 of    337 study items found in association
 70%    337 of    483 study items found in population(29114)
Calculating 12,344 uncorrected p-values using fisher
  12,344 GO terms are associated with 18,029 of 29,114 population items
   1,425 GO terms are associated with    262 of    483 study items
  METHOD fdr_bh:
       1 GO terms found significant (< 0.

### Downregulated DEG

In [7]:
direction = "Down"
run_goea_direction(direction, get_AAonly_modality, directory)

  EXISTS: go-basic.obo
  EXISTS: gene2go
go-basic.obo: fmt(1.2) rel(2020-09-10) 47,314 GO Terms
HMS:0:00:05.219546 340,574 annotations, 20,692 genes, 18,417 GOs, 1 taxids READ: gene2go 
MF 18,165 annotated human genes
CC 19,419 annotated human genes
BP 18,665 annotated human genes

Load BP Gene Ontology Analysis ...
 70% 20,474 of 29,114 population items found in association

Load CC Gene Ontology Analysis ...
 73% 21,385 of 29,114 population items found in association

Load MF Gene Ontology Analysis ...
 70% 20,291 of 29,114 population items found in association

Run BP Gene Ontology Analysis: current study set of 371 IDs ... 83%    247 of    296 study items found in association
 80%    296 of    371 study items found in population(29114)
Calculating 12,344 uncorrected p-values using fisher
  12,344 GO terms are associated with 18,029 of 29,114 population items
   1,464 GO terms are associated with    247 of    371 study items
  METHOD fdr_bh:
       0 GO terms found significant (< 0.

## Multimodal EA only

### All DEG

In [8]:
directory = 'EAonly'
mkdir_p(directory)
run_goea(get_EAonly_modality, directory)

  EXISTS: go-basic.obo
  EXISTS: gene2go
go-basic.obo: fmt(1.2) rel(2020-09-10) 47,314 GO Terms
HMS:0:00:05.198410 340,574 annotations, 20,692 genes, 18,417 GOs, 1 taxids READ: gene2go 
MF 18,165 annotated human genes
CC 19,419 annotated human genes
BP 18,665 annotated human genes

Load BP Gene Ontology Analysis ...
 70% 20,474 of 29,114 population items found in association

Load CC Gene Ontology Analysis ...
 73% 21,385 of 29,114 population items found in association

Load MF Gene Ontology Analysis ...
 70% 20,291 of 29,114 population items found in association

Run BP Gene Ontology Analysis: current study set of 557 IDs ... 79%    314 of    399 study items found in association
 72%    399 of    557 study items found in population(29114)
Calculating 12,344 uncorrected p-values using fisher
  12,344 GO terms are associated with 18,029 of 29,114 population items
   1,635 GO terms are associated with    314 of    557 study items
  METHOD fdr_bh:
       0 GO terms found significant (< 0.

### Upregulated DEG

In [9]:
direction = "Up"
run_goea_direction(direction, get_EAonly_modality, directory)

  EXISTS: go-basic.obo
  EXISTS: gene2go
go-basic.obo: fmt(1.2) rel(2020-09-10) 47,314 GO Terms
HMS:0:00:05.807447 340,574 annotations, 20,692 genes, 18,417 GOs, 1 taxids READ: gene2go 
MF 18,165 annotated human genes
CC 19,419 annotated human genes
BP 18,665 annotated human genes

Load BP Gene Ontology Analysis ...
 70% 20,474 of 29,114 population items found in association

Load CC Gene Ontology Analysis ...
 73% 21,385 of 29,114 population items found in association

Load MF Gene Ontology Analysis ...
 70% 20,291 of 29,114 population items found in association

Run BP Gene Ontology Analysis: current study set of 307 IDs ... 80%    162 of    202 study items found in association
 66%    202 of    307 study items found in population(29114)
Calculating 12,344 uncorrected p-values using fisher
  12,344 GO terms are associated with 18,029 of 29,114 population items
     945 GO terms are associated with    162 of    307 study items
  METHOD fdr_bh:
       0 GO terms found significant (< 0.

### Downregulated DEG

In [10]:
direction = "Down"
run_goea_direction(direction, get_EAonly_modality, directory)

  EXISTS: go-basic.obo
  EXISTS: gene2go
go-basic.obo: fmt(1.2) rel(2020-09-10) 47,314 GO Terms
HMS:0:00:05.276081 340,574 annotations, 20,692 genes, 18,417 GOs, 1 taxids READ: gene2go 
MF 18,165 annotated human genes
CC 19,419 annotated human genes
BP 18,665 annotated human genes

Load BP Gene Ontology Analysis ...
 70% 20,474 of 29,114 population items found in association

Load CC Gene Ontology Analysis ...
 73% 21,385 of 29,114 population items found in association

Load MF Gene Ontology Analysis ...
 70% 20,291 of 29,114 population items found in association

Run BP Gene Ontology Analysis: current study set of 250 IDs ... 77%    152 of    197 study items found in association
 79%    197 of    250 study items found in population(29114)
Calculating 12,344 uncorrected p-values using fisher
  12,344 GO terms are associated with 18,029 of 29,114 population items
     954 GO terms are associated with    152 of    250 study items
  METHOD fdr_bh:
       0 GO terms found significant (< 0.

## Multimodal both ethnicity

### All DEG

In [11]:
directory = 'both_multimodal'
mkdir_p(directory)
run_goea(get_multi_modality, directory)

  EXISTS: go-basic.obo
  EXISTS: gene2go
go-basic.obo: fmt(1.2) rel(2020-09-10) 47,314 GO Terms
HMS:0:00:05.512838 340,574 annotations, 20,692 genes, 18,417 GOs, 1 taxids READ: gene2go 
MF 18,165 annotated human genes
CC 19,419 annotated human genes
BP 18,665 annotated human genes

Load BP Gene Ontology Analysis ...
 70% 20,474 of 29,114 population items found in association

Load CC Gene Ontology Analysis ...
 73% 21,385 of 29,114 population items found in association

Load MF Gene Ontology Analysis ...
 70% 20,291 of 29,114 population items found in association

Run BP Gene Ontology Analysis: current study set of 586 IDs ... 79%    278 of    352 study items found in association
 60%    352 of    586 study items found in population(29114)
Calculating 12,344 uncorrected p-values using fisher
  12,344 GO terms are associated with 18,029 of 29,114 population items
   1,587 GO terms are associated with    278 of    586 study items
  METHOD fdr_bh:
       6 GO terms found significant (< 0.

### Upregulated DEG

In [12]:
direction = "Up"
run_goea_direction(direction, get_multi_modality, directory)

  EXISTS: go-basic.obo
  EXISTS: gene2go
go-basic.obo: fmt(1.2) rel(2020-09-10) 47,314 GO Terms
HMS:0:00:05.454834 340,574 annotations, 20,692 genes, 18,417 GOs, 1 taxids READ: gene2go 
MF 18,165 annotated human genes
CC 19,419 annotated human genes
BP 18,665 annotated human genes

Load BP Gene Ontology Analysis ...
 70% 20,474 of 29,114 population items found in association

Load CC Gene Ontology Analysis ...
 73% 21,385 of 29,114 population items found in association

Load MF Gene Ontology Analysis ...
 70% 20,291 of 29,114 population items found in association

Run BP Gene Ontology Analysis: current study set of 339 IDs ... 74%    148 of    199 study items found in association
 59%    199 of    339 study items found in population(29114)
Calculating 12,344 uncorrected p-values using fisher
  12,344 GO terms are associated with 18,029 of 29,114 population items
     875 GO terms are associated with    148 of    339 study items
  METHOD fdr_bh:
       0 GO terms found significant (< 0.

### Downregulated DEG

In [13]:
direction = "Down"
run_goea_direction(direction, get_multi_modality, directory)

  EXISTS: go-basic.obo
  EXISTS: gene2go
go-basic.obo: fmt(1.2) rel(2020-09-10) 47,314 GO Terms
HMS:0:00:05.182071 340,574 annotations, 20,692 genes, 18,417 GOs, 1 taxids READ: gene2go 
MF 18,165 annotated human genes
CC 19,419 annotated human genes
BP 18,665 annotated human genes

Load BP Gene Ontology Analysis ...
 70% 20,474 of 29,114 population items found in association

Load CC Gene Ontology Analysis ...
 73% 21,385 of 29,114 population items found in association

Load MF Gene Ontology Analysis ...
 70% 20,291 of 29,114 population items found in association

Run BP Gene Ontology Analysis: current study set of 247 IDs ... 85%    130 of    153 study items found in association
 62%    153 of    247 study items found in population(29114)
Calculating 12,344 uncorrected p-values using fisher
  12,344 GO terms are associated with 18,029 of 29,114 population items
     950 GO terms are associated with    130 of    247 study items
  METHOD fdr_bh:
       6 GO terms found significant (< 0.

## Unimodal both ethnicity

### All DEG

In [11]:
directory = 'unimodal'
mkdir_p(directory)
run_goea(get_uni_modality, directory)

  EXISTS: go-basic.obo
  EXISTS: gene2go
go-basic.obo: fmt(1.2) rel(2020-09-10) 47,314 GO Terms
HMS:0:00:05.512838 340,574 annotations, 20,692 genes, 18,417 GOs, 1 taxids READ: gene2go 
MF 18,165 annotated human genes
CC 19,419 annotated human genes
BP 18,665 annotated human genes

Load BP Gene Ontology Analysis ...
 70% 20,474 of 29,114 population items found in association

Load CC Gene Ontology Analysis ...
 73% 21,385 of 29,114 population items found in association

Load MF Gene Ontology Analysis ...
 70% 20,291 of 29,114 population items found in association

Run BP Gene Ontology Analysis: current study set of 586 IDs ... 79%    278 of    352 study items found in association
 60%    352 of    586 study items found in population(29114)
Calculating 12,344 uncorrected p-values using fisher
  12,344 GO terms are associated with 18,029 of 29,114 population items
   1,587 GO terms are associated with    278 of    586 study items
  METHOD fdr_bh:
       6 GO terms found significant (< 0.

### Upregulated DEG

In [12]:
direction = "Up"
run_goea_direction(direction, get_uni_modality, directory)

  EXISTS: go-basic.obo
  EXISTS: gene2go
go-basic.obo: fmt(1.2) rel(2020-09-10) 47,314 GO Terms
HMS:0:00:05.454834 340,574 annotations, 20,692 genes, 18,417 GOs, 1 taxids READ: gene2go 
MF 18,165 annotated human genes
CC 19,419 annotated human genes
BP 18,665 annotated human genes

Load BP Gene Ontology Analysis ...
 70% 20,474 of 29,114 population items found in association

Load CC Gene Ontology Analysis ...
 73% 21,385 of 29,114 population items found in association

Load MF Gene Ontology Analysis ...
 70% 20,291 of 29,114 population items found in association

Run BP Gene Ontology Analysis: current study set of 339 IDs ... 74%    148 of    199 study items found in association
 59%    199 of    339 study items found in population(29114)
Calculating 12,344 uncorrected p-values using fisher
  12,344 GO terms are associated with 18,029 of 29,114 population items
     875 GO terms are associated with    148 of    339 study items
  METHOD fdr_bh:
       0 GO terms found significant (< 0.

### Downregulated DEG

In [13]:
direction = "Down"
run_goea_direction(direction, get_uni_modality, directory)

  EXISTS: go-basic.obo
  EXISTS: gene2go
go-basic.obo: fmt(1.2) rel(2020-09-10) 47,314 GO Terms
HMS:0:00:05.182071 340,574 annotations, 20,692 genes, 18,417 GOs, 1 taxids READ: gene2go 
MF 18,165 annotated human genes
CC 19,419 annotated human genes
BP 18,665 annotated human genes

Load BP Gene Ontology Analysis ...
 70% 20,474 of 29,114 population items found in association

Load CC Gene Ontology Analysis ...
 73% 21,385 of 29,114 population items found in association

Load MF Gene Ontology Analysis ...
 70% 20,291 of 29,114 population items found in association

Run BP Gene Ontology Analysis: current study set of 247 IDs ... 85%    130 of    153 study items found in association
 62%    153 of    247 study items found in population(29114)
Calculating 12,344 uncorrected p-values using fisher
  12,344 GO terms are associated with 18,029 of 29,114 population items
     950 GO terms are associated with    130 of    247 study items
  METHOD fdr_bh:
       6 GO terms found significant (< 0.