# GO module enrichment

In [1]:
import functools
import numpy as np
import pandas as pd
import collections as cx
from pybiomart import Dataset
from gtfparse import read_gtf
from scipy.stats import fisher_exact
from statsmodels.stats.multitest import multipletests

# GO analysis
from goatools.base import download_go_basic_obo
from goatools.base import download_ncbi_associations
from goatools.obo_parser import GODag
from goatools.anno.genetogo_reader import Gene2GoReader
from goatools.goea.go_enrichment_ns import GOEnrichmentStudyNS

## Functions

### Cached functions

In [2]:
@functools.lru_cache()
def get_gtf_genes_df():
    gtf_df = read_gtf("/ceph/genome/human/gencode25/gtf.CHR/_m/gencode.v25.annotation.gtf")
    return gtf_df[gtf_df["feature"] == "gene"][['gene_id', 'gene_name']]


@functools.lru_cache()
def get_wgcna_modules():
    return pd.read_csv("../../_m/modules.csv", index_col=0)


@functools.lru_cache()
def get_database():
    dataset = Dataset(name="hsapiens_gene_ensembl", 
                      host="http://www.ensembl.org",
                      use_cache=True)
    db = dataset.query(attributes=["ensembl_gene_id", 
                                   "external_gene_name", 
                                   "entrezgene_id"], 
                       use_attr_names=True).dropna(subset=['entrezgene_id'])
    return db

### Simple functions

In [3]:
def convert2entrez(mod):
    df = get_wgcna_modules()[(get_wgcna_modules().module) == mod].copy()
    df["ensemblID"] = df.index.str.replace("\\..*", "", regex=True)
    return df.merge(get_database(), left_on='ensemblID', 
                    right_on='ensembl_gene_id')


def obo_annotation(alpha=0.05):
    # database annotation
    fn_obo = download_go_basic_obo()
    fn_gene2go = download_ncbi_associations() # must be gunzip to work
    obodag = GODag(fn_obo) # downloads most up-to-date
    anno_hs = Gene2GoReader(fn_gene2go, taxids=[9606])
    # get associations
    ns2assoc = anno_hs.get_ns2assc()
    for nspc, id2gos in ns2assoc.items():
        print("{NS} {N:,} annotated human genes".format(NS=nspc, N=len(id2gos)))
    goeaobj = GOEnrichmentStudyNS(
        get_database()['entrezgene_id'], # List of human genes with entrez IDs
        ns2assoc, # geneid/GO associations
        obodag, # Ontologies
        propagate_counts = False,
        alpha = alpha, # default significance cut-off
        methods = ['fdr_bh'])
    return goeaobj


def run_goea(mod):
    df = convert2entrez(mod)
    geneids_study = {z[0]:z[1] for z in zip(df['entrezgene_id'], df['external_gene_name'])}
    goeaobj = obo_annotation()
    goea_results_all = goeaobj.run_study(geneids_study)
    goea_results_sig = [r for r in goea_results_all if r.p_fdr_bh < 0.05]
    ctr = cx.Counter([r.NS for r in goea_results_sig])
    print('Significant results[{TOTAL}] = {BP} BP + {MF} MF + {CC} CC'.format(
        TOTAL=len(goea_results_sig),
        BP=ctr['BP'],  # biological_process
        MF=ctr['MF'],  # molecular_function
        CC=ctr['CC'])) # cellular_component
    goeaobj.wr_xlsx("GO_analysis_module_%s.xlsx" % mod, goea_results_sig)
    goeaobj.wr_txt("GO_analysis_module_%s.txt" % mod, goea_results_sig)

## Gene annotation

In [4]:
gtf = get_gtf_genes_df()
gtf.head(2)

INFO:root:Extracted GTF attributes: ['gene_id', 'gene_type', 'gene_status', 'gene_name', 'level', 'havana_gene', 'transcript_id', 'transcript_type', 'transcript_status', 'transcript_name', 'transcript_support_level', 'tag', 'havana_transcript', 'exon_number', 'exon_id', 'ont', 'protein_id', 'ccdsid']


Unnamed: 0,gene_id,gene_name
0,ENSG00000223972.5,DDX11L1
12,ENSG00000227232.5,WASH7P


### Load WGCNA module

In [5]:
wgcna_df = get_wgcna_modules().merge(gtf, left_index=True, right_on="gene_id", how="left")
wgcna_df.head(2)

Unnamed: 0,module,gene_id,gene_name
12,greenyellow,ENSG00000227232.5,WASH7P
25,brown,ENSG00000278267.1,MIR6859-1


In [6]:
wgcna_df[(wgcna_df.gene_id.str.startswith("chr"))]

Unnamed: 0,module,gene_id,gene_name


In [7]:
wgcna_df[(wgcna_df.gene_name == 'DRD2')]

Unnamed: 0,module,gene_id,gene_name
1445279,brown,ENSG00000149295.13,DRD2


In [8]:
wgcna_df.to_csv("module_annotated.csv", index=False)

## GO enrichment for each cluster

In [9]:
for mod in get_wgcna_modules().module.unique():
    run_goea(mod)

requests.get(http://purl.obolibrary.org/obo/go/go-basic.obo, stream=True)
  WROTE: go-basic.obo

FTP RETR ftp.ncbi.nlm.nih.gov gene/DATA gene2go.gz -> gene2go.gz
  gunzip gene2go.gz
go-basic.obo: fmt(1.2) rel(2021-07-02) 47,229 GO Terms
HMS:0:00:05.345673 330,320 annotations, 20,687 genes, 18,684 GOs, 1 taxids READ: gene2go 
MF 18,191 annotated human genes
CC 19,424 annotated human genes
BP 18,506 annotated human genes

Load BP Gene Ontology Analysis ...
 70% 20,237 of 29,107 population items found in association

Load CC Gene Ontology Analysis ...
 74% 21,430 of 29,107 population items found in association

Load MF Gene Ontology Analysis ...
 70% 20,355 of 29,107 population items found in association

Run BP Gene Ontology Analysis: current study set of 406 IDs ... 90%    366 of    406 study items found in association
100%    406 of    406 study items found in population(29107)
Calculating 12,438 uncorrected p-values using fisher
  12,438 GO terms are associated with 17,849 of 29,107 p