# Enrichment in GWAS, TWAS, and DE 

In [1]:
import functools
import numpy as np
import pandas as pd
import collections as cx
from pybiomart import Dataset
from gtfparse import read_gtf
from scipy.stats import fisher_exact
from statsmodels.stats.multitest import multipletests

# GO analysis
from goatools.base import download_go_basic_obo
from goatools.base import download_ncbi_associations
from goatools.obo_parser import GODag
from goatools.anno.genetogo_reader import Gene2GoReader
from goatools.goea.go_enrichment_ns import GOEnrichmentStudyNS

## Functions

In [2]:
@functools.lru_cache()
def get_gtf_genes_df():
    gtf_df = read_gtf("/ceph/genome/human/gencode25/gtf.CHR/_m/gencode.v25.annotation.gtf")
    return gtf_df[gtf_df["feature"] == "gene"][['gene_id', 'gene_name']]


@functools.lru_cache()
def get_wgcna_modules():
    return pd.read_csv("../../_m/modules.csv", index_col=0)


@functools.lru_cache()
def get_database():
    dataset = Dataset(name="hsapiens_gene_ensembl", 
                      host="http://www.ensembl.org",
                      use_cache=True)
    db = dataset.query(attributes=["ensembl_gene_id", 
                                   "external_gene_name", 
                                   "entrezgene_id"], 
                       use_attr_names=True).dropna(subset=['entrezgene_id'])
    return db

In [3]:
def fet(a, b, u):
    # a, b, u are sets
    # u is the universe
    yes_a = u.intersection(a)
    yes_b = u.intersection(b)
    no_a = u - a
    no_b = u - b
    m = [[len(yes_a.intersection(yes_b)), len(no_a.intersection(yes_b)) ], 
         [len(yes_a.intersection(no_b)), len(no_a.intersection(no_b))]]
    return fisher_exact(m)


def enrichment_rows():
    mod = get_wgcna_modules().module.unique()
    u = set(get_wgcna_modules().index) 
    for ii in range(len(mod)): # for each module
        a = set(get_wgcna_modules()[(get_wgcna_modules().module) == mod[ii]].index)
        yield (mod[ii],
               len(a),
               *fet(a, gwas_genes, u),
               *fet(a, twas_genes, u),
               *fet(a, de_genes, u),
               )
        

def enrichment_rows_nomhc():
    mod = get_wgcna_modules().module.unique()
    u = set(get_wgcna_modules().index) - mhc_genes
    for ii in range(len(mod)): # for each module
        a = set(get_wgcna_modules()[(get_wgcna_modules().module) == mod[ii]].index) - mhc_genes
        yield (mod[ii],
               len(a),
               *fet(a, gwas_genes - mhc_genes, u),
               *fet(a, twas_genes - mhc_genes, u),
               *fet(a, de_genes - mhc_genes, u),
              )
        

def convert2entrez(mod):
    df = get_wgcna_modules()[(get_wgcna_modules().module) == mod].copy()
    df["ensemblID"] = df.index.str.replace("\\..*", "", regex=True)
    return df.merge(get_database(), left_on='ensemblID', 
                    right_on='ensembl_gene_id')


def obo_annotation(alpha=0.05):
    # database annotation
    fn_obo = download_go_basic_obo()
    fn_gene2go = download_ncbi_associations() # must be gunzip to work
    obodag = GODag(fn_obo) # downloads most up-to-date
    anno_hs = Gene2GoReader(fn_gene2go, taxids=[9606])
    # get associations
    ns2assoc = anno_hs.get_ns2assc()
    for nspc, id2gos in ns2assoc.items():
        print("{NS} {N:,} annotated human genes".format(NS=nspc, N=len(id2gos)))
    goeaobj = GOEnrichmentStudyNS(
        get_database()['entrezgene_id'], # List of human genes with entrez IDs
        ns2assoc, # geneid/GO associations
        obodag, # Ontologies
        propagate_counts = False,
        alpha = alpha, # default significance cut-off
        methods = ['fdr_bh'])
    return goeaobj


def run_goea(mod):
    df = convert2entrez(mod)
    geneids_study = {z[0]:z[1] for z in zip(df['entrezgene_id'], df['external_gene_name'])}
    goeaobj = obo_annotation()
    goea_results_all = goeaobj.run_study(geneids_study)
    goea_results_sig = [r for r in goea_results_all if r.p_fdr_bh < 0.05]
    ctr = cx.Counter([r.NS for r in goea_results_sig])
    print('Significant results[{TOTAL}] = {BP} BP + {MF} MF + {CC} CC'.format(
        TOTAL=len(goea_results_sig),
        BP=ctr['BP'],  # biological_process
        MF=ctr['MF'],  # molecular_function
        CC=ctr['CC'])) # cellular_component
    goeaobj.wr_xlsx("GO_analysis_module_%s.xlsx" % mod, goea_results_sig)
    goeaobj.wr_txt("GO_analysis_module_%s.txt" % mod, goea_results_sig)

## Gene annotation

In [4]:
gtf = get_gtf_genes_df()
gtf.head(2)

INFO:root:Extracted GTF attributes: ['gene_id', 'gene_type', 'gene_status', 'gene_name', 'level', 'havana_gene', 'transcript_id', 'transcript_type', 'transcript_status', 'transcript_name', 'transcript_support_level', 'tag', 'havana_transcript', 'exon_number', 'exon_id', 'ont', 'protein_id', 'ccdsid']


Unnamed: 0,gene_id,gene_name
0,ENSG00000223972.5,DDX11L1
12,ENSG00000227232.5,WASH7P


## GWAS, TWAS and DE enrichment

### Load DE, TWAS, and GWAS genes

In [5]:
de_genes = set(pd.read_csv('../../../differential_expression/_m/genes/diffExpr_szVctl_FDR05.txt',
                           sep='\t', usecols=[0], index_col=0).index)
len(de_genes)

2701

In [6]:
gwas_genes = set(pd.read_csv('/ceph/projects/v3_phase3_paper/inputs/gwas/PGC2_CLOZUK/table_s3/hg38/genes/_m/gwas_genes.csv')['gene_id'])
len(gwas_genes)

2000

In [7]:
mhc_genes = set(pd.read_csv('/ceph/projects/v4_phase3_paper/inputs/counts/mhc_region_genes/_m/mhc_genes.csv')['gene_id'])
len(mhc_genes)

383

In [8]:
annot = pd.read_csv("/ceph/projects/v4_phase3_paper/inputs/counts/text_files_counts/_m/caudate/gene.bed", 
                    sep='\t', index_col=0)
annot["Feature"] = annot.gene_id.str.replace("\\..*", "", regex=True)
twas = pd.read_csv("../../../twas/feature_comparison/manuscript_supp_data/_m/"+\
                   "BrainSeq_Phase3_Caudate_TWAS_associations_allFeatures.txt.gz", sep='\t')
twas = twas[(twas["FDR"] < 0.05) & (twas["Type"] == "Gene")].merge(annot, on="Feature")
twas_genes = set(twas['gene_id'])
len(twas_genes)

INFO:numexpr.utils:Note: NumExpr detected 60 cores but "NUMEXPR_MAX_THREADS" not set, so enforcing safe limit of 8.
INFO:numexpr.utils:NumExpr defaulting to 8 threads.


684

### Load WGCNA module

In [9]:
wgcna_df = get_wgcna_modules().merge(gtf, left_index=True, right_on="gene_id", how="left")
wgcna_df.head(2)

Unnamed: 0,module,gene_id,gene_name
12.0,grey,ENSG00000227232.5,WASH7P
25.0,yellow,ENSG00000278267.1,MIR6859-1


In [10]:
wgcna_df[(wgcna_df.gene_id.str.startswith("chr"))]

Unnamed: 0,module,gene_id,gene_name
,lightgreen,chr11:113412884-113414374(-),
,lightgreen,chr11:113412884-113415420(-),
,lightgreen,chr11:113414462-113415420(-),


In [11]:
wgcna_df[(wgcna_df.gene_name == 'DRD2')]

Unnamed: 0,module,gene_id,gene_name
1445279.0,lightcyan,ENSG00000149295.13,DRD2


In [12]:
wgcna_df[(wgcna_df.gene_name == 'SETD1A')]

Unnamed: 0,module,gene_id,gene_name
1914770.0,yellow,ENSG00000099381.16,SETD1A


### Enrichment

In [13]:
edf1 = pd.DataFrame.from_records(enrichment_rows(), 
                                 columns=['module_id', 'n_genes', 'gwas_or', 'gwas_p', 
                                          'twas_or', 'twas_p', 'de_or', 'de_p'],
                                 index='module_id')
edf1['twas_fdr_bh'] = multipletests(edf1['twas_p'], method='fdr_bh')[1]
edf1['gwas_fdr_bh'] = multipletests(edf1['gwas_p'], method='fdr_bh')[1]
edf1['de_fdr_bh'] = multipletests(edf1['de_p'], method='fdr_bh')[1]
edf1[['n_genes', 'gwas_or', 'gwas_p', 'gwas_fdr_bh', 'twas_or', 'twas_p', 
      'twas_fdr_bh', 'de_or', 'de_p', 'de_fdr_bh']].to_csv('wgcna_module_enrichment.csv')
edf1[['n_genes', 'gwas_or', 'gwas_p', 'gwas_fdr_bh', 'twas_or', 'twas_p', 
      'twas_fdr_bh', 'de_or', 'de_p', 'de_fdr_bh']]

Unnamed: 0_level_0,n_genes,gwas_or,gwas_p,gwas_fdr_bh,twas_or,twas_p,twas_fdr_bh,de_or,de_p,de_fdr_bh
module_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
grey,3940,0.892699,0.214026,0.523174,1.082137,0.43989,0.739413,0.67633,2.572857e-11,8.086121e-11
yellow,1587,1.127312,0.308776,0.617552,1.111805,0.444625,0.739413,0.5676,1.189145e-09,2.906798e-09
turquoise,3334,0.864652,0.142167,0.390959,0.877289,0.270678,0.739413,0.303677,3.095585e-57,3.405144e-56
cyan,375,1.172909,0.522217,0.691577,0.983961,1.0,1.0,1.867858,7.274883e-06,1.150205e-05
green,1568,1.245199,0.063396,0.232452,0.889608,0.537755,0.739413,0.379828,4.231908e-21,1.862039e-20
brown,2253,0.767852,0.025971,0.142843,0.8641,0.327368,0.739413,2.935529,7.804697e-79,1.717033e-77
black,1022,1.108971,0.480575,0.691577,1.127104,0.509117,0.739413,2.301877,1.466545e-23,8.065996e-23
midnightblue,370,1.463223,0.093245,0.293057,1.189579,0.535008,0.739413,1.491874,0.007027371,0.008136956
lightyellow,226,0.908099,1.0,1.0,0.584393,0.426782,0.739413,1.826881,0.0008007519,0.0009786968
tan,411,1.003817,0.9034,0.97369,1.415578,0.184201,0.707588,1.956593,3.82487e-07,7.649741e-07


### No MHC region

In [14]:
edf2 = pd.DataFrame.from_records(enrichment_rows_nomhc(), 
                                columns=['module_id', 'n_genes', 'gwas_or', 'gwas_p', 
                                         'twas_or', 'twas_p', 'de_or', 'de_p'],
                                index='module_id')
edf2['twas_fdr_bh'] = multipletests(edf2['twas_p'], method='fdr_bh')[1]
edf2['gwas_fdr_bh'] = multipletests(edf2['gwas_p'], method='fdr_bh')[1]
edf2['de_fdr_bh'] = multipletests(edf2['de_p'], method='fdr_bh')[1]
edf2[['n_genes', 'gwas_or', 'gwas_p', 'gwas_fdr_bh', 'twas_or', 'twas_p', 
      'twas_fdr_bh', 'de_or', 'de_p', 'de_fdr_bh']].to_csv('wgcna_module_enrichment_excluding_mhc_region.csv')
edf2[['n_genes', 'gwas_or', 'gwas_p', 'gwas_fdr_bh', 'twas_or', 'twas_p', 
      'twas_fdr_bh', 'de_or', 'de_p', 'de_fdr_bh']]

Unnamed: 0_level_0,n_genes,gwas_or,gwas_p,gwas_fdr_bh,twas_or,twas_p,twas_fdr_bh,de_or,de_p,de_fdr_bh
module_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
grey,3909,0.850262,0.102202,0.281057,1.03299,0.746956,0.887359,0.673036,1.78829e-11,5.62034e-11
yellow,1569,1.025675,0.834775,0.931585,1.103254,0.521159,0.814904,0.558072,4.513232e-10,1.241139e-09
turquoise,3313,0.865699,0.177595,0.390709,0.834726,0.15014,0.471869,0.305742,2.263035e-56,2.4893380000000002e-55
cyan,373,1.256521,0.331638,0.561233,0.976909,1.0,1.0,1.880481,6.734897e-06,1.229777e-05
green,1563,1.415003,0.006436,0.070794,0.901696,0.629698,0.814904,0.381027,7.589136999999999e-21,3.33922e-20
brown,2245,0.829071,0.155947,0.381205,0.916562,0.586088,0.814904,2.945581,6.003082e-79,1.3206779999999999e-77
black,1014,1.109538,0.49498,0.725971,1.124947,0.491598,0.814904,2.31284,1.036007e-23,5.698037e-23
midnightblue,367,1.522798,0.090576,0.281057,1.309193,0.331022,0.809166,1.506621,0.005327744,0.006168966
lightyellow,226,1.087747,0.722394,0.882926,0.636869,0.535715,0.814904,1.82675,0.0008010333,0.0009790407
tan,408,1.000158,1.0,1.0,1.557566,0.089448,0.471869,1.944618,5.322901e-07,1.06458e-06


## GO enrichment for each cluster

In [15]:
for mod in get_wgcna_modules().module.unique():
    run_goea(mod)

requests.get(http://purl.obolibrary.org/obo/go/go-basic.obo, stream=True)
  WROTE: go-basic.obo

FTP RETR ftp.ncbi.nlm.nih.gov gene/DATA gene2go.gz -> gene2go.gz
  gunzip gene2go.gz
go-basic.obo: fmt(1.2) rel(2021-09-01) 47,191 GO Terms
HMS:0:00:05.711431 330,404 annotations, 20,688 genes, 18,642 GOs, 1 taxids READ: gene2go 
CC 19,433 annotated human genes
BP 18,501 annotated human genes
MF 18,194 annotated human genes

Load BP Gene Ontology Analysis ...
fisher module not installed.  Falling back on scipy.stats.fisher_exact
 70% 20,231 of 29,107 population items found in association

Load CC Gene Ontology Analysis ...
fisher module not installed.  Falling back on scipy.stats.fisher_exact
 74% 21,438 of 29,107 population items found in association

Load MF Gene Ontology Analysis ...
fisher module not installed.  Falling back on scipy.stats.fisher_exact
 70% 20,357 of 29,107 population items found in association

Run BP Gene Ontology Analysis: current study set of 2789 IDs ... 76%  2,113