# Enrichment in GWAS, TWAS, and DE 

In [28]:
import functools
import numpy as np
import pandas as pd
import collections as cx
from gtfparse import read_gtf
from scipy.stats import fisher_exact
from statsmodels.stats.multitest import multipletests

## Functions

In [39]:
@functools.lru_cache()
def get_gtf_genes_df():
    gtf_df = read_gtf("/ceph/genome/human/gencode25/gtf.CHR/_m/gencode.v25.annotation.gtf")
    return gtf_df[gtf_df["feature"] == "gene"][['gene_id', 'gene_name', 'seqname']]


@functools.lru_cache()
def get_X_linked():
    return get_gtf_genes_df()[(get_gtf_genes_df()['seqname'] == 'chrX')]


@functools.lru_cache()
def get_de(tissue):
    deg = {
        'caudate': "/ceph/projects/v3_phase3_paper/analysis/differential_expression/_m/genes/diffExpr_szVctl_full.txt",
        'dlpfc': "/ceph/users/jbenja13/phase3_paper/phase2/extract_de/_m/dlpfc_diffExpr_szVctl_full.txt",
        'hippocampus': "/ceph/users/jbenja13/phase3_paper/phase2/extract_de/_m/hippo_diffExpr_szVctl_full.txt",
    }
    return pd.read_csv(deg[tissue], sep='\t')


@functools.lru_cache()
def get_degs(tissue):
    if tissue == 'caudate':
        return get_de(tissue)[(get_de(tissue)['adj.P.Val'] < 0.05)]
    else:
        return get_de(tissue)[(get_de(tissue)['adj.P.Val'] < 0.05) & 
                              (get_de(tissue)['type'] == 'gene')]
    

In [51]:
def fet(a, b, u):
    # a, b, u are sets
    # u is the universe
    yes_a = u.intersection(a)
    yes_b = u.intersection(b)
    no_a = u - a
    no_b = u - b
    m = [[len(yes_a.intersection(yes_b)), len(no_a.intersection(yes_b)) ], 
         [len(yes_a.intersection(no_b)), len(no_a.intersection(no_b))]]
    return fisher_exact(m)


def enrichment_rows():
    u = set(get_gtf_genes_df().gene_id) 
    for tissue in ["caudate", "dlpfc", "hippocampus"]:
        a = set(get_X_linked().gene_id) & set(get_degs(tissue).gencodeID)
        yield (tissue, len(a),
               *fet(a, gwas_genes, u),
               *fet(a, caudate_twas, u),
               *fet(a, dlpfc_twas, u),
               *fet(a, hippo_twas, u),
              )


def enrichment_rows_nomhc():
    u = set(get_gtf_genes_df().gene_id) 
    for tissue in ["caudate", "dlpfc", "hippocampus"]:
        a = set(get_X_linked().gene_id) & set(get_degs(tissue).gencodeID)
        yield (tissue, len(a),
               *fet(a, gwas_genes - mhc_genes, u),
               *fet(a, caudate_twas - mhc_genes, u),
               *fet(a, dlpfc_twas - mhc_genes, u),
               *fet(a, hippo_twas - mhc_genes, u),
              )

## GWAS, TWAS and DE enrichment

### Load TWAS, and GWAS genes

In [41]:
config = {
    'mhc': '/ceph/projects/v3_phase3_paper/inputs/gwas/PGC2_CLOZUK/table_s3/hg38/mhc_region_genes/_m/mhc_genes.csv',
    'gwas': '/ceph/projects/v3_phase3_paper/inputs/gwas/PGC2_CLOZUK/table_s3/hg38/genes/_m/gwas_genes.csv',
    'twas_C': '/ceph/users/apua/projects/caudate_twas_reader/genes/_m/twas_significant_genes.csv',
    'twas_H': '/ceph/users/jbenja13/phase3_paper/phase2/twas/extract_twas/_m/hippo_twas_assocations_fusion.csv',
    'twas_D': '/ceph/users/jbenja13/phase3_paper/phase2/twas/extract_twas/_m/dlpfc_twas_assocations_fusion.csv',
}

In [42]:
mhc_genes = set(pd.read_csv(config['mhc'])['gene_id'])
len(mhc_genes)

383

In [43]:
gwas_genes = set(pd.read_csv(config['gwas'])['gene_id'])
len(gwas_genes)

2000

In [44]:
dlpfc0 = pd.read_csv(config['twas_D'], low_memory=False)
dlpfc0 = dlpfc0[(dlpfc0['feature'] == 'gene') & 
                (dlpfc0['region'] == 'DLPFC') & 
               (dlpfc0['TWAS.FDR'] < 0.05)].drop('FILE', axis=1).copy()
dlpfc_twas = set(dlpfc0.ID)
len(dlpfc_twas)

406

In [45]:
hippo0 = pd.read_csv(config['twas_H'], low_memory=False)
hippo0 = hippo0[(hippo0['feature'] == 'gene') & 
                (hippo0['region'] == 'HIPPO') & 
               (hippo0['TWAS.FDR'] < 0.05)].drop('FILE', axis=1).copy()
hippo_twas = set(hippo0.ID)
len(hippo_twas)

270

In [46]:
# Caudate only for now
caudate_twas = set(pd.read_csv(config['twas_C'])['gene_id'])
len(caudate_twas)

489

### Enrichment

In [50]:
edf1 = pd.DataFrame.from_records(enrichment_rows(), 
                                 columns=['Tissue', 'N_genes', 'GWAS_OR', 'GWAS_P', 
                                          'Caudate_TWAS_OR', 'Caudate_TWAS_P', 
                                          'DLPFC_TWAS_OR', 'DLPFC_TWAS_P', 
                                          'Hippocampus_TWAS_OR', 'Hippocampus_TWAS_P'])
edf1['GWAS_FDR'] = multipletests(edf1['GWAS_P'], method='fdr_bh')[1]
edf1['Caudate_TWAS_FDR'] = multipletests(edf1['Caudate_TWAS_P'], method='fdr_bh')[1]
edf1['DLPFC_TWAS_FDR'] = multipletests(edf1['DLPFC_TWAS_P'], method='fdr_bh')[1]
edf1['Hippocampus_TWAS_FDR'] = multipletests(edf1['Hippocampus_TWAS_P'], method='fdr_bh')[1]
edf1 = edf1[['Tissue', 'N_genes', 'GWAS_OR', 'GWAS_P', 'GWAS_FDR',
             'Caudate_TWAS_OR', 'Caudate_TWAS_P', 'Caudate_TWAS_FDR',
             'DLPFC_TWAS_OR', 'DLPFC_TWAS_P', 'DLPFC_TWAS_FDR',
             'Hippocampus_TWAS_OR', 'Hippocampus_TWAS_P', 'Hippocampus_TWAS_FDR']]
#edf1.to_csv('deg_xlinked_enrichment.csv', index=False)
edf1

Unnamed: 0,Tissue,N_genes,GWAS_OR,GWAS_P,GWAS_FDR,Caudate_TWAS_OR,Caudate_TWAS_P,Caudate_TWAS_FDR,DLPFC_TWAS_OR,DLPFC_TWAS_P,DLPFC_TWAS_FDR,Hippocampus_TWAS_OR,Hippocampus_TWAS_P,Hippocampus_TWAS_FDR
0,caudate,109,0.0,1.0,1.0,,1.0,1.0,,1.0,1.0,,1.0,1.0
1,dlpfc,10,0.0,1.0,1.0,,1.0,1.0,,1.0,1.0,,1.0,1.0
2,hippocampus,0,,1.0,1.0,,1.0,1.0,,1.0,1.0,,1.0,1.0


In [26]:
u = set(get_gtf_genes_df().gene_id) 
a = set(get_X_linked().gene_id) & set(get_degs("dlpfc").gencodeID)
gene_list = {
    'GWAS': gwas_genes, 'Caudate_TWAS': caudate_twas, 
    "DLPFC_TWAS": dlpfc_twas, 'Hippocampus_TWAS': hippo_twas,
}
or_lt = []; pval_lt = []; glt = [];
for genes in ['GWAS', 'Caudate_TWAS', 'DLPFC_TWAS', "Hippocampus_TWAS"]:
    oddratio, pval = fet(a, gene_list[genes], u)
    or_lt.append(oddratio)
    pval_lt.append(pval)
    glt.append(genes)
dt = pd.DataFrame({'Gene Source': glt, 'Odd_Ratio': or_lt, 'PValue': pval_lt})
dt.to_csv('xlinked_enrichment.csv', index=False)
dt

### No MHC region

In [52]:
edf2 = pd.DataFrame.from_records(enrichment_rows_nomhc(), 
                                 columns=['Tissue', 'N_genes', 'GWAS_OR', 'GWAS_P', 
                                          'Caudate_TWAS_OR', 'Caudate_TWAS_P', 
                                          'DLPFC_TWAS_OR', 'DLPFC_TWAS_P', 
                                          'Hippocampus_TWAS_OR', 'Hippocampus_TWAS_P'])
edf2['GWAS_FDR'] = multipletests(edf2['GWAS_P'], method='fdr_bh')[1]
edf2['Caudate_TWAS_FDR'] = multipletests(edf2['Caudate_TWAS_P'], method='fdr_bh')[1]
edf2['DLPFC_TWAS_FDR'] = multipletests(edf2['DLPFC_TWAS_P'], method='fdr_bh')[1]
edf2['Hippocampus_TWAS_FDR'] = multipletests(edf2['Hippocampus_TWAS_P'], method='fdr_bh')[1]
edf2 = edf2[['Tissue', 'N_genes', 'GWAS_OR', 'GWAS_P', 'GWAS_FDR',
             'Caudate_TWAS_OR', 'Caudate_TWAS_P', 'Caudate_TWAS_FDR',
             'DLPFC_TWAS_OR', 'DLPFC_TWAS_P', 'DLPFC_TWAS_FDR',
             'Hippocampus_TWAS_OR', 'Hippocampus_TWAS_P', 'Hippocampus_TWAS_FDR']]
edf2.to_csv('deg_xlinked_enrichment_nomhc.csv', index=False)
edf2

Unnamed: 0,Tissue,N_genes,GWAS_OR,GWAS_P,GWAS_FDR,Caudate_TWAS_OR,Caudate_TWAS_P,Caudate_TWAS_FDR,DLPFC_TWAS_OR,DLPFC_TWAS_P,DLPFC_TWAS_FDR,Hippocampus_TWAS_OR,Hippocampus_TWAS_P,Hippocampus_TWAS_FDR
0,caudate,109,0.0,0.078917,0.23675,0.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0
1,dlpfc,10,0.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,1.0
2,hippocampus,0,,1.0,1.0,,1.0,1.0,,1.0,1.0,,1.0,1.0


In [29]:
or_lt = []; pval_lt = []; glt = [];
for genes in ['GWAS', 'Caudate_TWAS', 'DLPFC_TWAS', "Hippocampus_TWAS"]:
    oddratio, pval = fet(a, gene_list[genes] - mhc_genes, u)
    or_lt.append(oddratio)
    pval_lt.append(pval)
    glt.append(genes)
dt2 = pd.DataFrame({'Gene Source': glt, 'Odd_Ratio': or_lt, 'PValue': pval_lt})
dt2.to_csv('xlinked_enrichment_nomhc.csv', index=False)
dt2

Unnamed: 0,Gene Source,Odd_Ratio,PValue
0,GWAS,0.028405,1.5477599999999998e-26
1,Caudate_TWAS,0.0,1.770576e-08
2,DLPFC_TWAS,0.0,8.552157e-08
3,Hippocampus_TWAS,0.0,3.146627e-05


#### Significant depletion of X linked genes for GWAS and TWAS