# Cell type enrichment analysis

In [None]:
import os, errno
import functools
import numpy as np
import pandas as pd
import session_info
from pyhere import here
from functools import lru_cache
from matplotlib_venn import venn3
from scipy.stats import fisher_exact
from matplotlib import pyplot as plt
from statsmodels.stats.multitest import multipletests

## Functions

### Cached functions

In [None]:
@lru_cache()
def get_deg(tissue):
    fn = here(f'differential_expression/{tissue}',
              '_m/genes/diffExpr_maleVfemale_full.txt')
    df = pd.read_csv(fn, sep='\t', index_col=0)
    df["ensemblID"] = df.gencodeID.str.replace("\\..*", "", regex=True)
    return df


@lru_cache()
def get_xci():
    xci = pd.read_csv('../_h/xci_status_hg19.txt', sep='\t')
    xci['ensemblID'] = xci['Gene ID'].str.replace("\\..*", "", regex=True)
    return xci

### Simple functions

In [None]:
def tissue_annotation(tissue):
    return {'caudate': "Caudate", "dlpfc": "DLPFC", 
            "hippocampus": "Hippocampus"}[tissue]


def mkdir_p(directory):
    try:
        os.makedirs(directory)
    except OSError as e:
        if e.errno != errno.EEXIST:
            raise


def cal_fishers(status, tissue):
    deg = get_deg(tissue)
    xci = get_xci()
    df = deg.merge(xci, on='ensemblID', how='left')
    table = [[np.sum((df['adj.P.Val']<0.05) & (df['Combined XCI status'] == status)), 
              np.sum((df['adj.P.Val']<0.05) & (df['Combined XCI status'] != status))], 
             [np.sum((df['adj.P.Val']>0.05) & (df['Combined XCI status'] == status)), 
              np.sum((df['adj.P.Val']>0.05) & (df['Combined XCI status'] != status))]]
    #print(table)
    return fisher_exact(table)


def cal_fishers_direction(status, direction, tissue):
    deg = get_deg(tissue)
    if direction == 'Up':
        deg = deg[(deg['t'] > 0)].copy()
    else:
        deg = deg[(deg['t'] < 0)].copy()
    xci = get_xci()
    df = deg.merge(xci, on='ensemblID', how='left')
    table = [[np.sum((df['adj.P.Val']<0.05) & (df['Combined XCI status'] == status)), 
              np.sum((df['adj.P.Val']<0.05) & (df['Combined XCI status'] != status))], 
             [np.sum((df['adj.P.Val']>0.05) & (df['Combined XCI status'] == status)), 
              np.sum((df['adj.P.Val']>0.05) & (df['Combined XCI status'] != status))]]
    #print(table)
    return fisher_exact(table)


def cal_fisher_by_xci_status(tissue):
    xci_status = xci = get_xci().loc[:, 'Combined XCI status'].unique()
    xci_lt = []; pval_lt = []; oddratio_lt = []; dir_lt = [];
    for status in xci_status:
        odd_ratio, pval = cal_fishers(status, tissue)
        xci_lt.append(status); pval_lt.append(pval);
        oddratio_lt.append(odd_ratio); dir_lt.append('All')
        if pval < 0.05:
            print("There is a significant enrichment (p-value < %.1e) of %s!" % 
                  (pval, status))

    for status in xci_status:
        odd_ratio, pval = cal_fishers_direction(status, "Up", tissue)
        xci_lt.append(status); pval_lt.append(pval);
        oddratio_lt.append(odd_ratio); dir_lt.append('Male Bias')
        if pval < 0.05:
            print("There is a significant enrichment of male bias genes (p-value < %.1e) of %s!" %
                  (pval, status))
        
    for status in xci_status:
        odd_ratio, pval = cal_fishers_direction(status, "Down", tissue)
        xci_lt.append(status); pval_lt.append(pval);
        oddratio_lt.append(odd_ratio); dir_lt.append('Female Bias')
        if pval < 0.05:
            print("There is a significant enrichment of female bias genes (p-value < %.1e) of %s!" % 
                  (pval, status))
    return pd.DataFrame({"Tissue": tissue_annotation(tissue),
                         'XCI status': xci_lt, 'OR': oddratio_lt, 
                         'PValue': pval_lt, 'Direction': dir_lt})

## Gene set overlaps

In [None]:
for tissue in ["caudate", "dlpfc", "hippocampus"]:
    mkdir_p(tissue)
    print(tissue_annotation(tissue))
    deg = get_deg(tissue)
    xci = get_xci()
    xci_status = get_xci().loc[:, 'Combined XCI status'].unique()
    ## Plot venn diagram
    for status in xci_status:
        setA = set(deg[(deg["adj.P.Val"] < 0.05) & (deg['t'] < 0)].ensemblID)
        setB = set(xci[(xci["Combined XCI status"] == status)].ensemblID)
        setC = set(deg[(deg["adj.P.Val"] < 0.05) & (deg['t'] > 0)].ensemblID)
        plt.rcParams.update({'font.size': 18})
        plt.figure(figsize=(7,7))
        v = venn3([setA, setB, setC], ("Female Bias", "XCI", "Male Bias"))
        plt.title(status.title())
        fn = f'{tissue}/venn_DEGs_{status}'
        for ext in ['png', 'pdf', 'svg']:
            plt.savefig(fn + '.' + ext) 
        plt.show()

## Calculated fisher's exact for each XCI status

In [None]:
d = dict()
for tissue in ["caudate", "dlpfc", "hippocampus"]:
    print(tissue_annotation(tissue))
    df = cal_fisher_by_xci_status(tissue)
    _, fdr, _, _ = multipletests(df.PValue, method='bonferroni')
    df['Bonferroni'] = fdr
    d[tissue] = df

In [None]:
dft = pd.concat(d.values(), axis=0)
dft[(dft.Bonferroni <= 0.05)]

In [None]:
dft.to_csv('xci_enrichment_analysis_3brainRegions.txt', sep='\t', index=False)

## Session information

In [None]:
session_info.show()