# Cell type enrichment analysis

In [1]:
import functools
import numpy as np
import pandas as pd
from scipy.stats import fisher_exact
from statsmodels.stats.multitest import fdrcorrection

In [2]:
basename = '/ceph/users/jbenja13/projects/aanri/racial_diff'
config = {
    'gene_background': basename+'/input/celltypes/_h/cell_type/mouse2human_gene.txt',
    'deg_file': '../../_m/genes/diffExpr_maleVfemale_full.txt',
    'ki1_high': basename+'/input/celltypes/_m/KI_level1_human_specificity_0.4.txt',
    'ki1_low': basename+'/input/celltypes/_m/KI_level1_human_specificity_0.2.txt',
    'ki2_high': basename+'/input/celltypes/_m/KI_level2_human_specificity_0.4.txt',
    'ki2_low': basename+'/input/celltypes/_m/KI_level2_human_specificity_0.2.txt'
}

In [3]:
@functools.lru_cache()
def get_gene_background():
    return pd.read_csv(config['gene_background'], sep='\t')


@functools.lru_cache()
def get_deg():
    return pd.read_csv(config['deg_file'], sep='\t')


@functools.lru_cache()
def get_celltype_ki(fn):
    return pd.read_csv(fn, sep='\t')

In [4]:
def cal_fishers(celltype, fn):
    deg = get_deg()
    ct = get_celltype_ki(fn)
    ctp = ct[(ct['cell_type'] == celltype)].copy()
    ctm = set(get_gene_background().Symbol_human) - set(ctp.Symbol_human)
    table = [[len(set(deg[(deg['adj.P.Val']<0.05)].Symbol) & set(ctp.Symbol_human)),
              len(set(deg[(deg['adj.P.Val']<0.05)].Symbol) & ctm)], 
             [len(set(deg[(deg['adj.P.Val']>0.05)].Symbol) & set(ctp.Symbol_human)),
              len(set(deg[(deg['adj.P.Val']>0.05)].Symbol) & ctm)]]
    #print(table)
    return fisher_exact(table)


def cal_fishers_direction(celltype, direction, fn):
    deg = get_deg()
    if direction == 'Up':
        deg = deg[(deg['t'] > 0)].copy()
    else:
        deg = deg[(deg['t'] < 0)].copy()
    ct = get_celltype_ki(fn)
    ctp = ct[(ct['cell_type'] == celltype)].copy()
    ctm = set(get_gene_background().Symbol_human) - set(ctp.Symbol_human)
    table = [[len(set(deg[(deg['adj.P.Val']<0.05)].Symbol) & set(ctp.Symbol_human)),
              len(set(deg[(deg['adj.P.Val']<0.05)].Symbol) & ctm)], 
             [len(set(deg[(deg['adj.P.Val']>0.05)].Symbol) & set(ctp.Symbol_human)),
              len(set(deg[(deg['adj.P.Val']>0.05)].Symbol) & ctm)]]
    #print(table)
    return fisher_exact(table)

In [5]:
def cal_fisher_by_celltype(fn):
    unique_celltypes = get_celltype_ki(fn).cell_type.unique()
    ct_lt = []; pval_lt = []; oddratio_lt = []; dir_lt = [];
    for celltype in unique_celltypes:
        odd_ratio, pval = cal_fishers(celltype, fn)
        ct_lt.append(celltype); pval_lt.append(pval);
        oddratio_lt.append(odd_ratio); dir_lt.append('All')
        if pval < 0.05:
            print("There is a significant enrichment (p-value < %.1e) of %s!" % 
                  (pval, celltype))

    for celltype in unique_celltypes:
        odd_ratio, pval = cal_fishers_direction(celltype, "Up", fn)
        ct_lt.append(celltype); pval_lt.append(pval);
        oddratio_lt.append(odd_ratio); dir_lt.append('Up')
        if pval < 0.05:
            print("There is a significant enrichment of up-regulated genes (p-value < %.1e) of %s!" %
                  (pval, celltype))
        
    for celltype in unique_celltypes:
        odd_ratio, pval = cal_fishers_direction(celltype, "Down", fn)
        ct_lt.append(celltype); pval_lt.append(pval);
        oddratio_lt.append(odd_ratio); dir_lt.append('Down')
        if pval < 0.05:
            print("There is a significant enrichment of down-regulated genes (p-value < %.1e) of %s!" % 
                  (pval, celltype))
    return pd.DataFrame({'Cell_type': ct_lt, 'OR': oddratio_lt, 
                         'PValue': pval_lt, 'Direction': dir_lt})

## Calculated fisher's exact for each version of cell types 

### KI level1, high specificity

In [10]:
df1 = cal_fisher_by_celltype(config['ki1_high'])
df1['List'] = 'KI1'
df1['Specificity'] = 0.4
_, fdr = fdrcorrection(df1.PValue)
df1['FDR'] = fdr

There is a significant enrichment (p-value < 3.0e-02) of endothelial-mural!
There is a significant enrichment (p-value < 3.5e-02) of microglia!
There is a significant enrichment (p-value < 1.7e-02) of HypothalamicDopaminergicNeurons!
There is a significant enrichment (p-value < 7.6e-03) of Radialglialikecells!
There is a significant enrichment of up-regulated genes (p-value < 3.2e-03) of HypothalamicDopaminergicNeurons!
There is a significant enrichment of up-regulated genes (p-value < 4.8e-02) of interneurons!
There is a significant enrichment of down-regulated genes (p-value < 1.3e-02) of endothelial-mural!
There is a significant enrichment of down-regulated genes (p-value < 1.5e-02) of Radialglialikecells!


### KI level1, low specificity

In [12]:
df2 = cal_fisher_by_celltype(config['ki1_low'])
df2['List'] = 'KI1'
df2['Specificity'] = 0.2
_, fdr = fdrcorrection(df2.PValue)
df2['FDR'] = fdr

There is a significant enrichment (p-value < 2.6e-03) of endothelial-mural!
There is a significant enrichment (p-value < 1.6e-02) of HypothalamicGABAergicNeurons!
There is a significant enrichment (p-value < 3.5e-02) of NeuralProgenitors!
There is a significant enrichment (p-value < 1.8e-02) of EmbryonicGABAergicNeuron!
There is a significant enrichment (p-value < 1.3e-02) of interneurons!
There is a significant enrichment (p-value < 1.3e-03) of EmbryonicDopaminergicNeuron!
There is a significant enrichment of up-regulated genes (p-value < 4.8e-02) of Radialglialikecells!
There is a significant enrichment of up-regulated genes (p-value < 8.9e-03) of NeuralProgenitors!
There is a significant enrichment of up-regulated genes (p-value < 2.3e-02) of EmbryonicGABAergicNeuron!
There is a significant enrichment of up-regulated genes (p-value < 1.7e-03) of interneurons!
There is a significant enrichment of down-regulated genes (p-value < 2.5e-03) of endothelial-mural!
There is a significant en

### KI level2, high specificity

In [13]:
df3 = cal_fisher_by_celltype(config['ki2_high'])
df3['List'] = 'KI2'
df3['Specificity'] = 0.4
_, fdr = fdrcorrection(df3.PValue)
df3['FDR'] = fdr

There is a significant enrichment (p-value < 1.4e-02) of HypothalamicGABA2Neuron!
There is a significant enrichment (p-value < 2.8e-02) of HypothalamicOxt;Avp-low;Th;Cacna1HNeuron!
There is a significant enrichment (p-value < 1.4e-02) of Int4!
There is a significant enrichment of up-regulated genes (p-value < 1.3e-02) of HypothalamicDopamine;Tac1;Ghrh;Pnoc;Dat+and-;GABANeuron!
There is a significant enrichment of up-regulated genes (p-value < 1.3e-02) of Int4!
There is a significant enrichment of down-regulated genes (p-value < 3.0e-02) of HypothalamicVglut2;Cnr1;Ninl;Rfx5;Zfp346Neuron!
There is a significant enrichment of down-regulated genes (p-value < 1.5e-02) of HypothalamicGABA2Neuron!
There is a significant enrichment of down-regulated genes (p-value < 3.0e-02) of HypothalamicOxt;Avp-low;Th;Cacna1HNeuron!


### KI level2, low specificity

In [14]:
df4 = cal_fisher_by_celltype(config['ki2_low'])
df4['List'] = 'KI2'
df4['Specificity'] = 0.2
_, fdr = fdrcorrection(df4.PValue)
df4['FDR'] = fdr

There is a significant enrichment (p-value < 4.1e-02) of HypothalamicVglut2;Cnr1;Ninl;Rfx5;Zfp346Neuron!
There is a significant enrichment (p-value < 2.8e-03) of Int10!
There is a significant enrichment (p-value < 2.8e-02) of DopaminergicAdult-Ventraltegmentalarea3!
There is a significant enrichment of up-regulated genes (p-value < 3.9e-02) of Int14!
There is a significant enrichment of up-regulated genes (p-value < 3.9e-02) of SubPyr!
There is a significant enrichment of up-regulated genes (p-value < 1.0e-03) of Int10!
There is a significant enrichment of up-regulated genes (p-value < 2.6e-02) of Int16!
There is a significant enrichment of up-regulated genes (p-value < 1.3e-02) of HypothalamicDopamine;Tac1;Ghrh;Pnoc;Dat+and-;GABANeuron!
There is a significant enrichment of up-regulated genes (p-value < 2.6e-02) of Int4!
There is a significant enrichment of down-regulated genes (p-value < 4.4e-02) of DopaminergicAdult-Ventraltegmentalarea2!
There is a significant enrichment of down-reg

## Merge data

In [16]:
df = pd.concat([df1, df2, df3, df4], axis=0)
df[(df.FDR <= 0.05)]

Unnamed: 0,Cell_type,OR,PValue,Direction,List,Specificity,FDR
0,endothelial-mural,2.520993,0.002552,All,KI1,0.2,0.036746
21,EmbryonicDopaminergicNeuron,9.541533,0.001256,All,KI1,0.2,0.036746
44,interneurons,4.473277,0.001688,Up,KI1,0.2,0.036746
48,endothelial-mural,2.810696,0.00251,Down,KI1,0.2,0.036746
69,EmbryonicDopaminergicNeuron,15.640271,0.001573,Down,KI1,0.2,0.036746


In [30]:
df.to_csv('celltype_enrichment_analysis.txt', sep='\t', index=False)
df.head()

Unnamed: 0,Cell_type,OR,PValue,Direction,List,Specificity
0,endothelial-mural,1.523901,0.015064,All,KI1,0.4
1,Oligodendrocytes,0.612161,0.078419,All,KI1,0.4
2,VascularLeptomeningealCells,1.106483,0.605932,All,KI1,0.4
3,microglia,0.898144,0.54862,All,KI1,0.4
4,MediumSpinyNeuron,0.991856,1.0,All,KI1,0.4
