# Tissue comparison for differential expression analysis

In [None]:
import functools
import numpy as np
import pandas as pd
from plotnine import *
from scipy.stats import binom_test, fisher_exact, linregress

from warnings import filterwarnings
from matplotlib.cbook import mplDeprecation
filterwarnings('ignore', category=mplDeprecation)
filterwarnings('ignore', category=UserWarning, module='plotnine.*')
filterwarnings('ignore', category=DeprecationWarning, module='plotnine.*')

In [None]:
config = {
    'caudate': '../../../caudate/_m/genes/diffExpr_maleVfemale_full.txt',
    'dlpfc': '../../../dlpfc/_m/genes/diffExpr_maleVfemale_full.txt',
    'hippo': '../../../hippocampus/_m/genes/diffExpr_maleVfemale_full.txt',
    'cmc_dlpfc': '../../../cmc_dlpfc/_m/mssm_penn_pitt_maleVfemale.tsv',
    'cmc_hbcc': "../../../cmc_dlpfc/_m/nimh_hbcc_maleVfemale.tsv",
}

In [None]:
@functools.lru_cache()
def get_deg(filename):
    dft = pd.read_csv(filename, sep='\t', index_col=0)
    dft['Feature'] = dft.index
    dft['Dir'] = np.sign(dft['t'])
    dft['ensemblID'] = dft.Feature.str.replace('\\..*', '', regex=True)
    return dft[['Feature', 'ensemblID', 'adj.P.Val', 'logFC', 't', 'Dir']]


@functools.lru_cache()
def get_deg_sig(filename, fdr):
    dft = get_deg(filename)
    return dft[(dft['adj.P.Val'] < fdr)]


@functools.lru_cache()
def merge_dataframes(tissue1, tissue2):
    return get_deg(config[tissue1]).merge(get_deg(config[tissue2]), 
                                          on='Feature', 
                                          suffixes=['_%s' % tissue1, '_%s' % tissue2])


@functools.lru_cache()
def merge_dataframes_sig(tissue1, tissue2):
    fdr = 0.05 
    return get_deg_sig(config[tissue1], fdr).merge(get_deg_sig(config[tissue2], fdr), 
                                                   on='Feature', 
                                                   suffixes=['_%s' % tissue1, '_%s' % tissue2])

In [None]:
def enrichment_binom(tissue1, tissue2, merge_fnc):
    df = merge_fnc(tissue1, tissue2)
    df['agree'] = df['Dir_%s' % tissue1] * df['Dir_%s' % tissue2]
    dft = df.groupby('agree').size().reset_index()
    print(dft)
    return binom_test(dft[0].iloc[1], dft[0].sum()) if dft.shape[0] != 1 else print("All directions agree!")


def cal_fishers(tissue1, tissue2):
    df = merge_dataframes(tissue1, tissue2)
    fdr = 0.05
    table = [[np.sum((df['adj.P.Val_%s' % tissue1]<fdr) & 
                     ((df['adj.P.Val_%s' % tissue2]<fdr))),
              np.sum((df['adj.P.Val_%s' % tissue1]<fdr) & 
                     ((df['adj.P.Val_%s' % tissue2]>=fdr)))],
             [np.sum((df['adj.P.Val_%s' % tissue1]>=fdr) & 
                     ((df['adj.P.Val_%s' % tissue2]<fdr))),
              np.sum((df['adj.P.Val_%s' % tissue1]>=fdr) & 
                     ((df['adj.P.Val_%s' % tissue2]>=fdr)))]]
    print(table)
    return fisher_exact(table)


def calculate_corr(xx, yy):
    '''This calculates R^2 correlation via linear regression:
         - used to calculate relationship between 2 arrays 
         - the arrays are principal components 1 or 2 (PC1, PC2) AND gender
         - calculated on a scale of 0 to 1 (with 0 being no correlation)
        Inputs: 
          x: array of Gender (converted to binary output)
          y: array of PC
        Outputs: 
          1. r2
          2. p-value, two-sided test 
            - whose null hypothesis is that two sets of data are uncorrelated
          3. slope (beta): directory of correlations
    '''
    slope, intercept, r_value, p_value, std_err = linregress(xx, yy)
    return r_value, p_value

    
def corr_annotation(tissue1, tissue2, merge_fnc):
    dft = merge_fnc(tissue1, tissue2)
    xx = dft['t_%s' % tissue1]
    yy = dft['t_%s' % tissue2]
    r_value1, p_value1 = calculate_corr(xx, yy)
    return 'R2: %.2f\nP-value: %.2e' % (r_value1**2, p_value1)


def tissue_annotation(tissue):
    return {'dlpfc': 'DLPFC', 'hippo': 'Hippocampus', 
            'caudate': 'Caudate', 'cmc_dlpfc': 'CMC DLPFC', 
            "cmc_hbcc": "CMC DLPFC: HBCC"}[tissue]

In [None]:
def plot_corr_impl(tissue1, tissue2, merge_fnc):
    dft = merge_fnc(tissue1, tissue2)
    title = '\n'.join([corr_annotation(tissue1, tissue2, merge_fnc)])
    xlab = 'T-statistic (%s)' % tissue_annotation(tissue1)
    ylab = 'T-statistic (%s)' % tissue_annotation(tissue2)
    pp = ggplot(dft, aes(x='t_%s'%tissue1, y='t_%s' % tissue2))\
    + geom_point(alpha=0.75, size=3)\
    + theme_matplotlib()\
    + theme(axis_text=element_text(size=18), 
            axis_title=element_text(size=20, face='bold'), 
            plot_title=element_text(size=22))
    pp += labs(x=xlab, y=ylab, title=title)
    return pp


def plot_corr(tissue1, tissue2, merge_fnc):
    return plot_corr_impl(tissue1, tissue2, merge_fnc)


def save_plot(p, fn, width=7, height=7):
    '''Save plot as svg, png, and pdf with specific label and dimension.'''
    for ext in ['.svg', '.png', '.pdf']:
        p.save(fn+ext, width=width, height=height)

## BrainSeq Tissue Comparison

In [None]:
caudate = get_deg(config['caudate'])
caudate.groupby('Dir').size()

In [None]:
caudate[(caudate['adj.P.Val'] < 0.05)].shape

In [None]:
dlpfc = get_deg(config['dlpfc'])
dlpfc.groupby('Dir').size()

In [None]:
dlpfc[(dlpfc['adj.P.Val'] < 0.05)].shape

In [None]:
hippo = get_deg(config['hippo'])
hippo.groupby('Dir').size()

In [None]:
hippo[(hippo['adj.P.Val'] < 0.05)].shape

### Enrichment of DEG

In [None]:
cal_fishers('caudate', 'dlpfc')

In [None]:
cal_fishers('caudate', 'hippo')

In [None]:
cal_fishers('dlpfc', 'hippo')

### Correlation

In [None]:
pp = plot_corr('caudate', 'dlpfc', merge_dataframes)
pp

In [None]:
qq = plot_corr('caudate', 'hippo', merge_dataframes)
qq

In [None]:
ww = plot_corr('dlpfc', 'hippo', merge_dataframes)
ww

### Significant correlation, FDR < 0.05

In [None]:
pp = plot_corr('caudate', 'dlpfc', merge_dataframes_sig)
pp

In [None]:
qq = plot_corr('caudate', 'hippo', merge_dataframes_sig)
qq

In [None]:
ww = plot_corr('dlpfc', 'hippo', merge_dataframes_sig)
ww

In [None]:
#save_plot(pp, 'dlpfc_caudate_tstatistic_corr_sig')
#save_plot(qq, 'hippo_caudate_tstatistic_corr_sig')
#save_plot(ww, 'hippo_dlpfc_tstatistic_corr_sig')

### Directionality test

#### All genes

In [None]:
enrichment_binom('caudate', 'dlpfc', merge_dataframes)

In [None]:
enrichment_binom('caudate', 'hippo', merge_dataframes)

In [None]:
enrichment_binom('dlpfc', 'hippo', merge_dataframes)

#### Significant DEG (FDR < 0.05)

In [None]:
enrichment_binom('caudate', 'dlpfc', merge_dataframes_sig)

In [None]:
df = merge_dataframes_sig("caudate", "dlpfc")
df[(df['agree']<0)]

In [None]:
enrichment_binom('caudate', 'hippo', merge_dataframes_sig)

In [None]:
enrichment_binom('dlpfc', 'hippo', merge_dataframes_sig)

## Common Mind Comparison: MSSM Penn Pitt

In [None]:
cmc_dlpfc = get_deg(config['cmc_dlpfc'])
cmc_dlpfc.groupby('Dir').size()

In [None]:
cmc_dlpfc[(cmc_dlpfc['adj.P.Val'] < 0.05)].shape

### Enrichment of DEG

In [None]:
cal_fishers('dlpfc', 'cmc_dlpfc')

In [None]:
cal_fishers('hippo', 'cmc_dlpfc')

In [None]:
cal_fishers('caudate', 'cmc_dlpfc')

### Correlation

In [None]:
pp = plot_corr('cmc_dlpfc', 'dlpfc', merge_dataframes)
pp

In [None]:
qq = plot_corr('cmc_dlpfc', 'hippo', merge_dataframes)
qq

In [None]:
ww = plot_corr('cmc_dlpfc', 'caudate', merge_dataframes)
ww

### Significant correlation, FDR < 0.05

In [None]:
pp = plot_corr('cmc_dlpfc', 'dlpfc', merge_dataframes_sig)
pp

In [None]:
qq = plot_corr('cmc_dlpfc', 'hippo', merge_dataframes_sig)
qq

In [None]:
ww = plot_corr('cmc_dlpfc', 'caudate', merge_dataframes_sig)
ww

### Directionality

#### All genes

In [None]:
enrichment_binom('cmc_dlpfc', 'dlpfc', merge_dataframes)

In [None]:
enrichment_binom('cmc_dlpfc', 'hippo', merge_dataframes)

In [None]:
enrichment_binom('cmc_dlpfc', 'caudate', merge_dataframes)

#### Significant DEG (FDR < 0.05)

In [None]:
enrichment_binom('cmc_dlpfc', 'dlpfc', merge_dataframes_sig)

In [None]:
enrichment_binom('cmc_dlpfc', 'hippo', merge_dataframes_sig)

In [None]:
enrichment_binom('cmc_dlpfc', 'caudate', merge_dataframes_sig)

## Common Mind Comparison: NIMH HBCC

In [None]:
cmc_dlpfc = get_deg(config['cmc_hbcc'])
cmc_dlpfc.groupby('Dir').size()

In [None]:
cmc_dlpfc[(cmc_dlpfc['adj.P.Val'] < 0.05)].shape

### Enrichment of DEG

In [None]:
cal_fishers('dlpfc', 'cmc_hbcc')

In [None]:
cal_fishers('hippo', 'cmc_hbcc')

In [None]:
cal_fishers('caudate', 'cmc_hbcc')

### Significant correlation, FDR < 0.05

In [None]:
pp = plot_corr('cmc_hbcc', 'dlpfc', merge_dataframes_sig)
pp

In [None]:
qq = plot_corr('cmc_hbcc', 'hippo', merge_dataframes_sig)
qq

In [None]:
ww = plot_corr('cmc_hbcc', 'caudate', merge_dataframes_sig)
ww

### Directionality

#### All genes

In [None]:
enrichment_binom('cmc_hbcc', 'dlpfc', merge_dataframes)

In [None]:
enrichment_binom('cmc_hbcc', 'hippo', merge_dataframes)

In [None]:
enrichment_binom('cmc_hbcc', 'caudate', merge_dataframes)

#### Significant DEG (FDR < 0.05)

In [None]:
enrichment_binom('cmc_hbcc', 'dlpfc', merge_dataframes_sig)

In [None]:
enrichment_binom('cmc_hbcc', 'hippo', merge_dataframes_sig)

In [None]:
enrichment_binom('cmc_hbcc', 'caudate', merge_dataframes_sig)