# Tissue comparison for differential expression analysis

In [None]:
import functools
import numpy as np
import pandas as pd
from plotnine import *
from scipy.stats import binom_test, fisher_exact, linregress

from warnings import filterwarnings
from matplotlib.cbook import mplDeprecation
filterwarnings('ignore', category=mplDeprecation)
filterwarnings('ignore', category=UserWarning, module='plotnine.*')
filterwarnings('ignore', category=DeprecationWarning, module='plotnine.*')

In [None]:
config = {
    'caudate': '../../../_m/genes/diffExpr_szVctl_full.txt',
    'dlpfc': '/ceph/projects/v4_phase3_paper/inputs/public_data/_m/phase2/dlpfc_diffExpr_szVctl_full.txt',
    'hippo': '/ceph/projects/v4_phase3_paper/inputs/public_data/_m/phase2/hippo_diffExpr_szVctl_full.txt',
    'cmc_sva': '/ceph/projects/v4_phase3_paper/inputs/public_data/_m/cmc/CMC_MSSM-Penn-Pitt_DLPFC_mRNA_'+\
    'IlluminaHiSeq2500_gene-adjustedSVA-differentialExpression-includeAncestry-DxSCZ-DE.tsv',
    'cmc': '/ceph/projects/v4_phase3_paper/inputs/public_data/_m/cmc/CMC_MSSM-Penn-Pitt_DLPFC_mRNA_'+\
    'IlluminaHiSeq2500_gene-adjustedNoSVA-differentialExpression-includeAncestry-DxSCZ-DE.tsv'
}

In [None]:
@functools.lru_cache()
def get_cmc(SVA=True):
    if SVA:
        cmc_dlpfc = pd.read_csv(config["cmc_sva"], sep='\t')\
                      .rename(columns={'MAPPED_genes': 'Symbol', "genes": "ensemblID"})
    else:
        cmc_dlpfc = pd.read_csv(config["cmc"], sep='\t')\
                      .rename(columns={'MAPPED_genes': "Symbol", "genes": "ensemblID"})
    cmc_dlpfc['Dir'] = np.sign(cmc_dlpfc['t'])
    cmc_dlpfc["Feature"] = cmc_dlpfc.ensemblID
    return cmc_dlpfc[["Feature", "ensemblID", 'adj.P.Val', 't', 'Dir', "Symbol"]]


@functools.lru_cache()
def get_deg(filename):
    dft = pd.read_csv(filename, sep='\t', index_col=0)
    dft['Feature'] = dft.index
    dft['Dir'] = np.sign(dft['t'])
    if 'gene_id' in dft.columns:
        dft['ensemblID'] = dft.gene_id.str.replace('\\..*', '', regex=True)
    elif 'ensembl_gene_id' in dft.columns:
        dft.rename(columns={'ensembl_gene_id': 'ensemblID'}, inplace=True)
    return dft[['Feature', 'ensemblID', 'adj.P.Val', 'logFC', 't', 'Dir']]


@functools.lru_cache()
def get_deg_sig(filename, fdr):
    dft = get_deg(filename)
    return dft[(dft['adj.P.Val'] < fdr)]


@functools.lru_cache()
def merge_dataframes(tissue1, tissue2):
    return get_deg(config[tissue1]).merge(get_deg(config[tissue2]), 
                                          on='ensemblID', 
                                          suffixes=['_%s' % tissue1, '_%s' % tissue2])


@functools.lru_cache()
def merge_dataframes_sig(tissue1, tissue2):
    fdr1 = 0.05 if tissue1 != 'dlpfc' else 0.05
    fdr2 = 0.05 if tissue2 != 'dlpfc' else 0.05
    return get_deg_sig(config[tissue1], fdr1).merge(get_deg_sig(config[tissue2], fdr2), 
                                                    on='ensemblID', 
                                                    suffixes=['_%s' % tissue1, '_%s' % tissue2])

@functools.lru_cache()
def merge_cmc(tissue1, sig=False, SVA=True):
    if sig:
        df1 = get_cmc(SVA)[(get_cmc(SVA)["adj.P.Val"] < 0.05)]
        df2 = get_deg_sig(config[tissue1], 0.05)
    else:
        df1 = get_cmc(SVA)
        df2 = get_deg(config[tissue1])
    return df2.merge(df1, on="ensemblID", suffixes=["_%s" % tissue1, '_cmc'])
        

In [None]:
def enrichment_binom(tissue1, tissue2, merge_fnc, sig=False, sva=True):
    if tissue2 != "cmc":
        df = merge_fnc(tissue1, tissue2)
    else:
        df = merge_fnc(tissue1, sig, sva)
    df['agree'] = df['Dir_%s' % tissue1] * df['Dir_%s' % tissue2]
    dft = df.groupby('agree').size().reset_index()
    print(dft)
    return binom_test(dft[0].iloc[1], dft[0].sum()) if dft.shape[0] != 1 else print("All directions agree!")


def cal_fishers(tissue1, tissue2, fnc, sva=True):
    if tissue2 != 'cmc':
        df = fnc(tissue1, tissue2)
    else:
        df = fnc(tissue1, False, sva)
    fdr1 = 0.05 if tissue1 != 'dlpfc' else 0.05
    fdr2 = 0.05 if tissue2 != 'dlpfc' else 0.05
    table = [[np.sum((df['adj.P.Val_%s' % tissue1]<fdr1) & 
                     ((df['adj.P.Val_%s' % tissue2]<fdr2))),
              np.sum((df['adj.P.Val_%s' % tissue1]<fdr1) & 
                     ((df['adj.P.Val_%s' % tissue2]>=fdr2)))],
             [np.sum((df['adj.P.Val_%s' % tissue1]>=fdr1) & 
                     ((df['adj.P.Val_%s' % tissue2]<fdr2))),
              np.sum((df['adj.P.Val_%s' % tissue1]>=fdr1) & 
                     ((df['adj.P.Val_%s' % tissue2]>=fdr2)))]]
    print(table)
    return fisher_exact(table)


def calculate_corr(xx, yy):
    '''This calculates R^2 correlation via linear regression:
         - used to calculate relationship between 2 arrays 
         - the arrays are principal components 1 or 2 (PC1, PC2) AND gender
         - calculated on a scale of 0 to 1 (with 0 being no correlation)
        Inputs: 
          x: array of Gender (converted to binary output)
          y: array of PC
        Outputs: 
          1. r2
          2. p-value, two-sided test 
            - whose null hypothesis is that two sets of data are uncorrelated
          3. slope (beta): directory of correlations
    '''
    slope, intercept, r_value, p_value, std_err = linregress(xx, yy)
    return r_value, p_value

    
def corr_annotation(tissue1, tissue2, merge_fnc, sig=False, sva=True):
    if tissue2 != 'cmc':
        dft = merge_fnc(tissue1, tissue2)
    else:
        dft = merge_fnc(tissue1, sig, sva)
    xx = dft['t_%s' % tissue1]
    yy = dft['t_%s' % tissue2]
    r_value1, p_value1 = calculate_corr(xx, yy)
    return 'R2: %.2f\nP-value: %.2e' % (r_value1**2, p_value1)


def tissue_annotation(tissue):
    return {'dlpfc': 'DLPFC', 'hippo': 'Hippocampus', 
            'caudate': 'Caudate', 'cmc': "CMC DLPFC"}[tissue]

In [None]:
def plot_corr_impl(tissue1, tissue2, merge_fnc, sig, sva):
    if tissue2 != "cmc":
        dft = merge_fnc(tissue1, tissue2)
        title = '\n'.join([corr_annotation(tissue1, tissue2, merge_fnc)])
    else:
        dft = merge_fnc(tissue1, sig, sva)
        title = '\n'.join([corr_annotation(tissue1, tissue2, merge_fnc, sig, sva)])
    xlab = 'T-statistic (%s)' % tissue_annotation(tissue1)
    ylab = 'T-statistic (%s)' % tissue_annotation(tissue2)
    pp = ggplot(dft, aes(x='t_%s'%tissue1, y='t_%s' % tissue2))\
    + geom_point(alpha=0.75, size=3)\
    + theme_matplotlib()\
    + theme(axis_text=element_text(size=18), 
            axis_title=element_text(size=20, face='bold'), 
            plot_title=element_text(size=22))
    pp += labs(x=xlab, y=ylab, title=title)
    return pp


def plot_corr(tissue1, tissue2, merge_fnc, sig=False, sva=True):
    return plot_corr_impl(tissue1, tissue2, merge_fnc, sig, sva)


def save_plot(p, fn, width=7, height=7):
    '''Save plot as svg, png, and pdf with specific label and dimension.'''
    for ext in ['.svg', '.png', '.pdf']:
        p.save(fn+ext, width=width, height=height)

## Sample summary

In [None]:
pheno_file = '/ceph/projects/v4_phase3_paper/inputs/phenotypes/_m/merged_phenotypes.csv'
pheno = pd.read_csv(pheno_file, index_col=0)
pheno = pheno[(pheno['Age'] > 17) & 
              (pheno['Dx'].isin(['SZ', 'CTL'])) & 
              (pheno['Race'].isin(['AA', "EA"]))].copy()
pheno.head(2)

In [None]:
pheno.groupby(['Region']).size()

In [None]:
pheno.groupby(['Region', 'Race']).size()

In [None]:
pheno.groupby(['Region', 'Race', 'Sex']).size()

## BrainSeq Tissue Comparison

In [None]:
caudate = get_deg(config['caudate'])
caudate.groupby('Dir').size()

In [None]:
caudate[(caudate['adj.P.Val'] < 0.05)].shape

In [None]:
dlpfc = get_deg(config['dlpfc'])
dlpfc.groupby('Dir').size()

In [None]:
dlpfc[(dlpfc['adj.P.Val'] < 0.05)].shape

In [None]:
hippo = get_deg(config['hippo'])
hippo.groupby('Dir').size()

In [None]:
hippo[(hippo['adj.P.Val'] < 0.05)].shape

### Enrichment of DEG

In [None]:
cal_fishers('caudate', 'dlpfc', merge_dataframes)

In [None]:
cal_fishers('caudate', 'hippo', merge_dataframes)

In [None]:
cal_fishers('dlpfc', 'hippo', merge_dataframes)

### Correlation

In [None]:
pp = plot_corr('caudate', 'dlpfc', merge_dataframes)
pp

In [None]:
qq = plot_corr('caudate', 'hippo', merge_dataframes)
qq

In [None]:
ww = plot_corr('dlpfc', 'hippo', merge_dataframes)
ww

### Significant correlation, FDR < 0.05

In [None]:
pp = plot_corr('caudate', 'dlpfc', merge_dataframes_sig)
pp

In [None]:
qq = plot_corr('caudate', 'hippo', merge_dataframes_sig)
qq

In [None]:
ww = plot_corr('dlpfc', 'hippo', merge_dataframes_sig)
ww

### Directionality test

#### All genes

In [None]:
enrichment_binom('caudate', 'dlpfc', merge_dataframes)

In [None]:
enrichment_binom('caudate', 'hippo', merge_dataframes)

In [None]:
enrichment_binom('dlpfc', 'hippo', merge_dataframes)

#### Significant DEG (FDR < 0.05)

In [None]:
enrichment_binom('caudate', 'dlpfc', merge_dataframes_sig)

In [None]:
enrichment_binom('caudate', 'hippo', merge_dataframes_sig)

In [None]:
enrichment_binom('dlpfc', 'hippo', merge_dataframes_sig)

## CMC comparison

### Adjusted SVA

In [None]:
cmc = get_cmc(SVA=True)
cmc.groupby('Dir').size()

In [None]:
cmc[(cmc['adj.P.Val'] < 0.05)].shape

### No adjusted SVA

In [None]:
cmc_dlpfc2 = get_cmc(False)
cmc_dlpfc2.groupby('Dir').size()

In [None]:
cmc_dlpfc2[(cmc_dlpfc2['adj.P.Val'] < 0.05)].shape

### Enrichment of DEG

#### SVA corrected

In [None]:
cal_fishers("caudate", "cmc", merge_cmc, True)

In [None]:
cal_fishers("dlpfc", "cmc", merge_cmc, True)

In [None]:
cal_fishers("hippo", "cmc", merge_cmc, True)

#### No SVA correction

In [None]:
cal_fishers("caudate", "cmc", merge_cmc, False)

In [None]:
cal_fishers("dlpfc", "cmc", merge_cmc, False)

In [None]:
cal_fishers("hippo", "cmc", merge_cmc, False)

### Correlation

#### SVA correction 

In [None]:
pp = plot_corr('caudate', 'cmc', merge_cmc, False, True)
pp

#### No SVA correction

In [None]:
qq = plot_corr('caudate', 'cmc', merge_cmc, False, False)
qq

### Significant correlation, FDR < 0.05

#### SVA correction

In [None]:
pp = plot_corr('caudate', 'cmc', merge_cmc, True, True)
pp

#### No SVA correction

In [None]:
qq = plot_corr('caudate', 'cmc', merge_cmc, True, False)
qq

### Directionality test

#### All genes

##### SVA correction

In [None]:
enrichment_binom('caudate', 'cmc', merge_cmc, False, True)

In [None]:
enrichment_binom('dlpfc', 'cmc', merge_cmc, False, True)

In [None]:
enrichment_binom('hippo', 'cmc', merge_cmc, False, True)

##### No SVA correction

In [None]:
enrichment_binom('caudate', 'cmc', merge_cmc, False, False)

In [None]:
enrichment_binom('dlpfc', 'cmc', merge_cmc, False, False)

In [None]:
enrichment_binom('hippo', 'cmc', merge_cmc, False, False)

#### Significant DEG (FDR < 0.05)

##### SVA correction

In [None]:
enrichment_binom('caudate', 'cmc', merge_cmc, True, True)

In [None]:
enrichment_binom('dlpfc', 'cmc', merge_cmc, True, True)

In [None]:
#enrichment_binom('hippo', 'cmc', merge_cmc, True, True)

##### No SVA correction

In [None]:
enrichment_binom('caudate', 'cmc', merge_cmc, True, False)

In [None]:
enrichment_binom('dlpfc', 'cmc', merge_cmc, True, False)

In [None]:
enrichment_binom('hippo', 'cmc', merge_cmc, True, False)