# Comparison with other datasets

In [None]:
import functools
import numpy as np
import pandas as pd

## BrainSeq functions

In [None]:
config = {
    'caudate': '../../../caudate/male_analysis/metrics_summary/_m/male_specific_DE_4features.txt',
    'dlpfc': '../../../dlpfc/male_analysis/metrics_summary/_m/male_specific_DE_4features.txt',
    'hippo': '../../../hippocampus/male_analysis/metrics_summary/_m/male_specific_DE_4features.txt',
}

config2 = {
    'caudate': '../../../caudate/male_analysis/_m/genes/diffExpr_szVctl_full.txt',
    'dlpfc': '../../../dlpfc/male_analysis/_m/genes/diffExpr_szVctl_full.txt',
    'hippo': '../../../hippocampus/male_analysis/_m/genes/diffExpr_szVctl_full.txt',
}

In [None]:
@functools.lru_cache()
def get_deg(filename):
    dft = pd.read_csv(filename, sep='\t', index_col=0)
    if 'Type' in dft.columns:
        dft = dft[(dft['Type'] == 'gene')].copy()
    dft['Feature'] = dft.index
    dft['Dir'] = np.sign(dft['t'])
    if 'gene_id' in dft.columns:
        dft['ensemblID'] = dft.gene_id.str.replace('\\..*', '', regex=True)
    return dft[['Feature', 'ensemblID', 'Symbol', 'adj.P.Val', 'logFC', 't', 'Dir']]


@functools.lru_cache()
def get_deg_sig(filename, fdr):
    dft = get_deg(filename)
    return dft[(dft['adj.P.Val'] < fdr)]


In [None]:
def tissue_annotation(tissue):
    return {'dlpfc': 'DLPFC', 'hippo': 'Hippocampus', 
            'caudate': 'Caudate', 'cmc_dlpfc': 'CMC DLPFC'}[tissue]

## Qin comparison

In [None]:
qin_file = '/ceph/users/jbenja13/projects/sex_sz_ria/input/public_results/_m/qin/qin_results_probesets.csv'
qin = pd.read_csv(qin_file)
tissue = 'caudate'
qin.head(2)

In [None]:
for tissue in ['caudate', 'dlpfc', 'hippo']:
    fdr = 0.05 if tissue != 'dlpfc' else 0.01
    tot = len(set(qin.loc[:, 'Gene symbol ']))
    overlap = len(set(get_deg_sig(config[tissue], fdr).Symbol) & 
                  set(qin.loc[:, 'Gene symbol '].str.replace(' ','')))
    xx = overlap / tot
    print("There is %d (%.1f%%) overlap between %s and PFC!" % 
          (overlap, xx* 100, tissue_annotation(tissue)))

In [None]:
shared = set(get_deg_sig(config['caudate'], 0.05).Symbol) & set(qin.loc[:, 'Gene symbol '].str.replace(' ',''))
shared

In [None]:
shared = set(get_deg_sig(config['hippo'], 0.05).Symbol) & set(qin.loc[:, 'Gene symbol '].str.replace(' ',''))
shared

In [None]:
qin[qin['Gene symbol '].isin(['USE1 ', 'BBX '])]

In [None]:
get_deg_sig(config['caudate'], 0.05)[get_deg_sig(config['caudate'], 0.05).Symbol.isin(["USE1", "BBX"])]

In [None]:
get_deg_sig(config['hippo'], 0.05)[get_deg_sig(config['hippo'], 0.05).Symbol == 'USE1']

In [None]:
for tissue in ['caudate', 'dlpfc', 'hippo']:
    fdr = 0.05 if tissue != 'dlpfc' else 0.05
    tot = len(set(qin.loc[:, 'Gene symbol ']))
    overlap = len(set(get_deg_sig(config2[tissue], fdr).Symbol) & 
                  set(qin.loc[:, 'Gene symbol '].str.replace(' ','')))
    xx = overlap / tot
    print("There is %d (%.1f%%) overlap between %s and PFC!" % 
          (overlap, xx* 100, tissue_annotation(tissue)))

In [None]:
shared = set(get_deg_sig(config2['caudate'], 0.05).Symbol) & set(qin.loc[:, 'Gene symbol '].str.replace(' ',''))
shared

In [None]:
qin[qin['Gene symbol '].isin(['ABCG2 ', 'GABARAPL1 ', 'PARD3 ', 'USE1 ', 'BBX '])]

In [None]:
get_deg_sig(config2['caudate'], 0.05)[get_deg_sig(config2['caudate'], 0.05)\
    .Symbol.isin(['ABCG2', 'GABARAPL1', 'PARD3', 'USE1', "BBX"])]

In [None]:
set(get_deg_sig(config2['dlpfc'], 0.05).Symbol) & set(qin.loc[:, 'Gene symbol '].str.replace(' ',''))

In [None]:
qin[qin['Gene symbol '].isin(['ABCG2 '])]

In [None]:
get_deg_sig(config2['dlpfc'], 0.05)[get_deg_sig(config2['dlpfc'], 0.05)\
    .Symbol.isin(['ABCG2'])]

In [None]:
set(get_deg_sig(config2['hippo'], 0.05).Symbol) & set(qin.loc[:, 'Gene symbol '].str.replace(' ',''))

In [None]:
qin[qin['Gene symbol '].isin(['ABCG2 ', 'USE1 '])]

In [None]:
get_deg_sig(config2['hippo'], 0.05)[get_deg_sig(config2['hippo'], 0.05)\
    .Symbol.isin(['ABCG2', 'USE1'])]

#### GABARAPL1 direction does not agree