# Enrichment and Overlap of PGC2+CLOZUK

In [1]:
import re
import os, errno
import functools
import subprocess
import numpy as np
import pandas as pd
from plotnine import *
from pandas_plink import read_plink
from warnings import filterwarnings
from matplotlib.cbook import mplDeprecation
from scipy.stats import fisher_exact, binom_test

filterwarnings("ignore",category=mplDeprecation)
filterwarnings('ignore', category=UserWarning, module='plotnine.*')
filterwarnings('ignore', category=DeprecationWarning, module='plotnine.*')

## Config and Functions

In [2]:
config = {
    'biomart_file': '../_h/biomart.csv',
    'phenotype_file': '/ceph/projects/v4_phase3_paper/inputs/phenotypes/_m/merged_phenotypes.csv',
    'plink_file_prefix': '/ceph/projects/v4_phase3_paper/inputs/genotypes/_m/LIBD_Brain_TopMed',
    'gwas_snp_file': '/ceph/projects/v4_phase3_paper/inputs/sz_gwas/pgc2_clozuk/map_phase3/_m/libd_hg38_pgc2sz_snps.tsv'
}

In [3]:
@functools.lru_cache()
def get_de_df():
    de_df = pd.read_csv(config_feature['de_file'], sep='\t', index_col=0)
    return de_df


@functools.lru_cache()
def get_eqtl_df(fdr=0.05):
    with subprocess.Popen('''awk ' ($6<%f) || (NR==1) {print}' %s ''' % 
                          (fdr, config_feature['matrixeqtl_output_file']),
                          shell=True, stdout=subprocess.PIPE) as p:
        eqtl_df = pd.read_csv(p.stdout, sep='\t')
    return eqtl_df


@functools.lru_cache()
def get_gwas_snps():
    return pd.read_csv(config['gwas_snp_file'], sep='\t', index_col=0, low_memory=False)


@functools.lru_cache()
def get_integration_df(fdr=0.05):
    dft = get_gwas_snps().merge(get_eqtl_df(fdr), left_on='our_snp_id', right_on='SNP', 
                                suffixes=['_PGC2', '_eqtl'])\
                         .merge(get_de_df(), left_on='gene', right_index=True)
    return dft


@functools.lru_cache()
def get_residual_expression_df():
    residual_expression_df = pd.read_csv(config_feature['residual_expression_file'], 
                                         sep='\t', index_col=0).transpose()
    return residual_expression_df


@functools.lru_cache()
def get_pheno_df():
    pheno_df = pd.read_csv(config['phenotype_file'], index_col=0)
    pheno_df['New_Dx'] = pheno_df.New_Dx.astype('category')\
                                 .cat.reorder_categories(['Control', 'Schizo_noAP', 'Schizo_AP'])
    return pheno_df

In [4]:
def agree_direction(row):
    return [-1, 1][row['pgc2_a1_same_as_our_counted']] * np.sign(row['OR'] - 1) * np.sign(row['t-stat']) * np.sign(row['t'])


def letter_snp(number, a0, a1):
    '''
    Example:
    letter_snp(0, 'A', 'G') is 'AA'
    letter_snp(1, 'A', 'G') is 'AG'
    letter_snp(2, 'A', 'G') is 'GG'
    
    '''
    if np.isnan(number):
        return np.nan
    if len(a0)==1 and len(a1)==1:
        sep = ''
    else:
        sep = ' '
    return sep.join(sorted([a0]*int(number) + [a1]*(2-int(number))))


def get_gwas_snp(snp_id):
    gwas = get_gwas_snps()
    r = gwas[gwas['our_snp_id']==snp_id]
    assert len(r) == 1
    return r

In [5]:
@functools.lru_cache()
def get_expression_and_pheno_df():
    return pd.merge(get_pheno_df(), get_residual_expression_df(), left_index=True, right_index=True)


@functools.lru_cache()
def get_plink_tuple():
    '''
    Usage: (bim, fam, bed) = get_plink_tuple()
    '''
    return read_plink(config['plink_file_prefix'])


@functools.lru_cache()
def get_snp_df(snp_id):
    '''
    Returns a dataframe containing the genotype on snp snp_id.
    The allele count is the same as in the plink files.
    
    Example: 
    get_snp_df('rs653953').head(5)
    
            rs653953_num rs653953_letter rs653953
    Br5168             0              GG    0\nGG
    Br2582             1              AG    1\nAG
    Br2378             1              AG    1\nAG
    Br5155             2              AA    2\nAA
    Br5182             2              AA    2\nAA
    '''
    (bim, fam, bed) = get_plink_tuple()
    brain_ids = list(set(get_expression_and_pheno_df()['BrNum']).intersection(set(fam['fid'])))
    snp_info = bim[bim['snp']==snp_id]
    snp_pos = snp_info.iloc[0]['i']
    fam_pos = list(fam.set_index('fid').loc[brain_ids]['i'])
    dfsnp = (pd.DataFrame(bed[[snp_pos]].compute()[:,fam_pos], columns=brain_ids, index=[snp_id + '_num'])
             .transpose().dropna())
    my_letter_snp = functools.partial(letter_snp, a0=snp_info.iloc[0]['a0'], a1=snp_info.iloc[0]['a1'])
    # the 2 - in next line is to workaround a possible bug in pandas_plink? a1 and a0 inverted
    dfsnp[[snp_id + '_num']] = 2 - dfsnp[[snp_id + '_num']].astype('int')
    dfsnp[snp_id + '_letter'] = dfsnp[snp_id + '_num'].apply(my_letter_snp)
    dfsnp[snp_id] = (dfsnp[snp_id + '_num'].astype('str') + '\n' + 
                     dfsnp[snp_id + '_letter'].astype('str')).astype('category')
    return dfsnp


@functools.lru_cache()
def get_gwas_ordered_snp_df(snp_id):
    '''
    Returns a dataframe containing the genotype on snp snp_id.
    The allele count is the number of risk alleles according to GWAS.
    
    Example: 
    get_gwas_ordered_snp_df('rs653953').head(5)
    
            rs653953_num rs653953_letter rs653953
    Br5168             2              GG    2\nGG
    Br2582             1              AG    1\nAG
    Br2378             1              AG    1\nAG
    Br5155             0              AA    0\nAA
    Br5182             0              AA    0\nAA
    '''
    pgc = get_gwas_snps()
    dfsnp = get_snp_df(snp_id).copy()
    gwas_snp = get_gwas_snp(snp_id)
    if gwas_snp['pgc2_a1_same_as_our_counted'].iloc[0]:
        if gwas_snp['OR'].iloc[0] > 1:
            pass
        else:
            dfsnp[[snp_id + '_num']] = 2 - dfsnp[[snp_id + '_num']]
    else:
        if gwas_snp['OR'].iloc[0] > 1:
            dfsnp[[snp_id + '_num']] = 2 - dfsnp[[snp_id + '_num']]
        else:
            pass
    dfsnp[snp_id] = (dfsnp[snp_id + '_num'].astype('str') + '\n' + 
                     dfsnp[snp_id + '_letter'].astype('str')).astype('category')
    return dfsnp


@functools.lru_cache()
def get_biomart_df():
    biomart = pd.read_csv(config['biomart_file'])
    biomart['description'] = biomart['description'].str.replace('\[Source.*$','', regex=True)
    return biomart


@functools.lru_cache()
def get_risk_allele(snp_id):
    gwas_snp = get_gwas_snp(snp_id)
    
    if gwas_snp['OR'].iloc[0] > 1:
        ra = gwas_snp['A1'].iloc[0]
    else:
        ra = gwas_snp['A2'].iloc[0]
    
    return ra

In [6]:
def get_gene_symbol(gene_id, biomart=get_biomart_df()):
    ensge = re.sub('\..+$','', gene_id)
    ggg = biomart[biomart['ensembl_gene_id']==ensge]
    if ggg.shape[0]==0:
        return '', ''
    gs = ggg['external_gene_name'].values[0]
    de = ggg['description'].values[0]
    if type(de)!=str:
        de = ''
    de = re.sub('\[Source:.*$','',de)
    return gs, de


def save_plot(p, fn):
    for ext in ['png', 'pdf', 'svg']:
        p.save(fn + '.' + ext)
        

def get_snp_gene_pheno_df(snp_id, gene_id, snp_df_func):
    pheno_columns = list(get_pheno_df().columns)
    expr_df = get_expression_and_pheno_df()[pheno_columns + [gene_id]]
    snp_df =  snp_df_func(snp_id)
    return expr_df.merge(snp_df, left_on='BrNum', right_index=True)


def simple_snp_expression_pheno_plot_impl(snp_id, gene_id, snp_df_func, pheno_var):
    df = get_snp_gene_pheno_df(snp_id, gene_id, snp_df_func)
    df['Dx'] = df.Dx.astype('category').cat.rename_categories({'Control': 'CTL', 'Schizo': 'SZ'})
    y0 = df[gene_id].quantile(.01) - 0.26
    y1 = df[gene_id].quantile(.99) + 0.26
    pjd = position_jitterdodge(jitter_width=0.27)
    p = ggplot(df, aes(x=snp_id, y=gene_id, fill=pheno_var)) \
    + geom_boxplot(alpha=0.4, outlier_alpha=0) \
    + geom_jitter(position=pjd, stroke=0, alpha=0.6) + ylim(y0, y1) \
    + labs(y='Residualized expression', fill='Diagnosis') \
    + theme_matplotlib(base_size=20)\
    + theme(legend_title=element_text(face='bold'))
    return p


def simple_gwas_ordered_snp_expression_pheno_plot(snp_id, gene_id, pheno_var):
    return simple_snp_expression_pheno_plot_impl(snp_id, gene_id, get_gwas_ordered_snp_df, pheno_var)


def gwas_annotation(snp_id):
    return 'SZ GWAS pvalue: %.1e' % get_gwas_snp(snp_id).iloc[0]['P']


def eqtl_annotation(snp_id, gene_id):
    eqtl_df = get_eqtl_df()
    r = eqtl_df[(eqtl_df['SNP']==snp_id) & (eqtl_df['gene']==gene_id)]
    assert len(r)==1
    return 'eQTL FDR: %.1e' % r.iloc[0]['FDR']


def de_annotation(gene_id):
    de_df = get_de_df()
    g = de_df[(de_df['gencodeID'] == gene_id)]
    return 'DE adj.P.Val: %.3f' % g.iloc[0]['adj.P.Val']


def risk_allele_annotation(snp_id):
    return 'SZ risk allele: %s' % get_risk_allele(snp_id)


def gwas_annotated_eqtl_pheno_plot(snp_id, gene_id, pheno_var):
    p = simple_gwas_ordered_snp_expression_pheno_plot(snp_id, gene_id, pheno_var)                         
    gene_symbol, gene_description = get_gene_symbol(gene_id)
    
    title ="\n".join([gene_symbol, gene_id,
                      gwas_annotation(snp_id),
                      risk_allele_annotation(snp_id),
                      eqtl_annotation(snp_id, gene_id), 
                      de_annotation(gene_id)])
    
    p += ggtitle(title)
    return p

## Genes

In [10]:
config_feature = {
    'de_file': '../../differential_expression/_m/genes/diffExpr_szVctl_full.txt',
    'residual_expression_file': '../../differential_expression/_m/genes/residualized_expression.tsv',
    'matrixeqtl_output_file': '../../../eQTL_analysis/caudate_eqtl/_m/cis_eqtls_genes.ctxt',
}

In [11]:
feature = 'genes'
try:
    os.makedirs(feature)
except OSError as e:
    if e.errno != errno.EEXIST:
        raise

### Enrichment

#### Integrate DEG with PGC2+CLOZUK SNPs

In [12]:
dft = get_integration_df()
dft.shape

  mask |= (ar1 == a)


(1206995, 44)

In [13]:
dft['agree_direction'] = dft.apply(agree_direction, axis=1)

In [14]:
agreement = {1: 'Yes', -1: 'No', 0: 0}
dft.agree_direction = [agreement[item] for item in dft['agree_direction']]

In [15]:
table =  [[np.sum((dft['P']<5e-8) & ((dft['adj.P.Val']<.05))),
           np.sum((dft['P']<5e-8) & ((dft['adj.P.Val']>=.05)))],
          [np.sum((dft['P']>=5e-8) & ((dft['adj.P.Val']<.05))),
           np.sum((dft['P']>=5e-8) & ((dft['adj.P.Val']>=.05)))]]
print(table)
fisher_exact(table)

[[2373, 23710], [132748, 1048164]]


(0.790255336066623, 5.953721058780092e-29)

In [16]:
dft1 = dft[(dft['P']<5e-8) & ((dft['adj.P.Val']<.05))]
df = dft1.groupby('agree_direction').size().reset_index()
df

Unnamed: 0,agree_direction,0
0,No,449
1,Yes,1924


In [17]:
binom_test(df[0].iloc[1], df[0].sum())

2.017014918564934e-216

In [18]:
dft2 = dft[(dft['P']<=5e-8)]
dft2 = dft2[(dft2['CtrlvsSZ'] != 0)]
dft2['risk_allele'] = dft2['our_snp_id'].apply(get_risk_allele)

In [19]:
direction = {-1: 'Down', 1: 'Up'}
boolean_conv = {True: 1, False: -1}
dft2.pgc2_a1_same_as_our_counted = [boolean_conv[item] for item in dft2['pgc2_a1_same_as_our_counted']]
dft2['eqtl_gwas_dir'] = [direction[item] for item in np.sign(dft2['pgc2_a1_same_as_our_counted']) * np.sign(dft2['t-stat']) * np.sign(dft2['OR'] - 1)]
dft2['de_dir'] = [direction[item] for item in np.sign(dft2['t'])]
dft2['eqtl_t'] = np.sign(dft2['pgc2_a1_same_as_our_counted']) * np.sign(dft2['OR'] - 1) * dft2['t-stat']

dft2 = dft2[['gene', 'Symbol', 'SNP_eqtl', 'A1', 'A2', 'risk_allele', 'OR', 
             'P', 'FDR', 'adj.P.Val', 'logFC', 't', 'eqtl_t', 
             'de_dir', 'eqtl_gwas_dir', 'agree_direction']]
dft2.to_csv('%s/integration_by_symbol.txt' % feature, sep='\t', index=False)
df2 = dft2.groupby(['gene']).first().reset_index().sort_values('P')

In [20]:
df2.groupby(['agree_direction']).size()

agree_direction
No     14
Yes    18
dtype: int64

In [21]:
df2.set_index('Symbol').rename(columns={'t': 'de_t', 'P': 'GWAS_P', 'FDR': 'eQTL_FDR', 
                                        'adj.P.Val': 'de_adj.P.Val'})

Unnamed: 0_level_0,gene,SNP_eqtl,A1,A2,risk_allele,OR,GWAS_P,eQTL_FDR,de_adj.P.Val,logFC,de_t,eqtl_t,de_dir,eqtl_gwas_dir,agree_direction
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
ZNF391,ENSG00000124613.8,rs35715914:27592003:C:T,C,T,C,1.2619,2.8e-40,0.0009039846,0.04140334,0.074953,2.862466,-4.332547,Up,Down,No
ZSCAN9,ENSG00000137185.11,rs66868086:27865902:A:T,A,T,A,1.2619,1.4600000000000002e-39,0.006888964,0.01365665,-0.082397,-3.337802,-3.764447,Down,Down,Yes
HCG4,ENSG00000176998.4,rs3117425,C,T,C,1.2614,6.89e-39,0.001794875,0.01468069,0.224649,3.307039,4.150195,Up,Up,Yes
BRD2,ENSG00000204256.12,rs1794282,C,T,C,1.2138,1.58e-28,0.003908755,0.02260331,-0.050859,-3.130279,-3.932347,Down,Down,Yes
C4A,ENSG00000244731.7,rs3131642:31450637:G:A,G,A,G,1.1846,2.08e-28,5.615194e-09,0.002714423,0.392615,3.928587,6.786279,Up,Up,Yes
ZNF204P,ENSG00000204789.4,rs9357045:27688927:G:A,G,A,G,1.1286,1.63e-21,0.005685491,0.01394713,0.097111,3.326268,-3.822067,Up,Down,No
ELFN1,ENSG00000225968.7,rs871924:2047845:A:G,A,G,A,1.0827,3.61e-16,0.005791652,0.0483215,-0.119752,-2.786812,-3.816539,Down,Down,Yes
BAG6,ENSG00000204463.12,rs3868082,A,G,G,0.92414,2.23e-14,0.006738295,0.01906708,-0.041937,-3.20179,3.77114,Down,Up,No
CKB,ENSG00000166165.12,rs4900592:104177098:G:T,G,T,T,0.92884,3.28e-13,0.002496139,0.00582281,-0.082683,-3.660539,-4.059289,Down,Down,Yes
NGEF,ENSG00000066248.14,rs778353:233788350:C:T,C,T,T,0.93531,2.68e-12,3.678208e-05,0.01511964,0.10901,3.294629,-5.096546,Up,Down,No


### Plot with PGC2 risk allele

In [35]:
for xx in range(df2.shape[0]):
    gg = gwas_annotated_eqtl_pheno_plot(df2.iloc[xx, :].SNP_eqtl, df2.iloc[xx, :].gene, 'Dx')
    print(gg)
    label = '%s/eqtl_gwas_%s' % (feature, df2.iloc[xx, :].Symbol)
    save_plot(gg, label)

TypeError: __init__() got an unexpected keyword argument 'fill'