# Enrichment and Overlap of PGC2+CLOZUK

In [1]:
import re
import os, errno
import functools
import numpy as np
import pandas as pd
from plotnine import *
from pandas_plink import read_plink
from warnings import filterwarnings
from matplotlib.cbook import mplDeprecation
from scipy.stats import fisher_exact, binom_test

filterwarnings("ignore",category=mplDeprecation)
filterwarnings('ignore', category=UserWarning, module='plotnine.*')
filterwarnings('ignore', category=DeprecationWarning, module='plotnine.*')

## Config and Functions

In [2]:
config = {
    'biomart_file': '../_h/biomart.csv',
    'phenotype_file': '/ceph/projects/v4_phase3_paper/inputs/phenotypes/_m/merged_phenotypes.csv',
    'plink_file_prefix': '/ceph/projects/v4_phase3_paper/inputs/genotypes/_m/LIBD_Brain_TopMed',
    'gwas_snp_file': '/ceph/projects/v4_phase3_paper/inputs/sz_gwas/pgc2_clozuk/map_phase3/_m/libd_hg38_pgc2sz_snps.tsv'
}

config_feature = {
    'de_file': '../../differential_expression/_m/genes/diffExpr_szVctl_full.txt',
    'residual_expression_file': '../../differential_expression/_m/genes/residualized_expression.tsv',
    'fastqtl_output_file': '../../eqtl/caudate/summary_table/_m/Brainseq_LIBD_caudate_4features.signifpairs.txt.gz',
}

feature = "genes"

In [3]:
@functools.lru_cache()
def feature_map(feature):
    return {"genes": "Gene", "transcripts": "Transcript", 
            "exons": "Exon", "junctions": "Junction"}[feature]


@functools.lru_cache()
def get_de_df():
    """
    Load DE analysis
    """
    return pd.read_csv(config_feature['de_file'], sep='\t', index_col=0)


@functools.lru_cache()
def get_eqtl_df():
    eqtl_df = pd.read_csv(config_feature['fastqtl_output_file'], sep='\t')
    return eqtl_df[(eqtl_df["Type"] == feature_map(feature))]


@functools.lru_cache()
def get_gwas_snps():
    return pd.read_csv(config['gwas_snp_file'], sep='\t', index_col=0, low_memory=False)


@functools.lru_cache()
def get_integration_df():
    return get_gwas_snps().merge(get_eqtl_df(), left_on='our_snp_id', right_on='variant_id', 
                                 suffixes=['_PGC2', '_eQTL'])\
                          .merge(get_de_df(), left_on='gene_id', right_index=True)


@functools.lru_cache()
def get_residual_expression_df():
    return pd.read_csv(config_feature['residual_expression_file'], 
                       sep='\t', index_col=0).transpose()


@functools.lru_cache()
def get_pheno_df():
    return pd.read_csv(config['phenotype_file'], index_col=0)

In [4]:
def agree_direction(row):
    return [-1, 1][row['pgc2_a1_same_as_our_counted']] * np.sign(row['OR'] - 1) * np.sign(row['slope']) * np.sign(row['t'])


def letter_snp(number, a0, a1):
    '''
    Example:
    letter_snp(0, 'A', 'G') is 'AA'
    letter_snp(1, 'A', 'G') is 'AG'
    letter_snp(2, 'A', 'G') is 'GG'
    
    '''
    if np.isnan(number):
        return np.nan
    if len(a0)==1 and len(a1)==1:
        sep = ''
    else:
        sep = ' '
    return sep.join(sorted([a0]*int(number) + [a1]*(2-int(number))))


def get_gwas_snp(snp_id):
    gwas = get_gwas_snps()
    r = gwas[gwas['our_snp_id']==snp_id]
    assert len(r) == 1
    return r

In [5]:
@functools.lru_cache()
def get_expression_and_pheno_df():
    return pd.merge(get_pheno_df(), get_residual_expression_df(), left_index=True, right_index=True)


@functools.lru_cache()
def get_plink_tuple():
    '''
    Usage: (bim, fam, bed) = get_plink_tuple()
    '''
    return read_plink(config['plink_file_prefix'])


@functools.lru_cache()
def subset_bed():
    """
    This subsets the bed and bim file and returns the new subsetted
    data with shared brain_ids.
    
    This is to speed up accessing the bed file.
    """
    (bim, fam, bed) = get_plink_tuple()
    brain_ids = list(set(get_expression_and_pheno_df()['BrNum']).intersection(set(fam['fid'])))
    fam_pos = list(fam[(fam["fid"].isin(brain_ids))].drop_duplicates(subset="fid").loc[:, 'i'])
    unique_snps = get_eqtl_df().variant_id.unique()
    snp_info = bim[(bim["snp"].isin(unique_snps))].copy()
    snp_pos = list(snp_info.loc[:, "i"])
    new_bed = bed[snp_pos].compute()[:,fam_pos]
    new_bim = bim[(bim["i"].isin(snp_pos))].reset_index(drop=True)
    new_bim['ii'] = new_bim.index
    return new_bed, new_bim, brain_ids


@functools.lru_cache()
def get_snp_df(snp_id):
    '''
    Returns a dataframe containing the genotype on snp snp_id.
    The allele count is the same as in the plink files.
    
    Example: 
    get_snp_df('rs653953').head(5)
    
            rs653953_num rs653953_letter rs653953
    Br5168             0              GG    0\nGG
    Br2582             1              AG    1\nAG
    Br2378             1              AG    1\nAG
    Br5155             2              AA    2\nAA
    Br5182             2              AA    2\nAA
    '''
    bed, bim, brain_ids = subset_bed()
    snp_info = bim[bim['snp']==snp_id]
    snp_pos = snp_info.iloc[0]['ii']
    dfsnp = pd.DataFrame(bed[[snp_pos]], columns=brain_ids, index=[snp_id + '_num']).transpose().dropna()
    my_letter_snp = functools.partial(letter_snp, a0=snp_info.iloc[0]['a0'], a1=snp_info.iloc[0]['a1'])
    # the 2 - in next line is to workaround a possible bug in pandas_plink? a1 and a0 inverted
    dfsnp[[snp_id + '_num']] = 2 - dfsnp[[snp_id + '_num']].astype('int')
    dfsnp[snp_id + '_letter'] = dfsnp[snp_id + '_num'].apply(my_letter_snp)
    dfsnp[snp_id] = (dfsnp[snp_id + '_num'].astype('str') + '\n' + 
                     dfsnp[snp_id + '_letter'].astype('str')).astype('category')
    return dfsnp


@functools.lru_cache()
def get_gwas_ordered_snp_df(snp_id):
    '''
    Returns a dataframe containing the genotype on snp snp_id.
    The allele count is the number of risk alleles according to GWAS.
    
    Example: 
    get_gwas_ordered_snp_df('rs653953').head(5)
    
            rs653953_num rs653953_letter rs653953
    Br5168             2              GG    2\nGG
    Br2582             1              AG    1\nAG
    Br2378             1              AG    1\nAG
    Br5155             0              AA    0\nAA
    Br5182             0              AA    0\nAA
    '''
    pgc = get_gwas_snps()
    dfsnp = get_snp_df(snp_id).copy()
    gwas_snp = get_gwas_snp(snp_id)
    if gwas_snp['pgc2_a1_same_as_our_counted'].iloc[0]:
        if gwas_snp['OR'].iloc[0] > 1:
            pass
        else:
            dfsnp[[snp_id + '_num']] = 2 - dfsnp[[snp_id + '_num']]
    else:
        if gwas_snp['OR'].iloc[0] > 1:
            dfsnp[[snp_id + '_num']] = 2 - dfsnp[[snp_id + '_num']]
        else:
            pass
    dfsnp[snp_id] = (dfsnp[snp_id + '_num'].astype('str') + '\n' + 
                     dfsnp[snp_id + '_letter'].astype('str')).astype('category')
    return dfsnp


@functools.lru_cache()
def get_biomart_df():
    biomart = pd.read_csv(config['biomart_file'])
    biomart['description'] = biomart['description'].str.replace('\[Source.*$','', regex=True)
    return biomart


@functools.lru_cache()
def get_risk_allele(snp_id):
    gwas_snp = get_gwas_snp(snp_id)
    if gwas_snp['OR'].iloc[0] > 1:
        ra = gwas_snp['A1'].iloc[0]
    else:
        ra = gwas_snp['A2'].iloc[0]
    
    return ra

In [14]:
def get_gene_symbol(gene_id, biomart=get_biomart_df()):
    ensge = re.sub('\..+$','', gene_id)
    ggg = biomart[biomart['ensembl_gene_id']==ensge]
    if ggg.shape[0]==0:
        return '', ''
    gs = ggg['external_gene_name'].values[0]
    de = ggg['description'].values[0]
    if type(de)!=str:
        de = ''
    de = re.sub('\[Source:.*$','',de)
    return gs, de


def save_plot(p, fn):
    for ext in ['png', 'pdf', 'svg']:
        p.save(fn + '.' + ext)
        

def get_snp_gene_pheno_df(snp_id, gene_id, snp_df_func):
    pheno_columns = list(get_pheno_df().columns)
    expr_df = get_expression_and_pheno_df()[pheno_columns + [gene_id]]
    snp_df =  snp_df_func(snp_id)
    return expr_df.merge(snp_df, left_on='BrNum', right_index=True)


def simple_snp_expression_pheno_plot_impl(snp_id, gene_id, snp_df_func, pheno_var):
    df = get_snp_gene_pheno_df(snp_id, gene_id, snp_df_func)
    df['Dx'] = df.Dx.astype('category').cat.rename_categories({'Control': 'CTL', 'Schizo': 'SZ'})
    y0 = df[gene_id].quantile(.01) - 0.26
    y1 = df[gene_id].quantile(.99) + 0.26
    pjd = position_jitterdodge(jitter_width=0.27)
    p = ggplot(df, aes(x=snp_id, y=gene_id, fill=pheno_var)) \
    + geom_boxplot(alpha=0.4, outlier_alpha=0) \
    + geom_jitter(position=pjd, stroke=0, alpha=0.6) + ylim(y0, y1) \
    + labs(y='Residualized expression', fill='Diagnosis') \
    + theme_bw(base_size=20)\
    + theme(legend_title=element_text(face='bold'), 
            panel_grid_major=element_blank(), 
            panel_grid_minor=element_blank())
    return p


def simple_gwas_ordered_snp_expression_pheno_plot(snp_id, gene_id, pheno_var):
    return simple_snp_expression_pheno_plot_impl(snp_id, gene_id, get_gwas_ordered_snp_df, pheno_var)


def gwas_annotation(snp_id):
    return 'SZ GWAS pvalue: %.1e' % get_gwas_snp(snp_id).iloc[0]['P']


def eqtl_annotation(snp_id, gene_id):
    r = get_eqtl_df()[(get_eqtl_df()['variant_id']==snp_id) & 
                      (get_eqtl_df()['gene_id']==gene_id)]
    assert len(r)==1
    return 'eQTL nominal p-value: %.1e' % r.iloc[0]['pval_nominal']


def de_annotation(gene_id):
    g = get_de_df()[(get_de_df()['gencodeID'] == gene_id)]
    return 'DE adj.P.Val: %.3f' % g.iloc[0]['adj.P.Val']


def risk_allele_annotation(snp_id):
    return 'SZ risk allele: %s' % get_risk_allele(snp_id)


def gwas_annotated_eqtl_pheno_plot(snp_id, gene_id, pheno_var):
    p = simple_gwas_ordered_snp_expression_pheno_plot(snp_id, gene_id, pheno_var)                         
    gene_symbol, gene_description = get_gene_symbol(gene_id)
    title ="\n".join([gene_symbol, gene_id,
                      gwas_annotation(snp_id),
                      risk_allele_annotation(snp_id),
                      eqtl_annotation(snp_id, gene_id), 
                      de_annotation(gene_id)])
    
    p += ggtitle(title)
    return p

## Genes

In [7]:
try:
    os.makedirs(feature)
except OSError as e:
    if e.errno != errno.EEXIST:
        raise

### Enrichment

#### Integrate DEG with PGC2+CLOZUK SNPs

In [None]:
dft = get_integration_df()
dft.shape

In [None]:
dft['agree_direction'] = dft.apply(agree_direction, axis=1)
agreement = {1: 'Yes', -1: 'No', 0: 0}
dft.agree_direction = [agreement[item] for item in dft['agree_direction']]
## Enrichment test
table =  [[np.sum((dft['P']<5e-8) & ((dft['adj.P.Val']<.05))),
           np.sum((dft['P']<5e-8) & ((dft['adj.P.Val']>=.05)))],
          [np.sum((dft['P']>=5e-8) & ((dft['adj.P.Val']<.05))),
           np.sum((dft['P']>=5e-8) & ((dft['adj.P.Val']>=.05)))]]
print(table)
fisher_exact(table)

In [None]:
dft1 = dft[(dft['P']<5e-8) & ((dft['adj.P.Val']<.05))]
df = dft1.groupby('agree_direction').size().reset_index()
df

In [None]:
binom_test(df[0].iloc[1], df[0].sum())

In [None]:
dft2 = dft[(dft['P']<=5e-8) & (dft['adj.P.Val'] < 0.05)].copy()
dft2.groupby("gene_id").first().reset_index().shape

In [None]:
dft2['risk_allele'] = dft2['our_snp_id'].apply(get_risk_allele)

In [None]:
direction = {-1: 'Down', 1: 'Up'}
boolean_conv = {True: 1, False: -1}
dft2.pgc2_a1_same_as_our_counted = [boolean_conv[item] for item in dft2['pgc2_a1_same_as_our_counted']]
dft2['eqtl_gwas_dir'] = [direction[item] for item in np.sign(dft2['pgc2_a1_same_as_our_counted']) * np.sign(dft2['slope']) * np.sign(dft2['OR'] - 1)]
dft2['de_dir'] = [direction[item] for item in np.sign(dft2['t'])]
dft2['eqtl_slope'] = np.sign(dft2['pgc2_a1_same_as_our_counted']) * np.sign(dft2['OR'] - 1) * dft2['slope']
dft3 = dft2[['gene_id', 'Symbol', 'variant_id', 'A1', 'A2', 'risk_allele', 'OR', 
             'P', 'pval_nominal', 'adj.P.Val', 'logFC', 't', 'eqtl_slope', 
             'de_dir', 'eqtl_gwas_dir', 'agree_direction']]
dft3['Symbol'].fillna(dft3['gene_id'], inplace=True)
dft3.to_csv('%s/integration_by_symbol.txt' % feature, sep='\t', index=False)

In [8]:
#dft3 = pd.read_csv("../_m/genes/integration_by_symbol.txt", sep='\t')
#dft3['Symbol'].fillna(dft3['gene_id'], inplace=True)
df2 = dft3.groupby(['gene_id']).first().reset_index().sort_values('P')
df2.groupby(['agree_direction']).size()

agree_direction
No     13
Yes    22
dtype: int64

In [9]:
df2.set_index('Symbol').rename(columns={'t': 'de_t', 'P': 'GWAS_P', 'pval_nominal': 'eQTL_pvalue', 
                                        'adj.P.Val': 'de_FDR'})

Unnamed: 0_level_0,gene_id,variant_id,A1,A2,risk_allele,OR,GWAS_P,eQTL_pvalue,de_FDR,logFC,de_t,eqtl_slope,de_dir,eqtl_gwas_dir,agree_direction
Symbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
ZSCAN9,ENSG00000137185.11,chr6:27731058:C:T,C,T,C,1.2622,1.2100000000000002e-39,0.000894661,0.02262693,-0.07713,-3.126621,-0.307218,Down,Down,Yes
HCG4,ENSG00000176998.4,chr6:29298706:G:A,G,A,G,1.2636,2.5000000000000002e-39,4.44311e-05,0.01090935,0.231187,3.425733,0.374783,Up,Up,Yes
BRD2,ENSG00000204256.12,chr6:32669525:G:A,G,A,G,1.2197,6.82e-30,1.13181e-05,0.01718619,-0.051562,-3.239197,-0.404029,Down,Down,Yes
FLOT1,ENSG00000137312.14,chr6:30406434:A:G,A,G,A,1.2363,2.8000000000000003e-27,0.000628976,0.01347849,-0.047316,-3.33723,-0.193544,Down,Down,Yes
HCG11,ENSG00000228223.2,chr6:26466161:G:A,G,A,A,0.91432,1.02e-14,1.3741e-05,0.001181958,0.114043,4.1936,0.21117,Up,Up,Yes
BAG6,ENSG00000204463.12,chr6:31222761:G:A,G,A,A,0.92265,6.79e-14,8.19924e-05,0.01241493,-0.044002,-3.373864,0.148294,Down,Up,No
NELFE,ENSG00000204356.12,chr6:31868665:A:G,A,G,A,1.0793,1.66e-13,0.000730582,0.02507792,-0.046326,-3.081897,0.090267,Down,Up,No
ZNF204P,ENSG00000204789.4,chr6:27343879:A:G,A,G,G,0.93306,4.31e-13,0.00105804,0.01603175,0.095817,3.267501,0.089913,Up,Up,Yes
NGEF,ENSG00000066248.14,chr2:232843683:G:A,G,A,A,0.91758,7.13e-13,0.000241545,0.009144005,0.104229,3.49355,-0.144167,Up,Down,No
PRRC2A,ENSG00000204469.12,chr6:31379292:C:T,C,T,C,1.0861,7.93e-12,0.000402146,0.001283462,-0.066383,-4.167993,-0.131287,Down,Down,Yes


### Plot with PGC2 risk allele

In [None]:
for xx in range(df2.shape[0]):
    gg = gwas_annotated_eqtl_pheno_plot(df2.iloc[xx, :].variant_id, df2.iloc[xx, :].gene_id, 'Dx')
    print(gg)
    label = '%s/eqtl_gwas_%s' % (feature, df2.iloc[xx, :].Symbol)
    save_plot(gg, label)