# Enrichment and Overlap of PGC2+CLOZUK

In [1]:
import re
import os, errno
import functools
import numpy as np
import pandas as pd
from plotnine import *
from pandas_plink import read_plink
from warnings import filterwarnings
from matplotlib.cbook import mplDeprecation
from scipy.stats import fisher_exact, binom_test

filterwarnings("ignore",category=mplDeprecation)
filterwarnings('ignore', category=UserWarning, module='plotnine.*')
filterwarnings('ignore', category=DeprecationWarning, module='plotnine.*')

## Config and Functions

In [2]:
config = {
    'biomart_file': '../_h/biomart.csv',
    'phenotype_file': '/ceph/projects/v4_phase3_paper/inputs/phenotypes/_m/merged_phenotypes.csv',
    'plink_file_prefix': '/ceph/projects/v4_phase3_paper/inputs/genotypes/_m/LIBD_Brain_TopMed',
    'gwas_snp_file': '/ceph/projects/v4_phase3_paper/inputs/sz_gwas/pgc2_clozuk/map_phase3/_m/libd_hg38_pgc2sz_snps.tsv'
}

config_feature = {
    'de_file': '../../differential_expression/_m/junctions/diffExpr_szVctl_full.txt',
    'residual_expression_file': '../../differential_expression/_m/junctions/residualized_expression.tsv',
    'fastqtl_output_file': '../../eqtl/caudate/summary_table/_m/Brainseq_LIBD_caudate_4features.signifpairs.txt.gz',
}

feature = "junctions"

In [3]:
@functools.lru_cache()
def feature_map(feature):
    return {"genes": "Gene", "transcripts": "Transcript", 
            "exons": "Exon", "junctions": "Junction"}[feature]


@functools.lru_cache()
def get_de_df():
    """
    Load DE analysis
    """
    return pd.read_csv(config_feature['de_file'], sep='\t', index_col=0)


@functools.lru_cache()
def get_eqtl_df():
    eqtl_df = pd.read_csv(config_feature['fastqtl_output_file'], sep='\t')
    return eqtl_df[(eqtl_df["Type"] == feature_map(feature))]


@functools.lru_cache()
def get_gwas_snps():
    return pd.read_csv(config['gwas_snp_file'], sep='\t', index_col=0, low_memory=False)


@functools.lru_cache()
def get_integration_df():
    return get_gwas_snps().merge(get_eqtl_df(), left_on='our_snp_id', right_on='variant_id', 
                                 suffixes=['_PGC2', '_eQTL'])\
                          .merge(get_de_df(), left_on='gene_id', right_index=True)


@functools.lru_cache()
def get_residual_expression_df():
    return pd.read_csv(config_feature['residual_expression_file'], 
                       sep='\t', index_col=0).transpose()


@functools.lru_cache()
def get_pheno_df():
    return pd.read_csv(config['phenotype_file'], index_col=0)


@functools.lru_cache()
def get_biomart_df():
    biomart = pd.read_csv(config['biomart_file'])
    biomart['description'] = biomart['description'].str.replace('\[Source.*$','', regex=True)
    return biomart

In [4]:
def agree_direction(row):
    return [-1, 1][row['pgc2_a1_same_as_our_counted']] * np.sign(row['OR'] - 1) * np.sign(row['slope']) * np.sign(row['t'])


def save_plot(p, fn):
    for ext in ['png', 'pdf', 'svg']:
        p.save(fn + '.' + ext)
        

def letter_snp(number, a0, a1):
    '''
    Example:
    letter_snp(0, 'A', 'G') is 'AA'
    letter_snp(1, 'A', 'G') is 'AG'
    letter_snp(2, 'A', 'G') is 'GG'
    
    '''
    if np.isnan(number):
        return np.nan
    if len(a0)==1 and len(a1)==1:
        sep = ''
    else:
        sep = ' '
    return sep.join(sorted([a0]*int(number) + [a1]*(2-int(number))))


def get_gwas_snp(snp_id):
    gwas = get_gwas_snps()
    r = gwas[gwas['our_snp_id']==snp_id]
    assert len(r) == 1
    return r

In [5]:
@functools.lru_cache()
def get_expression_and_pheno_df():
    return pd.merge(get_pheno_df(), get_residual_expression_df(), left_index=True, right_index=True)


@functools.lru_cache()
def get_plink_tuple():
    '''
    Usage: (bim, fam, bed) = get_plink_tuple()
    '''
    return read_plink(config['plink_file_prefix'])


@functools.lru_cache()
def subset_bed():
    """
    This subsets the bed and bim file and returns the new subsetted
    data with shared brain_ids.
    
    This is to speed up accessing the bed file.
    """
    (bim, fam, bed) = get_plink_tuple()
    brain_ids = list(set(get_expression_and_pheno_df()['BrNum']).intersection(set(fam['fid'])))
    fam_pos = list(fam[(fam["fid"].isin(brain_ids))].drop_duplicates(subset="fid").loc[:, 'i'])
    unique_snps = get_eqtl_df().variant_id.unique()
    snp_info = bim[(bim["snp"].isin(unique_snps))].copy()
    snp_pos = list(snp_info.loc[:, "i"])
    new_bed = bed[snp_pos].compute()[:,fam_pos]
    new_bim = bim[(bim["i"].isin(snp_pos))].reset_index(drop=True)
    new_bim['ii'] = new_bim.index
    return new_bed, new_bim, brain_ids


@functools.lru_cache()
def get_snp_df(snp_id):
    '''
    Returns a dataframe containing the genotype on snp snp_id.
    The allele count is the same as in the plink files.
    
    Example: 
    get_snp_df('rs653953').head(5)
    
            rs653953_num rs653953_letter rs653953
    Br5168             0              GG    0\nGG
    Br2582             1              AG    1\nAG
    Br2378             1              AG    1\nAG
    Br5155             2              AA    2\nAA
    Br5182             2              AA    2\nAA
    '''
    bed, bim, brain_ids = subset_bed()
    snp_info = bim[bim['snp']==snp_id]
    snp_pos = snp_info.iloc[0]['ii']
    dfsnp = pd.DataFrame(bed[[snp_pos]], columns=brain_ids, index=[snp_id + '_num']).transpose().dropna()
    my_letter_snp = functools.partial(letter_snp, a0=snp_info.iloc[0]['a0'], a1=snp_info.iloc[0]['a1'])
    # the 2 - in next line is to workaround a possible bug in pandas_plink? a1 and a0 inverted
    dfsnp[[snp_id + '_num']] = 2 - dfsnp[[snp_id + '_num']].astype('int')
    dfsnp[snp_id + '_letter'] = dfsnp[snp_id + '_num'].apply(my_letter_snp)
    dfsnp[snp_id] = (dfsnp[snp_id + '_num'].astype('str') + '\n' + 
                     dfsnp[snp_id + '_letter'].astype('str')).astype('category')
    return dfsnp


@functools.lru_cache()
def get_gwas_ordered_snp_df(snp_id):
    '''
    Returns a dataframe containing the genotype on snp snp_id.
    The allele count is the number of risk alleles according to GWAS.
    
    Example: 
    get_gwas_ordered_snp_df('rs653953').head(5)
    
            rs653953_num rs653953_letter rs653953
    Br5168             2              GG    2\nGG
    Br2582             1              AG    1\nAG
    Br2378             1              AG    1\nAG
    Br5155             0              AA    0\nAA
    Br5182             0              AA    0\nAA
    '''
    pgc = get_gwas_snps()
    dfsnp = get_snp_df(snp_id).copy()
    gwas_snp = get_gwas_snp(snp_id)
    
    if gwas_snp['pgc2_a1_same_as_our_counted'].iloc[0]:
        if gwas_snp['OR'].iloc[0] > 1:
            pass
        else:
            dfsnp[[snp_id + '_num']] = 2 - dfsnp[[snp_id + '_num']]
    else:
        if gwas_snp['OR'].iloc[0] > 1:
            dfsnp[[snp_id + '_num']] = 2 - dfsnp[[snp_id + '_num']]
        else:
            pass
        
    dfsnp[snp_id] = (dfsnp[snp_id + '_num'].astype('str') + '\n' + 
                     dfsnp[snp_id + '_letter'].astype('str')).astype('category')
    return dfsnp

In [6]:
def get_gene_symbol(gene_id, biomart=get_biomart_df()):
    ensge = re.sub('\..+$','', gene_id)
    ggg = biomart[biomart['ensembl_gene_id']==ensge]
    if ggg.shape[0]==0:
        return '', ''

    gs = ggg['external_gene_name'].values[0]
    de = ggg['description'].values[0]
    if type(de)!=str:
        de = ''
        
    de = re.sub('\[Source:.*$','',de)
    return gs, de


@functools.lru_cache()
def get_risk_allele(snp_id):
    gwas_snp = get_gwas_snp(snp_id)
    
    if gwas_snp['OR'].iloc[0] > 1:
        ra = gwas_snp['A1'].iloc[0]
    else:
        ra = gwas_snp['A2'].iloc[0]
    
    return ra


In [7]:
def get_snp_gene_pheno_df(snp_id, gene_id, snp_df_func):
    pheno_columns = list(get_pheno_df().columns)
    expr_df = get_expression_and_pheno_df()[pheno_columns + [gene_id]]
    snp_df =  snp_df_func(snp_id)
    return expr_df.merge(snp_df, left_on='BrNum', right_index=True)


def simple_snp_expression_pheno_plot_impl(snp_id, gene_id, snp_df_func, pheno_var):
    df = get_snp_gene_pheno_df(snp_id, gene_id, snp_df_func)
    df['Dx'] = df.Dx.astype('category').cat.rename_categories({'Control': 'CTL', 'Schizo': 'SZ'})
    y0 = df[gene_id].quantile(.01) - 0.26
    y1 = df[gene_id].quantile(.99) + 0.26
    pjd = position_jitterdodge(jitter_width=0.27)
    p = ggplot(df, aes(x=snp_id, y=gene_id, fill=pheno_var)) \
    + geom_boxplot(alpha=0.4, outlier_alpha=0) \
    + geom_jitter(position=pjd, stroke=0, alpha=0.6) + ylim(y0, y1) \
    + labs(y='Residualized expression', fill='Diagnosis') \
    + theme_bw(base_size=20)\
    + theme(legend_title=element_text(face='bold'), 
            panel_grid_major=element_blank(), 
            panel_grid_minor=element_blank())
    return p


def simple_gwas_ordered_snp_expression_pheno_plot(snp_id, gene_id, pheno_var):
    return simple_snp_expression_pheno_plot_impl(snp_id, gene_id, get_gwas_ordered_snp_df, pheno_var)

In [8]:
def gwas_annotation(snp_id):
    return 'SZ GWAS pvalue: %.1e' % get_gwas_snp(snp_id).iloc[0]['P']


def eqtl_annotation(snp_id, gene_id):
    r = get_eqtl_df()[(get_eqtl_df()['variant_id']==snp_id) & 
                      (get_eqtl_df()['gene_id']==gene_id)]
    assert len(r)==1
    return 'eQTL nominal p-value: %.1e' % r.iloc[0]['pval_nominal']


def de_annotation(gene_id):
    de_df = get_de_df()
    de_df['Feature'] = de_df.index
    g = de_df[(de_df['Feature'] == gene_id)]
    return 'DE adj.P.Val: %.3f' % g.iloc[0]['adj.P.Val']


def risk_allele_annotation(snp_id):
    return 'SZ risk allele: %s' % get_risk_allele(snp_id)


def gwas_annotated_eqtl_pheno_plot(snp_id, gene_id, pheno_var):
    p = simple_gwas_ordered_snp_expression_pheno_plot(snp_id, gene_id, pheno_var)
    de_df = get_de_df()
    de_df['Feature'] = de_df.index
    de_df = de_df[(de_df['Feature'] == gene_id)]
    gene_symbol = de_df.iloc[0]['newGeneSymbol']
    
    title ="\n".join([gene_symbol,
                      gene_id,
                      gwas_annotation(snp_id),
                      risk_allele_annotation(snp_id),
                      eqtl_annotation(snp_id, gene_id), 
                      de_annotation(gene_id)])
    
    p += ggtitle(title) 
    return p

## Junctions

In [9]:
try:
    os.makedirs(feature)
except OSError as e:
    if e.errno != errno.EEXIST:
        raise

### Enrichment

#### Integrate DEG with PGC2+CLOZUK SNPs

In [10]:
dft = get_integration_df()
dft.shape



(3190613, 57)

In [11]:
agreement = {1: 'Yes', -1: 'No', 0: 0}
dft['agree_direction'] = dft.apply(agree_direction, axis=1)
dft.agree_direction = [agreement[item] for item in dft['agree_direction']]

table =  [[np.sum((dft['P']<5e-8) & ((dft['adj.P.Val']<.05))),
           np.sum((dft['P']<5e-8) & ((dft['adj.P.Val']>=.05)))],
          [np.sum((dft['P']>=5e-8) & ((dft['adj.P.Val']<.05))),
           np.sum((dft['P']>=5e-8) & ((dft['adj.P.Val']>=.05)))]]
print(table)
fisher_exact(table)

[[5196, 74475], [122145, 2988797]]


(1.7071801613298905, 4.8917286718013116e-254)

In [12]:
dft1 = dft[(dft['P']<5e-8) & ((dft['adj.P.Val']<.05))]
df = dft1.groupby('agree_direction').size().reset_index()
df

Unnamed: 0,agree_direction,0
0,No,295
1,Yes,4901


In [13]:
binom_test(df[0].iloc[1], df[0].sum())

0.0

In [14]:
dft2 = dft[(dft['P']<=5e-8) & (dft['adj.P.Val'] < 0.05)].copy()
dft2['risk_allele'] = dft2['our_snp_id'].apply(get_risk_allele)

In [None]:
direction = {-1: 'Down', 1: 'Up'}
boolean_conv = {True: 1, False: -1}
dft2.pgc2_a1_same_as_our_counted = [boolean_conv[item] for item in dft2['pgc2_a1_same_as_our_counted']]
dft2['eqtl_gwas_dir'] = [direction[item] for item in np.sign(dft2['pgc2_a1_same_as_our_counted']) * np.sign(dft2['slope']) * np.sign(dft2['OR'] - 1)]
dft2['de_dir'] = [direction[item] for item in np.sign(dft2['t'])]
dft2['eqtl_slope'] = np.sign(dft2['pgc2_a1_same_as_our_counted']) * np.sign(dft2['OR'] - 1) * dft2['slope']
dft2 = dft2[['gene_id', 'newGeneSymbol', 'variant_id', 'A1', 'A2', 'risk_allele', 'OR', 
             'P', 'pval_nominal', 'adj.P.Val', 'logFC', 't', 'eqtl_slope', 
             'de_dir', 'eqtl_gwas_dir', 'agree_direction']].rename(columns={"newGeneSymbol": "Symbol"})
dft2['Symbol'].fillna(dft2['gene_id'], inplace=True)
dft2.to_csv('%s/integration_by_symbol.txt' % feature, sep='\t', index=False)

In [15]:
df2 = dft2.groupby(['Symbol']).first().reset_index().sort_values('P')
df2.groupby('agree_direction').size()

agree_direction
No     10
Yes    11
dtype: int64

In [16]:
df2.set_index('Symbol').rename(columns={'t': 'de_t', 'P': 'GWAS_P', 'pval_nominal': 'eQTL_pvalue', 
                                        'adj.P.Val': 'de_FDR'})

Unnamed: 0_level_0,chrN,our_snp_id,cm,pos,our_counted,our_alt,chrom,SNP,Freq.A1,CHR,...,endExon,newGeneID,isFusion,logFC,AveExpr,de_t,P.Value,de_FDR,B,agree_direction
newGeneSymbol,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
HLA-C,6,chr6:30770669:G:A,0,30770669,A,G,chr6,rs3095336:30738446:G:A,0.9085,6,...,182798.0,ENSG00000204525.15,False,-1.434498,-1.062368,-3.191459,0.001533957,0.037406,-1.448857,Yes
C4B,6,chr6:31530467:G:GC,0,31530467,GC,G,chr6,rs9281523:31498244:G:GC,0.0825,6,...,184126.0,ENSG00000224389.8,False,0.273063,1.175654,3.088621,0.002159476,0.046496,-1.588252,Yes
ZSCAN26,6,chr6:27772521:C:A,0,27772521,A,C,chr6,rs760587:27740300:C:A,0.7803,6,...,,ENSG00000197062.11,False,-0.26177,0.397751,-3.493012,0.0005341323,0.018932,-0.337857,Yes
ZFYVE21,14,chr14:103633871:T:A,0,103633871,A,T,chr14,rs12147645:104100208:T:A,0.7028,14,...,386973.0,ENSG00000100711.13,False,0.137121,2.583755,4.331524,1.899676e-05,0.002033,2.543318,Yes
CKB,14,chr14:103710761:G:T,0,103710761,T,G,chr14,rs4900592:104177098:G:T,0.7048,14,...,386769.0,ENSG00000166165.12,False,-0.094478,6.95792,-4.201159,3.3161e-05,0.002976,1.672783,Yes
BAG6,6,chr6:31610995:T:C,0,31610995,C,T,chr6,rs2844477:31578772:T:C,0.5974,6,...,183144.0,ENSG00000204463.12,False,-0.111484,2.471439,-3.769427,0.0001897734,0.009575,0.440874,No
NGEF,2,chr2:232926898:C:G,0,232926898,G,C,chr2,rs1878289:233791608:C:G,0.5596,2,...,91921.0,ENSG00000066248.14,False,0.115109,5.582328,3.772815,0.0001873065,0.009518,0.159948,No
PLCH2,1,chr1:2440958:A:G,0,2440958,G,A,chr1,rs6688934:2372397:A:G,0.508,1,...,1980.0,ENSG00000149527.17,False,-0.254239,-0.320352,-3.941689,9.639145e-05,0.006178,0.891407,Yes
HLA-DPB1,6,chr6:33020111:A:G,0,33020111,G,A,chr6,rs17214290:32987888:A:G,0.9563,6,...,184945.0,ENSG00000223865.10,False,0.452891,-1.123835,3.159284,0.001708866,0.040099,-1.508125,Yes
DRD2,11,chr11:113447023:G:C,0,113447023,C,G,chr11,rs17601612:113317745:G:C,0.6272,11,...,320189.0,ENSG00000149295.13,False,0.302443,-0.618759,4.079084,5.51663e-05,0.004208,1.310019,No


### Plot with PGC2 risk allele

In [None]:
for xx in range(df2.shape[0]):
    gg = gwas_annotated_eqtl_pheno_plot(df2.iloc[xx, :].variant_id, 
                                        df2.iloc[xx, :].gene_id, 'Dx')
    print(gg)
    label = '%s/eqtl_gwas_%s' % (feature, df2.iloc[xx, :].Symbol)
    save_plot(gg, label)