# Plotting eQTLs, increase font sizes

### Kynon Jade Benjamin and Apuã Paquola

In [1]:
import re
import functools
import subprocess
import numpy as np
import pandas as pd
from plotnine import *
from pandas_plink import read_plink
from warnings import filterwarnings
from matplotlib.cbook import mplDeprecation

filterwarnings("ignore",category=mplDeprecation)
filterwarnings('ignore', category=UserWarning, module='plotnine.*')
filterwarnings('ignore', category=DeprecationWarning, module='plotnine.*')

## Configuration

In [2]:
tissue = "hippocampus"; feature = "genes"
config = {
    'biomart_file': '../_h/biomart.csv',
    'residual_expression_file': "../../../../prep_eqtl_analysis/%s/%s/covariates/" % (tissue, feature)+\
    "residualized_expression/_m/%s_residualized_expression.csv" % feature,
    'phenotype_file': '/ceph/projects/v4_phase3_paper/inputs/phenotypes/_m/merged_phenotypes.csv',
    'plink_file_prefix': '/ceph/projects/v4_phase3_paper/inputs/genotypes/_m/LIBD_Brain_TopMed',
    'eqtl_output_file': '../../../summary_table/_m/Brainseq_sex_interacting_4features_3regions.eFeatures.txt.gz',
    'gwas_snp_file': '/ceph/projects/v4_phase3_paper/inputs/sz_gwas/pgc2_clozuk/map_phase3/_m/libd_hg38_pgc2sz_snps_p5e_minus8.tsv'
}

## Functions

### Expression functions

In [3]:
@functools.lru_cache()
def tissue_map(tissue):
    return {"caudate": "Caudate", "dlpfc": "DLPFC",
            "hippocampus": "Hippocampus"}[tissue]


@functools.lru_cache()
def feature_map(feature):
    return {"genes": "Gene", "transcripts": "Transcript",
            "exons": "Exon", "junctions": "Junction"}[feature]


@functools.lru_cache()
def get_biomart_df():
    biomart = pd.read_csv(config['biomart_file'], index_col=0)
    biomart['description'] = biomart['description'].str.replace('\[Source.*$','', regex=True)
    return biomart


@functools.lru_cache()
def get_residual_expression_df():
    return pd.read_csv(config['residual_expression_file'], index_col=0).transpose()


@functools.lru_cache()
def get_pheno_df():
    return pd.read_csv(config['phenotype_file']).set_index("BrNum").loc[:, ["RNum", "Sex", "Dx"]]


@functools.lru_cache()
def get_expression_and_pheno_df():
    return pd.merge(get_pheno_df(), get_residual_expression_df(),
                    left_index=True, right_index=True)


@functools.lru_cache()
def get_gene_id_df():
    return pd.DataFrame({'gene_id': get_residual_expression_df().columns,
                         'ensembl_gene_id': get_residual_expression_df().columns.str.replace('\..+$','', regex=True)})


@functools.lru_cache()
def gene_info_from_symbol(gene_symbol):
    return get_biomart_df()[get_biomart_df()['external_gene_name']==gene_symbol]\
        .merge(get_gene_id_df(), on='ensembl_gene_id', how='left')


@functools.lru_cache()
def gene_id_from_symbol(gene_symbol):
    df = gene_info_from_symbol(gene_symbol)
    assert df.shape[0] == 1
    return df[['gene_id']].iloc[0].values[0]

### Genotype and eQTL functions

In [4]:
def letter_snp(number, a0, a1):
    '''
    Example:
    letter_snp(0, 'A', 'G') is 'AA'
    letter_snp(1, 'A', 'G') is 'AG'
    letter_snp(2, 'A', 'G') is 'GG'
    '''
    if np.isnan(number):
        return np.nan
    if len(a0)==1 and len(a1)==1:
        sep = ''
    else:
        sep = ' '
    return sep.join(sorted([a0]*int(number) + [a1]*(2-int(number))))


@functools.lru_cache()
def get_plink_tuple():
    '''
    Usage: (bim, fam, bed) = get_plink_tuple()
    '''
    return read_plink(config['plink_file_prefix'])


@functools.lru_cache()
def get_eFeature_df():
    eqtl_df = pd.read_csv(config["eqtl_output_file"], sep='\t')
    return eqtl_df[(eqtl_df["Type"] == feature_map(feature)) & 
                   (eqtl_df["Tissue"] == tissue_map(tissue))]


@functools.lru_cache()
def get_gwas_snps():
    return pd.read_csv(config['gwas_snp_file'], sep='\t', index_col=0)


@functools.lru_cache()
def get_risk_allele(snp_id):
    gwas_snp = get_gwas_snp(snp_id)
    if gwas_snp['OR'].iloc[0] > 1:
        ra = gwas_snp['A1'].iloc[0]
    else:
        ra = gwas_snp['A2'].iloc[0]
    return ra


@functools.lru_cache()
def get_snp_df(snp_id):
    '''
    Returns a dataframe containing the genotype on snp snp_id.
    The allele count is the same as in the plink files.
    
    Example: 
    get_snp_df('rs653953').head(5)
    
            rs653953_num rs653953_letter rs653953
    Br5168             0              GG    0\nGG
    Br2582             1              AG    1\nAG
    Br2378             1              AG    1\nAG
    Br5155             2              AA    2\nAA
    Br5182             2              AA    2\nAA
    '''
    (bim, fam, bed) = get_plink_tuple()
    brain_ids = list(set(get_expression_and_pheno_df().index).intersection(set(fam['fid'])))
    snp_info = bim[bim['snp']==snp_id]
    snp_pos = snp_info.iloc[0]['i']
    fam_pos = list(fam.drop_duplicates(subset="fid").set_index('fid').loc[brain_ids]['i'])
    dfsnp = (pd.DataFrame(bed[[snp_pos]].compute()[:,fam_pos], 
                          columns=brain_ids, index=[snp_id + '_num'])
             .transpose().dropna())
    my_letter_snp = functools.partial(letter_snp, a0=snp_info.iloc[0]['a0'], a1=snp_info.iloc[0]['a1'])
    dfsnp[[snp_id + '_num']] = 2 - dfsnp[[snp_id + '_num']].astype('int')
    dfsnp[snp_id + '_letter'] = dfsnp[snp_id + '_num'].apply(my_letter_snp)
    dfsnp[snp_id] = (dfsnp[snp_id + '_num'].astype('str') + '\n' + 
                     dfsnp[snp_id + '_letter'].astype('str')).astype('category')
    return dfsnp


@functools.lru_cache()
def get_gwas_ordered_snp_df(snp_id):
    '''
    Returns a dataframe containing the genotype on snp snp_id.
    The allele count is the number of risk alleles according to GWAS.
    
    Example: 
    get_gwas_ordered_snp_df('rs653953').head(5)
    
            rs653953_num rs653953_letter rs653953
    Br5168             2              GG    2\nGG
    Br2582             1              AG    1\nAG
    Br2378             1              AG    1\nAG
    Br5155             0              AA    0\nAA
    Br5182             0              AA    0\nAA
    '''
    pgc = get_gwas_snps()
    dfsnp = get_snp_df(snp_id).copy()
    gwas_snp = get_gwas_snp(snp_id)
    if gwas_snp['pgc2_a1_same_as_our_counted'].iloc[0]:
        if gwas_snp['OR'].iloc[0] > 1:
            pass
        else:
            dfsnp[[snp_id + '_num']] = 2 - dfsnp[[snp_id + '_num']]
    else:
        if gwas_snp['OR'].iloc[0] > 1:
            dfsnp[[snp_id + '_num']] = 2 - dfsnp[[snp_id + '_num']]
        else:
            pass
    dfsnp[snp_id] = (dfsnp[snp_id + '_num'].astype('str') + '\n' + 
                     dfsnp[snp_id + '_letter'].astype('str')).astype('category')
    return dfsnp


### Plotting functions

In [5]:
def get_snp_gene_pheno_df(snp_id, gene_id, snp_df_func):
    pheno_columns = list(get_pheno_df().columns)
    expr_df = get_expression_and_pheno_df()[pheno_columns + [gene_id]]
    snp_df =  snp_df_func(snp_id)
    return expr_df.merge(snp_df, left_index=True, right_index=True)
    

def simple_snp_expression_plot_impl(snp_id, gene_id, snp_df_func):
    df = get_snp_gene_pheno_df(snp_id, gene_id, snp_df_func)
    y0 = df[gene_id].quantile(.01) - 0.26
    y1 = df[gene_id].quantile(.99) + 0.26
    p = ggplot(df, aes(x=snp_id, y=gene_id, fill='Sex')) \
    + geom_boxplot(alpha=0.4, outlier_alpha=0) \
    + geom_jitter(position=position_jitterdodge(jitter_width=0.25),
                  stroke=0, alpha=0.6) \
    + ylim(y0, y1) \
    + theme_bw(base_size=15) \
    + theme(panel_grid=element_blank(), 
            axis_title=element_text(face="bold"))
    return p
    

def simple_snp_expression_plot(snp_id, gene_id):
    return simple_snp_expression_plot_impl(snp_id, gene_id, get_snp_df)


def simple_gwas_ordered_snp_expression_plot(snp_id, gene_id):
    return simple_snp_expression_plot_impl(snp_id, gene_id, get_gwas_ordered_snp_df)


def get_gene_symbol(gene_id, biomart=get_biomart_df()):
    ensge = re.sub('\..+$','', gene_id)
    ggg = biomart[biomart['ensembl_gene_id']==ensge]
    if ggg.shape[0]==0:
        return '', ''
    gs = ggg['external_gene_name'].values[0]
    de = ggg['description'].values[0]
    if type(de)!=str:
        de = ''
    de = re.sub('\[Source:.*$','',de)
    return gs, de


def get_gwas_snp(snp_id):
    gwas = get_gwas_snps()
    r = gwas[gwas['our_snp_id']==snp_id]
    assert len(r) == 1
    return r


def gwas_annotation(snp_id):
    return 'SZ GWAS pvalue: %.1e' % get_gwas_snp(snp_id).iloc[0]['P']


def eqtl_annotation(snp_id, gene_id):
    eqtl_df = get_eFeature_df()
    r = eqtl_df[(eqtl_df['variant_id']==snp_id) & (eqtl_df['gene_id']==gene_id)]
    assert len(r)==1
    return 'eQTL adjusted p-value: %.1e' % r.iloc[0]['BF']


def risk_allele_annotation(snp_id):
    return 'SZ risk allele: %s' % get_risk_allele(snp_id)


def annotated_eqtl_plot(snp_id, gene_id):
    p = simple_snp_expression_plot(snp_id, gene_id)
    gene_symbol, gene_description = get_gene_symbol(gene_id)
    title ="\n".join([gene_symbol,
                     eqtl_annotation(snp_id, gene_id)
                     ])
    p += ggtitle(title) + ylab('Residualized Expression') 
    return p


def gwas_annotated_eqtl_plot(snp_id, gene_id):
    p = simple_gwas_ordered_snp_expression_plot(snp_id, gene_id)
    gene_symbol, gene_description = get_gene_symbol(gene_id)
    title ="\n".join([gene_symbol,
                     eqtl_annotation(snp_id, gene_id),
                     gwas_annotation(snp_id),
                     risk_allele_annotation(snp_id)
                     ])
    p += ggtitle(title) + ylab('Residualized Expression') 
    return p


def save_plot(p, fn):
    for ext in ['png', 'pdf', 'svg']:
        p.save(fn + '.' + ext)
    

## Plot eQTLs

### DRD2

In [6]:
get_eFeature_df()[(get_eFeature_df()["gene_id"] == gene_id_from_symbol('DRD2'))]

Unnamed: 0,variant_id,gene_id,gencodeID,slope,statistic,pval_nominal,BF,eigenMT_BH,TESTS,Type,Tissue


### Top 5 eQTLs

In [7]:
eqtl_df = get_eFeature_df()
eqtl_df.head()

Unnamed: 0,variant_id,gene_id,gencodeID,slope,statistic,pval_nominal,BF,eigenMT_BH,TESTS,Type,Tissue
55266,chrX:100404152:G:A,ENSG00000000005.5,ENSG00000000005.5,0.497189,17.628291,1.4e-05,0.00442,0.478076,323,Gene,Hippocampus
55267,chr1:196610909:G:A,ENSG00000000971.15,ENSG00000000971.15,-0.476843,-11.561317,1.3e-05,0.001964,0.443931,151,Gene,Hippocampus
55268,chr1:24888756:T:C,ENSG00000001461.16,ENSG00000001461.16,-0.311761,-11.705099,4.2e-05,0.022525,0.634349,536,Gene,Hippocampus
55269,chr7:42749666:G:C,ENSG00000002746.14,ENSG00000002746.14,0.314297,5.959216,7.7e-05,0.033827,0.663881,440,Gene,Hippocampus
55270,chr17:38495906:AT:A,ENSG00000002834.17,ENSG00000002834.17,0.477555,11.40226,2.5e-05,0.015118,0.598413,608,Gene,Hippocampus


In [None]:
top_5 = eqtl_df.sort_values('pval_nominal').reset_index(drop=True).head(5)
for x in top_5.itertuples():
    filename = "top_%d_eqtl_%s" % (x.Index, tissue)
    p = annotated_eqtl_plot(x.variant_id, x.gene_id)
    print(filename, x.Index, x.variant_id, x.gene_id)
    print(p)
    save_plot(p, filename)
    

### Top 5 X-linked genes

In [None]:
top_5_x = eqtl_df[eqtl_df['variant_id'].str.contains("chrX")].sort_values("pval_nominal").reset_index(drop=True).head(5)
for x  in top_5_x.itertuples():
    filename = "top_%d_eqtl_xlinked_%s" % (x.Index, tissue)
    p = annotated_eqtl_plot(x.variant_id, x.gene_id)
    print(filename, x.Index, x.variant_id, x.gene_id)
    print(p)
    save_plot(p, filename)

### Top 5 eQTL with GWAS significant index SNP

In [None]:
gwas_eqtl_df = eqtl_df.merge(get_gwas_snps(), left_on = 'variant_id', 
                             right_on = 'our_snp_id', suffixes=['','_gwas'])
print(gwas_eqtl_df.shape)
gwas_eqtl_df.head()

In [None]:
top_gwas_eqtl_df = gwas_eqtl_df[(gwas_eqtl_df['is_index_snp'])].sort_values(['BF', 'P'])
print(top_gwas_eqtl_df.shape)
top_gwas_eqtl_df.head()

In [None]:
top_gwas_eqtl_df = gwas_eqtl_df.sort_values(['BF', 'P']).reset_index(drop=True)
print(top_gwas_eqtl_df.shape)
top_gwas_eqtl_df.head(10)

In [None]:
top_5_gwas = top_gwas_eqtl_df.head(5)
for x  in top_5_gwas.itertuples():
    filename = "top_%d_eqtl_in_gwas_significant_snps_%s" % (x.Index, tissue)
    p = gwas_annotated_eqtl_plot(x.variant_id, x.gene_id)
    print(filename, x.Index, x.variant_id, x.gene_id)
    print(p)
    save_plot(p, filename)