# eqtplot for eqtl-gwas colocalization

In [1]:
import pandas as pd
import numpy as np
import subprocess
import functools

In [2]:
%load_ext rpy2.ipython

## Functions

In [3]:
def flip_slope_by_allele(row):
    return [-1, 1][bool(row["pgc2_a1_same_as_our_counted"])] * row["NES"]


def get_ld(fn, eqtl_dfx, gwas_dfx, label):
    shared_df = gwas_dfx.merge(eqtl_dfx, left_on='SNP', right_on='SNP.Id')\
                        .sort_values('P', ascending=True)
    shared_df[['SNP.Id']].to_csv('snps_%s.txt' % label, index=None, header=None)
    cmd = '''/ceph/opt/plink-ng/1.9/plink \
                --bfile /ceph/users/jbenja13/github_projects/sex_differences_sz/input/genotypes/subset_by_sex/shared_snps/_m/LIBD_Brain_TopMed \
                --extract snps_%s.txt \
                --keep-fam %s --r2 inter-chr \
                --write-snplist --ld-window-r2 0 \
                --out shared_snps_%s; 
            sed -i 's/ \+//; s/ \+/\t/g' shared_snps_%s.ld
      ''' % (label,fn,label,label)
    subprocess.run(cmd, shell=True)
    return pd.read_csv("shared_snps_%s.ld" % label, sep='\t', usecols=[*range(7)])

In [4]:
@functools.lru_cache()
def get_gwas():
    gwas_fn = '/ceph/projects/v4_phase3_paper/inputs/sz_gwas/'+\
              'pgc2_clozuk/map_phase3/zscore/_m/libd_hg38_pgc2sz_snps.tsv'
    return pd.read_csv(gwas_fn, sep="\t", dtype={'chrN':str}, index_col=0)


@functools.lru_cache()
def subset_gwas(chrom, pos, window):
    gwas_df = get_gwas().loc[(get_gwas()['chrN'] == chrom) & 
                             (get_gwas()['pos'] > pos - window) & 
                             (get_gwas()['pos'] < pos + window), 
                             ['chrN', 'pos', 'our_snp_id', 'P']]\
                        .rename(columns={'chrN':'CHR', 'pos':'BP', 'our_snp_id':'SNP'})
    ## Flip direction of OR based on alleles matching
    gwas_df['BETA'] = np.log(get_gwas()[["OR"]])
    gwas_df['PHE'] = 'SCZD'
    gwas_df['CHR'] = gwas_df['CHR'].astype(int)
    gwas_df['pgc2_a1_same_as_our_counted'] = get_gwas()[["pgc2_a1_same_as_our_counted"]]
    return gwas_df


@functools.lru_cache()
def get_eqtl(fn, feature):
    cmd = '''
    zcat %s | head -1; zcat %s | awk '$1 == "%s" {print}'
    ''' % (fn,fn,feature)
    #print(cmd)
    with subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) as p:
        df = pd.read_csv(p.stdout, sep='\t')
    return df


@functools.lru_cache()
def annotate_eqtls(fn, feature, tissue):
    df = get_eqtl(fn, feature)
    eqtl_df = pd.DataFrame({'SNP.Id': df['variant_id'], 
                            'Gene.Symbol': df['gene_id'], 
                            'P.Value': df['pval_nominal'], 
                            'NES': df['slope'], 'Tissue': tissue}, 
                           index=df.index)
    return eqtl_df

In [5]:
def get_eqtl_by_genes(sex, tissue, gene):
    fn = '../../../../prep_eqtl_analysis/by_sex/%s/%s/' % (tissue, sex) +\
         'prepare_expression/fastqtl_nominal/_m/Brainseq_LIBD.allpairs.txt.gz'
    return annotate_eqtls(fn, gene, tissue)


def merge_gwas(eqtl_df, gwas_df):
    eqtl_df = pd.merge(eqtl_df, gwas_df, left_on="SNP.Id", right_on="SNP", how="left")\
                .drop(["CHR", "SNP", "BP", "P", "BETA", "PHE"], axis=1).fillna(True)
    eqtl_df.loc[:,'NES'] = eqtl_df.apply(flip_slope_by_allele, axis=1)
    return eqtl_df.drop(["pgc2_a1_same_as_our_counted"], axis=1)


def get_ld_by_tissue(eqtl_df, gwas_df, tissue, sex, label):
    fn_fam = "../../../../prep_eqtl_analysis/by_sex/%s/%s/_m/keepFam.txt" % (tissue, sex)
    return get_ld(fn_fam, eqtl_df, gwas_df, "%s_%s_%s" % (label, tissue, sex))

## Prepare data

### GWAS summary statistics data frame

In [6]:
## Parameters
chrom = 15; pos = 90868592; window = 2e5
## Load GWAS summary stats
gwas_df = subset_gwas("%d" % chrom, pos, window)
print(gwas_df.dtypes)
print(gwas_df.shape)
gwas_df.head(2)

  mask |= (ar1 == a)


CHR                              int64
BP                               int64
SNP                             object
P                              float64
BETA                           float64
PHE                             object
pgc2_a1_same_as_our_counted       bool
dtype: object
(1188, 7)


Unnamed: 0,CHR,BP,SNP,P,BETA,PHE,pgc2_a1_same_as_our_counted
1797695,15,90668725,chr15:90668725:TTTCAGAGATAG:T,0.4831,-0.0085,SCZD,True
1797696,15,90669048,chr15:90669048:T:C,0.6677,0.004789,SCZD,False


### Gene data frame

In [None]:
genes_df = pd.DataFrame({'CHR':[15],
                         'Start':[90868592],
                         'Stop':[90883458],
                         'Gene':["ENSG00000140564.10"],
                         'Build': ['hg38']})
genes_df

### Significant p-values

In [None]:
## P value nominal threshold by feature from permutation analysis
perm_pval = pd.DataFrame({'Gene':["ENSG00000140564.10"], 
                          'perm_pval': [0.01]})
perm_pval

### eQTL data frame

In [None]:
# DLPFC
eqtl_df1 = get_eqtl_by_genes("female", "dlpfc", genes_df.Gene[0])
eqtl_df2 = get_eqtl_by_genes("male", "dlpfc", genes_df.Gene[0])
# Hippocampus
eqtl_df3 = get_eqtl_by_genes("female", "hippocampus", genes_df.Gene[0])
eqtl_df4 = get_eqtl_by_genes("male", "hippocampus", genes_df.Gene[0])

In [None]:
## DLPFC
eqtl_df1 = merge_gwas(eqtl_df1, gwas_df)
print(eqtl_df1.shape)
eqtl_df1.sort_values("P.Value").head(5)

In [None]:
eqtl_df2 = merge_gwas(eqtl_df2, gwas_df)
print(eqtl_df2.shape)
eqtl_df2.sort_values("P.Value").head(5)

In [None]:
## Hippocampus
eqtl_df3 = merge_gwas(eqtl_df3, gwas_df)
print(eqtl_df3.shape)
eqtl_df3.sort_values("P.Value").head(5)

In [None]:
eqtl_df4 = merge_gwas(eqtl_df4, gwas_df)
print(eqtl_df4.shape)
eqtl_df4.sort_values("P.Value").head(5)

In [None]:
gwas_df.drop(["pgc2_a1_same_as_our_counted"], axis=1, inplace=True)
print(gwas_df.shape)

### LD data frame

In [None]:
ld_df1 = get_ld_by_tissue(eqtl_df1, gwas_df, "dlpfc", "female", "furin")
ld_df2 = get_ld_by_tissue(eqtl_df2, gwas_df, "dlpfc", "male", "furin")
ld_df3 = get_ld_by_tissue(eqtl_df3, gwas_df, "hippocampus", "female", "furin")
ld_df4 = get_ld_by_tissue(eqtl_df4, gwas_df, "hippocampus", "male", "furin")

## Plot

In [None]:
%%R -i gwas_df,genes_df,perm_pval
library(eQTpLot)
perm_pval

### DLPFC

#### Female

In [None]:
%%R -i ld_df1,eqtl_df1
ld_df = ld_df1
eqtl_df = eqtl_df1
# based on permutation p-value
pval = perm_pval$perm_pval[1]
gene = perm_pval$Gene[1]

p = eQTpLot(GWAS.df = gwas_df, eQTL.df = eqtl_df, Genes.df = genes_df, getplot=FALSE,
            LD.df = ld_df, LDmin = 10, R2min = 0.25, LDcolor = 'black', gene = gene, 
            trait = 'SCZD', gbuild = 'hg38', tissue = 'caudate', sigpvalue_eQTL = pval, 
            CollapseMethod = "min", congruence = FALSE)

#### Male

In [None]:
%%R -i ld_df2,eqtl_df2
ld_df = ld_df2
eqtl_df = eqtl_df2
# based on permutation p-value
pval = perm_pval$perm_pval[1]
gene = perm_pval$Gene[1]

p = eQTpLot(GWAS.df = gwas_df, eQTL.df = eqtl_df, Genes.df = genes_df, getplot=FALSE,
            LD.df = ld_df, LDmin = 10, R2min = 0.25, LDcolor = 'black', gene = gene, 
            trait = 'SCZD', gbuild = 'hg38', tissue = 'caudate', sigpvalue_eQTL = pval, 
            CollapseMethod = "min", congruence = FALSE)