# eQTL boxplot: Enrichment and Overlap of PGC2+CLOZUK

This is script ported from python to fix unknown plotting error.

In [None]:
suppressPackageStartupMessages({
    library(tidyverse)
    library(ggpubr)
})

## Functions

In [None]:
feature = "genes"

### Cached functions

In [None]:
get_de_df <- function(){
    de_file = paste0("../../differential_expression/_m/", feature, 
                     "/diffExpr_szVctl_full.txt")
    return(data.table::fread(de_file))
}
memDE <- memoise::memoise(get_de_df)

get_eqtl_df <- function(){
    eGenes_file = paste0('../../eqtl/caudate/summary_table/_m/',
                        'Brainseq_LIBD_caudate_4features.signifpairs.txt.gz')
    eGenes = data.table::fread(eGenes_file) %>% 
        filter(Type == feature_map(feature)) %>%
        arrange(pval_nominal)
    return(eGenes)
}
memEQTL <- memoise::memoise(get_eqtl_df)

get_pheno_df <- function(){
    phenotype_file = paste0('/ceph/projects/v4_phase3_paper/inputs/',
                            'phenotypes/_m/merged_phenotypes.csv')
    return(data.table::fread(phenotype_file))
}
memPHENO <- memoise::memoise(get_pheno_df)

get_residualized_df <- function(){
    expr_file = paste0("../../differential_expression/_m/", feature, 
                       "/residualized_expression.tsv")
    return(data.table::fread(expr_file) %>% column_to_rownames("V1"))
}
memRES <- memoise::memoise(get_residualized_df)

get_genotypes <- function(){
    traw_file = paste0("/ceph/projects/brainseq/genotype/download/topmed/convert2plink/",
                      "filter_maf_01/a_transpose/_m/LIBD_Brain_TopMed.traw")
    traw = data.table::fread(traw_file) %>% rename_with(~ gsub('\\_.*', '', .x))
    return(traw)
}
memSNPs <- memoise::memoise(get_genotypes)

get_gwas_snps <- function(){
    gwas_snp_file = paste0('/ceph/projects/v4_phase3_paper/inputs/sz_gwas/',
                          'pgc2_clozuk/map_phase3/_m/libd_hg38_pgc2sz_snps.tsv')
    gwas_df = data.table::fread(gwas_snp_file) %>% arrange(P)
    return(gwas_df)
}
memGWAS <- memoise::memoise(get_gwas_snps)

get_integration_df <- function(){
    return(inner_join(memGWAS(), memEQTL(), 
                      by=c("our_snp_id"="variant_id"), 
                      suffix=c("_PGC2", "_eQTL")) %>%
            inner_join(memDE(), by=c("gene_id"="V1")) %>% 
            mutate(agree_direction=sign(OR -1) * sign(slope) * sign(t) * ifelse(pgc2_a1_same_as_our_counted, 1, -1)))
}
memMERGE <- memoise::memoise(get_integration_df)

get_snp_df <- function(variant_id, gene_id){
    zz = get_geno_annot() %>% filter(SNP == variant_id)
    xx = get_snps_df() %>% filter(SNP == variant_id) %>% 
        column_to_rownames("SNP") %>% t %>% as.data.frame %>% 
        rownames_to_column("BrNum") %>% mutate(COUNTED=zz$COUNTED, ALT=zz$ALT) %>% 
        rename("SNP"=all_of(variant_id))
    yy = memRES()[gene_id, ] %>% t %>% as.data.frame %>% 
        rownames_to_column("RNum") %>% inner_join(memPHENO(), by="RNum")
    ## Annotated SNPs
    letters = c()
    for(ii in seq_along(xx$COUNTED)){
        a0 = xx$COUNTED[ii]; a1 = xx$ALT[ii]; number = xx$SNP[ii]
        letters <- append(letters, letter_snp(number, a0, a1))
    }
    xx = xx %>% mutate(LETTER=letters, ID=paste(SNP, LETTER, sep="\n"))
    df = inner_join(xx, yy, by="BrNum") %>% mutate_if(is.character, as.factor)
    return(df)
}
memDF <- memoise::memoise(get_snp_df)

### Simple functions

In [None]:
feature_map <- function(feature){
    return(list("genes"="Gene", "transcripts"= "Transcript", 
                "exons"= "Exon", "junctions"= "Junction")[[feature]])
}

get_geno_annot <- function(){
    return(memSNPs() %>% select(CHR, SNP, POS, COUNTED, ALT))
}

get_snps_df <- function(){
    return(memSNPs() %>% select("SNP", starts_with("Br")))
}

letter_snp <- function(number, a0, a1){
    if(is.na(number)){ return(NA) }
    if( length(a0) == 1 & length(a1) == 1){
        seps = ""; collapse=""
    } else {
        seps = " "; collapse=NULL
    }
    return(paste(paste0(rep(a0, number), collapse = collapse), 
                 paste0(rep(a1, (2-number)), collapse = collapse), sep=seps))
}

save_ggplots <- function(fn, p, w, h){
    for(ext in c('.pdf', '.png', '.svg')){
        ggsave(paste0(fn, ext), plot=p, width=w, height=h)
    }
}

get_biomart_df <- function(){
    biomart = data.table::fread("../_h/biomart.csv")
}
memMART <- memoise::memoise(get_biomart_df)

get_gene_symbol <- function(gene_id){
    ensemblID = gsub("\\..*", "", gene_id)
    geneid = memMART() %>% filter(ensembl_gene_id == gsub("\\..*", "", gene_id))
    if(dim(geneid)[1] == 0){
        return("")
    } else {
        return(geneid$external_gene_name)
    }
}

plot_simple_eqtl <- function(fn, gene_id, variant_id, eqtl_annot){
    bxp = memDF(variant_id, gene_id) %>%
        ggboxplot(x="ID", y=gene_id, fill="red", add="jitter", xlab="",
                  ylab="Residualized Expression", outlier.shape=NA,
                  add.params=list(alpha=0.5), alpha=0.4,
                  ggtheme=theme_pubr(base_size=20, border=TRUE)) +
        font("xy.title", face="bold") + 
        ggtitle(paste(get_gene_symbol(gene_id), gene_id, eqtl_annot, sep='\n')) +
        theme(plot.title = element_text(hjust = 0.5, face="bold"))
    print(bxp)
    save_ggplots(fn, bxp, 7, 7)
}

### GWAS plots

In [None]:
get_risk_allele <- function(OR, A1, A2){
    ra = ifelse(OR > 1, A1, A2)
    return(ra)
}

get_df <- function(){
    return(memEQTL() %>% inner_join(memGWAS(), by="variant_id"))
}

get_gwas_ordered_snp_df <- function(variant_id, gene_id, pgc2_a1_same_as_our_counted, OR){
    df = memDF(variant_id, gene_id)
    if(!pgc2_a1_same_as_our_counted){ # Fix bug with matching alleles!
        if(OR < 1){ df = df %>% mutate(SNP = 2-SNP, ID=paste(SNP, LETTER, sep="\n")) }
    } else {
        if(OR > 1){ df = df %>% mutate(SNP = 2-SNP, ID=paste(SNP, LETTER, sep="\n")) }
    }
    return(df)
}

plot_gwas_eqtl_pheno <- function(fn, gene_id, variant_id, pgc2_a1_same_as_our_counted, OR, title){
    bxp = get_gwas_ordered_snp_df(variant_id, gene_id, pgc2_a1_same_as_our_counted, OR) %>%
        mutate_if(is.character, as.factor) %>% filter(Dx %in% c("CTL", "SZ"), Age > 17) %>%
        ggboxplot(x="ID", y=gene_id, fill="Dx", color="Dx", add="jitter", xlab=variant_id,
                  ylab="Residualized Expression", outlier.shape=NA,
                  add.params=list(alpha=0.5), alpha=0.4, legend="bottom",
                  ggtheme=theme_pubr(base_size=20, border=TRUE)) +
        font("xy.title", face="bold") + ggtitle(title) +
        theme(plot.title = element_text(hjust = 0.5, face="bold"))
    print(bxp)
    save_ggplots(fn, bxp, 7, 9)
}

## Integration analysis

In [None]:
dir.create(feature)

### Enrichment

#### Integrate DEG with PGC2+CLOZUK SNPs

In [None]:
dft = memMERGE() %>% mutate(agree_direction=ifelse(agree_direction == 1, "Yes", ifelse(agree_direction == -1, "No", 0)))
dim(dft)

In [None]:
table(dft$agree_direction)

In [None]:
table = matrix(c(sum((dft$P<5e-8)  & (dft$adj.P.Val < 0.05)), 
                 sum((dft$P>=5e-8) & (dft$adj.P.Val < 0.05)), 
                 sum((dft$P<5e-8)  & (dft$adj.P.Val >= 0.05)), 
                 sum((dft$P>=5e-8) & (dft$adj.P.Val >= 0.05))),
               nrow=2)
print(table)
fisher.test(table)

In [None]:
dft2 = dft %>% filter(P <= 5e-8, `adj.P.Val` < 0.05) %>%
    mutate(eqtl_gwas_dir=sign(OR -1) * sign(slope) * ifelse(pgc2_a1_same_as_our_counted, 1, -1), 
           de_dir=sign(t), eqtl_slope=sign(OR -1)*sign(slope)*ifelse(pgc2_a1_same_as_our_counted, 1, -1)) %>% 
    #rowwise() %>% mutate(risk_allele=get_risk_allele(our_snp_id)) %>%
    select(gene_id, Symbol, our_snp_id, rsid, A1, A2, OR, P, pval_nominal, adj.P.Val, logFC, 
           t, eqtl_slope, de_dir, eqtl_gwas_dir, agree_direction, pgc2_a1_same_as_our_counted) %>% 
    rename("variant_id"="our_snp_id") %>% mutate_all(list(~na_if(.,""))) %>% 
    mutate(Symbol = coalesce(Symbol,gene_id))
dft2 %>% data.table::fwrite(paste0(feature, "/integration_by_symbol.txt"), sep='\t')
dim(dft2)

In [None]:
df = dft2 %>% group_by(gene_id) %>% slice(1) %>% arrange(P) 
table(df$agree_direction)

In [None]:
df

### Plot with PGC2 risk allele

In [None]:
for(num in seq_along(df$gene_id)){
    variant_id = df$variant_id[num]
    gene_id = df$gene_id[num]
    gene_name = df$Symbol[num]
    pgc2_a1_same_as_our_counted = df$pgc2_a1_same_as_our_counted[num]
    OR = df$OR[num]; A1 = df$A1[num]; A2 = df$A2[num]
    fn = paste0(feature, "/eqtl_gwas_", gsub("\\.", "_", gene_name))
    de_annot = paste('DE adj.P.Val:', signif(df$adj.P.Val[num], 2))
    eqtl_annot = paste("eQTL nominal p-value:", signif(df$pval_nominal[num], 2))
    gwas_annot = paste("SZ GWAS pvalue:", signif(df$P[num], 2))
    risk_annot = paste("SZ risk allele:", get_risk_allele(OR, A1, A2))
    title = paste(get_gene_symbol(gene_id), gene_id, gwas_annot, 
                  risk_annot, eqtl_annot, de_annot, sep='\n')
    plot_gwas_eqtl_pheno(fn, gene_id, variant_id, pgc2_a1_same_as_our_counted, OR, title)
    #print(title)
}

## Session Info

In [None]:
Sys.time()
proc.time()
options(width = 120)
sessioninfo::session_info()