#### Summary:
In this notebook we'll run FINRICH on different sets of peaks, to test for enrichment of GWAS credible sets. The two peak sets are: phenotype associated peaks and caQTL feature peaks (all cREs with 1/+ significant SNPs for each cell type, and cell type or lineage specific ones based on mashR outputs). Required inputs:
- ATAC DESeq phenotype association outputs
- Summary df of per cRE caQTL cell type results
- MashR raw results
- Background sets of peaks (all peaks tested with DESeq OR all peaks accessible in cell type)

In [1]:
suppressMessages(library(tidyverse))
suppressMessages(library(ggplot2))
suppressMessages(library(data.table))
suppressMessages(library(ggrepel))
suppressMessages(library(vroom))
suppressMessages(library(parallel))

# Set up

In [2]:
cell_types <- c('acinar','alpha','beta','bulk','delta','ductal','endothelial','gamma','immune','stellate')
major_celltypes <- c('beta','alpha','delta','gamma','acinar','ductal')

In [3]:
# Set up color palettes
major_colors = c('acinar'='#09850B', 'alpha'='#F51646', 'beta'='#3F98E0',
                 'delta'='#FFA946', 'ductal'='#F5DE6C', 'endothelial'='#752BBA', 
                 'gamma'='#A1D99B', 'immune'='#F598C9', 'stellate'='#AB7345')

In [8]:
peak_dir <- '/nfs/lab/projects/multiomic_islet/outputs/multiome/call_peaks/recluster_final_majorCTs_v2'
deseq_outdir <- '/path/to/trait/assoc/results/ATAC/DESeq'

In [9]:
#final credible set files!
credset_dir <- '/dir/with/hg38/credible/sets/'
t1d_fp <- file.path(credset_dir,'T1D_Chiou_2021_cred_set.bed')
t2d_fp <- file.path(credset_dir,'T2D_DIAMANTE_multiancestry.cred99.hg38.bed')
glu2h_fp <- file.path(credset_dir,'2hGlu_MAGIC_trans_ancestry_pseudo_credset.LDproxyRsq0.8.bed')
fg_fp <- file.path(credset_dir,'FG_MAGIC_trans_ancestry_pseudo_credset.LDproxyRsq0.8.bed')
fi_fp <- file.path(credset_dir,'FI_MAGIC_trans_ancestry_pseudo_credset.LDproxyRsq0.8.bed')
hba1c_fp <- file.path(credset_dir,'HbA1c_MAGIC_trans_ancestry_pseudo_credset.LDproxyRsq0.8.bed')

In [10]:
gwas_prefixes <- c('T1D','T2D','2hGlu','FG','FI','HbA1c')
gwas_fps <- c(t1d_fp, t2d_fp, glu2h_fp, fg_fp, fi_fp, hba1c_fp)
names(gwas_fps) <- gwas_prefixes

In [11]:
outdir <- '/dir/to/write/outputs/to'

# 1. Phenotype associated cREs

## 1a. Collect sets of cREs for FINRICH

In [8]:
### Function to make bed files of sig peaks
### PVALUE CUTOFF VERSION -- easy enough to change to using a qvalue cutoff
get_sig_peaks <- function(celltype, trait, dir, deseq_dir, outdir, min_peaks=5, p_cutoff=0.01){
    #create a bed file of sig results for input dir
    fp <- file.path(deseq_outdir,trait,sprintf('ATAC_deseq_v4.%s.%s.tsv',trait, celltype))
    df <- read.table(fp, sep='\t', header=1)
    if (dir=='up') df_sig_dir <- subset(df, pvalue<p_cutoff & log2FoldChange > 0)
    if (dir=='down') df_sig_dir <- subset(df, pvalue<p_cutoff & log2FoldChange < 0)    
    
    sig_peaks <- row.names(df_sig_dir)
    sig_df <- as.data.frame(str_split_fixed(sig_peaks,'-',3))
    sig_df$V4 <- sig_peaks
    
    #if enough sig peaks, proceed
    if(dim(sig_df)[1] >= min_peaks){
        out_fp <- file.path(outdir,'ATAC','sig_assoc_peaks',trait,sprintf('%s_%s_sig_peaks_%s.bed', celltype, trait, dir))
        write.table(sig_df, out_fp, sep='\t', row.names=FALSE, col.names=FALSE, quote=FALSE)
    } else {
        print(sprintf('Not enough sig peaks for %s and %s %s associations (n=%s)',celltype, trait, dir, dim(sig_df)[1]))
    }
}

In [13]:
traits <- c('scaled_age', 'scaled_BMI', 'scaled_HbA1c', 'sex')
cre_dir <- file.path(outdir,'trait_assoc_cREs')
dir.create(cre_dir, showWarnings = FALSE)

for(celltype in joint_celltypes){
    for(trait in traits){
        for(dir in c('up','down')){
             get_sig_peaks(celltype, trait, dir, deseq_outdir, cre_dir)           
        }
    }
}

## 1b. Run FINRICH on sets of trait associated cREs

In [7]:
### Function to run FINRICH on all sets of sig results (will just use files from GREAT)
### similar to great I'm going to just write all the commands to a script and then run that in the terminal
run_FINRICH <- function(finemap_bed, prefix, cre_bed, bg_bed, out_dir){
    #create overall output file
    overall_out_fp <- file.path(out_dir,'combined_results.txt')
    
    #set up finrich code
    temp_fp <- file.path(out_dir, paste0(prefix,'_temp.txt'))
    cmd <- sprintf('finrich %s %s %s --permutations 1000 --processes 2 > %s', finemap_bed, cre_bed, bg_bed, temp_fp)
    # print(cmd)
    system(cmd)

    #concat and reformat outputs slightly
    cmd2 <- sprintf('awk -v prefix=%s \'NR == 1 {print prefix "|" $0;}\' %s >> %s', prefix, temp_fp, overall_out_fp)
    # print(cmd2)
    system(cmd2)
    system(sprintf('rm %s',temp_fp))
}

In [9]:
### Function to run FINRICH on a cell type (so I can parallelize by cell type)
run_finrich_trait_assoc_cres <- function(celltype, trait, cre_dir, gwas_prefix, gwas_fp, outdir){
    #set up file paths
    gwas_dir <- file.path(outdir,gwas_prefix)
    dir.create(gwas_dir, showWarnings=F, recursive=T)
    
    #collect necessary file paths for finrich
    fp <- file.path(cre_dir, sprintf('%s_%s.p01.bed',celltype, trait))
    bg_peaks <- file.path(cre_dir, sprintf('%s_%s.all.bed',celltype, trait))
    
    #run finrich with helper function
    run_FINRICH(gwas_fp, paste(celltype,trait,sep='-'), fp, bg_peaks, gwas_dir)
}

In [10]:
trait_cre_outdir <- file.path(outdir,'trait_assoc_cREs_FINRICH')
dir.create(trait_cre_outdir, showWarnings=F)

In [21]:
for(gwas_prefix in names(gwas_fps)){
    print(paste(gwas_prefix, Sys.time()))
    gwas_fp <- gwas_fps[[gwas_prefix]]

    for(trait in kg_traits){
        mclapply(major_celltypes, run_finrich_trait_assoc_cres, trait, cre_dir, gwas_prefix, gwas_fp, trait_cre_outdir)
    }
}

[1] "T1D"              "1719944928.62769"
[1] "T2D"              "1719945483.56098"
[1] "2hGlu"            "1719945587.26309"
[1] "FG"               "1719945614.01858"
[1] "FI"               "1719945659.58167"
[1] "HbA1c"            "1719945692.64532"


# 2. caQTL cREs

## 2a. Collect sets of cREs for FINRICH
- Make sure peak was called in the cell type, otherwise we didn't even test it and thus it shouldn't be used

In [11]:
caqtl_cres_fp <- '/path/to/all/cREs/tested/for/caQTLs/and/ct/spec/info'
caqtl_cres <- read.table(caqtl_cres_fp, sep='\t', header=1)
head(caqtl_cres, n=2)

Unnamed: 0_level_0,cre,sig_caqtls_num_cts,sig_caqtls_cts,sig_caqtls_cts_spec,mash_num_cts,mash_cts,mash_cts_spec,peak_num_cts,peak_cts,peak_ct_spec,cpm5_num_cts,cpm5_cts,cpm5_ct_spec,cpm10_num_cts,cpm10_cts,cpm10_ct_spec,entropy2_celltype
Unnamed: 0_level_1,<chr>,<int>,<chr>,<lgl>,<int>,<chr>,<lgl>,<int>,<chr>,<lgl>,<int>,<chr>,<lgl>,<int>,<chr>,<lgl>,<chr>
1,chr10:100006062-100006793,2.0,"beta, bulk",False,,,,4,"alpha, delta, gamma, ductal",False,3,"alpha, gamma, ductal",False,,,,
2,chr10:100009188-100010635,,,,,,,6,"beta, alpha, delta, gamma, acinar, ductal",False,6,"beta, alpha, delta, gamma, acinar, ductal",False,6.0,"beta, alpha, delta, gamma, acinar, ductal",False,


In [13]:
#for each cell type collect all cres with a sig caQTL and all cres with specific mashR caqtl
cre_dir <- file.path(outdir,'caqtl_cres')
dir.create(cre_dir, showWarnings = F)

for(celltype in major_celltypes){
    #first subset to all sig caqtls (AND IF PEAK WAS EVEN CALLED IN CT) and write cres to a file
    sig_caqtls1 <- subset(caqtl_cres, grepl(celltype, sig_caqtls_cts) & grepl(celltype, peak_cts))
    bed1 <- as.data.frame(str_split_fixed(sig_caqtls1$cre, ':|-', 3))
    out_fp1 <- file.path(cre_dir, sprintf('%s_all_sig_caQTL_cREs.bed',celltype))
    write.table(bed1, out_fp1, sep='\t', row.names=F, col.names=F, quote=F)

    sig_caqtls2 <- subset(caqtl_cres, mash_cts==celltype)
    bed2 <- as.data.frame(str_split_fixed(sig_caqtls2$cre, ':|-', 3))
    out_fp2 <- file.path(cre_dir, sprintf('%s_ct_spec_caQTL_cREs.bed',celltype))
    write.table(bed2, out_fp2, sep='\t', row.names=F, col.names=F, quote=F)
}

### Collect shared and lineage specific caQTL cREs

In [49]:
#read in mashR results and format as binary matrix
mash <- data.frame(fread('/path/to/mash/results/240111_WE_run1_all_ash_pca_canonical_mash_lfsr.tsv', sep='\t'))
row.names(mash) <- mash$V1
mash_fin <- as.data.frame(mash[,seq(2,7)] < 0.05)
mash_fin$cre <- stringr::str_split(row.names(mash_fin), '::', simplify=T)[,1]

upset_data <- group_by(mash_fin, cre) %>%
    summarise_all(any) %>%
    tibble::column_to_rownames(var='cre') %>%
    mutate_all(as.numeric)
head(upset_data)

“Detected 6 column names but the data has 7 columns (i.e. invalid file). Added 1 extra default column name for the first column which is guessed to be row names or an index. Use setnames() afterwards if this guess is not correct, or fix the file write command that created the file to create a valid file.”


Unnamed: 0_level_0,acinar,alpha,beta,delta,ductal,gamma
Unnamed: 0_level_1,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
chr1:100036775-100037124,0,0,0,0,0,0
chr1:100037591-100039086,0,0,0,0,0,0
chr1:100046104-100046480,0,0,0,0,0,0
chr1:100046883-100047599,0,0,0,0,0,0
chr1:100050187-100050403,0,0,0,0,0,0
chr1:100053889-100054089,0,0,0,0,0,0


In [85]:
#collect cREs based on which cell types they have sig caQTLs from mashR 
endo_cres <- row.names(subset(upset_data, beta==1 & alpha==1 & delta==1 & gamma==1 & acinar==0 & ductal==0))
exo_cres <- row.names(subset(upset_data, beta==0 & alpha==0 & delta==0 & gamma==0 & acinar==1 & ductal==1))
shared_cres <- row.names(subset(upset_data, beta==1 & alpha==1 & delta==1 & gamma==1 & acinar==1 & ductal==1))
lineage_cres <- list('endo'=endo_cres, 'exo'=exo_cres, 'shared'=shared_cres)
lapply(lineage_cres, length)

In [94]:
cre_dir <- file.path(outdir,'caqtl_cres')

#write these to files
for (ct_set in names(lineage_cres)){
    cres <- lineage_cres[[ct_set]]
    chroms <- str_split_fixed(cres, ':', 2)[,1]
    coords <- str_split_fixed(str_split_fixed(cres, ':', 2)[,2], '-',2)
    bed <- as.data.frame(cbind(chroms, coords))

    bed_fp <- file.path(cre_dir, sprintf('%s_ct_spec_caQTL_cREs.bed',ct_set))
    write.table(bed, bed_fp, sep='\t', row.names=F, col.names=F, quote=F) 
}

## 2b. Run FINRICH on sets of caQTL cREs

In [91]:
### Function to run FINRICH on all sets of sig results (will just use files from GREAT)
### similar to great I'm going to just write all the commands to a script and then run that in the terminal
run_FINRICH <- function(finemap_bed, prefix, cre_bed, bg_bed, out_dir){
    #create overall output file
    overall_out_fp <- file.path(out_dir,'combined_results.txt')
    
    #set up finrich code
    temp_fp <- file.path(out_dir, paste0(prefix,'_temp.txt'))
    cmd <- sprintf('finrich %s %s %s --permutations 1000 --processes 2 > %s', finemap_bed, cre_bed, bg_bed, temp_fp)
    # print(cmd)
    system(cmd)

    #concat and reformat outputs slightly
    cmd2 <- sprintf('awk -v prefix=%s \'NR == 1 {print prefix "|" $0;}\' %s >> %s', prefix, temp_fp, overall_out_fp)
    # print(cmd2)
    system(cmd2)
    system(sprintf('rm %s',temp_fp))
}

In [101]:
### Function to run FINRICH on a cell type (so I can parallelize by cell type)
run_finrich_all_caqtls <- function(celltype, cre_dir, gwas_prefix, gwas_fp, outdir){
    #set up file paths
    gwas_dir <- file.path(outdir,gwas_prefix)
    dir.create(gwas_dir, showWarnings=F, recursive=T)
    
    #collect necessary file paths for finrich
    # fp1 <- file.path(cre_dir, sprintf('%s_all_sig_caQTL_cREs.bed',celltype))
    fp2 <- file.path(cre_dir, sprintf('%s_ct_spec_caQTL_cREs.bed',celltype))
    
    #run finrich with helper function
    if(celltype %in% c('endo','exo','shared')){
        ct_peaks <- file.path(peak_dir, 'mergedPeak.txt')
    } else {
        ct_peaks <- file.path(peak_dir, sprintf('%s.merged_peaks.anno.mergedOverlap.bed',celltype))    
    }
    # run_FINRICH(gwas_fp, paste('all_sig',celltype,sep='-'), fp1, ct_peaks, gwas_dir)
    run_FINRICH(gwas_fp, paste('ct_spec',celltype,sep='-'), fp2, ct_peaks, gwas_dir)
}

In [93]:
caqtl_cre_outdir <- file.path(outdir,'caQTL_cREs_FINRICH')

In [102]:
celltypes_to_run <- c(major_celltypes, 'endo', 'exo', 'shared')
celltypes_to_run

In [97]:
for(gwas_prefix in names(gwas_fps)){
    print(paste(gwas_prefix, Sys.time()))
    gwas_fp <- gwas_fps[[gwas_prefix]]
    mclapply(celltypes_to_run, run_finrich_all_caqtls, cre_dir, gwas_prefix, gwas_fp, caqtl_cre_outdir)
}

[1] "T1D 2024-07-25 13:12:04"
[1] "T2D 2024-07-25 13:26:04"
[1] "2hGlu 2024-07-25 13:29:15"
[1] "FG 2024-07-25 13:29:31"
[1] "FI 2024-07-25 13:30:32"
[1] "HbA1c 2024-07-25 13:31:10"


In [2]:
sessionInfo()

R version 4.1.1 (2021-08-10)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 20.04.2 LTS

Matrix products: default
BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.9.0
LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.9.0

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] parallel  stats     graphics  grDevices utils     datasets  methods  
[8] base     

other attached packages:
 [1] vroom_1.5.7       ggrepel_0.9.4     data.table_1.14.8 forcats_0.5.2    
 [5] stringr_1.4.1     dplyr_1.0.10      purrr_0.3.5       readr_2.1.1      
 [9] tidyr_1.2.1       tibble_3.1.8      ggplot2_3.4.4     tidyverse_1.3.2  

loaded via a namespace