For RASQUAL we want the allele specific counts for the allelic imbalance test

In [1]:
library(parallel)

In [2]:
samples = c('R207','R217','R218','R221','R223','R226','R228','R234','R237','R238',
            'R246','R247','R275','R284','R290','R292','R316','R317','R319','R325',
            'R326','R327','R332','R353','R354','R362','R363','R364')

non_eur_samples = c('R221','R237','R246','R247','R292','R325','R363','R364')

In [3]:
homedir  = '/nfs/lab/welison/islet_multiome/intermediates/caQTLs/231026_WE_caQTLs/caQTLs_rasqual/'
sampdir  = '/nfs/lab/welison/islet_multiome/intermediates/caQTLs/231026_WE_caQTLs/'
matdir   = '/nfs/lab/welison/islet_multiome/intermediates/caQTLs/231026_WE_caQTLs/atac_cell_type_matrices/'

In [4]:
#dir.create(homedir)

### sort and index the bulk file

In [5]:
bulksampdir = "/nfs/lab/paola/islets_snatac_qtls/data/dedup_bams/"

In [6]:
for (s in samples){
    bam_in  = paste0(bulksampdir, s, "/atac_possorted_bam.filt.rmdup.bam" )
    bam_out = paste0(sampdir, s, "/",s, "_bulk_orig.bam" )
    system(paste("ln -s", bam_in, bam_out))
}

In [7]:
setwd(sampdir)

In [8]:
bam        = paste0(sampdir, samples, "/",samples, "_bulk_orig.bam" )
sorted_bam = paste0(sampdir, samples, "/",samples, "_bulk.bam" )

In [9]:
# ## sort and Index the bam files
sort_and_index = function(b){
    
    
    system(paste('samtools', 'sort', '-m', '2G', '-@', '3', '-o', sorted_bam[b], bam[b]))
    system(paste("samtools index", sorted_bam[b]))
}

In [10]:
sorted_bam

In [11]:
length(sorted_bam)

In [12]:
mclapply(1:length(sorted_bam), function (x) sort_and_index(x),mc.cores = 7)

We are making a couple vcfs to test down the road. 1) Variants in peaks only (don't use this) 2) require a minimum number of heterozygous individuals. RAQUAL won't run ASE without enough het individuals, so we filtered by maf rather than the number of heterozygous individuals.

In [4]:
celltypes =  c('acinar','alpha','beta','bulk','delta','ductal','endothelial','gamma',
               'immune','stellate') #Add subtypes later

In [6]:
allvcf_dir = '/nfs/lab/welison/islet_multiome/intermediates/caQTLs/imputed_genotypes/multiome_28/'

In [7]:
matdir
homedir
allvcf_dir

In [8]:
filter_vcf = function(cell) {
    
    sample_order <- stringr::str_flatten(read.table(paste0('/nfs/lab/welison/islet_multiome/intermediates/caQTLs/231026_WE_caQTLs/atac_cell_type_matrices/samples.',cell,'.csv'), sep=',')[[1]], collapse=',')
    print(cell)
    print(sample_order)
        
    for (n in 1:22) {
        vcf    = paste0(allvcf_dir, "chr" , n,  ".multi.ancestry.vcf.gz.gz")
        outdir = paste(homedir,  cell, "vcfs_peaks100kb", sep="/")    
        system(paste("mkdir -p ",outdir  ) )   
        outvcf = paste0(outdir, "/chr",n ,'.filt.vcf.gz')
        bed    = paste0(matdir,  "/" , cell, '.filtered.bed.100kb.merged')
        print(1)
        
        
        #system(paste("bcftools view", vcf , '-s', sample_order, '-R' , bed, "-i \'COUNT(GT=\"het\")>1\' -Oz -o" , outvcf))
        system(paste("bcftools view", vcf , '-s', sample_order, '-R' , bed, "-i \'COUNT(GT=\"het\")>1\' -Oz -o" , outvcf))
        system(paste("tabix", outvcf))
        print(2)
    }
}

In [9]:
no_filter_vcf = function(cell) {
    
    sample_order <- stringr::str_flatten(read.table(paste0('/nfs/lab/welison/islet_multiome/intermediates/caQTLs/231026_WE_caQTLs/atac_cell_type_matrices/samples.',cell,'.csv'), sep=',')[[1]], collapse=',')
    print(cell)
    print(sample_order)
    
    for (n in 1:22) {
        vcf    = paste0(allvcf_dir, "chr" , n,  ".multi.ancestry.vcf.gz.gz")
        outdir = paste(homedir,  cell, "vcfs_peaks100kb", sep="/")    
        system(paste("mkdir -p ",outdir  ) )   
        outvcf = paste0(outdir, "/chr",n ,'.nofilt.vcf.gz')
        bed    = paste0(matdir,  "/" , cell, '.filtered.bed.100kb.merged')
        print(1)
        
        #system(paste("bcftools view", vcf , '-s', sample_order, '-R' , bed, "-i \'COUNT(GT=\"het\")>1\' -Oz -o" , outvcf))
        system(paste("bcftools view", vcf , '-s', sample_order, "-Oz -o" , outvcf))
        system(paste("tabix", outvcf))
        print(2)
    }
}

In [10]:
celltypes

In [11]:
mclapply(celltypes, function(x) filter_vcf(x) , mc.cores = length(celltypes))

In [12]:
mclapply(celltypes, function(x) no_filter_vcf(x) , mc.cores = length(celltypes))

In [13]:
for (cell in celltypes){
    cell_specific_samples <- read.table(paste0('/nfs/lab/welison/islet_multiome/intermediates/caQTLs/231026_WE_caQTLs/atac_cell_type_matrices/samples.',cell,'.csv'), sep=',')[[1]]
    
    outdir = paste(homedir,  cell, "vcfs_peaks100kb", sep="/")   
    bam = paste0(sampdir, cell_specific_samples, "/", cell_specific_samples, "_",cell, ".bam")
    writeLines(bam, paste(outdir, 'bam.list.txt', sep="/"))
}

Add allele specific counts (ASE, E for effects). I was hitting an error but I don't think it is associated with creaetASVCF but a dependency so next time try the original script.

In [14]:
rasqual_ase = function (cell, chr){
    setwd(paste0(homedir ,cell, "/vcfs_peaks100kb" )  )
    
    vcf1 = paste0("chr", chr, '.filt.vcf.gz')
    vcf2 = paste0("chr", chr, '.ase.filt.vcf.gz')
    system(paste("rm", vcf2))  ## remove older files 
    system(paste0("rm ", vcf2, ".tbi")) 
    #system(paste('bash /nfs/lab/welison/islet_multiome/notebooks/caQTL/createASVCF.sh paired_end bam.list.txt', vcf1, vcf2,"atac"))
    system(paste('bash /nfs/lab/welison/islet_multiome/notebooks/caQTL/createASVCF_testing.sh paired_end bam.list.txt', vcf1, vcf2,"atac"))
    system(paste('tabix', vcf2, "-f"))
}

In [15]:
rasqual_ase_no_filt = function (cell, chr){
    setwd(paste0(homedir ,cell, "/vcfs_peaks100kb" )  )
    
    vcf1 = paste0("chr", chr, '.nofilt.vcf.gz')
    vcf2 = paste0("chr", chr, '.ase.nofilt.vcf.gz')
    system(paste("rm", vcf2))  ## remove older files 
    system(paste0("rm ", vcf2, ".tbi")) 
    #system(paste('bash /nfs/lab/welison/islet_multiome/notebooks/caQTL/createASVCF.sh paired_end bam.list.txt', vcf1, vcf2,"atac"))
    system(paste('bash /nfs/lab/welison/islet_multiome/notebooks/caQTL/createASVCF_testing.sh paired_end bam.list.txt', vcf1, vcf2,"atac"))
    system(paste('tabix', vcf2, "-f"))
}

In [None]:
#filter_vcf_peaks = function(cell) {
#    indir = paste(homedir,  cell, "vcfs_peaks100kb", sep="/")    
#    outdir = paste(homedir,  cell, "vcfs_peaksonly", sep="/")  
#    system(paste("rm -r ",outdir  ) ) 
#    system(paste("mkdir -p ",outdir  ) )  
#   
#    for (n in 1:22) {
#        invcf  = paste0(indir, "/chr",n ,'.ase.filt.vcf.gz')
#        outvcf = paste0(outdir, "/chr",n ,'.ase.filt.vcf.gz')
#        bed    = paste0(matdir,  "/" , cell, '.filtered.bed')     
#        system(paste("bcftools view", invcf , '-R' , bed,  "-Oz -o" , outvcf))
#        system(paste("tabix", outvcf))
#    }
#}

In [16]:
celltypes

In [70]:
#celltypes =  c('acinar','alpha','beta','bulk','delta','ductal','endothelial','gamma',
#               'immune','stellate') #Add subtypes later

In [17]:
for (cc in celltypes){
    mclapply(1:22, function (x) rasqual_ase(cell=cc, chr=x),mc.cores = 22)
#filter_vcf_peaks(cc)
}

In [18]:
for (cc in celltypes){
    mclapply(1:22, function (x) rasqual_ase_no_filt(cell=cc, chr=x),mc.cores = 22)
#filter_vcf_peaks(cc)
}

In [19]:
#mclapply(celltypes, function(x) filter_vcf_peaks(x) , mc.cores = 34)

In [17]:
#mclapply(1:5, function (x) rasqual_ase(cell="bulk", chr=x),mc.cores = 5)


In [15]:
#mclapply(11:22, function (x) rasqual_ase(cell="alpha", chr=x),mc.cores = 22)


In [19]:
cat (celltypes)

acinar alpha beta bulk delta ductal endothelial gamma immune stellate

In [20]:
celltypes