In [110]:
library(stringr)
library(scales)
library(RColorBrewer)

In [111]:
traits = c( 'Asthma_child_onset.99credset.PPA.bed',
            'Atopic_dermatitis.99credset.PPA.bed',
            'Autoimmune_vitiligo.99credset.PPA.bed',
            'Basophil_count.99credset.PPA.bed',
            'Crohns_disease.99credset.PPA.bed',
            'Eosinophil_count.99credset.PPA.bed',
            'Gout.99credset.PPA.bed',
            'Lymphocyte_count.99credset.PPA.bed',
            'Monocyte_count.99credset.PPA.bed',
            'Neutrophil_count.99credset.PPA.bed',
            'Primary_sclerosing_cholangitis.99credset.PPA.bed',
            'Rheumatoid_arthritis.99credset.PPA.bed',
            'Selective_IgA_deficiency.99credset.PPA.bed',
            'Systemic_lupus_erythematosus.99credset.PPA.bed',
            'Type_1_diabetes.99credset.PPA.bed',
            'Ulcerative_colitis.99credset.PPA.bed')

In [112]:
fmdir = '/nfs/lab/projects/pbmc_snATAC/data/credible_sets/'

In [113]:
alltraits = data.frame()
for (i in 1: length(traits)){

fm = read.table(paste0(fmdir, traits[i] ))[,4:7]
colnames(fm) = c("varID", "Locus", "LNBF", "PPA")
 fm$Trait =  str_split_fixed(traits[i], "\\.", 2)[,1]
 alltraits = rbind(alltraits, fm)
    }
    

In [114]:
caqdir = '/nfs/lab/projects/pbmc_snATAC/analysis_v2/summarized_caqtls/run3/'
files  = list.files(caqdir)[grepl ('_caqtl_sumstats_run3.tsv', list.files(caqdir))] 
files = files[!(files %in% c('cd4_t_caqtl_sumstats_run3.tsv', 'cd8_t_caqtl_sumstats_run3.tsv', "bulk_caqtl_sumstats_run3.tsv"))]
cells  = gsub( '_caqtl_sumstats_run3.tsv', "" , files)

In [115]:
colkeep = c('Feature','varID', 'rsID', 'P_value', 'Q_value' ,'Effect_size' ,'cell', "exclude" )

In [116]:
M= data.frame()
for (y in 1:length(files)){

sms = read.table(paste0(caqdir, files[y]), header=T)
sss = subset(sms, Is_peak_caQTL==T )
sss = sss[order(sss$Q_value),]
ag  = aggregate(Q_value~Feature, sss, min)
mm  = merge (sss, ag, by= colnames(ag))
mm$cell = cells[y]
M    = rbind(M, mm[,colkeep])
}

In [117]:
allcaq = subset(alltraits,varID %in% c(substring(M$varID,4), as.character(M$rsID)) ) 
## this is only bc the t1d from tony had rsids

In [118]:
allcaq = subset(allcaq, PPA >0.01)

In [119]:
M$varID= substring(M$varID,4)

In [120]:
mm    = merge(allcaq, M, by="varID")
mmt1d = merge( M, allcaq, by.y="varID", by.x="rsID")

mm = rbind (mm, mmt1d)
mm = mm[!duplicated(mm),]

In [121]:
bed = str_split_fixed(mm$Feature, ":|-", 3)

In [122]:
bed= bed[!duplicated(bed),]

In [123]:
setwd("/nfs/lab/projects/pbmc_snATAC/analysis_v2/")

In [124]:
write.table(bed, "summarized_caqtls/finemapped_variants.bed", quote=F, sep="\t", row.names=F, col.names=F)

In [126]:
system("sort -k 1,1 -k2,2n summarized_caqtls/finemapped_variants.bed > summarized_caqtls/finemapped_variants.sorted.bed")
system('bedtools closest -a summarized_caqtls/finemapped_variants.sorted.bed -b /nfs/lab/publicdata/gencode_v19/gencode.v19.1kb_all_possible_transcripts.sorted.bed -d > summarized_caqtls/clostest_gene.bed')

In [127]:
cg = read.table("summarized_caqtls/clostest_gene.bed")
cg$Feature = paste0(cg[,1], ":",cg[,2], "-",cg[,3] )
cg = cg[order(cg$V8),]
cg = cg [!duplicated(cg[,7:9]),7:9]
cg$ClosestGene_Distance = paste(cg[,1], cg[,2], sep="_")
ag= aggregate(ClosestGene_Distance~Feature, cg, paste, collapse=";")

In [128]:
mm = merge(mm, ag, by="Feature")

In [129]:
map = read.table("summarized_caqtls/Promoter_associated_caQTLs.tsv", header=T)

clrs = read.table("peaks/color_scheme.tsv")
clrs = rbind(clrs, matrix( c('orange2', 'b','tan4','nk','#7FC97F','mono', 'magenta4', "t"), ncol=2, byrow=T))
colcol  = data.frame(celltypes = clrs$V2, 
                     group= c('mono', 'mono', 'mono', "mono","b", "b", "nk", "nk",
                               't','t','t','t','t','t','mkc','pDC', 'plasma','b','nk','mono','t'))

In [130]:
mm$group = colcol$group[match (mm$cell, colcol$celltypes  )]

groups = c( 'b', "mono", 't',"nk")  ### mkc are few and do not overlap any finemapped variant

In [131]:
annoq = data.frame()
for (gr in groups){

test = subset(map, group==gr)
    
agr  = aggregate(type~feature_gene, test[test$type!="EP",], function(x)  c('P','PP')[max(str_count(x))]  ) ## this is to remove extra P annot
test2 = merge(test[test$type!="EP",], agr, by=colnames(agr))
test3 = rbind(test2, test[test$type=="EP",])
                 
qt = subset(mm, group==gr)
qt = merge(qt, test3, by=c('Feature','group'), all.x=T)
annoq = rbind(annoq,qt)                 
                 }

In [132]:
annoq = annoq[order(-annoq$PPA),]

In [133]:
cols= c("Trait", "Locus",  "varID", "rsID", 'PPA',"cell", "Feature",'P_value','Q_value','Effect_size','exclude', "type", 
        'ClosestGene_Distance',  'feature_gene', 'coacPeak','coac_gene', 'coaccess' , 'dist')

In [134]:
annoq = annoq[,cols]

In [135]:
annoq[is.na(annoq)]<-"."

In [139]:
annoq = annoq[annoq$exclude==F,]

In [141]:
annoq$is_prom = annoq$type %in% c("P", "PP")

In [142]:
write.csv(annoq , paste0("summarized_caqtls/Finemap_Table_caQTLs_extended.csv"))


In [147]:
5010/5908

In [148]:
3841/5272 

In [149]:
rmdu =annoq[!duplicated(annoq[,c("varID", "Trait", "type")]),]

In [150]:
dim(rmdu)

In [151]:
sum(rmdu$type==".")

In [152]:
374-139