In [1]:
library(stringr)
library(scales)
library(RColorBrewer)

### Plot credible sets

In [2]:
traits = c( 'Asthma_child_onset.99credset.PPA.bed',
            'Atopic_dermatitis.99credset.PPA.bed',
            'Autoimmune_vitiligo.99credset.PPA.bed',
            'Basophil_count.99credset.PPA.bed',
            'Crohns_disease.99credset.PPA.bed',
            'Eosinophil_count.99credset.PPA.bed',
            'Gout.99credset.PPA.bed',
            'Lymphocyte_count.99credset.PPA.bed',
            'Monocyte_count.99credset.PPA.bed',
            'Neutrophil_count.99credset.PPA.bed',
            'Primary_sclerosing_cholangitis.99credset.PPA.bed',
            'Rheumatoid_arthritis.99credset.PPA.bed',
            'Selective_IgA_deficiency.99credset.PPA.bed',
            'Systemic_lupus_erythematosus.99credset.PPA.bed',
            'Type_1_diabetes.99credset.PPA.bed',
            'Ulcerative_colitis.99credset.PPA.bed')

In [3]:
fmdir = '/nfs/lab/projects/pbmc_snATAC/data/credible_sets/'

In [4]:
figdir = "/nfs/lab/projects/pbmc_snATAC/analysis_v2/figures/"

In [5]:
pl = data.frame()

df = data.frame()
for (i in 1: length(traits)){
fm = read.table(paste0(fmdir, traits[i] ))
ag = aggregate(V4~V5, fm, length)
ag$br = with(ag, cut(V4, breaks = c(1,3,6,21,51,101, max(V4)+1), right=F))
pl = rbind(pl, table(ag$br))
    ag$trait = i
    df = rbind(df,ag )
    }

In [6]:
colnames(pl) = names(table(ag$br))
rownames(pl) = str_split_fixed(traits, "\\.", 2)[,1]
totals = rowSums(pl)

pct = pl/totals
pal = rev(brewer.pal(6, "Purples"))

mat =as.matrix(t(pl[order(totals),]))
tots = totals [order(totals)]
leg = c("1-2", "3-5","6-20", "21-50" , "51-100", "101+")

In [7]:
pdf(paste0(figdir,"Barplot_loci_smaller.pdf"))
par(mar=c(8,12,10,6), xpd = TRUE)
bp = barplot(mat, las=1, horiz = T, col=pal, xlim=c(0,200), xlab="# loci")
text(tots,bp, labels = tots, pos = 4)
legend("bottomright",legend = leg, pch=22, pt.cex = 1.5,
       pt.bg = pal, title="# variants per locus")
dev.off()

In [8]:
sum(totals)

In [9]:
summary(df[,2])

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
    1.0     4.0    16.0    41.5    45.0  2159.0 

In [10]:
ss = split(df, df$trait)
sapply (ss, function(x) summary(x[,2]))

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
Min.,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,3,2.0,2.0,1.0
1st Qu.,4.0,33.0,7.0,2.0,14.0,3.0,5.0,3.75,2.0,4.0,4.0,3.5,22,9.0,25.0,9.0
Median,14.0,56.0,18.0,8.0,29.0,12.0,20.5,14.0,6.5,14.0,16.0,9.0,70,30.0,66.0,21.0
Mean,35.23333,81.3913,36.54545,28.78431,53.13913,29.18902,48.52632,30.00676,27.83871,44.12371,36.23529,21.02532,56,60.02439,140.8772,46.1519
3rd Qu.,36.75,97.0,56.0,32.5,67.5,33.0,36.0,35.25,22.75,44.0,41.0,27.5,82,75.0,162.0,59.0
Max.,392.0,365.0,181.0,276.0,332.0,315.0,495.0,358.0,492.0,316.0,235.0,210.0,103,248.0,2159.0,354.0


### caQTLs in credible sets
include also variants with the same q-value as the lead-

In [11]:
alltraits = data.frame()
for (i in 1: length(totals)){

fm = read.table(paste0(fmdir, traits[i] ))[,4:7]
colnames(fm) = c("varID", "Locus", "LNBF", "PPA")
 fm$Trait = paste(sprintf("%02d",rank(totals,ties.method ='first')[i]), str_split_fixed(traits[i], "\\.", 2)[,1], sep="_")
 alltraits = rbind(alltraits, fm)
    }
    

In [12]:
caqdir = '/nfs/lab/projects/pbmc_snATAC/analysis_v2/summarized_caqtls/run3/'
files  = list.files(caqdir)[grepl ('_caqtl_sumstats_run3.tsv', list.files(caqdir))] 
files = files[!(files %in% c('cd4_t_caqtl_sumstats_run3.tsv', 'cd8_t_caqtl_sumstats_run3.tsv', "bulk_caqtl_sumstats_run3.tsv"))]
cells  = gsub( '_caqtl_sumstats_run3.tsv', "" , files)

In [13]:
colkeep = c('Feature','varID', 'rsID', 'P_value', 'Q_value' ,'Effect_size' ,'cell' )

In [14]:
M= data.frame()
for (y in 1:length(files)){

sms = read.table(paste0(caqdir, files[y]), header=T)
sss = subset(sms, Is_peak_caQTL==T & exclude==F)
sss = sss[order(sss$Q_value),]
ag  = aggregate(Q_value~Feature, sss, min)
mm  = merge (sss, ag, by= colnames(ag))
mm$cell = cells[y]
M    = rbind(M, mm[,colkeep])
}

In [15]:
subset(M, rsID=="rs34038797")

Unnamed: 0_level_0,Feature,varID,rsID,P_value,Q_value,Effect_size,cell
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<chr>
977,chr12:739181-740753,chr12:740009:C:G,rs34038797,0.01239313,0.02478625,0.234758,act_cd4_t
4765,chr12:739181-740753,chr12:740009:C:G,rs34038797,0.0007802117,0.001560423,0.302812,b
6473,chr12:739181-740753,chr12:740009:C:G,rs34038797,0.01137523,0.01137523,0.28851,cDC
8659,chr12:739181-740753,chr12:740009:C:G,rs34038797,7.186429e-06,3.593214e-05,0.201709,cMono
16271,chr12:739181-740753,chr12:740009:C:G,rs34038797,0.006583025,0.006583025,0.25561,cyto_cd8_t
20544,chr12:739181-740753,chr12:740009:C:G,rs34038797,0.002159517,0.002159517,0.323118,mem_b
21147,chr12:739181-740753,chr12:740009:C:G,rs34038797,0.01933345,0.01933345,0.2592,mem_cd8_t
24230,chr12:739181-740753,chr12:740009:C:G,rs34038797,6.838758e-07,3.419379e-06,0.192756,mono
33178,chr12:739181-740753,chr12:740009:C:G,rs34038797,0.002337266,0.004674532,0.300117,naive_b
34536,chr12:739181-740753,chr12:740009:C:G,rs34038797,0.008408785,0.01681757,0.22529,naive_cd4_t


In [16]:
allcaq = subset(alltraits,varID %in% c(substring(M$varID,4), as.character(M$rsID)) ) 
## this is only bc the t1d from tony had rsids

In [17]:
agc = aggregate(varID~Trait,allcaq, length )

In [18]:
pdf(paste0(figdir, "Finemapped_caQTLs_bxpl_newres.pdf"))
par(mar=c(8,6,10,0), xpd = TRUE, mfrow=c(1,2))
#stripchart(PPA~Trait,allcaq , pch=19, las=1, col=alpha(pal[1],0.5))
boxplot(PPA~Trait,allcaq , horizontal = T, las=1, col=alpha(pal[1],0.5), pch=16,
        outcol= alpha(pal[1],0.5))
par(mar=c(8,6,10,0))
barplot(agc[,2], horiz=T, names.arg =agc[,2] , las=1)
dev.off()

In [19]:
allcaq[allcaq$Locus=="BACH2",]

Unnamed: 0_level_0,varID,Locus,LNBF,PPA,Trait
Unnamed: 0_level_1,<chr>,<chr>,<dbl>,<dbl>,<chr>
43134,rs72928038,BACH2,10.544367,0.07356601,08_Type_1_diabetes
43161,rs1504215,BACH2,8.948297,0.01491121,08_Type_1_diabetes
43165,rs56297233,BACH2,10.158814,0.05003034,08_Type_1_diabetes


In [20]:
sum(!duplicated(allcaq$varID))

In [21]:
sum(!duplicated(allcaq$Locus))

### Annotate with promoters and coaccessible promoters

In [22]:
setwd("/nfs/lab/projects/pbmc_snATAC/analysis_v2/")

map = read.table("summarized_caqtls/Promoter_associated_caQTLs.tsv", header=T)

clrs = read.table("peaks/color_scheme.tsv")
clrs = rbind(clrs, matrix( c('orange2', 'b','tan4','nk','#7FC97F','mono', 'magenta4', "t"), ncol=2, byrow=T))
colcol  = data.frame(celltypes = clrs$V2, 
                     group= c('mono', 'mono', 'mono', "mono","b", "b", "nk", "nk",
                               't','t','t','t','t','t','mkc','pDC', 'plasma','b','nk','mono','t'))

In [23]:
M$group = colcol$group[match (M$cell, colcol$celltypes  )]

groups = c( 'b', "mono", 't',"nk")  ### mkc are few and do not overlap any finemapped variant

In [24]:
#sum(c(unique(subset(M, cell == "mkc")$varID), unique(subset(M, cell == "mkc")$rsID)) %in% allcaq$varID)

In [25]:
annoq = data.frame()
for (gr in groups){

test = subset(map, group==gr)
    
agr  = aggregate(type~feature_gene, test[test$type!="EP",], function(x)  c('P','PP')[max(str_count(x))]  ) ## this is to remove extra P annot
test2 = merge(test[test$type!="EP",], agr, by=colnames(agr))
test3 = rbind(test2, test[test$type=="EP",])
                 
qt = subset(M, group==gr)
qt = merge(qt, test3, by=c('Feature','group'))
annoq = rbind(annoq,qt)                 
                 }
                 

In [26]:
annoq$varID= substring(annoq$varID,4)

In [27]:
allcaq = subset(allcaq, PPA >0.01)

In [28]:
mm = merge(allcaq, annoq, by="varID")
mmt1d = merge( annoq, allcaq, by.y="varID", by.x="rsID")

mm = rbind (mm, mmt1d)
mm = mm[!duplicated(mm),]

In [29]:
#subset(mm, rsID=='rs72928038')

In [30]:
#subset(mm, rsID=='rs12365699')

In [31]:
#subset(mm, rsID=='rs7731626')

In [32]:
#subset(mm, rsID=='rs17293632')

In [33]:
#subset(mm,coac_gene=="SMAD3" )

In [34]:
#subset(mm, rsID=='rs34038797')

In [35]:
rmdu =mm[!duplicated(mm[,c("varID", "Trait", "type")]),]

In [36]:
dim(rmdu)

In [37]:
#rmdu =mm[!duplicated(mm[,c("varID", "Trait")]),]

In [38]:
#dim(rmdu)

In [39]:
TA = table(rmdu$Trait, rmdu$type)

In [40]:
extra = unique(alltraits$Trait)[!(unique(alltraits$Trait) %in% rownames(TA))]
TA = rbind(rbind(TA, 0),0)
rownames(TA)[15:16]=extra
TA = TA[order(rownames(TA)),]

In [41]:
colnames(TA) = c('2_EP','1_P','3_PP')
TA = TA[,order(colnames(TA))]

In [42]:
pal2 = brewer.pal(6,'RdYlBu')[c(1:3,5)]
pdf(paste0(figdir, "CAtegory_caQTLs_finemap_newresults.pdf"))
par(mar=c(8,6,10,2), xpd = TRUE, mfrow=c(1,2))
plot.new()
barplot(t(TA), horiz = T, las=1, col=pal2, xlim=c(0,35))
legend("bottomright",legend = colnames(TA), pch=22, pt.cex = 1.8,
       pt.bg = pal2, title="#caQTLs\n(PPA>0.01)", bty="n")
dev.off()

### GWAS plots

In [326]:
setwd('/nfs/lab/projects/pbmc_snATAC/analysis_v2/gwas_sumstats')

In [None]:
# loci
# chr6   89207850   92324486
# chr11   117991786   119494785
# chr2 227431691 229908940

In [None]:
# commands:
# - bedtools intersect -a /nfs/lab/publicdata/1kg_snps/ALL.wgs.phase3_sites.bed.ids -b loci -wa > loci_snps.bed
# - awk -v OFS="\t" '{print $4}' loci_snps.bed > loci_snps.rsid_list
# - zgrep -F -f loci_snps.rsid_list T1D.Aylward_2018_biorxiv.txt.gz > T1D.Aylward_2018.select

In [262]:
t1d  = read.table("T1D.Aylward_2018.select", sep="\t")
ly   = read.table("Astle.Cell.2016.lymph.select", sep="\t")
as   = read.table("Ferreira.2019.AJHG.Child_onset_asthma.select")

In [357]:
uc   = read.table("deLange.NatGenet.2017.Ulcerative_Colitis.select")

In [358]:
coo  = read.table("lociAll_snps.bed", sep="\t")
colnames(coo) = c( "chr", "start", "pos", 'rsid', 'id')

In [267]:
t1d = merge(t1d, coo, by.x=1, by.y=4)
t1d = subset(t1d, chr == 6)
as = merge(as, coo, by.x=1, by.y=4)
as = subset(as, chr == 11)
ly = merge(ly, coo, by.x=1, by.y=4)
ly = subset(ly, chr == 6)

In [359]:
uc = merge(uc, coo, by.x=1, by.y=4)
uc = subset(uc, chr == 2)

In [366]:
uc$log10pval = -log10(uc$V9)
uc$credset = uc$id %in% alltraits$varID[alltraits$Trait == "10_Ulcerative_colitis" & alltraits$Locus=="2:228670476:C:G" ]

In [276]:
t1d$log10pval = -log10(t1d$V9)
t1d$credset = t1d$V1 %in% alltraits$varID[alltraits$Trait == "08_Type_1_diabetes" & alltraits$Locus=="BACH2" ]

as$log10pval = -log10(as$V13)
as$credset = as$id %in% alltraits$varID[alltraits$Trait == "13_Asthma_child_onset" & alltraits$Locus=='11:118743286:G:A']

ly$log10pval = -log10(ly$V9)
ly$credset = ly$id %in% alltraits$varID[alltraits$Trait == "14_Lymphocyte_count"& alltraits$Locus=='6:90976768:G:A']

t1d = t1d[order(-t1d$log10pval),]
as = as[order(-as$log10pval),]
ly = ly[order(-ly$log10pval),]
uc = uc[order(-uc$log10pval),]

In [362]:
t1d$caQTL = t1d$id %in% allcaq$varID
as$caQTL = as$id %in% allcaq$varID
ly$caQTL = ly$id %in% allcaq$varID
uc$caQTL = uc$id %in% allcaq$varID

In [375]:
pdf(paste0(figdir, "GWAS_loci_mainfigure.pdf"))
par(mfrow=c(2,2), mar = c(4,4,10,0), las=1)
plot(log10pval~V3, t1d, pch=c(19,17)[t1d$caQTL +1], col=alpha(c("gray", "red"),0.5)[t1d$credset +1], cex=0.6,
    xlim=c(90700000,91200000), main="BACH2 - T1D")

plot(log10pval~V3, ly, pch=c(19,17)[ly$caQTL +1], col=alpha(c("gray", "red"),0.5)[ly$credset +1], cex=0.6,
    xlim=c(90700000,91200000), main="BACH2 - Lymphocyte count")

plot(log10pval~V3, as, pch=c(19,17)[as$caQTL +1], col=alpha(c("gray", "red"),0.5)[as$credset +1], cex=0.6,
  xlim=c(118400000,118900000), main="CXCR5 locus- Asthma")

plot(log10pval~V3, uc, pch=c(19,17)[uc$caQTL +1], col=alpha(c("gray", "red"),0.5)[uc$credset +1], cex=0.6,
     #xlim=c(228563194, 228887101), main="CCL20-Ulcerative Colitis")
     xlim=c(227830402,229027101))
  #xlim=c(228001852,228793788), main="CCL20-Ulcerative Colitis")
dev.off()

In [296]:
png(paste0(figdir, "GWAS_bach2.png"), width = 4, height = 1.6, units = 'in', res = 300)
par( pin=c(4,1.6), las=1)
plot(log10pval~V3, ly, pch=c(19,17)[ly$caQTL +1], col=alpha(c("gray", "red"),0.5)[ly$credset +1], cex=0.6,axes=F,
    xlim=c(90700000,91200000))
dev.off()

In [297]:
png(paste0(figdir, "GWAS_CXCR5.png"), width = 4, height = 1.6, units = 'in', res = 300)
par( pin=c(4,1.6), las=1)
plot(log10pval~V3, as, pch=c(19,17)[as$caQTL +1], col=alpha(c("gray", "red"),0.5)[as$credset +1], cex=0.6,axes=F,
  xlim=c(118400000,118900000))
dev.off()

In [376]:
png(paste0(figdir, "GWAS_CCL20.png"), width = 4, height = 1.6, units = 'in', res = 300)
par( pin=c(4,1.6), las=1)
plot(log10pval~V3, uc, pch=c(19,17)[uc$caQTL +1], col=alpha(c("gray", "red"),0.5)[uc$credset +1], cex=0.6,axes=F,
  xlim=c(227830402,229027101))
dev.off()

### Supptables

In [45]:
getwd()

In [46]:
st0 = read.table("summarized_caqtls/run3/EUR_caqtl_leads_run3.tsv", header=T)

In [47]:
dim(st0)

In [48]:
st = subset(st0, flag_fdr10==T)

In [49]:
dim(st)

In [50]:
st = st[order( st$cell,st$P_VAl),]

In [52]:
write.csv(st,"summarized_caqtls/run3/EUR_caqtl_leads_subset.csv", row.names=F)

In [53]:
head(st)

Unnamed: 0_level_0,Feature,varID,Chromosome,position,Ref,Alt,Af,HWEChi_square,IA,Log10_qval,⋯,Convergence_status,r2_fSNPs,r2_rSNP,P_VAl,flag_fdr10,flag_fdr05,flag_fdr01,cell,exclude,rsID
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<int>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<int>,<dbl>,<dbl>,<dbl>,<lgl>,<lgl>,<lgl>,<chr>,<lgl>,<chr>
322539,chr10:51502604-51504683,chr10:51503178:G:A,chr10,51503178,G,A,0.45,1.715131,0.991469,-19.52688,⋯,0,0.965503,0.979683,2.632664e-20,True,True,True,act_cd4_t,False,rs2611475
322540,chr6:32635924-32636583,chr6:32636376:C:G,chr6,32636376,C,G,0.45,0.00102,0.995562,-16.97326,⋯,0,0.991991,0.991873,1.8025729999999998e-19,True,True,True,act_cd4_t,False,rs2856698
322541,chr10:27546208-27546939,chr10:27546467:G:C,chr10,27546467,G,C,0.25,1.111111,0.990104,-14.1531,⋯,0,0.951346,0.978134,2.454829e-15,True,True,True,act_cd4_t,False,rs2488369
322542,chr6:43737236-43739826,chr6:43737873:G:A,chr6,43737873,G,A,0.1,0.123457,0.994865,-11.979,⋯,0,0.991476,0.983246,9.541214e-14,True,True,True,act_cd4_t,False,chr6:43737873:G:A
322543,chr6:166672201-166673529,chr6:166672298:G:A,chr6,166672298,G,A,0.4,0.277778,0.989443,-11.39206,⋯,0,0.983409,0.982888,1.928802e-12,True,True,True,act_cd4_t,False,rs911202
322544,chr7:134831931-134833621,chr7:134833233:G:A,chr7,134833233,G,A,0.6,4.444444,0.998298,-10.56247,⋯,0,0.987382,0.98188,2.272994e-11,True,True,True,act_cd4_t,False,rs3735000


In [54]:
sum(st$exclude)/ nrow(st)

In [393]:
st$sig = st$flag_fdr10 & st$exclude==FALSE

In [398]:
ag1 = aggregate(sig~cell, st, sum)

In [399]:
ag2=aggregate(Feature~cell, st0, length)

In [401]:
agg = merge (ag1, ag2, by=1)

In [407]:
agg$fract = agg$sig/agg$Feature

In [409]:
agg[order(agg$sig),]

Unnamed: 0_level_0,cell,sig,Feature,fract
Unnamed: 0_level_1,<chr>,<int>,<int>,<dbl>
12,mkc,31,11892,0.002606794
9,iMono,39,35454,0.001100017
2,adaptive_NK,40,17199,0.002325717
5,cDC,72,24092,0.002988544
20,tReg,74,20769,0.003563003
10,mem_b,161,27930,0.005764411
11,mem_cd8_t,219,28332,0.007729776
16,naive_cd8_t,320,37433,0.008548607
14,naive_b,376,35363,0.010632582
17,ncMono,481,54300,0.008858195
