In [180]:
suppressPackageStartupMessages(library(circlize))
suppressPackageStartupMessages(library(plotrix))
suppressPackageStartupMessages(library(gplots))
suppressPackageStartupMessages(library(stringr))
suppressPackageStartupMessages(library(scales))
suppressPackageStartupMessages(library(gridExtra))
suppressPackageStartupMessages(library(RColorBrewer))
suppressPackageStartupMessages(library(tidyr))
suppressPackageStartupMessages(library(beeswarm))
suppressPackageStartupMessages(library(reshape))

In [2]:
setwd("/nfs/lab/projects/islet_cytok/analysis/selex/")

### DeepSea

In [1]:
deepdiff2 = read.table( "/nfs/lab/projects/selex/selex_t2d_t1d/results/deepsea/Compiled_DeepSea_results", header=T)
evalues   = read.table(  "/nfs/lab/projects/selex/selex_t2d_t1d/results/deepsea/Compiled_DeepSea_results_evalue", header=T)
deepdiff2[evalues>=0.01] <-NA

In [4]:
selex = read.table("combinedResults.tsv", header=T)

In [5]:
mmr = subset(selex, prot %in% colnames(deepdiff2))

In [7]:
length(unique(mmr$prot))

In [9]:
mermer=data.frame()
vi = list()
for (tf in unique(as.character(mmr$prot))) {
deep  = deepdiff2[,c(1,which(colnames(deepdiff2) == tf))]
if (ncol(deep)>2){
    deep$DeepSea = rowMeans(deep[,2:ncol(deep)])  ### average the Deepsea results for the same protein
    #deep$DeepSea = apply(deep[,2:ncol(deep)],1, max)## max Deepsea results for the same protein
    } else {
    deep$DeepSea = deep[,2]
}    
sel = subset(mmr, prot == toupper(tf))
    
mer   = merge(sel, deep[,c("CTCF","DeepSea")], by.x="snp_name", by.y="row.names")
if (sum(!is.na(mer$DeepSea)) >=5 ){    

vi[["all_SNPs"]][tf]<-  cor.test(-mer$DeepSea, mer$PBSb)$estimate
mermer = rbind(mermer, mer)   
    }
mers = subset(mer,mer[,'PBSb_pv']<0.05 )
if (sum(!is.na(mers$DeepSea)) >=5 ){     
    vi[["pbSNPs"]][tf]<- cor.test(-mers$DeepSea, mers$PBSb)$estimate
}
    
}
mermer$DeepSea_prediction = -mermer$DeepSea


In [10]:
length(vi[["pbSNPs"]])

In [26]:
names(vi[["pbSNPs"]])

In [11]:
length(vi[["all_SNPs"]])

In [29]:
paste(sort(names(vi[["all_SNPs"]])), collapse=", ")

In [12]:
vi = lapply(vi, function(x) x[order(-x)])

In [13]:
summary(vi[["all_SNPs"]])

    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
-0.35204  0.09739  0.34948  0.35253  0.66087  0.87856 

In [14]:
summary(vi[["pbSNPs"]])

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
-0.1303  0.8171  0.8815  0.8122  0.9167  0.9694 

In [15]:
mmg = mermer

In [17]:
col = rainbow(4)[3:4]

In [33]:
pdf("../cytokine_figures/Corr_Selex_DeepSea.pdf")
par(mfrow=c(3,3),mar=c(4,4,4,2))
plot(density(vi[["pbSNPs"]]), col =col[2], lwd=1.5, main="Correlation per TF", xlab="Pearson coefficient", xlim=c(-0.5,1))
lines(density(vi[["all_SNPs"]]), col =col[1], lwd=1.5)
abline(v=mean(vi[["pbSNPs"]]), col=col[2], lty=2)
abline(v=mean(vi[["all_SNPs"]]), col=col[1], lty=2)
legend("topleft", col = col, legend = paste(c( "all SNPs","pbSNPs"), lapply(vi, length)) , pch=15, bty="n")


par(mar=c(4,4,4,4))
tab    = na.omit(mmg[mmg[,'PBSb_pv'] <0.05   , c("PBSb", "DeepSea_prediction" )])

pc  = round(sum( sign(tab[, "PBSb"]) == sign(tab[, "DeepSea_prediction"])) / nrow(tab),4) *100

plot(mmg[,c("DeepSea_prediction","PBSb")],  col= alpha(c("gray", col[2]),0.2)[(mmg[,'PBSb_pv'] <0.05)+1], 
       pch=19,
          ylab= "Selex change", xlab= "DeepSea change", las=1)

abline (v=0, h=0, lty=2)
legend("topleft", legend= paste( pc, "%\nconcord."), bty="n")  
mtext(side=3, text= paste("n pbSNPs=", nrow(tab)), cex=0.8)
mtext(side=3, text= paste("n all=", sum(!is.na(mmg$DeepSea_prediction))), cex=0.8, line=1)
dev.off()

In [162]:
png("../cytokine_figures/Corr_Selex_deepsea.png", width = 4, height = 4, units = 'in', res = 300)
par( mar=c(0,0,0,0))
plot(mermer[,c("DeepSea_prediction","PBSb")],  col= alpha(c("gray", col[2]),0.2)[(mermer$PBSb_pv <0.05)+1], 
       pch=19,ylab= "", xlab= "", axes=F)
        
dev.off()

### Family correlations

In [38]:
fams = selex[!duplicated(selex[,c('prot','Family')]),c('prot','Family')]

In [72]:
ag = data.frame(table(fams[,2]))
ag = ag[order(-ag$Freq),]
ag$color = c(brewer.pal( 12, "Paired"), brewer.pal( 8, "Set2"),brewer.pal( 8, "Dark2"),brewer.pal( 9, "Pastel1"))
fams = merge(fams, ag, by.x=2, by.y=1)

In [76]:
circosfun = function(cormat, drawlegend =TRUE , annot = "well") {
cormat = cormat[complete.cases(cormat),complete.cases(cormat)]
mat = cormat[1:2,]
mat[1:2,] = 1
factors = 'a'
dend = as.dendrogram(hclust(as.dist(1-cormat)))   
circos.par(cell.padding = c(0, 0, 0, 0))
circos.initialize(factors, xlim = c(0, ncol(mat)))
circos.track(ylim = c(0, 2.5), bg.border = NA, panel.fun = function(x, y) {
  

    m2 = mat[, order.dendrogram(dend)]
    
    col_mat = as.character(fams$color[ match( colnames(m2),fams[,annot])])
    
    nr = nrow(m2)
    nc = ncol(m2)
    for(i in 1:nr) {
        circos.rect(1:nc - 1, rep(nr - i, nc), 
            1:nc, rep(nr - i + 1, nc), 
            border = col_mat,  col = col_mat)
    }
})

max_height = attr(dend, "height")
circos.track(ylim = c(0, max_height), bg.border = NA, track.height = 0.5, 
    panel.fun = function(x, y) {
        circos.dendrogram(dend, max_height = max_height)
})
circos.clear()
text(0, 0, nrow(cormat), cex = 1.5)
if (drawlegend ==TRUE) {
lab  = unique(fams$Family[ match( colnames(mat),fams[,annot])])
coll = unique(as.character(fams$color[ match( colnames(mat),fams[,annot])]) )
legend("right", pch=22, pt.bg=coll, legend=lab, cex=0.7, pt.cex=1.6, bty = "n", y.intersp=0.7, inset=c(-0.5,0), xpd=TRUE)
    }
    }

In [79]:
data = selex[, c('snp_name','prot', 'PBSb')]
data = data [complete.cases(data),]
mat  = spread(data , key = prot,value = PBSb, fill = NA)

In [83]:
rownames(mat) = mat[,1]
mat = mat[,-1]
mat = mat[rowSums(!is.na(mat))>1,]
mat = mat[,colnames(mat) %in% fams$prot[fams$Freq>2]]

In [103]:
mat_sig = mat[rownames(mat) %in% selex$snp_name[selex$PBSb_pv<0.05],]

In [91]:
cormatfun = function( mat1, filter=100,   method = "pearson") {
testcor = mat1[, colSums(!is.na(mat1))> filter ]
testcor = testcor[rowSums(is.na(testcor))< ncol(testcor), ]
cat (dim(testcor))
cormat = cor(testcor, use="pairwise", method =method)   
    return(cormat)
    }

In [109]:
pdf("../cytokine_figures/CircosPlot_selex_pearson.pdf")
par(mfrow=c(2,2),mar=c(1,1,3.5,3.5), xpd=TRUE)

for (fi in c(400, 200, 100, 50)){
    corr1 = cormatfun(mat_sig, filter = fi)
    circosfun(corr1, annot = "prot" )
}
dev.off()

27627 37427650 41127654 44227655 457

In [115]:
pdf("../cytokine_figures/CircosPlot_selex_pearson_all.pdf")
par(mfrow=c(2,2),mar=c(1,1,3.5,3.5), xpd=TRUE)

for (fi in c(400, 200, 100, 50)){
    corr1 = cormatfun(mat, filter = fi)
    circosfun(corr1, annot = "prot" )
}
dev.off()

107351 413107356 441107363 454107364 462

In [172]:
corr1 = cormatfun(mat_sig, filter = 50)

27655 457

In [173]:
dim(corr1)

In [174]:
cormat = corr1[complete.cases(corr1),complete.cases(corr1)]

In [175]:
dim(cormat)

In [181]:
tab_corr = melt(corr1)

In [183]:
dim(tab_corr)

In [188]:
tab_corr$pair = apply(tab_corr, 1, function(x) paste(sort(x[1:2]), collapse="_"))
tab_corr = tab_corr[!duplicated(tab_corr[,3:4]),]
tab_corr = tab_corr[tab_corr[,1]!=tab_corr[,2],]

In [196]:
tc = merge(tab_corr, fams[,1:2], by.x=1, by.y=2)
tc = merge(tc, fams[,1:2], by.x=2, by.y=2)

In [200]:
tc$samefam = tc[,5]==tc[,6]

In [203]:
aggregate(value~samefam, tc, mean)

samefam,value
<lgl>,<dbl>
False,0.1476119
True,0.2621796


## PWM correlation

In [114]:
tab   = read.table("motifbreak/Summary_significant_motifs_hocomocov10_long.tsv", header=T, sep="\t")
selex = subset(selex, !is.na(PBSb))
coord = str_replace_all(selex$snp_name,"\\_",":")
coord = gsub("T1D:", "", coord)
coord = gsub("T2D:", "", coord)
selex$snpID = paste0 ("chr", coord)

comp = merge(selex[,c('snpID','prot', "Family", 'PBSb', 'PBSb_pv')], tab[,c('SNP_id', 'geneSymbol','alleleDiff') ],
             by=1:2, all.x=T)

In [137]:
sig = subset(comp, PBSb_pv<0.05)
#sp   = split(sig, sig$prot)
sp   = split(comp, comp$prot)
sp1  = sp[sapply(sp, function(c) sum(!is.na(c$alleleDiff)) >10)]
rho1 = sapply(sp1 , function(x) cor.test(x$PBSb, -x$alleleDiff, exact = FALSE)$estimate)

In [138]:
frac = sapply( sp1, function(x) sum(x$PBSb_pv<0.05 & !is.na(x[, "alleleDiff"]))/ sum(x$PBSb_pv<0.05))

In [139]:
summary(frac)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
0.00000 0.08835 0.27132 0.29165 0.45977 0.85752       1 

In [205]:
length(rho1)

In [140]:
summary(rho1)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
-0.5510  0.3795  0.6410  0.5507  0.7826  0.9532 

In [141]:
ta = data.frame(rho = rho1, frac= frac, prot = names(sp1))

In [142]:
ta = merge(ta, fams, by="prot")

In [144]:
ta = droplevels(ta)

In [150]:
pdf("../cytokine_figures/MotifBreak_compare.pdf")
par(mar=c(4,8,6,8),mfrow = c(3,1))
bymedian <- with(ta, reorder(as.factor(Family), -rho, median))
agcol = aggregate(rho~color, ta, median)
boxplot(rho~bymedian, ta,  xlab=NA, ylab="correlation with PWM", las=2)
beeswarm(rho~bymedian, ta, las=2, method="center", corral ="gutter", pch =19, 
         col= alpha(agcol[order(-agcol$rho), "color"],0.8),
       add=T, axes=F)
bymedian <- with(ta, reorder(as.factor(Family), -frac, median, na.rm=T))
agcol = aggregate(frac~color, ta, median)
boxplot(frac~bymedian, ta,  xlab=NA, ylab="%pbSNPs with PWM", las=2)
beeswarm(frac~bymedian, ta, las=2, method="center", corral ="gutter", pch =19, 
         col= alpha(agcol[order(-agcol$frac), "color"],0.8),
       add=T, axes=F)
dev.off()

In [151]:
sp   = split(sig, sig$prot)
sp2  = sp[sapply(sp, function(c) sum(!is.na(c$alleleDiff)) >10)]
rho2 = sapply(sp2 , function(x) cor.test(x$PBSb, -x$alleleDiff, exact = FALSE)$estimate)

In [161]:
summary(rho2)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
-0.1231  0.9002  0.9283  0.9088  0.9505  0.9769 

In [153]:
mmg = comp[complete.cases(comp),]

In [155]:
mmg$alleleDiff = -mmg$alleleDiff

In [159]:
pdf("../cytokine_figures/Corr_Selex_pwm.pdf")
par(mfrow=c(3,3),mar=c(4,4,4,2))
col = rainbow(4)[3:4]
plot(density(rho2), col = col[2], main="Correlation per TF", xlab="Pearson coefficient", lwd=1.5, xlim=c(-0.5,1))
abline(v=mean(rho2), col=col[2], lty=2)
lines(density(rho1), col = col[1], lwd=1.5)
abline(v=mean(rho1), col=col[1], lty=2)
legend("topleft", col = col, legend = paste(c( "all SNPs","pbSNPs"), 
                                            c( length(rho1),length(rho2))) , pch=15, bty="n")
par( mar=c(4,4,4,4))
tb = mmg[mmg$PBSb_pv<0.05  , c("PBSb", "alleleDiff" )]
pc  = round(sum( sign(tb[, "PBSb"]) == sign(tb[, "alleleDiff"])) / nrow(tb),4) *100

plot(mmg[,c("alleleDiff","PBSb")],  col= alpha(c("gray", col[2]),0.2)[(mmg$PBSb_pv <0.05)+1], 
       pch="",
          ylab= "Selex change", xlab= "PWM change", las=1)

abline (v=0, h=0, lty=2)
legend("topleft", legend= paste( pc, "%\nconcord."), bty="n")  
mtext(side=3, text= paste("n pbSNPs=", nrow(tb)), cex=0.8)
mtext(side=3, text= paste("n all=", nrow(mmg)), cex=0.8, line=1)
dev.off()

In [160]:
png("../cytokine_figures/Corr_Selex_pwm.png", width = 4, height = 4, units = 'in', res = 300)
par( mar=c(0,0,0,0))
plot(mmg[,c("alleleDiff","PBSb")],  col= alpha(c("gray", col[2]),0.2)[(mmg$PBSb_pv <0.05)+1], 
       pch=19,ylab= "", xlab= "", axes=F)
        
dev.off()