In [1]:
suppressPackageStartupMessages(library(motifbreakR))
suppressPackageStartupMessages(library(MotifDb))
suppressPackageStartupMessages(library(BSgenome))
suppressPackageStartupMessages(library(tidyr))
suppressPackageStartupMessages(library("BSgenome.Hsapiens.UCSC.hg19"))
suppressPackageStartupMessages(library(motifStack))
suppressPackageStartupMessages(library(data.table))
suppressPackageStartupMessages(library(stringr))
suppressPackageStartupMessages(library(RColorBrewer))
suppressPackageStartupMessages(library(gridExtra))
suppressPackageStartupMessages(library(ggplot2))
suppressPackageStartupMessages(library(MotIV))

See system.file("LICENSE", package="MotifDb") for use restrictions.



In [2]:
setwd('/nfs/lab/projects/islet_cytok/analysis/selex/motifbreak')

In [3]:
vars = read.table("../selex_variants_reclassified.bed")

In [5]:
coord = str_replace_all(vars$V4,"\\_",":")
coord = gsub("T1D:", "", coord)
coord = gsub("T2D:", "", coord)
varlist= vars[,1:3]
varlist[,4] = paste0("chr", coord)
varlist[,5]= 0
varlist[,6]="+"
varlist = varlist[!duplicated(varlist),]

In [12]:
write.table( varlist, "Input_var_list.tsv", sep="\t", quote=F, row.names=F, col.names=F)

In [13]:
dim(varlist)

## Test analysis

In [14]:
hocomoco2 = MotifDb[mcols(MotifDb)$dataSource=='HOCOMOCOv10' & mcols(MotifDb)$organism=='Hsapiens']

In [15]:
snps <- snps.from.file(file = "Input_var_list.tsv",
                                  search.genome = BSgenome.Hsapiens.UCSC.hg19,
                                  format = "bed")

"User selected reference allele differs from the sequence in BSgenome.Hsapiens.UCSC.hg19 continuing with genome specified reference allels
 there are 402 differences"
"401 user variants are the same as the reference genome hg19 for Human
 These variants were excluded"


In [16]:
results <- motifbreakR(snpList = snps[1], filterp = TRUE,
                       pwmList = hocomoco2,
                       method="ic",
                       threshold = 5e-4,
                       BPPARAM = BiocParallel::bpparam("SerialParam"))


In [17]:
length(snps)

In [59]:
181540 + 402 +401

In [21]:
length(181401:181540)

In [None]:
#### Run using /bin/Rscript Motifbreak.R (~16 hrs) -- otherwise will load anaconda3 R
# for (i in seq(1,181400, by=200)){
#   test=snps[i:(i+199)]
#   results <- mclapply(1:200, function(x) data.frame(motifbreakR(snpList = test[x], filterp = TRUE,
#                        pwmList = hocomoco2,
#                        method="ic",
#                        threshold = 5e-4,
#                        BPPARAM = BiocParallel::bpparam("SerialParam"))), mc.cores=18 )

# rb = rbindlist(results[lapply(results, class) == "data.frame"])

# if(nrow(rb)>0){
#   write.table(rb, paste0("hocomoco_results/Results_Hocomoco_", i, "_", i+199, ".tsv"), quote=F, row.names=F, sep="\t")  
# }

# }

#   test=snps[181401:181540]
#   results <- mclapply(1:140, function(x) data.frame(motifbreakR(snpList = test[x], filterp = TRUE,
#                        pwmList = hocomoco2,
#                        method="ic",
#                        threshold = 5e-4,
#                        BPPARAM = BiocParallel::bpparam("SerialParam"))), mc.cores=18 )

# rb = rbindlist(results[lapply(results, class) == "data.frame"])

# if(nrow(rb)>0){
#   write.table(rb, paste0("hocomoco_results/Results_Hocomoco_", 181401, "_", 181540, ".tsv"), quote=F, row.names=F, sep="\t")  
# }


In [22]:
files = list.files('hocomoco_results')[grepl("Results", list.files('hocomoco_results'))]

In [23]:
length(files)

In [24]:
RES = data.frame()
for( f in files){
    res = read.table(paste0('hocomoco_results/',f),  row.names=NULL, header=T, sep="\t", stringsAsFactors = F)
    RES = rbind(RES, res)
}

In [25]:
tab = RES[, c(6,10:24)]
tab = tab[order(-abs(tab$alleleDiff)),]
tab = tab[!duplicated(tab[,c('SNP_id','geneSymbol' )]),]

In [32]:
tab[tab$SNP_id=="chr14:68836369:A:G",]

Unnamed: 0_level_0,SNP_id,motifPos,geneSymbol,dataSource,providerName,providerId,seqMatch,pctRef,pctAlt,scoreRef,scoreAlt,Refpvalue,Altpvalue,altPos,alleleDiff,effect
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<lgl>,<lgl>,<int>,<dbl>,<chr>
815444,chr14:68836369:A:G,"c(-13, 7)",CUX2,HOCOMOCOv10,CUX2_HUMAN.H10MO.D,CUX2_HUMAN.H10MO.D,catatctttcactttCcctatcgatacttcatatgttatg,0.92111,0.7757735,11.799061,10.00743,,,1,-1.791634,strong
815445,chr14:68836369:A:G,"c(-2, 11)",CUX1,HOCOMOCOv10,CUX1_HUMAN.H10MO.C,CUX1_HUMAN.H10MO.C,tttcactttCcctatcgatacttcata,0.9487956,0.7349893,7.028517,5.48814,,,1,-1.540377,strong


In [30]:
write.table(tab, "Summary_significant_motifs_hocomocov10_long.tsv", sep="\t", row.names=F, quote=F)

In [56]:
min(abs(tab$alleleDiff))

In [35]:
dim(tab)

In [34]:
sum(!duplicated(tab$SNP_id))

In [36]:
selex = read.table("../combinedResults.tsv", header=T)

In [37]:
selex = subset(selex, !is.na(PBSb))
coord = str_replace_all(selex$snp_name,"\\_",":")
coord = gsub("T1D:", "", coord)
coord = gsub("T2D:", "", coord)
selex$snpID = paste0 ("chr", coord)

comp = merge(selex[,c('snpID','prot', "Family", 'PBSb', 'PBSb_pv')], tab[,c('SNP_id', 'geneSymbol','alleleDiff') ],
             by=1:2, all.x=T)

In [45]:
dim(comp)

In [60]:
sum(!duplicated(comp$prot))

In [62]:
sum(!duplicated(comp$snpID))

In [57]:
citation("motifbreakR")


  Simon G. Coetzee, Gerhard A. Coetzee and Dennis J. Hazelett (2015).
  motifbreakR: an R/Bioconductor package for predicting variant effects
  at transcription factor binding sites. Bioinformatics.
  doi:10.1093/bioinformatics/btv470

A BibTeX entry for LaTeX users is

  @Article{,
    title = {motifbreakR: an R/Bioconductor package for predicting variant effects at transcription factor binding sites},
    author = {Simon G. Coetzee and Gerhard A. Coetzee and Dennis J. Hazelett},
    year = {2015},
    journal = {Bioinformatics},
    doi = {10.1093/bioinformatics/btv470},
    url = {http://bioinformatics.oxfordjournals.org/content/early/2015/08/12/bioinformatics.btv470.abstract},
  }


In [61]:
length(hocomoco2)