## Compare SV callsets in Venn and UpSet diagrams


### [SV callers](https://doi.org/10.5281/zenodo.1217111)
- Manta
- DELLY
- LUMPY
- GRIDSS 

### WGS samples
- _germline_ calls in single-sample data:
  - NA12878 [sample](https://ftp-trace.ncbi.nlm.nih.gov/giab/ftp/data/NA12878/NIST_NA12878_HG001_HiSeq_300x/RMNISTHS_30xdownsample.bam) (denoted as 'benchmark')
  - NA24385 [sample](ftp://ftp-trace.ncbi.nlm.nih.gov/giab/ftp/data/AshkenazimTrio/HG002_NA24385_son/NIST_Illumina_2x250bps/novoalign_bams/HG002.hs37d5.2x250.bam)
  - CHM1_CHM13 [sample](https://identifiers.org/ena.embl:ERX1413368)
- _somatic_ calls in paired-sample data:
   - COLO829 tumor [sample](https://identifiers.org/ena.embl:ERX2765496) sample with matched normal [sample](https://identifiers.org/ena.embl:ERX2765495) (denoted as 'cell lines')

### Exclusion list
- [ENCODE:ENCFF001TDO](http://identifiers.org/encode/ENCFF001TDO)

   
### SV callsets

- compare _callers_ for each sample

```
# file paths
../NA12878/compare/callers 
../NA24385/compare/callers
../CHM1_CHM13/compare/callers
../COLO829/compare/callers
```

- compare _samples_ (copies) for each caller

```
# file paths
../NA12878/compare/samples
../NA24385/compare/samples
../CHM1_CHM13/compare/samples
../COLO829/compare/samples
```

**Note**: The VCF files with SV calls were filtered and merged by SURVIVOR:

```
SURVIVOR filter [VCF] ENCFF001TDO.bed -1 -1 0 -1 [VCF]
SURVIVOR merge [TXT] 100 1 0 0 0 0 [VCF]
```


In [None]:
library(tools)
library(venn)
library(vcfR)
library(UpSetR)

In [None]:
options(jupyter.plot_mimetypes="image/svg+xml")  # set the MIME type of the plots (or "image/png")

  ### construct file paths

In [None]:
base.dir <- file.path("..", "..")
dts <- c("NA12878", "NA24385", "CHM1_CHM13", "COLO829")
dts.count <- length(dts)
dirs <- merge(dts, c("callers", "samples"))
paths <- file.path(base.dir, dirs$x, "compare", dirs$y)
paths.count <- length(paths)

In [None]:
print(paths)

In [None]:
callers <- c("all", "manta", "delly", "lumpy", "gridss")

### create a table of callsets with file paths

In [None]:
tab <- merge(paths[1:dts.count], callers[1])
tab <- rbind(tab, merge(paths[(dts.count+1):paths.count], callers[-1]))
names(tab) <- c("path", "callset")
print(tab)

### user input

In [None]:
sel.idx <- 1  # select a callset [1-20]
row <- tab[sel.idx,]
print(row)

### load callsets

In [None]:
sets <- read.table(file.path(row$path, paste0(row$callset, ".txt")), colClasses="character")

In [None]:
#sets

In [None]:
str <- file_path_sans_ext(sets$V1)
str.caller <-toupper(unique(regmatches(str, regexpr("[[:alpha:]]+$", str))))
str.run <-regmatches(str, regexpr("[[:digit:]]+", str))
if (sel.idx <= dts.count) {
    sets <- str.caller
} else {
    lb<-merge(str.caller, str.run)
    names(lb) <- c("caller", "run")
    sets <- paste0(lb$caller, "-", lb$run)
}

In [None]:
#sets

In [None]:
n.sets <- length(sets)

In [None]:
#n.sets

In [None]:
vcf <- read.vcfR(file.path(row$path, paste0(row$callset, ".vcf")), verbose = FALSE)

In [None]:
bin.vec <- sapply(strsplit(sapply(strsplit(getFIX(vcf, getINFO=TRUE)[, "INFO"], ";"), function(x) {x[2]}), "="), function (x) {x[2]})

In [None]:
#bin.vec

In [None]:
sets.sz <- data.frame(matrix(ncol=2, nrow=n.sets))

In [None]:
names(sets.sz) <- c("caller", "n")

In [None]:
for (i in 1:n.sets) {
    s <- rep(".",n.sets)
    s <- paste(replace(s,i,1), collapse="")
    re <- paste("^",s, "$", sep="")
    caller <- sets[i]
    n <- length(grep(s, bin.vec))
    sets.sz[i,] <- c(caller, n)
    #print(paste(caller, n))
}

In [None]:
#sets.sz

In [None]:
vec <- table(sapply(bin.vec, function(x) {strtoi(x, base=2)}))

In [None]:
#vec

In [None]:
lst <- strsplit(bin.vec, "")

In [None]:
#lst

In [None]:
s <- as.character(seq(0, (2 ** n.sets - 1)))

In [None]:
#s

In [None]:
N <- sapply(s, function(x){vec[x]}, USE.NAMES=FALSE)

In [None]:
N[is.na(N)] <- 0

In [None]:
#N

In [None]:
slabels <- paste0(sets.sz$caller, " (", sets.sz$n, ")")

In [None]:
if (sel.idx <= dts.count) {
    venn(n.sets, counts=N, snames=slabels, ilabels=TRUE, cexil=1.2, cexsn=1.2, zcolor="style", ellipse=TRUE)
}

In [None]:
nr <- length(lst)

In [None]:
#nr

In [None]:
nc <- length(lst[[1]])

In [None]:
#nc

In [None]:
M <- matrix(0, nrow=nr, ncol=nc)

In [None]:
for (i in 1:nr) {
  M[i, ] <- as.integer(lst[[i]])
}

In [None]:
#M

In [None]:
sv <- data.frame(ID = getFIX(vcf)[, "ID"], M)

In [None]:
#colnames(sv)

In [None]:
colnames(sv)[-1] <- slabels

In [None]:
#sv

In [None]:
upset(sv, nsets=n.sets, sets.bar.color="#56B4E9", order.by="freq")