## Evaluate SV callsets

### [SV callers](https://doi.org/10.5281/zenodo.1217111)
- Manta
- DELLY
- LUMPY
- GRIDSS

### SV truth sets (germline)
- [Personalis/1000 Genomes Project](https://ftp-trace.ncbi.nlm.nih.gov/giab/ftp/technical/svclassify_Manuscript/Supplementary_Information/Personalis_1000_Genomes_deduplicated_deletions.bed)
  data ([Parikh et al. ,2016](https://doi.org/10.1186/s12864-016-2366-2)) for SV calls in the NA12878 [sample](https://ftp-trace.ncbi.nlm.nih.gov/giab/ftp/data/NA12878/NIST_NA12878_HG001_HiSeq_300x/RMNISTHS_30xdownsample.bam)
- [PacBio/Moleculo](https://static-content.springer.com/esm/art%3A10.1186%2Fgb-2014-15-6-r84/MediaObjects/13059_2013_3363_MOESM4_ESM.zip) data ([Layer et al., 2014](https://doi.org/10.1186/gb-2014-15-6-r84)) for SV calls in the NA12878 sample
- [dbVar](https://www.ncbi.nlm.nih.gov/dbvar/studies/nstd167/) [nstd167](https://ftp.ncbi.nlm.nih.gov/pub/dbVar/data/Homo_sapiens/by_study/vcf/nstd167.GRCh37.variant_call.vcf.gz) data ([Wenger et al., 2019](https://doi.org/10.1038/s41587-019-0217-9)) for SV calls in the NA24385 [sample](https://ftp-trace.ncbi.nlm.nih.gov/giab/ftp/data/AshkenazimTrio/HG002_NA24385_son/NIST_Illumina_2x250bps/novoalign_bams/HG002.hs37d5.2x250.bam)
- [dbVar](https://www.ncbi.nlm.nih.gov/dbvar/studies/nstd137/) [nstd137](https://ftp.ncbi.nlm.nih.gov/pub/dbVar/data/Homo_sapiens/by_study/vcf/nstd137.GRCh37.variant_call.vcf.gz) data ([Huddlestone et al., 2017](https://dx.doi.org/10.1101%2Fgr.214007.116)) for SV calls in the CHM1_CHM13 [sample](https://identifiers.org/ena.embl:ERX1413368)

### Exclusion lists
- [ENCODE:ENCFF001TDO](http://identifiers.org/encode/ENCFF001TDO)
- [CEPH](https://doi.org/10.1186/gb-2014-15-6-r84#ref-CR28) (Layer et al., 2014)


In [None]:
library(tools)
suppressPackageStartupMessages(require(StructuralVariantAnnotation))

### create a list of data sets

In [None]:
samples <- c('NA12878', 'NA24385', 'CHM1_CHM13')
truth.sets <- list(
    Personalis1kGP='Personalis_1000_Genomes_deduplicated_deletions.bed',
    PacbioMoleculo='3717462611446476_add4.bedpe',
    dbVar_nstd167='nstd167.GRCh37.variant_call.vcf.gz',
    dbVar_nstd137='nstd137.GRCh37.variant_call.vcf.gz')
excl.lists <- c('ENCFF001TDO.bed', 'ceph18.b37.lumpy.exclude.2014-01-15.bed')  # exclusion lists
dts <- list(c(1,1,1), c(1,2,2), c(2,3,1), c(3,4,1))  # pair samples & truth sets & excl. lists via index

### user input

In [None]:
min.supp <- 3  # min. number of supporting callers
min.svLen <- -1  # currently set dynamically in [11]
sel.idx <- 1  # select a data sets [1-4]

In [None]:
sample <- samples[dts[[sel.idx]][1]]
ts <- names(truth.sets[dts[[sel.idx]][2]])
base.dir <- file.path('..', '..')
ts.file <- file.path(base.dir, sample, 'in', truth.sets[dts[[sel.idx]][2][[1]]])
excl.file <- file.path(base.dir, sample, 'in', excl.lists[dts[[sel.idx]][3]])
print(dts[[sel.idx]])
print(sample)
print(ts)
print(ts.file)
print(excl.file)

### helper functions

In [None]:
# get VCF file path given a dataset and a caller
getVcf <- function(dts, caller) {
    vcf.file <- file.path(base.dir, dts, 'out', '3', 'S3', paste0(caller, '_out'), paste0(caller, '.vcf'))
    print(vcf.file)
    if (file.exists(vcf.file)) {
        return(vcf.file)
    }
    return(NULL)
}

In [None]:
# assign SV types
# https://github.com/PapenfussLab/gridss/blob/7b1fedfed32af9e03ed5c6863d368a821a4c699f/example/simple-event-annotation.R#L9
getSvType <- function(gr) {
    return(ifelse(seqnames(gr) != seqnames(partner(gr)), 'CTX',
        ifelse(gr$insLen >= abs(gr$svLen) * 0.7, 'INS',
            ifelse(strand(gr) == strand(partner(gr)), 'INV',
                ifelse(xor(start(gr) < start(partner(gr)), strand(gr) == '-'),
                    'DEL', 'DUP')))))
}

In [None]:
# compute performance metrics for a callset
getPerfMetrics <- function(callset, hits, n.true) {
    n <- length(hits)
    tp <- sum(hits)
    fp <- n - tp
    fn <- n.true - tp
    prec <- round(tp * 100 / n, digits=1)
    rec <- round(tp * 100 / n.true, digits=1)
    lst <- list(callset=callset, n=n, tp=tp, fp=fp, fn=fn, precision=prec,
                recall=rec)
    return(replace(lst, is.na(lst), 0))
}

In [None]:
# exclude genomic regions
excludeRegions <- function(query.gr, subject.gr) {
    return(query.gr[!(overlapsAny(query.gr, subject.gr) |
                      overlapsAny(partner(query.gr), subject.gr)), ])
}

In [None]:
# count breakpoint overlaps (hits)
getHits <- function(query.gr, subject.gr) {
    return(countBreakpointOverlaps(query.gr, subject.gr, maxgap=100,
                                   sizemargin=0.25, ignore.strand=TRUE,
                                   restrictMarginToSizeMultiple=0.5,
                                   countOnlyBest=TRUE))
}

In [None]:
getRegionsFromTruthSet <- function(ts.file) {  # handle BED(PE) and VCF files
    if(file_ext(ts.file) %in% c('vcf', 'gz')) {
        vcf <- VariantAnnotation::readVcf(ts.file)
        # fix: SVLEN type: CharacterList->IntegerList
        info(vcf)$SVLEN <- IntegerList(info(vcf)$SVLEN)
        return(breakpointRanges(vcf))
    }

    if(file_ext(ts.file) == 'bed') {
        bedpe.file <- paste0(file_path_sans_ext(ts.file), '.bedpe')
        cmd <- paste("awk 'BEGIN {a=0; OFS=\"\t\"} NR>1 {print $1,$2,$2+1,$1,$3,\
                     $3+1,\"DEL_\" a,-1,\"+\",\"+\",\"DEL\"; a+=1}'", ts.file,
                     '>', bedpe.file)
        system(cmd)
        ts.file <- bedpe.file
    }
    return(pairs2breakpointgr(rtracklayer::import(ts.file)))
}

### import SVs from a truth set

In [None]:
true.gr <- getRegionsFromTruthSet(ts.file)
seqlevelsStyle(true.gr) <- 'NCBI'  # chr[X] -> [X]
if(min.svLen < 0) {
    min.svLen <- min(abs(end(partner(true.gr)) - start(true.gr)) + 1)
}
message('### Truth set ###')
message('input = ', ts.file)
message('n = ', length(true.gr))
message('min.svLen = ', min.svLen)

### filter SVs by an exclusion list

In [None]:
excl.gr <- rtracklayer::import(excl.file)
seqlevelsStyle(excl.gr) <- 'NCBI'  # chr[X] -> X
message('\n### Exclusion list ###')
message('input = ', excl.file)
print(seqnames(excl.gr))
true.gr <- excludeRegions(true.gr, excl.gr)
n.true <- length(true.gr)
message('\n### Truth set filtered by the exclusion list ###')
message('n = ', n.true)
message('min.svLen = ', min.svLen)

### import SV callsets from VCF files

In [None]:
callers <- c('manta', 'delly', 'lumpy', 'gridss')
hits.df <- data.frame(callset=character(), n=numeric(), tp=numeric(),
                      fp=numeric(), precision=numeric(), recall=numeric())
for (c in callers) {
  vcf.file <- getVcf(sample, c)
  vcf <- VariantAnnotation::readVcf(vcf.file)
  # select only DELs
  gr <- breakpointRanges(vcf)
  gr$svtype <- getSvType(gr)
  gr <- gr[gr$svtype == 'DEL']
  message('\n### All DELs ###')
  message('# ', c)
  message('n = ', length(gr))
  print(summary(abs(gr$svLen)))

  message('\n### DELs svLen != NA ###')
  gr <- gr[!is.na(gr$svLen)]
  message('# ', c)
  message('n = ', length(gr))
  print(summary(abs(gr$svLen)))

  gr <- gr[abs(gr$svLen) >= min.svLen]
  message('\n### DELs svLen >= min.svLen ###')
  message('# ', c)
  message('n = ', length(gr))
  print(summary(abs(gr$svLen)))

  gr <- excludeRegions(gr, excl.gr)
  message('\n### DELs >= min.svLen AND filtered by the exclusion list ###')
  message('# ', c)
  message('n = ', length(gr))
  print(summary(abs(gr$svLen)))

  hits <- getHits(gr, true.gr)
  pm <- getPerfMetrics(c, hits, n.true)
  hits.df <- rbind(hits.df, data.frame(pm))
}

### import merged callset from VCF file

In [None]:
# fix: replace ':' by '_' in ID & SAMPLE fields
vcf.infile <- file.path(base.dir, sample, 'out', '3', 'S3', 'all.vcf')
vcf.outfile <- 'merge.vcf'
cmd <- paste("awk '{if ($1 ~ /^#/){print} else {id=$3; gsub(\":\",\"_\",$3);\
             gsub(id,$3,$10); print}}'", vcf.infile, '>', vcf.outfile)
system(cmd)

In [None]:
vcf.outfile
vcf <- VariantAnnotation::readVcf(vcf.outfile)
# fix: INFO/CI{END,POS} types: String->Integer
header(vcf)
info(header(vcf))$Type[1:2] <- c("Integer", "Integer")
vcf <- vcf[which(info(vcf)$SVTYPE == 'DEL')]  # keep only deletions
vcf <- vcf[which(as.integer(info(vcf)$SUPP) >= min.supp, TRUE)]  # filter calls by support or
#vcf <- vcf[which(info(vcf)$SUPP_VEC == '1101', TRUE)]           # binary vector (MDLG)
# fix: CI{POS,END}} type: CharacterList->IntegerList
info(vcf)$CIPOS <- IntegerList(info(vcf)$CIPOS)
info(vcf)$CIEND <- IntegerList(info(vcf)$CIEND)

### evaluate SV break-points

In [None]:
gr <- breakpointRanges(vcf)
gr <- excludeRegions(gr, excl.gr)
hits <- getHits(gr, true.gr)
callset <- file_path_sans_ext(vcf.outfile)
pm <- getPerfMetrics(callset, hits, n.true)
hits.df <- rbind(hits.df, data.frame(pm))
message('\n### Performance metrics ###')
message('min.supp = ', min.supp, '\n')

### write CSV file with performance metrics

In [None]:
hits.df

In [None]:
csv.file <- paste0('metrics_', sample, '_', ts, '.csv')
write.table(hits.df, file=csv.file, row.names=FALSE, col.names=TRUE,
            quote=FALSE, sep=',')