## Evaluate SV callsets

### [SV callers](https://doi.org/10.5281/zenodo.1217111)
- Manta
- DELLY
- LUMPY
- GRIDSS

### SV truth sets (germline)
- [Personalis/1000 Genomes Project](https://ftp-trace.ncbi.nlm.nih.gov/giab/ftp/technical/svclassify_Manuscript/Supplementary_Information/Personalis_1000_Genomes_deduplicated_deletions.bed)
  data ([Parikh et al. ,2016](https://doi.org/10.1186/s12864-016-2366-2)) for SV calls in the NA12878 [sample](https://ftp-trace.ncbi.nlm.nih.gov/giab/ftp/data/NA12878/NIST_NA12878_HG001_HiSeq_300x/RMNISTHS_30xdownsample.bam)
- [PacBio/Moleculo](https://static-content.springer.com/esm/art%3A10.1186%2Fgb-2014-15-6-r84/MediaObjects/13059_2013_3363_MOESM4_ESM.zip) data ([Layer et al., 2014](https://doi.org/10.1186/gb-2014-15-6-r84)) for SV calls in the NA12878 sample
- [dbVar](https://www.ncbi.nlm.nih.gov/dbvar/studies/nstd167/) [nstd167](https://ftp.ncbi.nlm.nih.gov/pub/dbVar/data/Homo_sapiens/by_study/vcf/nstd167.GRCh37.variant_call.vcf.gz) data ([Wenger et al., 2019](https://doi.org/10.1038/s41587-019-0217-9)) for SV calls in the NA24385 [sample](https://ftp-trace.ncbi.nlm.nih.gov/giab/ftp/data/AshkenazimTrio/HG002_NA24385_son/NIST_Illumina_2x250bps/novoalign_bams/HG002.hs37d5.2x250.bam)
- [dbVar](https://www.ncbi.nlm.nih.gov/dbvar/studies/nstd137/) [nstd137](https://ftp.ncbi.nlm.nih.gov/pub/dbVar/data/Homo_sapiens/by_study/vcf/nstd137.GRCh37.variant_call.vcf.gz) data ([Huddlestone et al., 2017](https://dx.doi.org/10.1101%2Fgr.214007.116)) for SV calls in the CHM1_CHM13 [sample](https://identifiers.org/ena.embl:ERX1413368)

### Exclusion lists
- [ENCODE:ENCFF001TDO](http://identifiers.org/encode/ENCFF001TDO)
- [CEPH](https://doi.org/10.1186/gb-2014-15-6-r84#ref-CR28) (Layer et al., 2014)


In [1]:
library(tools)
suppressPackageStartupMessages(require(StructuralVariantAnnotation))

### data sets

In [2]:
samples <- c('NA12878', 'NA24385', 'CHM1_CHM13')
truth.sets <- list(
    Personalis1kGP='Personalis_1000_Genomes_deduplicated_deletions.bed',
    PacbioMoleculo='3717462611446476_add4.bedpe',
    dbVar_nstd167='nstd167.GRCh37.variant_call.vcf.gz',
    dbVar_nstd137='nstd137.GRCh37.variant_call.vcf.gz')
excl.lists <- c('ENCFF001TDO.bed', 'ceph18.b37.lumpy.exclude.2014-01-15.bed')  # exclusion lists
map <- list(c(1,1,1), c(1,2,2), c(2,3,1), c(3,4,1))  # pair samples & truth sets & excl. lists via index

### user input

In [3]:
min.supp <- 3  # min. number of supporting callers

In [4]:
base.dir <- file.path('..', '..')

In [5]:
sel.idx <- 4  # select a data sets in the map [1-4] 
sample <- samples[map[[sel.idx]][1]]
ts <- names(truth.sets[map[[sel.idx]][2]])
ts.file <- file.path(base.dir, sample, 'in', truth.sets[map[[sel.idx]][2][[1]]])
excl.file <- file.path(base.dir, sample, 'in', excl.lists[map[[sel.idx]][3]])
print(map[[sel.idx]])
print(sample)
print(ts)
print(ts.file)
print(excl.file)

[1] 3 4 1
[1] "CHM1_CHM13"
[1] "dbVar_nstd137"
[1] "../../CHM1_CHM13/in/nstd137.GRCh37.variant_call.vcf.gz"
[1] "../../CHM1_CHM13/in/ENCFF001TDO.bed"


### helper functions

In [6]:
# get VCF file path given dataset and caller
getVcf <- function(dts, caller) {
    vcf.file <- file.path(base.dir, dts, 'out', '3', 'S3', paste0(caller, '_out'), paste0(caller, '.vcf'))
    print(vcf.file)
    if (file.exists(vcf.file)) {
        return(vcf.file)
    }
    return(NULL)
}

In [7]:
# assign SV types
# https://github.com/PapenfussLab/gridss/blob/7b1fedfed32af9e03ed5c6863d368a821a4c699f/example/simple-event-annotation.R#L9
getSvType <- function(gr) {
    return(ifelse(seqnames(gr) != seqnames(partner(gr)), 'CTX',
        ifelse(gr$insLen >= abs(gr$svLen) * 0.7, 'INS',
            ifelse(strand(gr) == strand(partner(gr)), 'INV',
                ifelse(xor(start(gr) < start(partner(gr)), strand(gr) == '-'),
                    'DEL', 'DUP')))))
}

In [8]:
# compute performance metrics for a callset
getPerfMetrics <- function(callset, hits, n.true) {
    n <- length(hits)
    tp <- sum(hits)
    fp <- n - tp
    fn <- n.true - tp
    prec <- round(tp * 100 / n, digits=1)
    rec <- round(tp * 100 / n.true, digits=1)
    return(list(callset=callset, n=n, tp=tp, fp=fp, fn=fn, precision=prec,
                recall=rec))
}

In [9]:
# exclude genomic regions
excludeRegions <- function(query.gr, subject.gr) {
    return(query.gr[!(overlapsAny(query.gr, subject.gr) |
                      overlapsAny(partner(query.gr), subject.gr)), ])
}

In [10]:
# count breakpoint overlaps (hits)
getHits <- function(query.gr, subject.gr) {
    return(countBreakpointOverlaps(query.gr, subject.gr, maxgap=100,
                                   sizemargin=0.25, ignore.strand=TRUE,
                                   restrictMarginToSizeMultiple=0.5,
                                   countOnlyBest=TRUE))
}

In [11]:
getRegionsFromTruthSet <- function(ts.file) {  # handle BED(PE) and VCF files
    if(file_ext(ts.file) %in% c('vcf', 'gz')) {
        vcf <- VariantAnnotation::readVcf(ts.file)
        # fix: SVLEN type: CharacterList->IntegerList
        info(vcf)$SVLEN <- IntegerList(info(vcf)$SVLEN)
        return(breakpointRanges(vcf))
    }

    if(file_ext(ts.file) == 'bed') {
        bedpe.file <- paste0(file_path_sans_ext(ts.file), '.bedpe')
        cmd <- paste("awk 'BEGIN {a=0; OFS=\"\t\"} NR>1 {print $1,$2,$2+1,$1,$3,\
                     $3+1,\"DEL_\" a,-1,\"+\",\"+\",\"DEL\"; a+=1}'", ts.file,
                     '>', bedpe.file)
        system(cmd)
        ts.file <- bedpe.file
    }
    return(pairs2breakpointgr(rtracklayer::import(ts.file)))
}

### import SVs from a truth set

In [12]:
true.gr <- getRegionsFromTruthSet(ts.file)
seqlevelsStyle(true.gr) <- 'NCBI'  # chr[X] -> [X]
min.svLen <- min(abs(end(partner(true.gr)) - start(true.gr)) + 1)
message('### Truth set ###')
message('input = ', ts.file)
message('n = ', length(true.gr))
message('min.svLen = ', min.svLen)

### Truth set ###

input = ../../CHM1_CHM13/in/nstd137.GRCh37.variant_call.vcf.gz

n = 68342

min.svLen = 2



### filter SVs by an exclusion list

In [13]:
excl.gr <- rtracklayer::import(excl.file)
seqlevelsStyle(excl.gr) <- 'NCBI'  # chr[X] -> X
message('\n### Exclusion list ###')
message('input = ', excl.file)
print(seqnames(excl.gr))
true.gr <- excludeRegions(true.gr, excl.gr)
min.svlen <- min(abs(end(partner(true.gr)) - start(true.gr)) + 1)
n.true <- length(true.gr)
message('\n### Truth set filtered by the exclusion list ###')
message('n = ', n.true)
message('min.svLen = ', min.svLen)


### Exclusion list ###

input = ../../CHM1_CHM13/in/ENCFF001TDO.bed



factor-Rle of length 411 with 25 runs
  Lengths:  27  12  13   8   2   4   1  10 ...  14  10  18  13  23   1  25 118
  Values :   1  10  11  12  13  14  15  16 ...   5   6   7   8   9   M   X   Y
Levels(25): 1 10 11 12 13 14 15 16 17 18 19 2 20 21 22 3 4 5 6 7 8 9 M X Y


“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL000194.1, GL000208.1, GL000214.1, GL000217.1, GL000219.1, GL000224.1, GL000225.1, GL000233.1, GL383561.2, JH636052.4, JH806585.1, KE332502.1, NT_113885.1, NT_113901.1, NT_113916.2, NT_113923.1, NT_113930.1, NT_113961.1, NT_167212.1, NT_167213.1, NT_167214.1, NT_167225.1, NT_167226.1, NT_167229.1, NT_167230.1, NT_167234.1, NW_003315903.1, NW_003315911.1, NW_003315912.1, NW_003315923.1, NW_003315924.1, NW_003315925.1, NW_003315926.1, NW_003315932.1, NW_003315937.1, NW_003315947.1, NW_003315948.2, NW_003315949.1, NW_003315950.2, NW_003315968.1, NW_003571030.1, NW_003571034.1, NW_003571035.1, NW_003571040.1, NW_003571042.1, NW_003571046.1, NW_003571048.1, NW_003571051.1, NW_003571064.2, NW_003871055.3, NW_003871056.3, NW_003871060.1, NW_003871064.1, NW_003871068.1, NW_003871087.1, NW_003871088.1, NW_003871089.1, NW_003871094.1, NW_003871095.1, NW_003871100.1, NW_003871103.3, NW_004070863.1, NW_004070865.1, 

### import SV callsets from VCF files

In [14]:
callers <- c('manta', 'delly', 'lumpy', 'gridss')
hits.df <- data.frame(callset=character(), n=numeric(), tp=numeric(),
                      fp=numeric(), precision=numeric(), recall=numeric())
for (c in callers) {
  vcf.file <- getVcf(sample, c)
  vcf <- VariantAnnotation::readVcf(vcf.file)
  # select only DELs
  gr <- breakpointRanges(vcf)
  gr$svtype <- getSvType(gr)
  gr <- gr[gr$svtype == 'DEL']
  message('\n### All DELs ###')
  message('# ', c)
  message('n = ', length(gr))
  print(summary(gr$svLen))

  message('\n### DELs svLen != NA ###')
  gr <- gr[!is.na(gr$svLen)]
  message('# ', c)
  message('n = ', length(gr))
  print(summary(gr$svLen))

  gr <- gr[abs(gr$svLen) >= min.svLen]
  message('\n### DELs svLen >= min.svLen ###')
  message('# ', c)
  message('n = ', length(gr))
  print(summary(gr$svLen))

  gr <- excludeRegions(gr, excl.gr)
  message('\n### DELs >= min.svlen AND filtered by the exclusion list ###')
  message('# ', c)
  message('n = ', length(gr))
  print(summary(gr$svLen))

  hits <- getHits(gr, true.gr)
  pm <- getPerfMetrics(c, hits, n.true)
  hits.df <- rbind(hits.df, data.frame(pm))
}

[1] "../../CHM1_CHM13/out/3/S3/manta_out/manta.vcf"



### All DELs ###

# manta

n = 8928



     Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
-81790539      -343      -202   -104885       -76       -36 



### DELs svLen != NA ###

# manta

n = 8928



     Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
-81790539      -343      -202   -104885       -76       -36 



### DELs svLen >= min.svLen ###

# manta

n = 8928



     Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
-81790539      -343      -202   -104885       -76       -36 


“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': MT, GL000207.1, GL000226.1, GL000229.1, GL000231.1, GL000210.1, GL000239.1, GL000235.1, GL000201.1, GL000247.1, GL000245.1, GL000197.1, GL000203.1, GL000246.1, GL000249.1, GL000196.1, GL000248.1, GL000244.1, GL000238.1, GL000202.1, GL000234.1, GL000232.1, GL000206.1, GL000240.1, GL000236.1, GL000241.1, GL000243.1, GL000242.1, GL000230.1, GL000237.1, GL000233.1, GL000204.1, GL000198.1, GL000208.1, GL000191.1, GL000227.1, GL000228.1, GL000214.1, GL000221.1, GL000209.1, GL000218.1, GL000220.1, GL000213.1, GL000211.1, GL000199.1, GL000217.1, GL000216.1, GL000215.1, GL000205.1, GL000219.1, GL000224.1, GL000223.1, GL000195.1, GL000212.1, GL000222.1, GL000200.1, GL000193.1, GL000194.1, GL000225.1, GL000192.1, NC_007605
  - in 'y': M
  Make sure to always combine/compare objects based on the same reference
“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': MT, GL000207.1, GL000226.1

     Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
-81790539      -341      -202   -103433       -75       -36 


“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': MT, GL000207.1, GL000226.1, GL000229.1, GL000231.1, GL000210.1, GL000239.1, GL000235.1, GL000201.1, GL000247.1, GL000245.1, GL000197.1, GL000203.1, GL000246.1, GL000249.1, GL000196.1, GL000248.1, GL000244.1, GL000238.1, GL000202.1, GL000234.1, GL000232.1, GL000206.1, GL000240.1, GL000236.1, GL000241.1, GL000243.1, GL000242.1, GL000230.1, GL000237.1, GL000204.1, GL000198.1, GL000191.1, GL000227.1, GL000228.1, GL000221.1, GL000209.1, GL000218.1, GL000220.1, GL000213.1, GL000211.1, GL000199.1, GL000216.1, GL000215.1, GL000205.1, GL000223.1, GL000195.1, GL000212.1, GL000222.1, GL000200.1, GL000193.1, GL000192.1, NC_007605
  - in 'y': GL383561.2, JH636052.4, JH806585.1, KE332502.1, NT_113885.1, NT_113901.1, NT_113916.2, NT_113923.1, NT_113930.1, NT_113961.1, NT_167212.1, NT_167213.1, NT_167214.1, NT_167225.1, NT_167226.1, NT_167229.1, NT_167230.1, NT_167234.1, NW_003315903.1, NW_003315911.1, NW_003315912.1, NW

[1] "../../CHM1_CHM13/out/3/S3/delly_out/delly.vcf"


“Removing 1474 unpaired breakend variants BND00000016, BND00000020, BND00000047, BND00000053, BND00000074, BND00000076, BND00000077, BND00000078, BND00000161, BND00000168, BND00000315, BND00000720, BND00000785, BND00000787, BND00000839, BND00000842, BND00000852, BND00000851, BND00000874, BND00000910, BND00000912, BND00001090, BND00001117, BND00001126, BND00001136, BND00001137, BND00001145, BND00001222, BND00001262, BND00001293, BND00001325, BND00001324, BND00001332, BND00001378, BND00001404, BND00001476, BND00001482, BND00001533, BND00001570, BND00001603, BND00001605, BND00001668, BND00001687, BND00001877, BND00002904, BND00003150, BND00003153, BND00003152, BND00003173, BND00003253, BND00003272, BND00003336, BND00003389, BND00003421, BND00003430, BND00003495, BND00003522, BND00003625, BND00003637, BND00003654, BND00003718, BND00003746, BND00003770, BND00003929, BND00003931, BND00003932, BND00003935, BND00003942, BND00003992, BND00004023, BND00004050, BND00004071, BND00004072, BND000041

“GRanges object contains 7 out-of-bound ranges located on sequences MT,
  GL000226.1, and GL000207.1. Note that ranges located on a sequence
  whose length is unknown (NA) or on a circular sequence are not
  considered out-of-bound (use seqlengths() and isCircular() to get the
  lengths and circularity flags of the underlying sequences). You can use
  trim() to trim these ranges. See ?`trim,GenomicRanges-method` for more
  information.”
“GRanges object contains 7 out-of-bound ranges located on sequences MT,
  GL000226.1, and GL000207.1. Note that ranges located on a sequence
  whose length is unknown (NA) or on a circular sequence are not
  considered out-of-bound (use seqlengths() and isCircular() to get the
  lengths and circularity flags of the underlying sequences). You can use
  trim() to trim these ranges. See ?`trim,GenomicRanges-method` for more
  information.”
“GRanges object contains 7 out-of-bound ranges located on sequences MT,
  GL000226.1, and GL000207.1. Note that ranges

     Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
     -570       -68       -28    377563       -18 223472791 



### DELs svLen != NA ###

# delly

n = 27560



     Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
     -570       -68       -28    377563       -18 223472791 



### DELs svLen >= min.svLen ###

# delly

n = 27560



     Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
     -570       -68       -28    377563       -18 223472791 


“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': MT, GL000207.1, GL000226.1, GL000229.1, GL000231.1, GL000210.1, GL000239.1, GL000235.1, GL000201.1, GL000247.1, GL000245.1, GL000197.1, GL000203.1, GL000246.1, GL000249.1, GL000196.1, GL000248.1, GL000244.1, GL000238.1, GL000202.1, GL000234.1, GL000232.1, GL000206.1, GL000240.1, GL000236.1, GL000241.1, GL000243.1, GL000242.1, GL000230.1, GL000237.1, GL000233.1, GL000204.1, GL000198.1, GL000208.1, GL000191.1, GL000227.1, GL000228.1, GL000214.1, GL000221.1, GL000209.1, GL000218.1, GL000220.1, GL000213.1, GL000211.1, GL000199.1, GL000217.1, GL000216.1, GL000215.1, GL000205.1, GL000219.1, GL000224.1, GL000223.1, GL000195.1, GL000212.1, GL000222.1, GL000200.1, GL000193.1, GL000194.1, GL000225.1, GL000192.1, NC_007605
  - in 'y': M
  Make sure to always combine/compare objects based on the same reference
“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': MT, GL000207.1, GL000226.1

     Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
     -570       -67       -28    358404       -18 205246010 


“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': MT, GL000207.1, GL000226.1, GL000229.1, GL000231.1, GL000210.1, GL000239.1, GL000235.1, GL000201.1, GL000247.1, GL000245.1, GL000197.1, GL000203.1, GL000246.1, GL000249.1, GL000196.1, GL000248.1, GL000244.1, GL000238.1, GL000202.1, GL000234.1, GL000232.1, GL000206.1, GL000240.1, GL000236.1, GL000241.1, GL000243.1, GL000242.1, GL000230.1, GL000237.1, GL000204.1, GL000198.1, GL000191.1, GL000227.1, GL000228.1, GL000221.1, GL000209.1, GL000218.1, GL000220.1, GL000213.1, GL000211.1, GL000199.1, GL000216.1, GL000215.1, GL000205.1, GL000223.1, GL000195.1, GL000212.1, GL000222.1, GL000200.1, GL000193.1, GL000192.1, NC_007605
  - in 'y': GL383561.2, JH636052.4, JH806585.1, KE332502.1, NT_113885.1, NT_113901.1, NT_113916.2, NT_113923.1, NT_113930.1, NT_113961.1, NT_167212.1, NT_167213.1, NT_167214.1, NT_167225.1, NT_167226.1, NT_167229.1, NT_167230.1, NT_167234.1, NW_003315903.1, NW_003315911.1, NW_003315912.1, NW

[1] "../../CHM1_CHM13/out/3/S3/lumpy_out/lumpy.vcf"



### All DELs ###

# lumpy

n = 0



   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
                                                



### DELs svLen != NA ###

# lumpy

n = 0



   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
                                                



### DELs svLen >= min.svLen ###

# lumpy

n = 0



   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
                                                


“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': MT, GL000207.1, GL000208.1, GL000214.1, GL000220.1, GL000199.1, GL000216.1, GL000205.1, GL000219.1, GL000224.1, GL000195.1, GL000193.1, GL000225.1, GL000235.1, GL000234.1, GL000232.1, GL000233.1, GL000194.1, GL000192.1, GL000229.1, GL000231.1, GL000211.1, GL000212.1, GL000222.1, GL000198.1, GL000237.1, GL000201.1, GL000241.1, GL000203.1, GL000226.1, GL000210.1, GL000217.1, GL000228.1, GL000230.1, GL000218.1
  - in 'y': M
  Make sure to always combine/compare objects based on the same reference
“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': MT, GL000207.1, GL000208.1, GL000214.1, GL000220.1, GL000199.1, GL000216.1, GL000205.1, GL000219.1, GL000224.1, GL000195.1, GL000193.1, GL000225.1, GL000235.1, GL000234.1, GL000232.1, GL000233.1, GL000194.1, GL000192.1, GL000229.1, GL000231.1, GL000211.1, GL000212.1, GL000222.1, GL000198.1, GL000237.1, GL000201.1, GL000241.1, GL000203.

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
                                                


“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': MT, GL000207.1, GL000220.1, GL000199.1, GL000216.1, GL000205.1, GL000195.1, GL000193.1, GL000235.1, GL000234.1, GL000232.1, GL000192.1, GL000229.1, GL000231.1, GL000211.1, GL000212.1, GL000222.1, GL000198.1, GL000237.1, GL000201.1, GL000241.1, GL000203.1, GL000226.1, GL000210.1, GL000228.1, GL000230.1, GL000218.1
  - in 'y': GL383561.2, JH636052.4, JH806585.1, KE332502.1, NT_113885.1, NT_113901.1, NT_113916.2, NT_113923.1, NT_113930.1, NT_113961.1, NT_167212.1, NT_167213.1, NT_167214.1, NT_167225.1, NT_167226.1, NT_167229.1, NT_167230.1, NT_167234.1, NW_003315903.1, NW_003315911.1, NW_003315912.1, NW_003315923.1, NW_003315924.1, NW_003315925.1, NW_003315926.1, NW_003315932.1, NW_003315937.1, NW_003315947.1, NW_003315948.2, NW_003315949.1, NW_003315950.2, NW_003315968.1, NW_003571030.1, NW_003571034.1, NW_003571035.1, NW_003571040.1, NW_003571042.1, NW_003571046.1, NW_003571048.1, NW_003571051.1, NW_003571

[1] "../../CHM1_CHM13/out/3/S3/gridss_out/gridss.vcf"


“Removing 49 unpaired breakend variants gridss0_9843h, gridss0_10086h, gridss6_9953h, gridss9_20110o, gridss39_11175o, gridss41_8679o, gridss42_16285h, gridss51_7123h, gridss56_7698h, gridss56_14956o, gridss61_3401o, gridss64_7036o, gridss70_9642h, gridss90_2640h, gridss97_7400h, gridss101_10840o, gridss103_5566h, gridss109_14305h, gridss114_8848o, gridss115_4584o, gridss115_11920h, gridss121_6414h, gridss135_9803o, gridss139_7771o, gridss143_11642h, gridss144_6557o, gridss159_7572h, gridss168_7686h, gridss171_7829o, gridss190_8194h, gridss198_6243h, gridss207_10180h, gridss210_9962h, gridss211_5696h, gridss226_3026o, gridss234_6764o, gridss243_9621o, gridss244_9373h, gridss249_6883o, gridss260_1533h, gridss262_9526o, gridss274_867o, gridss275_7185o, gridss278_3489o, gridss296_5764h, gridss301_2590o, gridss305_1185o, gridss310_1083o, gridss317_12426o”

### All DELs ###

# gridss

n = 8502



      Min.    1st Qu.     Median       Mean    3rd Qu.       Max. 
-122193018       -339       -157     -78092        -43        -17 



### DELs svLen != NA ###

# gridss

n = 8502



      Min.    1st Qu.     Median       Mean    3rd Qu.       Max. 
-122193018       -339       -157     -78092        -43        -17 



### DELs svLen >= min.svLen ###

# gridss

n = 8502



      Min.    1st Qu.     Median       Mean    3rd Qu.       Max. 
-122193018       -339       -157     -78092        -43        -17 


“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': MT, GL000207.1, GL000226.1, GL000229.1, GL000231.1, GL000210.1, GL000239.1, GL000235.1, GL000201.1, GL000247.1, GL000245.1, GL000197.1, GL000203.1, GL000246.1, GL000249.1, GL000196.1, GL000248.1, GL000244.1, GL000238.1, GL000202.1, GL000234.1, GL000232.1, GL000206.1, GL000240.1, GL000236.1, GL000241.1, GL000243.1, GL000242.1, GL000230.1, GL000237.1, GL000233.1, GL000204.1, GL000198.1, GL000208.1, GL000191.1, GL000227.1, GL000228.1, GL000214.1, GL000221.1, GL000209.1, GL000218.1, GL000220.1, GL000213.1, GL000211.1, GL000199.1, GL000217.1, GL000216.1, GL000215.1, GL000205.1, GL000219.1, GL000224.1, GL000223.1, GL000195.1, GL000212.1, GL000222.1, GL000200.1, GL000193.1, GL000194.1, GL000225.1, GL000192.1, NC_007605
  - in 'y': M
  Make sure to always combine/compare objects based on the same reference
“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': MT, GL000207.1, GL000226.1

      Min.    1st Qu.     Median       Mean    3rd Qu.       Max. 
-122193018       -338       -154     -79180        -42        -17 


“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': MT, GL000207.1, GL000226.1, GL000229.1, GL000231.1, GL000210.1, GL000239.1, GL000235.1, GL000201.1, GL000247.1, GL000245.1, GL000197.1, GL000203.1, GL000246.1, GL000249.1, GL000196.1, GL000248.1, GL000244.1, GL000238.1, GL000202.1, GL000234.1, GL000232.1, GL000206.1, GL000240.1, GL000236.1, GL000241.1, GL000243.1, GL000242.1, GL000230.1, GL000237.1, GL000204.1, GL000198.1, GL000191.1, GL000227.1, GL000228.1, GL000221.1, GL000209.1, GL000218.1, GL000220.1, GL000213.1, GL000211.1, GL000199.1, GL000216.1, GL000215.1, GL000205.1, GL000223.1, GL000195.1, GL000212.1, GL000222.1, GL000200.1, GL000193.1, GL000192.1, NC_007605
  - in 'y': GL383561.2, JH636052.4, JH806585.1, KE332502.1, NT_113885.1, NT_113901.1, NT_113916.2, NT_113923.1, NT_113930.1, NT_113961.1, NT_167212.1, NT_167213.1, NT_167214.1, NT_167225.1, NT_167226.1, NT_167229.1, NT_167230.1, NT_167234.1, NW_003315903.1, NW_003315911.1, NW_003315912.1, NW

### import merged callset from VCF file

In [15]:
# fix: replace ':' by '_' in ID & SAMPLE fields
vcf.infile <- file.path(base.dir, sample, 'out', '3', 'S3', 'all.vcf')
vcf.outfile <- 'merge.vcf'
cmd <- paste("awk '{if ($1 ~ /^#/){print} else {id=$3; gsub(\":\",\"_\",$3);\
             gsub(id,$3,$10); print}}'", vcf.infile, '>', vcf.outfile)
system(cmd)

In [16]:
vcf <- VariantAnnotation::readVcf(vcf.outfile)
# fix: INFO/CI{END,POS} types: String->Integer
info(header(vcf))$Type[1:2] <- c("Integer", "Integer")
vcf <- vcf[which(info(vcf)$SVTYPE == 'DEL')]  # keep only deletions
vcf <- vcf[which(as.integer(info(vcf)$SUPP) >= min.supp, TRUE)]  # filter calls by support or
#vcf <- vcf[which(info(vcf)$SUPP_VEC == '1101', TRUE)]           # binary vector (MDLG)
# fix: CI{POS,END}} type: CharacterList->IntegerList
info(vcf)$CIPOS <- IntegerList(info(vcf)$CIPOS)
info(vcf)$CIEND <- IntegerList(info(vcf)$CIEND)

“record 2 sample CHM1_CHM13_2: fewer FORMAT fields than GENO fields”
“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL000191.1, GL000196.1, GL000197.1, GL000200.1, GL000202.1, GL000206.1, GL000209.1, GL000213.1, GL000223.1, GL000227.1, GL000236.1, GL000238.1, GL000239.1, GL000240.1, GL000242.1, GL000243.1, GL000244.1, GL000245.1, GL000246.1, GL000247.1, GL000248.1, GL000249.1, NC_007605
  - in 'y': 1 532077 MantaDEL_13_0_0_0_0_0 ACATTCATGCTCACTCATACACACCCAGATCATATATACACTCGTGCACACATTCACACTCATACACACCCAAATCATACTCACATTCATGCACACATGTT A 327 PASS SUPP=1;SUPP_VEC=1000;SVLEN=-100;SVTYPE=DEL;SVMETHOD=SURVIVOR1.0.6;CHR2=1;END=532177;CIPOS=0,0;CIEND=0,0;STRANDS=+- GT:PSV:LN:DR:ST:QV:TY:ID:RAL:AAL:CO 0/1:NA:100:26,15:+-:327:DEL:MantaDEL_13_0_0_0_0_0:ACATTCATGCTCACTCATACACACCCAGATCATATATACACTCGTGCACACATTCACACTCATACACACCCAAATCATACTCACATTCATGCACACATGTT:A:1_532077-1_532177 ./.:NaN:0:0,0:--:NaN:NaN:NaN:NAN:NAN:NAN ./.:NaN:0:0,0:--:NaN:NaN:NaN:NAN:NAN:NAN ./.:NaN:0:0,0:

### evaluate SV break-points

In [17]:
gr <- breakpointRanges(vcf)
gr <- excludeRegions(gr, excl.gr)
hits <- getHits(gr, true.gr)
callset <- file_path_sans_ext(vcf.outfile)
pm <- getPerfMetrics(callset, hits, n.true)
hits.df <- rbind(hits.df, data.frame(pm))
message('\n### Performance metrics ###')
message('min.supp = ', min.supp, '\n')

“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL000191.1, GL000192.1, GL000193.1, GL000194.1, GL000195.1, GL000196.1, GL000197.1, GL000198.1, GL000199.1, GL000200.1, GL000201.1, GL000202.1, GL000203.1, GL000204.1, GL000205.1, GL000206.1, GL000207.1, GL000208.1, GL000209.1, GL000210.1, GL000211.1, GL000212.1, GL000213.1, GL000214.1, GL000215.1, GL000216.1, GL000217.1, GL000218.1, GL000219.1, GL000220.1, GL000221.1, GL000222.1, GL000223.1, GL000224.1, GL000225.1, GL000226.1, GL000227.1, GL000228.1, GL000229.1, GL000230.1, GL000231.1, GL000232.1, GL000233.1, GL000234.1, GL000235.1, GL000236.1, GL000237.1, GL000238.1, GL000239.1, GL000240.1, GL000241.1, GL000242.1, GL000243.1, GL000244.1, GL000245.1, GL000246.1, GL000247.1, GL000248.1, GL000249.1, MT, NC_007605, 1 532077 MantaDEL_13_0_0_0_0_0 ACATTCATGCTCACTCATACACACCCAGATCATATATACACTCGTGCACACATTCACACTCATACACACCCAAATCATACTCACATTCATGCACACATGTT A 327 PASS SUPP=1;SUPP_VEC=1000;SVLEN=-100;SVTYPE=DEL;SVMETHOD

“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL000191.1, GL000192.1, GL000193.1, GL000194.1, GL000195.1, GL000196.1, GL000197.1, GL000198.1, GL000199.1, GL000200.1, GL000201.1, GL000202.1, GL000203.1, GL000204.1, GL000205.1, GL000206.1, GL000207.1, GL000208.1, GL000209.1, GL000210.1, GL000211.1, GL000212.1, GL000213.1, GL000214.1, GL000215.1, GL000216.1, GL000217.1, GL000218.1, GL000219.1, GL000220.1, GL000221.1, GL000222.1, GL000223.1, GL000224.1, GL000225.1, GL000226.1, GL000227.1, GL000228.1, GL000229.1, GL000230.1, GL000231.1, GL000232.1, GL000233.1, GL000234.1, GL000235.1, GL000236.1, GL000237.1, GL000238.1, GL000239.1, GL000240.1, GL000241.1, GL000242.1, GL000243.1, GL000244.1, GL000245.1, GL000246.1, GL000247.1, GL000248.1, GL000249.1, MT, NC_007605, 1 532077 MantaDEL_13_0_0_0_0_0 ACATTCATGCTCACTCATACACACCCAGATCATATATACACTCGTGCACACATTCACACTCATACACACCCAAATCATACTCACATTCATGCACACATGTT A 327 PASS SUPP=1;SUPP_VEC=1000;SVLEN=-100;SVTYPE=DEL;SVMETHOD

“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL000191.1, GL000192.1, GL000193.1, GL000195.1, GL000196.1, GL000197.1, GL000198.1, GL000199.1, GL000200.1, GL000201.1, GL000202.1, GL000203.1, GL000204.1, GL000205.1, GL000206.1, GL000207.1, GL000209.1, GL000210.1, GL000211.1, GL000212.1, GL000213.1, GL000215.1, GL000216.1, GL000218.1, GL000220.1, GL000221.1, GL000222.1, GL000223.1, GL000226.1, GL000227.1, GL000228.1, GL000229.1, GL000230.1, GL000231.1, GL000232.1, GL000234.1, GL000235.1, GL000236.1, GL000237.1, GL000238.1, GL000239.1, GL000240.1, GL000241.1, GL000242.1, GL000243.1, GL000244.1, GL000245.1, GL000246.1, GL000247.1, GL000248.1, GL000249.1, MT, NC_007605, 1 532077 MantaDEL_13_0_0_0_0_0 ACATTCATGCTCACTCATACACACCCAGATCATATATACACTCGTGCACACATTCACACTCATACACACCCAAATCATACTCACATTCATGCACACATGTT A 327 PASS SUPP=1;SUPP_VEC=1000;SVLEN=-100;SVTYPE=DEL;SVMETHOD=SURVIVOR1.0.6;CHR2=1;END=532177;CIPOS=0,0;CIEND=0,0;STRANDS=+- GT:PSV:LN:DR:ST:QV:TY:ID:RAL:AAL

“Each of the 2 combined objects has sequence levels not in the other:
  - in 'x': GL000191.1, GL000192.1, GL000193.1, GL000195.1, GL000196.1, GL000197.1, GL000198.1, GL000199.1, GL000200.1, GL000201.1, GL000202.1, GL000203.1, GL000204.1, GL000205.1, GL000206.1, GL000207.1, GL000209.1, GL000210.1, GL000211.1, GL000212.1, GL000213.1, GL000215.1, GL000216.1, GL000218.1, GL000220.1, GL000221.1, GL000222.1, GL000223.1, GL000226.1, GL000227.1, GL000228.1, GL000229.1, GL000230.1, GL000231.1, GL000232.1, GL000234.1, GL000235.1, GL000236.1, GL000237.1, GL000238.1, GL000239.1, GL000240.1, GL000241.1, GL000242.1, GL000243.1, GL000244.1, GL000245.1, GL000246.1, GL000247.1, GL000248.1, GL000249.1, MT, NC_007605, 1 532077 MantaDEL_13_0_0_0_0_0 ACATTCATGCTCACTCATACACACCCAGATCATATATACACTCGTGCACACATTCACACTCATACACACCCAAATCATACTCACATTCATGCACACATGTT A 327 PASS SUPP=1;SUPP_VEC=1000;SVLEN=-100;SVTYPE=DEL;SVMETHOD=SURVIVOR1.0.6;CHR2=1;END=532177;CIPOS=0,0;CIEND=0,0;STRANDS=+- GT:PSV:LN:DR:ST:QV:TY:ID:RAL:AAL


### Performance metrics ###

min.supp = 3




### write CSV file with performance metrics

In [18]:
hits.df

callset,n,tp,fp,fn,precision,recall
<fct>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
manta,8838,6516,2322,60896,73.7,9.7
delly,26806,6966,19840,60446,26.0,10.3
lumpy,0,0,0,67412,,0.0
gridss,8372,5142,3230,62270,61.4,7.6
merge,3750,3534,216,63878,94.2,5.2


In [19]:
csv.file <- paste0('metrics_', sample, '_', ts, '.csv')
write.table(hits.df, file=csv.file, row.names=FALSE, col.names=TRUE,
            quote=FALSE, sep=',')