In [1]:
using JuliaDB
using IndexedTables
using Dagger
using VCFTool

In [2]:
input_dir = "../input/"

vcf_738_file_path = joinpath(input_dir, "738_variants.vcf.gz")

vcf_738_dagger_file_path = joinpath(input_dir, "738_variants.vcf.dagger")

vcf_738_1000_file_path = joinpath(input_dir, "738_variants_1000.vcf.gz")

vcf_738_1000_dagger_file_path = joinpath(input_dir, "738_variants_1000.vcf.dagger")

vcf_file_path_to_use = vcf_738_1000_file_path

vcf_dagger_path_to_use = vcf_738_1000_dagger_file_path;

In [3]:
vcf_table = make_vcf_indexedtable(vcf_file_path_to_use)

vcf_table_dagger = Dagger.load(vcf_dagger_path_to_use)

vcf_ndsparse = make_vcf_ndsparse(vcf_file_path_to_use);

Loading Dagger file...

Dagger file loaded.


In [17]:
benchmark_chrom = "chr1"

benchmark_chrom_int = 1

benchmark_variant = 13868

benchmark_region_start = 10000

benchmark_region_end = 900000;

benchmark_variant_2 = 19322

benchmark_region_2_start = 500000

benchmark_region_2_end = 800000;

# NDSParse v.s. IndexedTable v.s. tabix

Past tests show tabix takes about .01 seconds to complete a query - about 10X faster than either NDSparse or IndexedTable. On the fist query, IndexedTable will take a few seconds. On all following queries, both NDSparse and Indexedtable will take about .1 seconds.

## Variant

In [8]:
@time filter(i -> (i.CHROM == benchmark_chrom) && (i.POS == benchmark_variant), vcf_ndsparse);

  0.113088 seconds (80.85 k allocations: 4.349 MiB)


In [9]:
@time filter(i -> (i.CHROM == benchmark_chrom) && (i.POS == benchmark_variant), vcf_table);

  0.104540 seconds (81.84 k allocations: 4.336 MiB)


In [10]:
@time run(`tabix $vcf_file_path_to_use $benchmark_chrom:$benchmark_variant-$benchmark_variant`)

chr1	13868	.	A	G	1	LowGQX;LowDepth;NoPassedVariantGTs	SNVHPOL=3;MQ=4	GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL	0/1:3:0:1:0:0,1:0,0:0,1:0.0:LowGQX;LowDepth:28,3,0
  0.010808 seconds (96 allocations: 4.516 KiB)


Process(`[4mtabix[24m [4m../input/738_variants_1000.vcf.gz[24m [4mchr1:13868-13868[24m`, ProcessExited(0))

## Region

In [None]:
@time filter(i -> (i.CHROM == benchmark_chrom) && (i.POS > benchmark_region_2_start) && (i.POS < benchmark_region_2_end), vcf_table);

In [None]:
@time run(`tabix $vcf_file_path_to_use $benchmark_chrom:$benchmark_region_2_start-$benchmark_region_2_end`)

# IndexedTable: indexed by ints vs. strings

Past tests show using chromosome ints is a few milliseconds faster.

In [None]:
# Convert chromosome strings to chromosome integers

chromosomes_string = Array(columns(vcf_table)[1])

chromosomes_int = []

for item in chromosomes_string

   parts = split(item, "")

   number = parts[length(parts)]

   push!(chromosomes_int, parse(Int, number))

end

vcf_table_chrom_int = IndexedTables.transform(vcf_table, :CHROM => chromosomes_int);

## Variant

In [None]:
@time filter(i -> (i.CHROM == benchmark_chrom) && (i.POS == benchmark_variant), vcf_table);

In [None]:
@time filter(i -> (i.CHROM == benchmark_chrom_int) && (i.POS == benchmark_variant), vcf_table_chrom_int);

## Region

In [None]:
@time filter(i -> (i.CHROM == benchmark_chrom) && (i.POS > benchmark_region_start) && (i.POS < benchmark_region_end), vcf_table);

In [None]:
@time filter(i -> (i.CHROM == benchmark_chrom_int) && (i.POS > benchmark_region_start) && (i.POS < benchmark_region_end), vcf_table_chrom_int);

# IndexedTable: load from vcf v.s. load from Dagger file

`Dagger.load` is well documented to be faster than `loadtable()` which loads a table from a regular text file. However, here we test whether it makes a difference on query speed once the data object has been loaded.

Past tests show query speeds are very similar, within .01 seconds of each other.

## Variant

In [11]:
@time filter(i -> (i.CHROM == benchmark_chrom) && (i.POS == benchmark_variant), vcf_table);

  0.112802 seconds (81.84 k allocations: 4.336 MiB)


In [12]:
@time filter(i -> (i.CHROM == benchmark_chrom) && (i.POS == benchmark_variant), vcf_table_dagger);

  0.104337 seconds (81.83 k allocations: 4.337 MiB)


## Region

In [20]:
@time filter(i -> (i.CHROM == benchmark_chrom) && (i.POS > benchmark_region_start) && (i.POS < benchmark_region_end), vcf_table);

  0.109939 seconds (96.15 k allocations: 5.516 MiB)


In [21]:
@time filter(i -> (i.CHROM == benchmark_chrom) && (i.POS > benchmark_region_start) && (i.POS < benchmark_region_end), vcf_table_dagger);

  0.104267 seconds (96.15 k allocations: 5.516 MiB)
