In [1]:
using JuliaDB
using IndexedTables
using Dagger
using VCFTool

In [2]:
input_dir = "../input/"

vcf_738_1000_file_path = joinpath(input_dir, "738_variants_1000.vcf.gz")

vcf_738_1000_dagger_file_path = joinpath(input_dir, "738_variants_1000.vcf.dagger")

vcf_file_path_to_use = vcf_738_1000_file_path;

In [3]:
vcf_table = make_vcf_indexedtable(vcf_file_path_to_use);

In [4]:
vcf_ndsparse = make_vcf_ndsparse(vcf_file_path_to_use);

In [5]:
Dagger.save(vcf_table, vcf_738_1000_dagger_file_path);

In [6]:
vcf_table_dagger = Dagger.load(vcf_738_1000_dagger_file_path);

# Dagger load v.s. load from file

Here we compare how long it takes to access a particular variant and a chromosomal region in a regular JuliaDB IndexedTable that was loaded directly from a file versus an index file that was created via `Dagger.save` and loaded here via `Dagger.load`. Using Dagger to save and load a VCF adds another step and dependency to data prep, so this speed test will tell if its worth it.

In [7]:
benchmark_chrom = 1

benchmark_variant = 13868

benchmark_region_start = 10000

benchmark_region_end = 100000;

## Variant

In [8]:
@time filter(i -> (i.CHROM == benchmark_chrom) && (i.POS == benchmark_variant), vcf_table);

  4.254988 seconds (6.90 M allocations: 385.777 MiB, 1.91% gc time)


In [9]:
@time filter(i -> (i.CHROM == benchmark_chrom) && (i.POS == benchmark_variant), vcf_table_dagger);

  0.096760 seconds (80.36 k allocations: 4.270 MiB)


## Region

In [16]:
@time filter(i -> (i.CHROM == benchmark_chrom) && (i.POS > benchmark_region_start) && (i.POS < benchmark_region_end), vcf_table);

  0.109644 seconds (82.36 k allocations: 4.416 MiB)


In [17]:
@time filter(i -> (i.CHROM == benchmark_chrom) && (i.POS > benchmark_region_start) && (i.POS < benchmark_region_end), vcf_table_dagger);

  0.118078 seconds (82.36 k allocations: 4.417 MiB, 10.17% gc time)


# Access data from NDSParse v.s. IndexedTable v.s. tabix

In [23]:
benchmark_variant_2 = 19322

benchmark_region_2_start = 500000

benchmark_region_2_end = 800000

800000

## Variant

In [24]:
@time filter(i -> (i.CHROM == benchmark_chrom) && (i.POS == benchmark_variant_2), vcf_ndsparse);

  0.120605 seconds (80.85 k allocations: 4.349 MiB)


In [25]:
@time filter(i -> (i.CHROM == benchmark_chrom) && (i.POS == benchmark_variant_2), vcf_table);

  0.109018 seconds (80.36 k allocations: 4.270 MiB)


In [26]:
@time run(`tabix ../input/738_variants_1000.vcf.gz $benchmark_chrom:$benchmark_variant_2-$benchmark_variant_2`)

  0.010803 seconds (86 allocations: 4.156 KiB)


Process(`[4mtabix[24m [4m../input/738_variants_1000.vcf.gz[24m [4m1:19322-19322[24m`, ProcessExited(0))

## Region

In [27]:
@time filter(i -> (i.CHROM == benchmark_chrom) && (i.POS > benchmark_region_2_start) && (i.POS < benchmark_region_2_end), vcf_ndsparse);

  0.122093 seconds (81.72 k allocations: 4.392 MiB)


In [28]:
@time filter(i -> (i.CHROM == benchmark_chrom) && (i.POS > benchmark_region_2_start) && (i.POS < benchmark_region_2_end), vcf_table);

  0.107274 seconds (84.41 k allocations: 4.588 MiB)


In [29]:
@time run(`tabix ../input/738_variants_1000.vcf.gz $benchmark_chrom:$benchmark_region_2_start-$benchmark_region_2_end`)

  0.010713 seconds (86 allocations: 4.156 KiB)


Process(`[4mtabix[24m [4m../input/738_variants_1000.vcf.gz[24m [4m1:500000-800000[24m`, ProcessExited(0))