In [1]:
using JuliaDB
using IndexedTables
using Dagger
using VCFTool

┌ Info: Precompiling VCFTool [8f8d80d9-e796-45a3-8ef2-c0461a3110a4]
└ @ Base loading.jl:1278


In [3]:
input_dir = "../input/"

vcf_738_file_path = joinpath(input_dir, "738_variants.vcf.gz")

vcf_738_dagger_file_path = joinpath(input_dir, "738_variants.vcf.dagger")

vcf_738_1000_file_path = joinpath(input_dir, "738_variants_1000.vcf.gz")

vcf_738_1000_dagger_file_path = joinpath(input_dir, "738_variants_1000.vcf.dagger")

vcf_file_path_to_use = vcf_738_file_path;

In [4]:
vcf_table = make_vcf_indexedtable(vcf_file_path_to_use);

In [15]:
vcf_ndsparse = make_vcf_ndsparse(vcf_file_path_to_use);

In [9]:
Dagger.save(vcf_table, vcf_738_dagger_file_path);

In [10]:
vcf_table_dagger = Dagger.load(vcf_738_dagger_file_path);

In [6]:
IndexedTables.set_show_compact!(false);

vcf_table

Table with 5217068 rows, 10 columns:
[1mCHROM   [22m[1mPOS       [22mID   REF    ALT      QUAL   FILTER                                INFO                                                   FORMAT                                     GERM
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
"chr1"  10439     "."  "AC"   "A"      "72"   "PASS"                                "CIGAR=1M1D;RU=C;REFREP=4;IDREP=3;MQ=9"                "GT:GQ:GQX:DPI:AD:ADF:ADR:FT:PL"           "0/1:31:5:7:2,5:0,1:2,4:PASS:108,0,28"
"chr1"  13284     "."  "G"    "A"      "60"   "LowGQX;NoPassedVariantGTs"           "SNVHPOL=4;MQ=17"                                      "GT:GQ:GQX:DP:DPF:AD:ADF:ADR:SB:FT:PL"     "0/1:93:1:30:1:19,11:12,8:7,3:-9.2:LowGQX:95,0,157"
"chr1"  13868     "."  "A"    "G"      "1"    "LowGQX;LowD

# Load directly from vcf v.s. load from Dagger file

Here we compare how long it takes to access a particular variant and a chromosomal region in a regular JuliaDB IndexedTable that was loaded directly from a file versus an index file that was created via `Dagger.save` and loaded here via `Dagger.load`. Using Dagger to save and load a VCF adds another step and dependency to data prep, so this speed test will tell if its worth it.

In [7]:
benchmark_chrom = "chr1"

benchmark_variant = 13868

benchmark_region_start = 10000

benchmark_region_end = 700000;

## Variant

In [8]:
@time filter(i -> (i.CHROM == benchmark_chrom) && (i.POS == benchmark_variant), vcf_table);

  9.684306 seconds (95.35 M allocations: 5.512 GiB, 9.04% gc time)


In [11]:
@time filter(i -> (i.CHROM == benchmark_chrom) && (i.POS == benchmark_variant), vcf_table_dagger);

  6.444005 seconds (88.75 M allocations: 5.148 GiB, 17.71% gc time)


## Region

In [12]:
@time filter(i -> (i.CHROM == benchmark_chrom) && (i.POS > benchmark_region_start) && (i.POS < benchmark_region_end), vcf_table);

  5.696883 seconds (88.76 M allocations: 5.149 GiB, 14.29% gc time)


In [13]:
@time filter(i -> (i.CHROM == benchmark_chrom) && (i.POS > benchmark_region_start) && (i.POS < benchmark_region_end), vcf_table_dagger);

  5.710947 seconds (88.76 M allocations: 5.149 GiB, 14.23% gc time)


# NDSParse v.s. IndexedTable v.s. tabix

In [14]:
benchmark_variant_2 = 19322

benchmark_region_2_start = 500000

benchmark_region_2_end = 800000

800000

## Variant

In [16]:
@time filter(i -> (i.CHROM == benchmark_chrom) && (i.POS == benchmark_variant_2), vcf_ndsparse);

  6.410873 seconds (84.95 M allocations: 5.303 GiB, 20.67% gc time)


In [17]:
@time filter(i -> (i.CHROM == benchmark_chrom) && (i.POS == benchmark_variant_2), vcf_table);

  6.471574 seconds (88.76 M allocations: 5.148 GiB, 12.02% gc time)


In [None]:
@time run(`tabix $vcf_file_path_to_use $benchmark_chrom:$benchmark_variant_2-$benchmark_variant_2`)

## Region

In [None]:
@time filter(i -> (i.CHROM == benchmark_chrom) && (i.POS > benchmark_region_2_start) && (i.POS < benchmark_region_2_end), vcf_ndsparse);

In [None]:
@time filter(i -> (i.CHROM == benchmark_chrom) && (i.POS > benchmark_region_2_start) && (i.POS < benchmark_region_2_end), vcf_table);

In [None]:
@time run(`tabix $vcf_file_path_to_use $benchmark_chrom:$benchmark_region_2_start-$benchmark_region_2_end`)