In [26]:
include("../src/ProcessSequence.jl")



Main.ProcessSequence

In [27]:
string("hello", "_", "world")

"hello_world"

In [28]:
sample_name = "test_high_quality"

project_dir = dirname(@__DIR__)
output_dir = joinpath(project_dir, "output")
input_dir = joinpath(project_dir, "input")
sample_dir = joinpath(input_dir, sample_name)
reference_dir = joinpath(input_dir, "reference")
snpeff_jar = "/opt/snpeff/snpEff/snpEff.jar"


using JSON: parse
project_json = parse(read(joinpath(project_dir, "project.json"), String))

Dict{String,Any} with 5 entries:
  "gb_memory"           => 40
  "n_job"               => 8
  "dna_is_targeted"     => false
  "germ_dna.2.fastq.gz" => "input/test_high_quality/germ_dna.2.1m.fastq.gz"
  "germ_dna.1.fastq.gz" => "input/test_high_quality/germ_dna.1.1m.fastq.gz"

In [29]:
if !isdir(reference_dir)

    ProcessSequence.print_and_run_cmd(`unzip -o -d $input_dir $reference_dir.zip`)
    
end

In [30]:
if !isfile(snpeff_jar)
    
    throw("$snpeff is missing.")
    
end

In [31]:
ProcessSequence.check_program()

Checking program...
`[4mwhich[24m [4mskewer[24m`
/opt/conda/bin/skewer
`[4mwhich[24m [4mfastqc[24m`
/opt/conda/bin/fastqc
`[4mwhich[24m [4mbgzip[24m`
/opt/conda/bin/bgzip
`[4mwhich[24m [4mtabix[24m`
/opt/conda/bin/tabix
`[4mwhich[24m [4mminimap2[24m`
/opt/conda/bin/minimap2
`[4mwhich[24m [4msamtools[24m`
/opt/conda/bin/samtools
`[4mwhich[24m [4mbcftools[24m`
/opt/conda/bin/bcftools
`[4mwhich[24m [4mkallisto[24m`
/opt/conda/bin/kallisto
`[4mbash[24m [4m-c[24m [4m'source activate py2 && which configManta.py'[24m`
/opt/conda/envs/py2/bin/configManta.py
`[4mbash[24m [4m-c[24m [4m'source activate py2 && which configureStrelkaGermlineWorkflow.py'[24m`
/opt/conda/envs/py2/bin/configureStrelkaGermlineWorkflow.py
`[4mbash[24m [4m-c[24m [4m'source activate py2 && which configureStrelkaSomaticWorkflow.py'[24m`
/opt/conda/envs/py2/bin/configureStrelkaSomaticWorkflow.py


In [32]:
function print_and_run_command(cmd::Base.AbstractCmd)

    println(cmd)

    run(cmd)

end

print_and_run_command (generic function with 1 method)

In [33]:
process_dna_arguments = (
    joinpath(
        reference_dir,
        "GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz",
    ),
    joinpath(reference_dir, "chromosome.bed.gz"),
    joinpath(reference_dir, "chrn_n.tsv"),
    project_json["n_job"],
    project_json["gb_memory"],
    20,
    snpeff_jar,
)

("/home/jovyan/ProcessSequence.jl/input/reference/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz", "/home/jovyan/ProcessSequence.jl/input/reference/chromosome.bed.gz", "/home/jovyan/ProcessSequence.jl/input/reference/chrn_n.tsv", 8, 40, 20, "/opt/snpeff/snpEff/snpEff.jar")

In [34]:
if all((
    in(key, keys(project_json)) for key in ("germ_dna.1.fastq.gz", "germ_dna.2.fastq.gz")
))

    ProcessSequence.process_germ_dna(
        joinpath(project_dir, project_json["germ_dna.1.fastq.gz"]),
        joinpath(project_dir, project_json["germ_dna.2.fastq.gz"]),
        project_json["dna_is_targeted"],
        joinpath(output_dir, string("process_germ_dna_", sample_name)),
        process_dna_arguments...,
    )

end

Started analysis of germ-trimmed-pair1.fastq.gz
Started analysis of germ-trimmed-pair2.fastq.gz
Approx 5% complete for germ-trimmed-pair1.fastq.gz
Approx 5% complete for germ-trimmed-pair2.fastq.gz
Approx 10% complete for germ-trimmed-pair1.fastq.gz
Approx 10% complete for germ-trimmed-pair2.fastq.gz
Approx 15% complete for germ-trimmed-pair1.fastq.gz
Approx 15% complete for germ-trimmed-pair2.fastq.gz
Approx 20% complete for germ-trimmed-pair1.fastq.gz
Approx 20% complete for germ-trimmed-pair2.fastq.gz
Approx 25% complete for germ-trimmed-pair1.fastq.gz
Approx 25% complete for germ-trimmed-pair2.fastq.gz
Approx 30% complete for germ-trimmed-pair1.fastq.gz
Approx 30% complete for germ-trimmed-pair2.fastq.gz
Approx 35% complete for germ-trimmed-pair1.fastq.gz
Approx 35% complete for germ-trimmed-pair2.fastq.gz
Approx 40% complete for germ-trimmed-pair1.fastq.gz
Approx 45% complete for germ-trimmed-pair1.fastq.gz
Approx 40% complete for germ-trimmed-pair2.fastq.gz
Approx 50% complete fo

(2020-12-05T20:11:29.284) Trimming sequence ...
`[4mskewer[24m [4m--threads[24m [4m8[24m [4m-x[24m [4mAGATCGGAAGAGC[24m [4m--end-quality[24m [4m20[24m [4m--mode[24m [4mpe[24m [4m--compress[24m [4m--output[24m [4m/home/jovyan/ProcessSequence.jl/output/process_germ_dna_test_high_quality/trim_sequence/germ[24m [4m--quiet[24m [4m/home/jovyan/ProcessSequence.jl/input/test_high_quality/germ_dna.1.1m.fastq.gz[24m [4m/home/jovyan/ProcessSequence.jl/input/test_high_quality/germ_dna.2.1m.fastq.gz[24m`
.--. .-.
: .--': :.-.
`. `. : `'.' .--. .-..-..-. .--. .--.
_`, :: . `.' '_.': `; `; :' '_.': ..'
`.__.':_;:_;`.__.'`.__.__.'`.__.':_;
skewer v0.2.2 [April 4, 2016]
Parameters used:
-- 3' end adapter sequence (-x):[0;33m	AGATCGGAAGAGC
[0m-- maximum error ratio allowed (-r):	0.100
-- maximum indel error ratio allowed (-d):	0.030
-- end quality threshold (-q):		20
-- minimum read length allowed after trimming (-l):	18
-- file format (-f):		Sanger/Illumina 1.8+ FASTQ (a

[M::worker_pipeline::16.460*1.46] mapped 2 sequences
[M::worker_pipeline::16.463*1.46] mapped 2 sequences
[M::worker_pipeline::16.463*1.46] mapped 2 sequences
[M::worker_pipeline::16.464*1.46] mapped 2 sequences
[M::worker_pipeline::16.465*1.46] mapped 2 sequences
[M::worker_pipeline::16.466*1.46] mapped 2 sequences
[M::worker_pipeline::16.467*1.46] mapped 2 sequences
[M::worker_pipeline::16.467*1.46] mapped 2 sequences
[M::worker_pipeline::16.468*1.46] mapped 2 sequences
[M::worker_pipeline::16.469*1.46] mapped 2 sequences
[M::worker_pipeline::16.469*1.46] mapped 2 sequences
[M::worker_pipeline::16.470*1.46] mapped 2 sequences
[M::worker_pipeline::16.471*1.46] mapped 2 sequences
[M::worker_pipeline::16.472*1.46] mapped 2 sequences
[M::worker_pipeline::16.472*1.46] mapped 2 sequences
[M::worker_pipeline::16.473*1.46] mapped 2 sequences
[M::worker_pipeline::16.474*1.46] mapped 2 sequences
[M::worker_pipeline::16.474*1.46] mapped 2 sequences
[M::worker_pipeline::16.475*1.46] mapped 2 seq

LoadError: failed process: Process(`java -Xmx40g -jar /opt/snpeff/snpEff/snpEff.jar GRCh38.99 -noLog -verbose -csvStats /home/jovyan/ProcessSequence.jl/output/process_germ_dna_test_high_quality/find_variant/snpeff/stats.csv -htmlStats /home/jovyan/ProcessSequence.jl/output/process_germ_dna_test_high_quality/find_variant/snpeff/stats.html /home/jovyan/ProcessSequence.jl/output/process_germ_dna_test_high_quality/find_variant/concat.vcf.gz`, ProcessExited(255)) [255]


In [10]:
if all((
    in(key, keys(project_json))
    for
    key in (
        "germ_dna.1.fastq.gz",
        "germ_dna.2.fastq.gz",
        "soma_dna.1.fastq.gz",
        "soma_dna.2.fastq.gz",
    )
))

    ProcessSequence.process_soma_dna(
        joinpath(project_dir, project_json["germ_dna.1.fastq.gz"]),
        joinpath(project_dir, project_json["germ_dna.2.fastq.gz"]),
        joinpath(project_dir, project_json["soma_dna.1.fastq.gz"]),
        joinpath(project_dir, project_json["soma_dna.2.fastq.gz"]),
        project_json["dna_is_targeted"],
        joinpath(output_dir, "process_soma_dna"),
        process_dna_arguments...,
    )

end

In [None]:
if all((
    in(key, keys(project_json)) for key in ("soma_rna.1.fastq.gz", "soma_rna.2.fastq.gz")
))

    ProcessSequence.process_soma_rna(
        joinpath(project_dir, project_json["soma_rna.1.fastq.gz"]),
        joinpath(project_dir, project_json["soma_rna.2.fastq.gz"]),
        joinpath(output_dir, "process_soma_rna"),
        joinpath(reference_dir, "Homo_sapiens.GRCh38.cdna.all.fa.gz"),
        project_json["n_job"],
    )

end