In [1]:
sample_name = "HG002_cat"
include("../analyze_sequence/analyze_sequence.jl")

Main.analyze_sequence

In [2]:
project_dir = dirname(@__DIR__)

"/home/jovyan/analyze_sequence"

In [3]:
output_dir = joinpath(project_dir, "output")
input_dir = joinpath(project_dir, "input")
sample_dir = joinpath(input_dir, sample_name)
reference_dir = joinpath(input_dir, "reference")
snpeff_jar = joinpath(project_dir, "tool/snpEff/snpEff.jar")

using JSON: parse
project_json = parse(read(joinpath(project_dir, "project.json"), String))

Dict{String,Any} with 5 entries:
  "gb_memory"           => 50
  "n_job"               => 2
  "dna_is_targeted"     => false
  "germ_dna.2.fastq.gz" => "input/HG002_cat/HG002_R2.fastq.gz"
  "germ_dna.1.fastq.gz" => "input/HG002_cat/HG002_R1.fastq.gz"

In [4]:
if !isdir(reference_dir)

    analyze_sequence.print_and_run_cmd(`unzip -o -d $input_dir $reference_dir.zip`)
    
end

In [5]:
if !isfile(snpeff_jar)
    
    throw("$snpeff is missing.")
    
end

In [6]:
analyze_sequence.check_program()

Checking program...
`[4mwhich[24m [4mskewer[24m`
/opt/conda/bin/skewer
`[4mwhich[24m [4mfastqc[24m`
/opt/conda/bin/fastqc
`[4mwhich[24m [4mbgzip[24m`
/opt/conda/bin/bgzip
`[4mwhich[24m [4mtabix[24m`
/opt/conda/bin/tabix
`[4mwhich[24m [4mminimap2[24m`
/opt/conda/bin/minimap2
`[4mwhich[24m [4msamtools[24m`
/opt/conda/bin/samtools
`[4mwhich[24m [4mbcftools[24m`
/opt/conda/bin/bcftools
`[4mwhich[24m [4mkallisto[24m`
/opt/conda/bin/kallisto
`[4mbash[24m [4m-c[24m [4m'source activate py2 && which configManta.py'[24m`
/opt/conda/envs/py2/bin/configManta.py
`[4mbash[24m [4m-c[24m [4m'source activate py2 && which configureStrelkaGermlineWorkflow.py'[24m`
/opt/conda/envs/py2/bin/configureStrelkaGermlineWorkflow.py
`[4mbash[24m [4m-c[24m [4m'source activate py2 && which configureStrelkaSomaticWorkflow.py'[24m`
/opt/conda/envs/py2/bin/configureStrelkaSomaticWorkflow.py


In [7]:
function print_and_run_command(cmd::Base.AbstractCmd)

    println(cmd)

    run(cmd)

end

print_and_run_command (generic function with 1 method)

In [8]:
process_dna_arguments = (
    joinpath(
        reference_dir,
        "GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz",
    ),
    joinpath(reference_dir, "chromosome.bed.gz"),
    joinpath(reference_dir, "chrn_n.tsv"),
    project_json["n_job"],
    project_json["gb_memory"],
    20,
    snpeff_jar,
)

("/home/jovyan/analyze_sequence/input/reference/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.gz", "/home/jovyan/analyze_sequence/input/reference/chromosome.bed.gz", "/home/jovyan/analyze_sequence/input/reference/chrn_n.tsv", 2, 50, 20, "/home/jovyan/analyze_sequence/tool/snpEff/snpEff.jar")

In [9]:
# Test py2 environment

print_and_run_command(`conda create --name py2 --yes python=2.7`)

print_and_run_command(`conda install --name py2 --channel bioconda --yes strelka manta`)

`[4mconda[24m [4mcreate[24m [4m--name[24m [4mpy2[24m [4m--yes[24m [4mpython=2.7[24m`
Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... failed with repodata from current_repodata.json, will retry with next repodata source.
Collecting package metadata (repodata.json): ...working... done
Solving environment: ...working... done

## Package Plan ##

  environment location: /opt/conda/envs/py2

  added / updated specs:
    - python=2.7


The following NEW packages will be INSTALLED:

  _libgcc_mutex      conda-forge/linux-64::_libgcc_mutex-0.1-conda_forge
  _openmp_mutex      conda-forge/linux-64::_openmp_mutex-4.5-1_gnu
  ca-certificates    conda-forge/linux-64::ca-certificates-2020.11.8-ha878542_0
  certifi            pkgs/main/noarch::certifi-2020.6.20-pyhd3eb1b0_3
  libffi             pkgs/main/linux-64::libffi-3.3-he6710b0_2
  libgcc-ng          conda-forge/linux-64::libgcc-ng-9.3.0-h5dbcf3e_17
  libgomp            con



  current version: 4.9.0
  latest version: 4.9.2

Please update conda by running

    $ conda update -n base conda




`[4mconda[24m [4minstall[24m [4m--name[24m [4mpy2[24m [4m--channel[24m [4mbioconda[24m [4m--yes[24m [4mstrelka[24m [4mmanta[24m`
Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

## Package Plan ##

  environment location: /opt/conda/envs/py2

  added / updated specs:
    - manta
    - strelka


The following NEW packages will be INSTALLED:

  manta              bioconda/linux-64::manta-1.6.0-py27_0
  strelka            bioconda/linux-64::strelka-2.9.10-0


Preparing transaction: ...working... done
Verifying transaction: ...working... done
Executing transaction: ...working... done




  current version: 4.9.0
  latest version: 4.9.2

Please update conda by running

    $ conda update -n base conda




Process(`[4mconda[24m [4minstall[24m [4m--name[24m [4mpy2[24m [4m--channel[24m [4mbioconda[24m [4m--yes[24m [4mstrelka[24m [4mmanta[24m`, ProcessExited(0))

In [10]:
if all((
    in(key, keys(project_json)) for key in ("germ_dna.1.fastq.gz", "germ_dna.2.fastq.gz")
))

    analyze_sequence.process_germ_dna(
        joinpath(project_dir, project_json["germ_dna.1.fastq.gz"]),
        joinpath(project_dir, project_json["germ_dna.2.fastq.gz"]),
        project_json["dna_is_targeted"],
        joinpath(output_dir, "process_germ_dna_HG002_cat__trim_pe_q_20"),
        process_dna_arguments...,
    )

end

(2020-11-19T19:50:20.766) Finding variant ...
`[4mbash[24m [4m-c[24m [4m'source activate py2 && configManta.py --referenceFasta /home/jovyan/analyze_sequence/input/reference/GCA_000001405.15_GRCh38_no_alt_analysis_set.fna.bgz --callRegions /home/jovyan/analyze_sequence/input/reference/chromosome.bed.gz --bam /home/jovyan/analyze_sequence/output/process_germ_dna_HG002_cat__trim_pe_q_20/align_sequence/germ.bam --outputContig --runDir /home/jovyan/analyze_sequence/output/process_germ_dna_HG002_cat__trim_pe_q_20/find_variant/manta && /home/jovyan/analyze_sequence/output/process_germ_dna_HG002_cat__trim_pe_q_20/find_variant/manta/runWorkflow.py --mode local --jobs 2 --memGb 50 --quiet'[24m`

Successfully created workflow run script.
To execute the workflow, run the following script and set appropriate options:

/home/jovyan/analyze_sequence/output/process_germ_dna_HG002_cat__trim_pe_q_20/find_variant/manta/runWorkflow.py
`[4mbash[24m [4m-c[24m [4m'source activate py2 && configure

00:00:00	SnpEff version SnpEff 5.0 (build 2020-08-09 21:23), by Pablo Cingolani
00:00:00	Command: 'ann'
00:00:00	Reading configuration file 'snpEff.config'. Genome: 'GRCh38.99'
00:00:00	Reading config file: /home/jovyan/analyze_sequence/notebook/snpEff.config
00:00:00	Reading config file: /home/jovyan/analyze_sequence/tool/snpEff/snpEff.config
00:00:00	done
00:00:00	Reading database for genome version 'GRCh38.99' from file '/home/jovyan/analyze_sequence/tool/snpEff/./data/GRCh38.99/snpEffectPredictor.bin' (this might take a while)
00:00:00	Database not installed
	Attempting to download and install database 'GRCh38.99'
00:00:00	Reading configuration file 'snpEff.config'. Genome: 'GRCh38.99'
00:00:00	Reading config file: /home/jovyan/analyze_sequence/notebook/snpEff.config
00:00:00	Reading config file: /home/jovyan/analyze_sequence/tool/snpEff/snpEff.config
00:00:01	done
00:00:01	Downloading database for 'GRCh38.99'
00:00:01	Connecting to https://snpeff.blob.core.windows.net/databases/v5

`[4mtabix[24m [4m/home/jovyan/analyze_sequence/output/process_germ_dna_HG002_cat__trim_pe_q_20/find_variant/snpeff/snpeff.vcf.gz[24m`
pipeline(pipeline(`[4mbcftools[24m [4mview[24m [4m--threads[24m [4m2[24m [4m--include[24m [4m'FILTER=="PASS"'[24m [4m/home/jovyan/analyze_sequence/output/process_germ_dna_HG002_cat__trim_pe_q_20/find_variant/snpeff/snpeff.vcf.gz[24m`, stdout=`[4mbgzip[24m [4m--threads[24m [4m2[24m [4m--stdout[24m`), stdout>Base.FileRedirect("/home/jovyan/analyze_sequence/output/process_germ_dna_HG002_cat__trim_pe_q_20/find_variant/pass.vcf.gz", false))
`[4mtabix[24m [4m/home/jovyan/analyze_sequence/output/process_germ_dna_HG002_cat__trim_pe_q_20/find_variant/pass.vcf.gz[24m`
(2020-11-19T20:34:52.612) Done in 44 minutes, 31 seconds, 846 milliseconds.


In [10]:
if all((
    in(key, keys(project_json))
    for
    key in (
        "germ_dna.1.fastq.gz",
        "germ_dna.2.fastq.gz",
        "soma_dna.1.fastq.gz",
        "soma_dna.2.fastq.gz",
    )
))

    analyze_sequence.process_soma_dna(
        joinpath(project_dir, project_json["germ_dna.1.fastq.gz"]),
        joinpath(project_dir, project_json["germ_dna.2.fastq.gz"]),
        joinpath(project_dir, project_json["soma_dna.1.fastq.gz"]),
        joinpath(project_dir, project_json["soma_dna.2.fastq.gz"]),
        project_json["dna_is_targeted"],
        joinpath(output_dir, "process_soma_dna"),
        process_dna_arguments...,
    )

end

In [None]:
if all((
    in(key, keys(project_json)) for key in ("soma_rna.1.fastq.gz", "soma_rna.2.fastq.gz")
))

    analyze_sequence.process_soma_rna(
        joinpath(project_dir, project_json["soma_rna.1.fastq.gz"]),
        joinpath(project_dir, project_json["soma_rna.2.fastq.gz"]),
        joinpath(output_dir, "process_soma_rna"),
        joinpath(reference_dir, "Homo_sapiens.GRCh38.cdna.all.fa.gz"),
        project_json["n_job"],
    )

end