In [5]:
sample_name = "HG002"
include("../analyze_sequence/analyze_sequence.jl")



Main.analyze_sequence

In [9]:
project_dir = dirname(@__DIR__)

"/home/jovyan/work"

In [10]:
output_dir = joinpath(project_dir, "output")
input_dir = joinpath(project_dir, "input")
sample_dir = joinpath(input_dir, sample_name)


using JSON: parse
project_json = parse(read(joinpath(project_dir, "project.json"), String))

[32m[1m  Resolving[22m[39m package versions...
[32m[1mNo Changes[22m[39m to `/opt/julia/environments/v1.5/Project.toml`
[32m[1mNo Changes[22m[39m to `/opt/julia/environments/v1.5/Manifest.toml`


Dict{String,Any} with 5 entries:
  "gb_memory"           => 48
  "n_job"               => 12
  "dna_is_targeted"     => false
  "germ_dna.2.fastq.gz" => "input/HG002_cat/HG002_R2.fastq.gz"
  "germ_dna.1.fastq.gz" => "input/HG002_cat/HG002_R1.fastq.gz"

In [11]:
function print_and_run_command(cmd::Base.AbstractCmd)

    println(cmd)

    run(cmd)

end

using Dates

function check_sequence(fastq_gzs::Tuple{Vararg{String}}, output_dir::String, n_job::Int)

    start_time = now()

    println("($start_time) Checking sequence ...")

    mkpath(output_dir)

    print_and_run_command(`fastqc --quiet --threads $(minimum((length(fastq_gzs), n_job))) --outdir $output_dir $fastq_gzs`)

    end_time = now()

    println("($end_time) Done in $(canonicalize(Dates.CompoundPeriod(end_time - start_time))).")

end

check_sequence (generic function with 1 method)

## Gather raw reads

In [12]:
number_of_fastq_files = 0
number_of_fastq_gz_files = 0

fastq_files_to_check = []

println("Walking sample directories ...\n")

for (root, dirs, files) in walkdir("$sample_dir")
    println("$root")
    for file in files
        if occursin("fastq", file)
            number_of_fastq_files += 1
        end
        if occursin("fastq.gz", file)
            number_of_fastq_gz_files += 1
            push!(fastq_files_to_check, joinpath(root, file))
        end
    end
end

println("\nNumber of fastq files found in directories walked: $number_of_fastq_files\n")

println("Number of fastq.gz files found in directories walked: $number_of_fastq_gz_files\n")

println(string("Number of fastq.gz files to be checked: ", length(fastq_files_to_check)))

Walking sample directories ...

/home/jovyan/work/input/HG002
/home/jovyan/work/input/HG002/140528_D00360_0018_AH8VC6ADXX
/home/jovyan/work/input/HG002/140528_D00360_0018_AH8VC6ADXX/Project_RM8391_RM8392
/home/jovyan/work/input/HG002/140528_D00360_0018_AH8VC6ADXX/Project_RM8391_RM8392/Sample_2A1
/home/jovyan/work/input/HG002/140528_D00360_0018_AH8VC6ADXX/Project_RM8391_RM8392/Sample_2A2
/home/jovyan/work/input/HG002/140528_D00360_0018_AH8VC6ADXX/Project_RM8391_RM8392/Sample_2B1
/home/jovyan/work/input/HG002/140528_D00360_0018_AH8VC6ADXX/Project_RM8391_RM8392/Sample_2B2
/home/jovyan/work/input/HG002/140528_D00360_0018_AH8VC6ADXX/Project_RM8391_RM8392/Sample_2C1
/home/jovyan/work/input/HG002/140528_D00360_0018_AH8VC6ADXX/Project_RM8391_RM8392/Sample_2C2
/home/jovyan/work/input/HG002/140528_D00360_0018_AH8VC6ADXX/Project_RM8391_RM8392/Sample_2D1
/home/jovyan/work/input/HG002/140528_D00360_0018_AH8VC6ADXX/Project_RM8391_RM8392/Sample_2D2
/home/jovyan/work/input/HG002/140528_D00360_0018_AH8

## Run FastQC

In [None]:
using Dates

check_sequence(
    Tuple(fastq_files_to_check),
    joinpath(output_dir, string("check_sequence_", sample_name)),
    project_json["n_job"],
)

(2020-11-13T14:55:27.543) Checking sequence ...
`[4mfastqc[24m [4m--quiet[24m [4m--threads[24m [4m12[24m [4m--outdir[24m [4m/Users/kate/github/analyze_sequence/output/check_sequence_HG002[24m [4m/Users/kate/github/analyze_sequence/input/HG002/140528_D00360_0018_AH8VC6ADXX/Project_RM8391_RM8392/Sample_2A1/2A1_CGATGT_L001_R1_001.fastq.gz[24m [4m/Users/kate/github/analyze_sequence/input/HG002/140528_D00360_0018_AH8VC6ADXX/Project_RM8391_RM8392/Sample_2A1/2A1_CGATGT_L001_R1_002.fastq.gz[24m [4m/Users/kate/github/analyze_sequence/input/HG002/140528_D00360_0018_AH8VC6ADXX/Project_RM8391_RM8392/Sample_2A1/2A1_CGATGT_L001_R2_001.fastq.gz[24m [4m/Users/kate/github/analyze_sequence/input/HG002/140528_D00360_0018_AH8VC6ADXX/Project_RM8391_RM8392/Sample_2A1/2A1_CGATGT_L001_R2_002.fastq.gz[24m [4m/Users/kate/github/analyze_sequence/input/HG002/140528_D00360_0018_AH8VC6ADXX/Project_RM8391_RM8392/Sample_2A1/2A1_CGATGT_L002_R1_001.fastq.gz[24m [4m/Users/kate/github/analyze_seque

## Run MultiQC

In [None]:
check_sequence_directory = joinpath(output_dir, string("check_sequence_", sample_name))

print_and_run_command(`multiqc --outdir $check_sequence_directory $check_sequence_directory`)

## Concatenate reads of same strand

In [10]:
fastq_file_paths = readdir("/Users/kate/github/omics_sample_benchmark/input/$sample_name", join=false)

forward_read_files = []
reverse_read_files = []

number_of_forward_reads = 0
number_of_reverse_reads = 0

for file in fastq_files_to_check
    if occursin("R1", file)
        push!(forward_read_files, file)
        number_of_forward_reads += 1
    end
end

for file in fastq_files_to_check
    if occursin("R2", file)
        push!(reverse_read_files, file)
        number_of_reverse_reads += 1
    end
end

println("Number of Forward Reads = $number_of_forward_reads\n")
println("Number of Reverse Reads = $number_of_reverse_reads\n")

sample_cat_dir = joinpath(input_dir, string(sample_name, "_cat"))

println("\nCombining R1 Reads\n")

run(pipeline(`cat $forward_read_files`, stdout=joinpath(sample_cat_dir, "HG002_R1.fastq.gz"))) 

println("\nCombining R2 Reads\n")

run(pipeline(`cat $reverse_read_files`, stdout=joinpath(sample_cat_dir, "HG002_R2.fastq.gz"))) 

Number of Forward Reads = 133

Number of Reverse Reads = 133

Combining R1 Reads

pipeline(`[4mcat[24m [4m/Users/kate/github/omics_sample_benchmark/input/HG002/140528_D00360_0018_AH8VC6ADXX/Project_RM8391_RM8392/Sample_2A1/2A1_CGATGT_L001_R1_001.fastq.gz[24m [4m/Users/kate/github/omics_sample_benchmark/input/HG002/140528_D00360_0018_AH8VC6ADXX/Project_RM8391_RM8392/Sample_2A1/2A1_CGATGT_L001_R1_002.fastq.gz[24m [4m/Users/kate/github/omics_sample_benchmark/input/HG002/140528_D00360_0018_AH8VC6ADXX/Project_RM8391_RM8392/Sample_2A1/2A1_CGATGT_L002_R1_001.fastq.gz[24m [4m/Users/kate/github/omics_sample_benchmark/input/HG002/140528_D00360_0018_AH8VC6ADXX/Project_RM8391_RM8392/Sample_2A1/2A1_CGATGT_L002_R1_002.fastq.gz[24m [4m/Users/kate/github/omics_sample_benchmark/input/HG002/140528_D00360_0018_AH8VC6ADXX/Project_RM8391_RM8392/Sample_2A2/2A2_TGACCA_L001_R1_001.fastq.gz[24m [4m/Users/kate/github/omics_sample_benchmark/input/HG002/140528_D00360_0018_AH8VC6ADXX/Project_RM8391_RM

Process(`[4mcat[24m [4m/Users/kate/github/omics_sample_benchmark/input/HG002/140528_D00360_0018_AH8VC6ADXX/Project_RM8391_RM8392/Sample_2A1/2A1_CGATGT_L001_R2_001.fastq.gz[24m [4m/Users/kate/github/omics_sample_benchmark/input/HG002/140528_D00360_0018_AH8VC6ADXX/Project_RM8391_RM8392/Sample_2A1/2A1_CGATGT_L001_R2_002.fastq.gz[24m [4m/Users/kate/github/omics_sample_benchmark/input/HG002/140528_D00360_0018_AH8VC6ADXX/Project_RM8391_RM8392/Sample_2A1/2A1_CGATGT_L002_R2_001.fastq.gz[24m [4m/Users/kate/github/omics_sample_benchmark/input/HG002/140528_D00360_0018_AH8VC6ADXX/Project_RM8391_RM8392/Sample_2A1/2A1_CGATGT_L002_R2_002.fastq.gz[24m [4m/Users/kate/github/omics_sample_benchmark/input/HG002/140528_D00360_0018_AH8VC6ADXX/Project_RM8391_RM8392/Sample_2A2/2A2_TGACCA_L001_R2_001.fastq.gz[24m [4m/Users/kate/github/omics_sample_benchmark/input/HG002/140528_D00360_0018_AH8VC6ADXX/Project_RM8391_RM8392/Sample_2A2/2A2_TGACCA_L001_R2_002.fastq.gz[24m [4m/Users/kate/github/omics_s

## Run FastQC

In [11]:
check_sequence(
    Tuple(
        joinpath(project_dir, value)
        for (key, value) in project_json if endswith(key, ".fastq.gz")
    ),
    joinpath(output_dir, string("check_sequence_", sample_name, "_cat")),
    project_json["n_job"],
)

(2020-10-05T10:56:46.527) Checking sequence ...
`[4mfastqc[24m [4m--quiet[24m [4m--threads[24m [4m2[24m [4m--outdir[24m [4m/Users/kate/github/omics_sample_benchmark/output/check_sequence_HG002_cat[24m [4m/Users/kate/github/omics_sample_benchmark/input/HG002_cat/HG002_R2.fastq.gz[24m [4m/Users/kate/github/omics_sample_benchmark/input/HG002_cat/HG002_R1.fastq.gz[24m`
(2020-10-05T11:50:00.588) Done in 53 minutes, 14 seconds, 61 milliseconds.
