In [2]:
include("../src/ProcessSequence.jl")

Main.ProcessSequence

In [9]:
project_dir = dirname(@__DIR__)

using JSON: parse

project_json = parse(read(joinpath(project_dir, "project.json"), String))

sample_name = project_json["sample_name"]

output_dir = joinpath(project_dir, "output")
input_dir = joinpath(project_dir, "input")
sample_dir = joinpath(input_dir, sample_name)



LoadError: MethodError: no method matching parse(::String)
Closest candidates are:
  parse(!Matched::Type{T}, !Matched::AbstractChar; base) where T<:Integer at parse.jl:40
  parse(!Matched::Type{T}, !Matched::AbstractString; base) where T<:Integer at parse.jl:237
  parse(!Matched::Type{T}, !Matched::AbstractString; kwargs...) where T<:Real at parse.jl:376
  ...

## Gather raw reads

In [36]:
using Dates

function find_reads(sample_dir::String)
    
    start_time = now()
    
    number_of_fastq_files = 0
    number_of_fastq_gz_files = 0
    fastq_files_to_check = []
    
    println("Walking sample directory...\n")

    for (root, dirs, files) in walkdir("$sample_dir")
        
        println("$root\n")
        
        for file in files
            if occursin(".fastq", file)
                number_of_fastq_files += 1
            end
            if occursin("fastq.gz", file)
                number_of_fastq_gz_files += 1
                push!(fastq_files_to_check, joinpath(root, file))
            end
            if occursin("fq.gz", file)
                number_of_fastq_gz_files += 1
                push!(fastq_files_to_check, joinpath(root, file))
            end
        end
    end

    println("\nNumber of fastq files found in directories walked: $number_of_fastq_files\n")

    println("Number of fastq.gz or fq.gz files found in directories walked: $number_of_fastq_gz_files\n")

    println(string("Number of fastq.gz or fq.gz files to be checked: ", length(fastq_files_to_check)))
    
    end_time = now()
    
    println("\nDone at: $end_time\n")
    
    println("Took $(canonicalize(Dates.CompoundPeriod(end_time - start_time))).\n")
    
    return fastq_files_to_check
    
end

find_reads (generic function with 1 method)

In [37]:
reads = find_reads("../input/1004/")

Walking sample directory...

../input/1004/

../input/1004/G1004_CSFP200000023-2a_H52MHDSXY_L4_1_fastqc

../input/1004/G1004_CSFP200000023-2a_H52MHDSXY_L4_1_fastqc/Icons

../input/1004/G1004_CSFP200000023-2a_H52MHDSXY_L4_1_fastqc/Images


Number of fastq files found in directories walked: 0

Number of fastq.gz or fq.gz files found in directories walked: 2

Number of fastq.gz or fq.gz files to be checked: 2

Done at: 2020-12-08T07:51:20.11

Took 3 milliseconds.



2-element Array{Any,1}:
 "../input/1004/G1004_CSFP200000023-2a_H52MHDSXY_L4_1.fq.gz"
 "../input/1004/G1004_CSFP200000023-2a_H52MHDSXY_L4_2.fq.gz"

2-element Array{Any,1}:
 "../input/1004/G1004_CSFP200000023-2a_H52MHDSXY_L4_1.fq.gz"
 "../input/1004/G1004_CSFP200000023-2a_H52MHDSXY_L4_2.fq.gz"

In [3]:
number_of_fastq_files = 0
number_of_fastq_gz_files = 0
fastq_files_to_check = []

println("Walking sample directories ...\n")

for (root, dirs, files) in walkdir("$sample_dir")
    println("$root")
    for file in files
        if occursin(".fastq", file)
            number_of_fastq_files += 1
        end
        if occursin("fastq.gz", file)
            number_of_fastq_gz_files += 1
            push!(fastq_files_to_check, joinpath(root, file))
        end
        if occursin("fq.gz", file)
            number_of_fastq_gz_files += 1
            push!(fastq_files_to_check, joinpath(root, file))
        end
    end

println("\nNumber of fastq files found in directories walked: $number_of_fastq_files\n")

println("Number of fastq.gz or fq.gz files found in directories walked: $number_of_fastq_gz_files\n")

println(string("Number of fastq.gz or fq.gz files to be checked: ", length(fastq_files_to_check)))
    

Walking sample directories ...

/home/jovyan/ProcessSequence.jl/input/1004
/home/jovyan/ProcessSequence.jl/input/1004/G1004_CSFP200000023-2a_H52MHDSXY_L4_1_fastqc
/home/jovyan/ProcessSequence.jl/input/1004/G1004_CSFP200000023-2a_H52MHDSXY_L4_1_fastqc/Icons
/home/jovyan/ProcessSequence.jl/input/1004/G1004_CSFP200000023-2a_H52MHDSXY_L4_1_fastqc/Images

Number of fastq files found in directories walked: 0

Number of fastq.gz or fq.gz files found in directories walked: 2

Number of fastq.gz or fq.gz files to be checked: 2


## Run FastQC

In [4]:
using Dates

ProcessSequence.check_sequence(
    Tuple(fastq_files_to_check),
    joinpath(output_dir, string("check_sequence_", sample_name)),
    project_json["n_job"],
)

Skipping check sequence because check sequence directory already exists:
 /home/jovyan/ProcessSequence.jl/output/check_sequence_1004



## Run MultiQC

In [5]:
check_sequence_directory = joinpath(output_dir, string("check_sequence_", sample_name))

ProcessSequence.print_and_run_cmd(`multiqc --outdir $check_sequence_directory $check_sequence_directory`)

`[4mmultiqc[24m [4m--outdir[24m [4m/home/jovyan/ProcessSequence.jl/output/check_sequence_1004[24m [4m/home/jovyan/ProcessSequence.jl/output/check_sequence_1004[24m`


[INFO   ]         multiqc : This is MultiQC v1.9
[INFO   ]         multiqc : Template    : default
[INFO   ]         multiqc : Searching   : /home/jovyan/ProcessSequence.jl/output/check_sequence_1004
[INFO   ]          fastqc : Found 2 reports
[INFO   ]         multiqc : Compressing plot data
[INFO   ]         multiqc : Report      : ../output/check_sequence_1004/multiqc_report_1.html
[INFO   ]         multiqc : Data        : ../output/check_sequence_1004/multiqc_data_1
[INFO   ]         multiqc : MultiQC complete


Process(`[4mmultiqc[24m [4m--outdir[24m [4m/home/jovyan/ProcessSequence.jl/output/check_sequence_1004[24m [4m/home/jovyan/ProcessSequence.jl/output/check_sequence_1004[24m`, ProcessExited(0))

## Concatenate reads of same strand

In [10]:
fastq_file_paths = readdir("/Users/kate/github/omics_sample_benchmark/input/$sample_name", join=false)

forward_read_files = []
reverse_read_files = []

number_of_forward_reads = 0
number_of_reverse_reads = 0

for file in fastq_files_to_check
    if occursin("R1", file)
        push!(forward_read_files, file)
        number_of_forward_reads += 1
    end
end

for file in fastq_files_to_check
    if occursin("R2", file)
        push!(reverse_read_files, file)
        number_of_reverse_reads += 1
    end
end

println("Number of Forward Reads = $number_of_forward_reads\n")
println("Number of Reverse Reads = $number_of_reverse_reads\n")

sample_cat_dir = joinpath(input_dir, string(sample_name, "_cat"))

println("\nCombining R1 Reads\n")

run(pipeline(`cat $forward_read_files`, stdout=joinpath(sample_cat_dir, "HG002_R1.fastq.gz"))) 

println("\nCombining R2 Reads\n")

run(pipeline(`cat $reverse_read_files`, stdout=joinpath(sample_cat_dir, "HG002_R2.fastq.gz"))) 

Number of Forward Reads = 133

Number of Reverse Reads = 133

Combining R1 Reads

pipeline(`[4mcat[24m [4m/Users/kate/github/omics_sample_benchmark/input/HG002/140528_D00360_0018_AH8VC6ADXX/Project_RM8391_RM8392/Sample_2A1/2A1_CGATGT_L001_R1_001.fastq.gz[24m [4m/Users/kate/github/omics_sample_benchmark/input/HG002/140528_D00360_0018_AH8VC6ADXX/Project_RM8391_RM8392/Sample_2A1/2A1_CGATGT_L001_R1_002.fastq.gz[24m [4m/Users/kate/github/omics_sample_benchmark/input/HG002/140528_D00360_0018_AH8VC6ADXX/Project_RM8391_RM8392/Sample_2A1/2A1_CGATGT_L002_R1_001.fastq.gz[24m [4m/Users/kate/github/omics_sample_benchmark/input/HG002/140528_D00360_0018_AH8VC6ADXX/Project_RM8391_RM8392/Sample_2A1/2A1_CGATGT_L002_R1_002.fastq.gz[24m [4m/Users/kate/github/omics_sample_benchmark/input/HG002/140528_D00360_0018_AH8VC6ADXX/Project_RM8391_RM8392/Sample_2A2/2A2_TGACCA_L001_R1_001.fastq.gz[24m [4m/Users/kate/github/omics_sample_benchmark/input/HG002/140528_D00360_0018_AH8VC6ADXX/Project_RM8391_RM

Process(`[4mcat[24m [4m/Users/kate/github/omics_sample_benchmark/input/HG002/140528_D00360_0018_AH8VC6ADXX/Project_RM8391_RM8392/Sample_2A1/2A1_CGATGT_L001_R2_001.fastq.gz[24m [4m/Users/kate/github/omics_sample_benchmark/input/HG002/140528_D00360_0018_AH8VC6ADXX/Project_RM8391_RM8392/Sample_2A1/2A1_CGATGT_L001_R2_002.fastq.gz[24m [4m/Users/kate/github/omics_sample_benchmark/input/HG002/140528_D00360_0018_AH8VC6ADXX/Project_RM8391_RM8392/Sample_2A1/2A1_CGATGT_L002_R2_001.fastq.gz[24m [4m/Users/kate/github/omics_sample_benchmark/input/HG002/140528_D00360_0018_AH8VC6ADXX/Project_RM8391_RM8392/Sample_2A1/2A1_CGATGT_L002_R2_002.fastq.gz[24m [4m/Users/kate/github/omics_sample_benchmark/input/HG002/140528_D00360_0018_AH8VC6ADXX/Project_RM8391_RM8392/Sample_2A2/2A2_TGACCA_L001_R2_001.fastq.gz[24m [4m/Users/kate/github/omics_sample_benchmark/input/HG002/140528_D00360_0018_AH8VC6ADXX/Project_RM8391_RM8392/Sample_2A2/2A2_TGACCA_L001_R2_002.fastq.gz[24m [4m/Users/kate/github/omics_s

## Run FastQC

In [11]:
check_sequence(
    Tuple(
        joinpath(project_dir, value)
        for (key, value) in project_json if endswith(key, ".fastq.gz")
    ),
    joinpath(output_dir, string("check_sequence_", sample_name, "_cat")),
    project_json["n_job"],
)

(2020-10-05T10:56:46.527) Checking sequence ...
`[4mfastqc[24m [4m--quiet[24m [4m--threads[24m [4m2[24m [4m--outdir[24m [4m/Users/kate/github/omics_sample_benchmark/output/check_sequence_HG002_cat[24m [4m/Users/kate/github/omics_sample_benchmark/input/HG002_cat/HG002_R2.fastq.gz[24m [4m/Users/kate/github/omics_sample_benchmark/input/HG002_cat/HG002_R1.fastq.gz[24m`
(2020-10-05T11:50:00.588) Done in 53 minutes, 14 seconds, 61 milliseconds.
