In [1]:
include("../src/ProcessSequence.jl")

Main.ProcessSequence

In [2]:
project_dir = dirname(@__DIR__)

using JSON: parse

project_json = parse(read(joinpath(project_dir, "project.json"), String))

sample_name = project_json["sample_name"]

output_dir = joinpath(project_dir, "output")

input_dir = joinpath(project_dir, "input")

sample_dir = joinpath(input_dir, sample_name)

"/home/jovyan/ProcessSequence.jl/input/1005"

## Find raw reads

In [3]:
read_file_paths = ProcessSequence.find_reads(sample_dir)

Walking sample directory...

/home/jovyan/ProcessSequence.jl/input/1005

/home/jovyan/ProcessSequence.jl/input/1005/G1005_CSFP200000024-2a_H52MHDSXY_L4_R1_fastqc_11.9

/home/jovyan/ProcessSequence.jl/input/1005/G1005_CSFP200000024-2a_H52MHDSXY_L4_R1_fastqc_11.9/Icons

/home/jovyan/ProcessSequence.jl/input/1005/G1005_CSFP200000024-2a_H52MHDSXY_L4_R1_fastqc_11.9/Images


Number of fastq files found in directories walked: 0

Number of fastq.gz or fq.gz files found in directories walked: 4


Done at: 2020-12-10T07:02:35.279

Took 22 milliseconds.



4-element Array{Any,1}:
 "/home/jovyan/ProcessSequence.jl/input/1005/G1005_CSFP200000024-2a_H52MHDSXY_L4_R1.fq.gz"
 "/home/jovyan/ProcessSequence.jl/input/1005/G1005_CSFP200000024-2a_H52MHDSXY_L4_R2.fq.gz"
 "/home/jovyan/ProcessSequence.jl/input/1005/G1005_CSFP200000024-2a_H5MLJDSXY_L2_R1.fq.gz"
 "/home/jovyan/ProcessSequence.jl/input/1005/G1005_CSFP200000024-2a_H5MLJDSXY_L2_R2.fq.gz"

## Run FastQC

In [4]:
using Dates

ProcessSequence.check_sequence(
    Tuple(read_file_paths),
    joinpath(output_dir, string("check_sequence_", sample_name)),
    project_json["n_job"],
)

Skipping check sequence because check sequence directory already exists:
 /home/jovyan/ProcessSequence.jl/output/check_sequence_1005



## Run MultiQC

Not necessary if fastq files came from same lane and same sequencing run. The more complex the production of the reads was, the more useful MultiQC will be at identifying potential lane or batch biases.

In [5]:
ProcessSequence.check_sequence_bias(sample_name, output_dir)

`[4mmultiqc[24m [4m--outdir[24m [4m/home/jovyan/ProcessSequence.jl/output/check_sequence_1005[24m [4m/home/jovyan/ProcessSequence.jl/output/check_sequence_1005[24m`


[INFO   ]         multiqc : This is MultiQC v1.9
[INFO   ]         multiqc : Template    : default
[INFO   ]         multiqc : Searching   : /home/jovyan/ProcessSequence.jl/output/check_sequence_1005
[INFO   ]          fastqc : Found 4 reports
[INFO   ]         multiqc : Compressing plot data
[INFO   ]         multiqc : Report      : ../output/check_sequence_1005/multiqc_report_1.html
[INFO   ]         multiqc : Data        : ../output/check_sequence_1005/multiqc_data_1
[INFO   ]         multiqc : MultiQC complete



Done at: 2020-12-10T07:02:46.953

Took 4 seconds, 496 milliseconds.



## Concatenate reads of same strand

In [7]:
using Dates


function concatenate_reads(
        read_file_paths, 
        sample_name::String, 
        input_dir::String,
    )

    start_time = now()

    forward_read_files = []
    
    reverse_read_files = []

    number_of_forward_reads = 0
    
    number_of_reverse_reads = 0

    for file in read_file_paths
        
        if occursin("R1", file)
            
            push!(forward_read_files, file)
            
            number_of_forward_reads += 1
            
        end
        
    end

    for file in read_file_paths
        
        if occursin("R2", file)
            
            push!(reverse_read_files, file)
            
            number_of_reverse_reads += 1
        end

    end

    println("Number of Forward (R1) Reads = $number_of_forward_reads\n")
    
    println("Number of Reverse (R2) Reads = $number_of_reverse_reads\n")

    sample_cat_dir = joinpath(input_dir, string(sample_name, "_cat"))

    run(pipeline(`mkdir $sample_cat_dir`))

    println("\nCombining R1 Reads\n")

    run(pipeline(`cat $forward_read_files`, stdout=joinpath(sample_cat_dir, string(sample_name, "_R1.fastq.gz"))))

    println("\nCombining R2 Reads\n")

    run(pipeline(`cat $reverse_read_files`, stdout=joinpath(sample_cat_dir, string(sample_name, "_R1.fastq.gz"))))
            
    end_time = now()
            
    println("\nDone at: $end_time\n")
    
    println("Took $(canonicalize(Dates.CompoundPeriod(end_time - start_time))).\n")     
            
end

concatenate_reads (generic function with 1 method)

In [None]:
concatenate_reads(read_file_paths, sample_name, input_dir)

Number of Forward (R1) Reads = 2

Number of Reverse (R2) Reads = 2


Combining R1 Reads



## Run FastQC

In [None]:
check_sequence(
    Tuple(
        joinpath(project_dir, value)
        for (key, value) in project_json if endswith(key, ".fastq.gz")
    ),
    joinpath(output_dir, string("check_sequence_", sample_name, "_cat")),
    project_json["n_job"],
)