Skip to content

Commit

Permalink
🍏🎨 add/update coverage handling (#223)
Browse files Browse the repository at this point in the history
* 🍏🐍📝🎨 add/update nxf inputs/outputs
* 🐍 Add typehints to bedtools.py
* 📝🍏🔥 Remove instances of num_splits
* 📝🍏🎨 Change --input to accept samplesheets instead of metagenomes
* 🍏🎨 Update check_samplesheet.py for Autometa-nxf samplesheet validation/formatting
* 🍏🔥 Remove partially-implemented align_reads.nf subworkflow
* 🍏🎨 Add align_reads.nf process
* 🍏🎨 Add when directive to CONTIG_COVERAGE processes
* 🍏🎨 Add when directive to SPADES_KMER_COVERAGE process
* 🍏🎨 Change coverages emit to coverage
* 🍏🎨📝 Update configs to match updated processes
* 🍏 Add three channels to INPUT_CHECK (reads, coverage, metagenome)
* 🍏🎨 Add & mix three different coverage channels in AUTOMETA to coverage_ch
* 🍏🎨 fixes #186
* 🍏🎨 Add functionality to accept pre-computed coverage tables
* 🍏🎨 meta.id is now retrieved from provided sample column in input samplesheet
* 🔥 Remove bowtie2/build from modules.json
* 📝 Update param.input docstring for nextflow_schema.json
* 🐛🔥 Remove unused open '(' in bowtie2 version retrieval
* 🎨🐛 Fix bt2 db emit and samtools view input param (add -bS)
* 📝 Add sample sheet information
* 📝 Fix samplesheet table
🔥 Remove mentions of num_splits and other references in advanced to specifying multiple inputs
* 🐛🔥📝 Fix typos update running modules section and next some of advanced params
* 📝 Reformat resource allotment
* 🔥 Remove incorrect usage of --input argument
* 📝 Replace parameters.config with nf-params.json that will likely be used...
* 📝 Replace attention with caution and bold some texts within caution
* 📝🎨 Change using nf-core tools header to using nf-core
* 📝 Add example sample_sheet.csv section header
* 🐛 Fix python-formatting in Python API examples
* 🎨📝 Reformat ambiguous information for note on coverage_tab input
* 🐛📝 Fix nf-core/nextlow conda install section header
* 🐛 python APIs headers not being visualized
* 📝🐛 Fixing section headers
* 🎨 Fix formatting for SLURM profile configuration section
* 🔥 Remove already mentioned comment with link
* 📝 Update data preparation with points to examples in sample sheet
* 📝 Re-order examples for data preparation
* 🎨🐛 fix resume code formatting in caution
* 📝🎨 Move attention section for data preparation to sample sheet section
  • Loading branch information
evanroyrees committed Jan 24, 2022
1 parent bdbffda commit 5931255
Show file tree
Hide file tree
Showing 22 changed files with 645 additions and 459 deletions.
8 changes: 6 additions & 2 deletions autometa/common/external/bedtools.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,15 @@
logger = logging.getLogger(__name__)


def genomecov(ibam, out, force=False):
def genomecov(ibam: str, out: str, force: bool = False) -> str:
"""Run bedtools genomecov with input `ibam` and `lengths` to retrieve
metagenome coverages.
Parameters
----------
ibam : str
</path/to/indexed/BAM/file.ibam>. Note: BAM *must* be sorted by position.
out : str
</path/to/alignment.bed>
The bedtools genomecov output is a tab-delimited file with the following columns:
Expand All @@ -36,6 +37,7 @@ def genomecov(ibam, out, force=False):
4. Size of chromosome
5. Fraction of bases on that chromosome with that coverage
See also: http://bedtools.readthedocs.org/en/latest/content/tools/genomecov.html
force : bool
force overwrite of `out` if it already exists (default is False).
Expand Down Expand Up @@ -63,15 +65,17 @@ def genomecov(ibam, out, force=False):
return out


def parse(bed, out=None, force=False):
def parse(bed: str, out: str = None, force: bool = False) -> pd.DataFrame:
"""Calculate coverages from bed file.
Parameters
----------
bed : str
</path/to/file.bed>
out : str
if provided will write to `out`. I.e. </path/to/coverage.tsv>
force : bool
force overwrite of `out` if it already exists (default is False).
Expand Down
186 changes: 126 additions & 60 deletions bin/check_samplesheet.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
#!/usr/bin/env python

# TODO nf-core: Update the script to check the samplesheet
# Check the samplesheet for valid inputs
# Checks for
# 1. Whether minimum required files exist
# 2. Whether provided sample IDs are unique
# 3. Whether appropriate parameter combinations were provided
# - if coverage is to be determined from read alignments, reads must also be provided
# - if a path to a coverage table is already provided in the samplesheet
#
# This script is based on the example at: https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv

import os
Expand All @@ -19,7 +25,7 @@ def parse_args(args=None):
return parser.parse_args(args)


def make_dir(path):
def make_dir(path: str) -> None:
if len(path) > 0:
try:
os.makedirs(path)
Expand All @@ -28,67 +34,100 @@ def make_dir(path):
raise exception


def print_error(error, context="Line", context_str=""):
error_str = "ERROR: Please check samplesheet -> {}".format(error)
def print_error(error: str, context: str = "Line", context_str: str = "") -> None:
error_str = f"ERROR: Please check samplesheet -> {error}"
if context != "" and context_str != "":
error_str = "ERROR: Please check samplesheet -> {}\n{}: '{}'".format(
error, context.strip(), context_str.strip()
)
error_str = f"ERROR: Please check samplesheet -> {error}\n{context.strip()}: '{context_str.strip()}'"
print(error_str)
sys.exit(1)


# TODO nf-core: Update the check_samplesheet function
def check_samplesheet(file_in, file_out):
"""
This function checks that the samplesheet follows the following structure:
sample,fastq_1,fastq_2
SAMPLE_PE,SAMPLE_PE_RUN1_1.fastq.gz,SAMPLE_PE_RUN1_2.fastq.gz
SAMPLE_PE,SAMPLE_PE_RUN2_1.fastq.gz,SAMPLE_PE_RUN2_2.fastq.gz
SAMPLE_SE,SAMPLE_SE_RUN1_1.fastq.gz,
NOTE: 0 and 1 are converted to boolean values (false and true, respectively)
*and* although for sample 2 reads were provided, the input is specifying to
retrieve the coverage information from the assembly's contigs' headers.
sample,assembly,fastq_1,fastq_2,coverage_tab,cov_from_contig_headers
SAMPLE_1,ASSEMBLY_1,fwd_reads.fastq.gz,rev_reads.fastq.gz,,0
SAMPLE_2,ASSEMBLY_2,fwd_reads.fastq.gz,rev_reads.fastq.gz,,1
SAMPLE_3,ASSEMBLY_3,,,,1
SAMPLE_4,ASSEMBLY_4,,,/path/to/coverage.tsv,0
For an example see:
https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv
"""

sample_mapping_dict = {}
with open(file_in, "r") as fin:

samples = {}
min_col_names = ["sample", "assembly", "cov_from_contig_headers"]
min_num_cols = len(min_col_names)
req_header_cols = [
"sample",
"assembly",
"fastq_1",
"fastq_2",
"coverage_tab",
"cov_from_contig_headers",
]
num_required_cols = len(req_header_cols)
with open(file_in, "r") as fh:
## Check header
MIN_COLS = 2
# TODO nf-core: Update the column names for the input samplesheet
HEADER = ["sample", "fastq_1", "fastq_2"]
header = [x.strip('"') for x in fin.readline().strip().split(",")]
if header[: len(HEADER)] != HEADER:
print("ERROR: Please check samplesheet header -> {} != {}".format(",".join(header), ",".join(HEADER)))
header = fh.readline().strip()
header_cols = [header_col.strip('"') for header_col in header.split(",")]
if header_cols[:num_required_cols] != req_header_cols:
print(
f"ERROR: Please check samplesheet header -> {','.join(header_cols)} != {','.join(req_header_cols)}"
)
sys.exit(1)

## Check sample entries
for line in fin:
lspl = [x.strip().strip('"') for x in line.strip().split(",")]
for line in fh:
sample_cols = [
sample_col.strip().strip('"') for sample_col in line.strip().split(",")
]

# Check valid number of columns per row
if len(lspl) < len(HEADER):
num_cols = len([sample_col for sample_col in sample_cols if sample_col])
if num_cols < min_num_cols:
print_error(
"Invalid number of columns (minimum = {})!".format(len(HEADER)),
f"Invalid number of populated columns (minimum = {min_num_cols})!",
"Line",
line,
)
num_cols = len([x for x in lspl if x])
if num_cols < MIN_COLS:

if len(sample_cols) != num_required_cols:
print_error(
"Invalid number of populated columns (minimum = {})!".format(MIN_COLS),
f"Invalid number of columns (required = {num_required_cols})! Check that you have the correct number of commas and retry",
"Line",
line,
)

## Check sample name entries
sample, fastq_1, fastq_2 = lspl[: len(HEADER)]
(
sample,
assembly,
fastq_1,
fastq_2,
coverage_tab,
cov_from_headers,
) = sample_cols

sample = sample.replace(" ", "_")
if not sample:
print_error("Sample entry has not been specified!", "Line", line)

if not cov_from_headers:
print_error(
"Must provide a value ('0' or '1') for cov_from_contig_headers column!",
"Line",
line,
)
if cov_from_headers not in {"0", "1"}:
print_error(
"cov_from_contig_headers column must be '0' or '1'!", "Line", line
)

## Check FastQ file extension
for fastq in [fastq_1, fastq_2]:
if fastq:
Expand All @@ -101,40 +140,67 @@ def check_samplesheet(file_in, file_out):
line,
)

# NOTE: An empty string will fail with the file(...) method for groovy.. So we pass in "0" here
# to be checked later with file(...).exists()
# Same goes for the fastq_* files
coverage_tab = coverage_tab if coverage_tab else "0"
## Auto-detect paired-end/single-end
sample_info = [] ## [single_end, fastq_1, fastq_2]
if sample and fastq_1 and fastq_2: ## Paired-end short reads
sample_info = ["0", fastq_1, fastq_2]
elif sample and fastq_1 and not fastq_2: ## Single-end short reads
sample_info = ["1", fastq_1, fastq_2]
## [ assembly, single_end, fastq_1, fastq_2, coverage_tab, cov_from_contig_headers ]
sample_info = []
if fastq_1 and fastq_2: ## Paired-end short reads
sample_info = [
assembly,
"0",
fastq_1,
fastq_2,
coverage_tab,
cov_from_headers,
]
elif fastq_1 and not fastq_2: ## Single-end short reads
sample_info = [
assembly,
"1",
fastq_1,
"0",
coverage_tab,
cov_from_headers,
]
else:
print_error("Invalid combination of columns provided!", "Line", line)
sample_info = [assembly, "0", "0", "0", coverage_tab, cov_from_headers]

## Create sample mapping dictionary = { sample: [ single_end, fastq_1, fastq_2 ] }
if sample not in sample_mapping_dict:
sample_mapping_dict[sample] = [sample_info]
else:
if sample_info in sample_mapping_dict[sample]:
print_error("Samplesheet contains duplicate rows!", "Line", line)
else:
sample_mapping_dict[sample].append(sample_info)
## Create sample mapping dictionary = { sample: [ assembly, single_end, fastq_1, fastq_2, cov_from_contig_headers ] }
if sample in samples:
print_error("Samplesheet contains duplicate samples!", "Line", line)
samples[sample] = sample_info

if not samples:
print_error("No entries to process!", f"Samplesheet: {file_in}")

## Write validated samplesheet with appropriate columns
if len(sample_mapping_dict) > 0:
out_dir = os.path.dirname(file_out)
make_dir(out_dir)
with open(file_out, "w") as fout:
fout.write(",".join(["sample", "single_end", "fastq_1", "fastq_2"]) + "\n")
for sample in sorted(sample_mapping_dict.keys()):

## Check that multiple runs of the same sample are of the same datatype
if not all(x[0] == sample_mapping_dict[sample][0][0] for x in sample_mapping_dict[sample]):
print_error("Multiple runs of a sample must be of the same datatype!", "Sample: {}".format(sample))

for idx, val in enumerate(sample_mapping_dict[sample]):
fout.write(",".join(["{}_T{}".format(sample, idx + 1)] + val) + "\n")
else:
print_error("No entries to process!", "Samplesheet: {}".format(file_in))
sample_lines = ""
for sample, sample_info in samples.items():
sample_info_line = ",".join(sample_info)
sample_line = f"{sample},{sample_info_line}\n"
sample_lines += sample_line
out_dir = os.path.dirname(file_out)
make_dir(out_dir)
header = (
",".join(
[
"sample",
"assembly",
"single_end",
"fastq_1",
"fastq_2",
"coverage_tab",
"cov_from_contig_headers",
]
)
+ "\n"
)
with open(file_out, "w") as outfh:
outfh.write(header)
outfh.write(sample_lines)


def main(args=None):
Expand Down
4 changes: 2 additions & 2 deletions conf/base.config
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,12 @@ process {
// See https://www.nextflow.io/docs/latest/config.html#config-process-selectors
withLabel:process_low {
cpus = { check_max( 1 * task.attempt, 'cpus' ) }
memory = { check_max( 2.GB * task.attempt, 'memory' ) }
memory = { check_max( 2.GB * task.attempt, 'memory' ) }
time = { check_max( 4.h * task.attempt, 'time' ) }
}
withLabel:process_medium {
cpus = { check_max( 8 * task.attempt, 'cpus' ) }
memory = { check_max( 8.GB * task.attempt, 'memory' ) }
memory = { check_max( 8.GB * task.attempt, 'memory' ) }
time = { check_max( 8.h * task.attempt, 'time' ) }
}
withLabel:process_high {
Expand Down
31 changes: 21 additions & 10 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -76,23 +76,35 @@ params {
}
'diamond_makedb_options' {
publish_by_meta = ['id']
args = ""
args = ""
}
'align_reads_options' {
args = ""
args2 = "-q --phred33 --very-sensitive --no-unal"
publish_by_meta = ['id']
publish_dir = "align_reads"
}
'samtools_viewsort_options' {
args = ""
args2 = ""
args = ""
args2 = ""
publish_by_meta = ['id']
publish_dir = "samtools_sort"
publish_dir = "samtools_sort"
}
'bedtools_genomecov_options' {
args = ""
args2 = ""
publish_by_meta = ['id']
publish_dir = "genome_coverage"
}
'seqkit_split_options' {
publish_by_meta = ['id']
args = "-p ${params.num_splits}"
args2 = "--two-pass"
args = ""
args2 = "--two-pass"
}
'spades_kmer_coverage' {
publish_by_meta = ['id']
publish_files = ['*.coverages.tsv':'']
publish_dir = "coverage"
publish_files = ['*.coverages.tsv':'']
publish_dir = "coverage"
}
'split_kingdoms_options' {
publish_by_meta = ['id']
Expand All @@ -102,9 +114,8 @@ params {
}
'unclustered_recruitment_options' {
publish_by_meta = ['id']
publish_dir = "binning_results/unclustered_recruitment_results"
publish_dir = "binning_results/unclustered_recruitment_results"

}
}
}

Loading

0 comments on commit 5931255

Please sign in to comment.