Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

🍏🎨 add/update coverage handling #223

Merged
merged 25 commits into from
Jan 24, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
2299308
:green_apple::snake::memo::art: add/update nxf inputs/outputs
evanroyrees Jan 21, 2022
58ce744
:fire: Remove bowtie2/build from modules.json
evanroyrees Jan 21, 2022
f7684fa
:memo: Update param.input docstring for nextflow_schema.json
evanroyrees Jan 21, 2022
e2949b6
:bug::fire: Remove unused open '(' in bowtie2 version retrieval
evanroyrees Jan 22, 2022
30888d4
:art::bug: Fix bt2 db emit and samtools view input param (add -bS)
evanroyrees Jan 23, 2022
f1b99ca
:memo: Add sample sheet information
evanroyrees Jan 24, 2022
9cd9ab9
:memo: Fix samplesheet table
evanroyrees Jan 24, 2022
09e9f1f
:bug::fire::memo: Fix typos update running modules section and next s…
evanroyrees Jan 24, 2022
9890699
:memo: Reformat resource allotment
evanroyrees Jan 24, 2022
50b671c
:fire: Remove incorrect usage of --input argument
evanroyrees Jan 24, 2022
70bcdc8
:memo: Replace parameters.config with nf-params.json that will likely…
evanroyrees Jan 24, 2022
ba29ab9
:memo: Replace attention with caution and bold some texts within caution
evanroyrees Jan 24, 2022
7325f7b
:memo::art: Change using nf-core tools header to using nf-core
evanroyrees Jan 24, 2022
9936121
:memo: Add example sample_sheet.csv section header
evanroyrees Jan 24, 2022
cc840d2
:bug: Fix python-formatting in Python API examples
evanroyrees Jan 24, 2022
ef586ba
:art::memo: Reformat ambiguous information for note on coverage_tab i…
evanroyrees Jan 24, 2022
fbdab4c
:bug::memo: Fix nf-core/nextlow conda install section header
evanroyrees Jan 24, 2022
9c4d74c
:bug: python APIs headers not being visualized
evanroyrees Jan 24, 2022
75d101b
:memo::bug: Fixing section headers
evanroyrees Jan 24, 2022
ba70f78
:art: Fix formatting for SLURM profile configuration section
evanroyrees Jan 24, 2022
a046251
:fire: Remove already mentioned comment with link
evanroyrees Jan 24, 2022
6f7d9c7
:memo: Update data preparation with points to examples in sample sheet
evanroyrees Jan 24, 2022
c166df9
:memo: Re-order examples for data preparation
evanroyrees Jan 24, 2022
350b075
:art::bug: fix resume code formatting in caution
evanroyrees Jan 24, 2022
31c1eec
:memo::art: Move attention section for data preparation to sample she…
evanroyrees Jan 24, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions autometa/common/external/bedtools.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,15 @@
logger = logging.getLogger(__name__)


def genomecov(ibam, out, force=False):
def genomecov(ibam: str, out: str, force: bool = False) -> str:
"""Run bedtools genomecov with input `ibam` and `lengths` to retrieve
metagenome coverages.

Parameters
----------
ibam : str
</path/to/indexed/BAM/file.ibam>. Note: BAM *must* be sorted by position.

out : str
</path/to/alignment.bed>
The bedtools genomecov output is a tab-delimited file with the following columns:
Expand All @@ -36,6 +37,7 @@ def genomecov(ibam, out, force=False):
4. Size of chromosome
5. Fraction of bases on that chromosome with that coverage
See also: http://bedtools.readthedocs.org/en/latest/content/tools/genomecov.html

force : bool
force overwrite of `out` if it already exists (default is False).

Expand Down Expand Up @@ -63,15 +65,17 @@ def genomecov(ibam, out, force=False):
return out


def parse(bed, out=None, force=False):
def parse(bed: str, out: str = None, force: bool = False) -> pd.DataFrame:
"""Calculate coverages from bed file.

Parameters
----------
bed : str
</path/to/file.bed>

out : str
if provided will write to `out`. I.e. </path/to/coverage.tsv>

force : bool
force overwrite of `out` if it already exists (default is False).

Expand Down
186 changes: 126 additions & 60 deletions bin/check_samplesheet.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
#!/usr/bin/env python

# TODO nf-core: Update the script to check the samplesheet
# Check the samplesheet for valid inputs
# Checks for
# 1. Whether minimum required files exist
# 2. Whether provided sample IDs are unique
# 3. Whether appropriate parameter combinations were provided
# - if coverage is to be determined from read alignments, reads must also be provided
# - if a path to a coverage table is already provided in the samplesheet
#
# This script is based on the example at: https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv

import os
Expand All @@ -19,7 +25,7 @@ def parse_args(args=None):
return parser.parse_args(args)


def make_dir(path):
def make_dir(path: str) -> None:
if len(path) > 0:
try:
os.makedirs(path)
Expand All @@ -28,67 +34,100 @@ def make_dir(path):
raise exception


def print_error(error, context="Line", context_str=""):
error_str = "ERROR: Please check samplesheet -> {}".format(error)
def print_error(error: str, context: str = "Line", context_str: str = "") -> None:
error_str = f"ERROR: Please check samplesheet -> {error}"
if context != "" and context_str != "":
error_str = "ERROR: Please check samplesheet -> {}\n{}: '{}'".format(
error, context.strip(), context_str.strip()
)
error_str = f"ERROR: Please check samplesheet -> {error}\n{context.strip()}: '{context_str.strip()}'"
print(error_str)
sys.exit(1)


# TODO nf-core: Update the check_samplesheet function
def check_samplesheet(file_in, file_out):
"""
This function checks that the samplesheet follows the following structure:

sample,fastq_1,fastq_2
SAMPLE_PE,SAMPLE_PE_RUN1_1.fastq.gz,SAMPLE_PE_RUN1_2.fastq.gz
SAMPLE_PE,SAMPLE_PE_RUN2_1.fastq.gz,SAMPLE_PE_RUN2_2.fastq.gz
SAMPLE_SE,SAMPLE_SE_RUN1_1.fastq.gz,
NOTE: 0 and 1 are converted to boolean values (false and true, respectively)
*and* although for sample 2 reads were provided, the input is specifying to
retrieve the coverage information from the assembly's contigs' headers.

sample,assembly,fastq_1,fastq_2,coverage_tab,cov_from_contig_headers
SAMPLE_1,ASSEMBLY_1,fwd_reads.fastq.gz,rev_reads.fastq.gz,,0
SAMPLE_2,ASSEMBLY_2,fwd_reads.fastq.gz,rev_reads.fastq.gz,,1
SAMPLE_3,ASSEMBLY_3,,,,1
SAMPLE_4,ASSEMBLY_4,,,/path/to/coverage.tsv,0

For an example see:
https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv
"""

sample_mapping_dict = {}
with open(file_in, "r") as fin:

samples = {}
min_col_names = ["sample", "assembly", "cov_from_contig_headers"]
min_num_cols = len(min_col_names)
req_header_cols = [
"sample",
"assembly",
"fastq_1",
"fastq_2",
"coverage_tab",
"cov_from_contig_headers",
]
num_required_cols = len(req_header_cols)
with open(file_in, "r") as fh:
## Check header
MIN_COLS = 2
# TODO nf-core: Update the column names for the input samplesheet
HEADER = ["sample", "fastq_1", "fastq_2"]
header = [x.strip('"') for x in fin.readline().strip().split(",")]
if header[: len(HEADER)] != HEADER:
print("ERROR: Please check samplesheet header -> {} != {}".format(",".join(header), ",".join(HEADER)))
header = fh.readline().strip()
header_cols = [header_col.strip('"') for header_col in header.split(",")]
if header_cols[:num_required_cols] != req_header_cols:
print(
f"ERROR: Please check samplesheet header -> {','.join(header_cols)} != {','.join(req_header_cols)}"
)
sys.exit(1)

## Check sample entries
for line in fin:
lspl = [x.strip().strip('"') for x in line.strip().split(",")]
for line in fh:
sample_cols = [
sample_col.strip().strip('"') for sample_col in line.strip().split(",")
]

# Check valid number of columns per row
if len(lspl) < len(HEADER):
num_cols = len([sample_col for sample_col in sample_cols if sample_col])
if num_cols < min_num_cols:
print_error(
"Invalid number of columns (minimum = {})!".format(len(HEADER)),
f"Invalid number of populated columns (minimum = {min_num_cols})!",
"Line",
line,
)
num_cols = len([x for x in lspl if x])
if num_cols < MIN_COLS:

if len(sample_cols) != num_required_cols:
print_error(
"Invalid number of populated columns (minimum = {})!".format(MIN_COLS),
f"Invalid number of columns (required = {num_required_cols})! Check that you have the correct number of commas and retry",
"Line",
line,
)

## Check sample name entries
sample, fastq_1, fastq_2 = lspl[: len(HEADER)]
(
sample,
assembly,
fastq_1,
fastq_2,
coverage_tab,
cov_from_headers,
) = sample_cols

sample = sample.replace(" ", "_")
if not sample:
print_error("Sample entry has not been specified!", "Line", line)

if not cov_from_headers:
print_error(
"Must provide a value ('0' or '1') for cov_from_contig_headers column!",
"Line",
line,
)
if cov_from_headers not in {"0", "1"}:
print_error(
"cov_from_contig_headers column must be '0' or '1'!", "Line", line
)

## Check FastQ file extension
for fastq in [fastq_1, fastq_2]:
if fastq:
Expand All @@ -101,40 +140,67 @@ def check_samplesheet(file_in, file_out):
line,
)

# NOTE: An empty string will fail with the file(...) method for groovy.. So we pass in "0" here
# to be checked later with file(...).exists()
# Same goes for the fastq_* files
coverage_tab = coverage_tab if coverage_tab else "0"
## Auto-detect paired-end/single-end
sample_info = [] ## [single_end, fastq_1, fastq_2]
if sample and fastq_1 and fastq_2: ## Paired-end short reads
sample_info = ["0", fastq_1, fastq_2]
elif sample and fastq_1 and not fastq_2: ## Single-end short reads
sample_info = ["1", fastq_1, fastq_2]
## [ assembly, single_end, fastq_1, fastq_2, coverage_tab, cov_from_contig_headers ]
sample_info = []
if fastq_1 and fastq_2: ## Paired-end short reads
sample_info = [
assembly,
"0",
fastq_1,
fastq_2,
coverage_tab,
cov_from_headers,
]
elif fastq_1 and not fastq_2: ## Single-end short reads
sample_info = [
assembly,
"1",
fastq_1,
"0",
coverage_tab,
cov_from_headers,
]
else:
print_error("Invalid combination of columns provided!", "Line", line)
sample_info = [assembly, "0", "0", "0", coverage_tab, cov_from_headers]

## Create sample mapping dictionary = { sample: [ single_end, fastq_1, fastq_2 ] }
if sample not in sample_mapping_dict:
sample_mapping_dict[sample] = [sample_info]
else:
if sample_info in sample_mapping_dict[sample]:
print_error("Samplesheet contains duplicate rows!", "Line", line)
else:
sample_mapping_dict[sample].append(sample_info)
## Create sample mapping dictionary = { sample: [ assembly, single_end, fastq_1, fastq_2, cov_from_contig_headers ] }
if sample in samples:
print_error("Samplesheet contains duplicate samples!", "Line", line)
samples[sample] = sample_info

if not samples:
print_error("No entries to process!", f"Samplesheet: {file_in}")

## Write validated samplesheet with appropriate columns
if len(sample_mapping_dict) > 0:
out_dir = os.path.dirname(file_out)
make_dir(out_dir)
with open(file_out, "w") as fout:
fout.write(",".join(["sample", "single_end", "fastq_1", "fastq_2"]) + "\n")
for sample in sorted(sample_mapping_dict.keys()):

## Check that multiple runs of the same sample are of the same datatype
if not all(x[0] == sample_mapping_dict[sample][0][0] for x in sample_mapping_dict[sample]):
print_error("Multiple runs of a sample must be of the same datatype!", "Sample: {}".format(sample))

for idx, val in enumerate(sample_mapping_dict[sample]):
fout.write(",".join(["{}_T{}".format(sample, idx + 1)] + val) + "\n")
else:
print_error("No entries to process!", "Samplesheet: {}".format(file_in))
sample_lines = ""
for sample, sample_info in samples.items():
sample_info_line = ",".join(sample_info)
sample_line = f"{sample},{sample_info_line}\n"
sample_lines += sample_line
out_dir = os.path.dirname(file_out)
make_dir(out_dir)
header = (
",".join(
[
"sample",
"assembly",
"single_end",
"fastq_1",
"fastq_2",
"coverage_tab",
"cov_from_contig_headers",
]
)
+ "\n"
)
with open(file_out, "w") as outfh:
outfh.write(header)
outfh.write(sample_lines)


def main(args=None):
Expand Down
4 changes: 2 additions & 2 deletions conf/base.config
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,12 @@ process {
// See https://www.nextflow.io/docs/latest/config.html#config-process-selectors
withLabel:process_low {
cpus = { check_max( 1 * task.attempt, 'cpus' ) }
memory = { check_max( 2.GB * task.attempt, 'memory' ) }
memory = { check_max( 2.GB * task.attempt, 'memory' ) }
time = { check_max( 4.h * task.attempt, 'time' ) }
}
withLabel:process_medium {
cpus = { check_max( 8 * task.attempt, 'cpus' ) }
memory = { check_max( 8.GB * task.attempt, 'memory' ) }
memory = { check_max( 8.GB * task.attempt, 'memory' ) }
time = { check_max( 8.h * task.attempt, 'time' ) }
}
withLabel:process_high {
Expand Down
31 changes: 21 additions & 10 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -76,23 +76,35 @@ params {
}
'diamond_makedb_options' {
publish_by_meta = ['id']
args = ""
args = ""
}
'align_reads_options' {
args = ""
args2 = "-q --phred33 --very-sensitive --no-unal"
publish_by_meta = ['id']
publish_dir = "align_reads"
}
'samtools_viewsort_options' {
args = ""
args2 = ""
args = ""
args2 = ""
publish_by_meta = ['id']
publish_dir = "samtools_sort"
publish_dir = "samtools_sort"
}
'bedtools_genomecov_options' {
args = ""
args2 = ""
publish_by_meta = ['id']
publish_dir = "genome_coverage"
}
'seqkit_split_options' {
publish_by_meta = ['id']
args = "-p ${params.num_splits}"
args2 = "--two-pass"
args = ""
args2 = "--two-pass"
}
'spades_kmer_coverage' {
publish_by_meta = ['id']
publish_files = ['*.coverages.tsv':'']
publish_dir = "coverage"
publish_files = ['*.coverages.tsv':'']
publish_dir = "coverage"
}
'split_kingdoms_options' {
publish_by_meta = ['id']
Expand All @@ -102,9 +114,8 @@ params {
}
'unclustered_recruitment_options' {
publish_by_meta = ['id']
publish_dir = "binning_results/unclustered_recruitment_results"
publish_dir = "binning_results/unclustered_recruitment_results"

}
}
}

Loading