🍏🎨 add/update coverage handling (#223)

* 🍏🐍📝🎨 add/update nxf inputs/outputs * 🐍 Add typehints to bedtools.py * 📝🍏🔥 Remove instances of num_splits * 📝🍏🎨 Change --input to accept samplesheets instead of metagenomes * 🍏🎨 Update check_samplesheet.py for Autometa-nxf samplesheet validation/formatting * 🍏🔥 Remove partially-implemented align_reads.nf subworkflow * 🍏🎨 Add align_reads.nf process * 🍏🎨 Add when directive to CONTIG_COVERAGE processes * 🍏🎨 Add when directive to SPADES_KMER_COVERAGE process * 🍏🎨 Change coverages emit to coverage * 🍏🎨📝 Update configs to match updated processes * 🍏 Add three channels to INPUT_CHECK (reads, coverage, metagenome) * 🍏🎨 Add & mix three different coverage channels in AUTOMETA to coverage_ch * 🍏🎨 fixes #186 * 🍏🎨 Add functionality to accept pre-computed coverage tables * 🍏🎨 meta.id is now retrieved from provided sample column in input samplesheet * 🔥 Remove bowtie2/build from modules.json * 📝 Update param.input docstring for nextflow_schema.json * 🐛🔥 Remove unused open '(' in bowtie2 version retrieval * 🎨🐛 Fix bt2 db emit and samtools view input param (add -bS) * 📝 Add sample sheet information * 📝 Fix samplesheet table 🔥 Remove mentions of num_splits and other references in advanced to specifying multiple inputs * 🐛🔥📝 Fix typos update running modules section and next some of advanced params * 📝 Reformat resource allotment * 🔥 Remove incorrect usage of --input argument * 📝 Replace parameters.config with nf-params.json that will likely be used... * 📝 Replace attention with caution and bold some texts within caution * 📝🎨 Change using nf-core tools header to using nf-core * 📝 Add example sample_sheet.csv section header * 🐛 Fix python-formatting in Python API examples * 🎨📝 Reformat ambiguous information for note on coverage_tab input * 🐛📝 Fix nf-core/nextlow conda install section header * 🐛 python APIs headers not being visualized * 📝🐛 Fixing section headers * 🎨 Fix formatting for SLURM profile configuration section * 🔥 Remove already mentioned comment with link * 📝 Update data preparation with points to examples in sample sheet * 📝 Re-order examples for data preparation * 🎨🐛 fix resume code formatting in caution * 📝🎨 Move attention section for data preparation to sample sheet section
KwanLab · Jan 24, 2022 · 5931255 · 5931255
1 parent bdbffda
commit 5931255
Show file tree

Hide file tree

Showing 22 changed files with 645 additions and 459 deletions.
diff --git a/autometa/common/external/bedtools.py b/autometa/common/external/bedtools.py
@@ -19,14 +19,15 @@
 logger = logging.getLogger(__name__)
 
 
-def genomecov(ibam, out, force=False):
+def genomecov(ibam: str, out: str, force: bool = False) -> str:
     """Run bedtools genomecov with input `ibam` and `lengths` to retrieve
     metagenome coverages.
 
     Parameters
     ----------
     ibam : str
         </path/to/indexed/BAM/file.ibam>. Note: BAM *must* be sorted by position.
+
     out : str
         </path/to/alignment.bed>
         The bedtools genomecov output is a tab-delimited file with the following columns:
@@ -36,6 +37,7 @@ def genomecov(ibam, out, force=False):
         4. Size of chromosome
         5. Fraction of bases on that chromosome with that coverage
         See also: http://bedtools.readthedocs.org/en/latest/content/tools/genomecov.html
+
     force : bool
         force overwrite of `out` if it already exists (default is False).
 
@@ -63,15 +65,17 @@ def genomecov(ibam, out, force=False):
     return out
 
 
-def parse(bed, out=None, force=False):
+def parse(bed: str, out: str = None, force: bool = False) -> pd.DataFrame:
     """Calculate coverages from bed file.
 
     Parameters
     ----------
     bed : str
         </path/to/file.bed>
+
     out : str
         if provided will write to `out`. I.e. </path/to/coverage.tsv>
+
     force : bool
         force overwrite of `out` if it already exists (default is False).
 

diff --git a/bin/check_samplesheet.py b/bin/check_samplesheet.py
@@ -1,6 +1,12 @@
 #!/usr/bin/env python
-
-# TODO nf-core: Update the script to check the samplesheet
+# Check the samplesheet for valid inputs
+# Checks for
+# 1. Whether minimum required files exist
+# 2. Whether provided sample IDs are unique
+# 3. Whether appropriate parameter combinations were provided
+#   - if coverage is to be determined from read alignments, reads must also be provided
+#   - if a path to a coverage table is already provided in the samplesheet
+#
 # This script is based on the example at: https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv
 
 import os
@@ -19,7 +25,7 @@ def parse_args(args=None):
     return parser.parse_args(args)
 
 
-def make_dir(path):
+def make_dir(path: str) -> None:
     if len(path) > 0:
         try:
             os.makedirs(path)
@@ -28,67 +34,100 @@ def make_dir(path):
                 raise exception
 
 
-def print_error(error, context="Line", context_str=""):
-    error_str = "ERROR: Please check samplesheet -> {}".format(error)
+def print_error(error: str, context: str = "Line", context_str: str = "") -> None:
+    error_str = f"ERROR: Please check samplesheet -> {error}"
     if context != "" and context_str != "":
-        error_str = "ERROR: Please check samplesheet -> {}\n{}: '{}'".format(
-            error, context.strip(), context_str.strip()
-        )
+        error_str = f"ERROR: Please check samplesheet -> {error}\n{context.strip()}: '{context_str.strip()}'"
     print(error_str)
     sys.exit(1)
 
 
-# TODO nf-core: Update the check_samplesheet function
 def check_samplesheet(file_in, file_out):
     """
     This function checks that the samplesheet follows the following structure:
 
-    sample,fastq_1,fastq_2
-    SAMPLE_PE,SAMPLE_PE_RUN1_1.fastq.gz,SAMPLE_PE_RUN1_2.fastq.gz
-    SAMPLE_PE,SAMPLE_PE_RUN2_1.fastq.gz,SAMPLE_PE_RUN2_2.fastq.gz
-    SAMPLE_SE,SAMPLE_SE_RUN1_1.fastq.gz,
+    NOTE: 0 and 1 are converted to boolean values (false and true, respectively)
+     *and* although for sample 2 reads were provided, the input is specifying to
+    retrieve the coverage information from the assembly's contigs' headers.
+
+    sample,assembly,fastq_1,fastq_2,coverage_tab,cov_from_contig_headers
+    SAMPLE_1,ASSEMBLY_1,fwd_reads.fastq.gz,rev_reads.fastq.gz,,0
+    SAMPLE_2,ASSEMBLY_2,fwd_reads.fastq.gz,rev_reads.fastq.gz,,1
+    SAMPLE_3,ASSEMBLY_3,,,,1
+    SAMPLE_4,ASSEMBLY_4,,,/path/to/coverage.tsv,0
 
     For an example see:
     https://raw.githubusercontent.com/nf-core/test-datasets/viralrecon/samplesheet/samplesheet_test_illumina_amplicon.csv
     """
 
-    sample_mapping_dict = {}
-    with open(file_in, "r") as fin:
-
+    samples = {}
+    min_col_names = ["sample", "assembly", "cov_from_contig_headers"]
+    min_num_cols = len(min_col_names)
+    req_header_cols = [
+        "sample",
+        "assembly",
+        "fastq_1",
+        "fastq_2",
+        "coverage_tab",
+        "cov_from_contig_headers",
+    ]
+    num_required_cols = len(req_header_cols)
+    with open(file_in, "r") as fh:
         ## Check header
-        MIN_COLS = 2
-        # TODO nf-core: Update the column names for the input samplesheet
-        HEADER = ["sample", "fastq_1", "fastq_2"]
-        header = [x.strip('"') for x in fin.readline().strip().split(",")]
-        if header[: len(HEADER)] != HEADER:
-            print("ERROR: Please check samplesheet header -> {} != {}".format(",".join(header), ",".join(HEADER)))
+        header = fh.readline().strip()
+        header_cols = [header_col.strip('"') for header_col in header.split(",")]
+        if header_cols[:num_required_cols] != req_header_cols:
+            print(
+                f"ERROR: Please check samplesheet header -> {','.join(header_cols)} != {','.join(req_header_cols)}"
+            )
             sys.exit(1)
 
         ## Check sample entries
-        for line in fin:
-            lspl = [x.strip().strip('"') for x in line.strip().split(",")]
+        for line in fh:
+            sample_cols = [
+                sample_col.strip().strip('"') for sample_col in line.strip().split(",")
+            ]
 
             # Check valid number of columns per row
-            if len(lspl) < len(HEADER):
+            num_cols = len([sample_col for sample_col in sample_cols if sample_col])
+            if num_cols < min_num_cols:
                 print_error(
-                    "Invalid number of columns (minimum = {})!".format(len(HEADER)),
+                    f"Invalid number of populated columns (minimum = {min_num_cols})!",
                     "Line",
                     line,
                 )
-            num_cols = len([x for x in lspl if x])
-            if num_cols < MIN_COLS:
+
+            if len(sample_cols) != num_required_cols:
                 print_error(
-                    "Invalid number of populated columns (minimum = {})!".format(MIN_COLS),
+                    f"Invalid number of columns (required = {num_required_cols})! Check that you have the correct number of commas and retry",
                     "Line",
                     line,
                 )
-
             ## Check sample name entries
-            sample, fastq_1, fastq_2 = lspl[: len(HEADER)]
+            (
+                sample,
+                assembly,
+                fastq_1,
+                fastq_2,
+                coverage_tab,
+                cov_from_headers,
+            ) = sample_cols
+
             sample = sample.replace(" ", "_")
             if not sample:
                 print_error("Sample entry has not been specified!", "Line", line)
 
+            if not cov_from_headers:
+                print_error(
+                    "Must provide a value ('0' or '1') for cov_from_contig_headers column!",
+                    "Line",
+                    line,
+                )
+            if cov_from_headers not in {"0", "1"}:
+                print_error(
+                    "cov_from_contig_headers column must be '0' or '1'!", "Line", line
+                )
+
             ## Check FastQ file extension
             for fastq in [fastq_1, fastq_2]:
                 if fastq:
@@ -101,40 +140,67 @@ def check_samplesheet(file_in, file_out):
                             line,
                         )
 
+            # NOTE: An empty string will fail with the file(...) method for groovy.. So we pass in "0" here
+            # to be checked later with file(...).exists()
+            # Same goes for the fastq_* files
+            coverage_tab = coverage_tab if coverage_tab else "0"
             ## Auto-detect paired-end/single-end
-            sample_info = []  ## [single_end, fastq_1, fastq_2]
-            if sample and fastq_1 and fastq_2:  ## Paired-end short reads
-                sample_info = ["0", fastq_1, fastq_2]
-            elif sample and fastq_1 and not fastq_2:  ## Single-end short reads
-                sample_info = ["1", fastq_1, fastq_2]
+            ## [ assembly, single_end, fastq_1, fastq_2, coverage_tab, cov_from_contig_headers ]
+            sample_info = []
+            if fastq_1 and fastq_2:  ## Paired-end short reads
+                sample_info = [
+                    assembly,
+                    "0",
+                    fastq_1,
+                    fastq_2,
+                    coverage_tab,
+                    cov_from_headers,
+                ]
+            elif fastq_1 and not fastq_2:  ## Single-end short reads
+                sample_info = [
+                    assembly,
+                    "1",
+                    fastq_1,
+                    "0",
+                    coverage_tab,
+                    cov_from_headers,
+                ]
             else:
-                print_error("Invalid combination of columns provided!", "Line", line)
+                sample_info = [assembly, "0", "0", "0", coverage_tab, cov_from_headers]
 
-            ## Create sample mapping dictionary = { sample: [ single_end, fastq_1, fastq_2 ] }
-            if sample not in sample_mapping_dict:
-                sample_mapping_dict[sample] = [sample_info]
-            else:
-                if sample_info in sample_mapping_dict[sample]:
-                    print_error("Samplesheet contains duplicate rows!", "Line", line)
-                else:
-                    sample_mapping_dict[sample].append(sample_info)
+            ## Create sample mapping dictionary = { sample: [ assembly, single_end, fastq_1, fastq_2, cov_from_contig_headers ] }
+            if sample in samples:
+                print_error("Samplesheet contains duplicate samples!", "Line", line)
+            samples[sample] = sample_info
+
+    if not samples:
+        print_error("No entries to process!", f"Samplesheet: {file_in}")
 
     ## Write validated samplesheet with appropriate columns
-    if len(sample_mapping_dict) > 0:
-        out_dir = os.path.dirname(file_out)
-        make_dir(out_dir)
-        with open(file_out, "w") as fout:
-            fout.write(",".join(["sample", "single_end", "fastq_1", "fastq_2"]) + "\n")
-            for sample in sorted(sample_mapping_dict.keys()):
-
-                ## Check that multiple runs of the same sample are of the same datatype
-                if not all(x[0] == sample_mapping_dict[sample][0][0] for x in sample_mapping_dict[sample]):
-                    print_error("Multiple runs of a sample must be of the same datatype!", "Sample: {}".format(sample))
-
-                for idx, val in enumerate(sample_mapping_dict[sample]):
-                    fout.write(",".join(["{}_T{}".format(sample, idx + 1)] + val) + "\n")
-    else:
-        print_error("No entries to process!", "Samplesheet: {}".format(file_in))
+    sample_lines = ""
+    for sample, sample_info in samples.items():
+        sample_info_line = ",".join(sample_info)
+        sample_line = f"{sample},{sample_info_line}\n"
+        sample_lines += sample_line
+    out_dir = os.path.dirname(file_out)
+    make_dir(out_dir)
+    header = (
+        ",".join(
+            [
+                "sample",
+                "assembly",
+                "single_end",
+                "fastq_1",
+                "fastq_2",
+                "coverage_tab",
+                "cov_from_contig_headers",
+            ]
+        )
+        + "\n"
+    )
+    with open(file_out, "w") as outfh:
+        outfh.write(header)
+        outfh.write(sample_lines)
 
 
 def main(args=None):

diff --git a/conf/base.config b/conf/base.config
@@ -27,12 +27,12 @@ process {
     // See https://www.nextflow.io/docs/latest/config.html#config-process-selectors
     withLabel:process_low {
         cpus   = { check_max( 1     * task.attempt, 'cpus'    ) }
-        memory = { check_max( 2.GB * task.attempt, 'memory'  ) }
+        memory = { check_max( 2.GB  * task.attempt, 'memory'  ) }
         time   = { check_max( 4.h   * task.attempt, 'time'    ) }
     }
     withLabel:process_medium {
         cpus   = { check_max( 8     * task.attempt, 'cpus'    ) }
-        memory = { check_max( 8.GB * task.attempt, 'memory'  ) }
+        memory = { check_max( 8.GB  * task.attempt, 'memory'  ) }
         time   = { check_max( 8.h   * task.attempt, 'time'    ) }
     }
     withLabel:process_high {

diff --git a/conf/modules.config b/conf/modules.config
@@ -76,23 +76,35 @@ params {
         }
         'diamond_makedb_options' {
             publish_by_meta  = ['id']
-            args = ""
+            args             = ""
+        }
+        'align_reads_options' {
+            args            = ""
+            args2           = "-q --phred33 --very-sensitive --no-unal"
+            publish_by_meta = ['id']
+            publish_dir     = "align_reads"
         }
         'samtools_viewsort_options' {
-            args           = ""
-            args2          = ""
+            args             = ""
+            args2            = ""
             publish_by_meta  = ['id']
-            publish_dir    = "samtools_sort"
+            publish_dir      = "samtools_sort"
+        }
+        'bedtools_genomecov_options' {
+            args             = ""
+            args2            = ""
+            publish_by_meta  = ['id']
+            publish_dir      = "genome_coverage"
         }
         'seqkit_split_options' {
             publish_by_meta  = ['id']
-            args           = "-p ${params.num_splits}"
-            args2          = "--two-pass"
+            args             = ""
+            args2            = "--two-pass"
         }
         'spades_kmer_coverage' {
             publish_by_meta  = ['id']
-            publish_files  = ['*.coverages.tsv':'']
-            publish_dir    = "coverage"
+            publish_files    = ['*.coverages.tsv':'']
+            publish_dir      = "coverage"
         }
         'split_kingdoms_options' {
             publish_by_meta  = ['id']
@@ -102,9 +114,8 @@ params {
         }
         'unclustered_recruitment_options' {
             publish_by_meta  = ['id']
-            publish_dir    = "binning_results/unclustered_recruitment_results"
+            publish_dir      = "binning_results/unclustered_recruitment_results"
 
         }
     }
 }
-