Samtools coverage submodule (#2356)

* Allow override title per-dataset * Add samtools coverage submodule * [automated] Update CHANGELOG.md * Fix changelog * Update multiqc/modules/samtools/rmdup.py Co-authored-by: Phil Ewels <phil.ewels@seqera.io> * Fix rmdup regex --------- Co-authored-by: MultiQC Bot <multiqc-bot@seqera.io> Co-authored-by: Phil Ewels <phil.ewels@seqera.io>
MultiQC · Feb 23, 2024 · 4630828 · 4630828
1 parent 7ddfa63
commit 4630828
Show file tree

Hide file tree

Showing 9 changed files with 285 additions and 38 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -32,9 +32,10 @@
 - **HTSeq Count**: robust file reading loop, ignore `.parquet` files ([#2364](https://github.com/MultiQC/MultiQC/pull/2364))
 - **Illumina InterOp Statistics**: do not set `'scale': False` as a default ([#2350](https://github.com/MultiQC/MultiQC/pull/2350))
 - **mosdepth**: fix regression in showing general stats ([#2346](https://github.com/MultiQC/MultiQC/pull/2346))
-- **PURPLE**: support v4.0.1 output without `version` column ([#2366](https://github.com/MultiQC/MultiQC/pull/2366))
+- **Samtools**: support `coverage` command ([#2356](https://github.com/MultiQC/MultiQC/pull/2356))
 - **UMI-tools**: support `extract` command ([#2296](https://github.com/MultiQC/MultiQC/pull/2296))
 - **Whatshap**: make robust to stdout appended to TSV ([#2361](https://github.com/MultiQC/MultiQC/pull/2361))
+- **PURPLE**: support v4.0.1 output without `version` column ([#2366](https://github.com/MultiQC/MultiQC/pull/2366))
 
 ## [MultiQC v1.20](https://github.com/ewels/MultiQC/releases/tag/v1.20) - 2024-02-12
 

diff --git a/multiqc/modules/samtools/coverage.py b/multiqc/modules/samtools/coverage.py
@@ -0,0 +1,245 @@
+import copy
+
+from typing import Dict, Union
+
+import logging
+
+from multiqc.plots import linegraph, table
+
+log = logging.getLogger(__name__)
+
+
+class CoverageReportMixin:
+    def parse_samtools_coverage(self):
+        """Find Samtools coverage logs and parse their data"""
+
+        data_by_sample = dict()
+        for f in self.find_log_files("samtools/coverage"):
+            metrics_by_chrom = parse_single_report(f)
+            if len(metrics_by_chrom) > 0:
+                if f["s_name"] in data_by_sample:
+                    log.debug(f"Duplicate sample name found! Overwriting: {f['s_name']}")
+                self.add_data_source(f, section="coverage")
+                data_by_sample[f["s_name"]] = metrics_by_chrom
+
+        # Filter to strip out ignored sample names
+        data_by_sample = self.ignore_samples(data_by_sample)
+        if len(data_by_sample) == 0:
+            return 0
+
+        # Superfluous function call to confirm that it is used in this module
+        # Replace None with actual version if it is available
+        self.add_software_version(None)
+
+        # Write parsed report data to a file (restructure first)
+        self.write_data_file(data_by_sample, "multiqc_samtools_coverage")
+
+        # Make a table/violin summarising the stats across all regions (mean or sum)
+        self.summary_table(data_by_sample)
+
+        # Make a line plot showing coverage stats per region, with a tab switch between stats
+        self.lineplot_per_region(data_by_sample)
+
+        # Return the number of logs that were found
+        return len(data_by_sample)
+
+    def summary_table(self, data_by_sample):
+        table_data = {sname: {} for sname in data_by_sample}
+        for sample, d_by_chrom in data_by_sample.items():
+            vals = list(d_by_chrom.values())
+            table_data[sample]["numreads"] = sum([m["numreads"] for m in vals])
+            table_data[sample]["covbases"] = sum([m["covbases"] for m in vals])
+            for m in vals:
+                m["size"] = m["endpos"] - m["startpos"] + 1
+            total_size = sum([m["size"] for m in vals])
+            # Average weighted by size. Multiplying by individual weight and dividing by total weight
+            table_data[sample]["coverage"] = sum([m["coverage"] * m["size"] for m in vals]) / total_size
+            table_data[sample]["meandepth"] = sum([m["meandepth"] * m["size"] for m in vals]) / total_size
+            table_data[sample]["meanbaseq"] = sum([m["meanbaseq"] * m["size"] for m in vals]) / total_size
+            table_data[sample]["meanmapq"] = sum([m["meanmapq"] * m["size"] for m in vals]) / total_size
+
+        headers = {
+            "numreads": {
+                "title": "Reads",
+                "description": "Total number of mapped reads",
+                "shared_key": "read_count",
+                "scale": "RdYlGn",
+            },
+            "covbases": {
+                "title": "Bases",
+                "description": "Total number of mapped base pairs",
+                "shared_key": "base_count",
+                "scale": "Blues",
+            },
+            "coverage": {
+                "title": "Coverage",
+                "description": "Percentage of region covered with reads",
+                "min": 0,
+                "max": 100,
+                "suffix": "%",
+                "scale": "YlGn",
+            },
+            "meandepth": {
+                "title": "Mean depth",
+                "description": "Mean depth of coverage",
+                "min": 0,
+                "suffix": "x",
+                "scale": "RdYlGn",
+            },
+            "meanbaseq": {
+                "title": "Mean BQ",
+                "description": "Mean base quality",
+                "min": 0,
+                "scale": "Blues",
+            },
+            "meanmapq": {
+                "title": "Mean MQ",
+                "description": "Mean mapping quality",
+                "min": 0,
+                "max": 60,
+                "scale": "RdYlGn",
+            },
+        }
+
+        self.add_section(
+            name="Coverage: global stats",
+            anchor="samtools-coverage-table-section",
+            description=(
+                "Stats parsed from <code>samtools coverage</code> output, and summarized "
+                "(added up or weighted-averaged) across all regions."
+            ),
+            plot=table.plot(
+                table_data,
+                copy.deepcopy(headers),
+                pconfig={
+                    "id": "samtools-coverage-table",
+                    "title": "Samtools Coverage: Summary",
+                },
+            ),
+        )
+
+        for h in headers:
+            headers[h]["hidden"] = True
+        headers["meandepth"]["hidden"] = False
+        self.general_stats_addcols(table_data, headers, namespace="coverage")
+
+    def lineplot_per_region(self, data_by_sample):
+        tabs = {
+            "numreads": {
+                "title": "Mapped reads per region",
+                "label": "Reads",
+                "ylab": "# reads",
+                "tt_suffix": " reads",
+                "tt_decimals": 0,
+            },
+            "covbases": {
+                "title": "Mapped base pairs per region",
+                "label": "Bases",
+                "ylab": "# bases",
+                "tt_suffix": " bp",
+                "tt_decimals": 0,
+            },
+            "coverage": {
+                "title": "Percentage of region covered with reads",
+                "label": "Coverage",
+                "ylab": "Coverage %",
+                "tt_suffix": "%",
+            },
+            "meandepth": {
+                "title": "Mean depth per region",
+                "label": "Mean depth",
+                "ylab": "Depth",
+                "tt_suffix": "x",
+            },
+            "meanbaseq": {
+                "title": "Mean base quality per region",
+                "label": "BQ",
+                "ylab": "Base quality",
+                "tt_suffix": "",
+            },
+            "meanmapq": {
+                "title": "Mean mapping quality per region",
+                "label": "MQ",
+                "ylab": "Mapping quality",
+                "tt_suffix": "",
+            },
+        }
+        datasets = [
+            {
+                sample: {chrom: metrics[metric] for chrom, metrics in data_by_sample[sample].items()}
+                for sample in data_by_sample
+            }
+            for metric in tabs
+        ]
+        data_labels = list(tabs.values())
+        for dconfig in data_labels:
+            dconfig["title"] = "Samtools coverage: " + dconfig["title"]
+
+        self.add_section(
+            name="Coverage: stats per region",
+            description="Per-region stats parsed from <code>samtools coverage</code> output.",
+            anchor="samtools-coverage-section",
+            plot=linegraph.plot(
+                datasets,
+                {
+                    "id": "samtools-coverage",
+                    "title": data_labels[0]["title"],
+                    "xlab": "Region",
+                    "ylab": data_labels[0]["ylab"],
+                    "ymin": 0,
+                    "categories": True,
+                    "smooth_points": 500,
+                    "logswitch": True,
+                    "hide_zero_cats": False,
+                    "data_labels": data_labels,
+                },
+            ),
+        )
+
+
+EXPECTED_COLUMNS = [
+    "rname",
+    "startpos",
+    "endpos",
+    "numreads",
+    "covbases",
+    "coverage",
+    "meandepth",
+    "meanbaseq",
+    "meanmapq",
+]
+
+
+def parse_single_report(f) -> Dict[str, Dict[str, Union[int, float]]]:
+    """
+    Example:
+    #rname	startpos	endpos	numreads	covbases	coverage	meandepth	meanbaseq	meanmapq
+    oligo_1512_adapters	1	156	10	156	100	7.26282	18.4	29.6
+    oligo_741_adapters	1	156	0	0	0	0	0	0
+
+    Returns a dictionary with the contig name (rname) as the key and the rest of the fields as a dictionary
+    """
+    parsed_data = {}
+    lines = f["f"].splitlines()
+    expected_header = "#" + "\t".join(EXPECTED_COLUMNS)
+    if lines[0] != expected_header:
+        logging.warning(f"Expected header for samtools coverage: {expected_header}, got: {lines[0]}")
+        return {}
+
+    for line in lines[1:]:
+        fields = line.strip().split("\t")
+        if len(fields) != len(EXPECTED_COLUMNS):
+            logging.warning(f"Skipping line with {len(fields)} fields, expected {len(EXPECTED_COLUMNS)}: {line}")
+        rname, startpos, endpos, numreads, covbases, coverage, meandepth, meanbaseq, meanmapq = fields
+        parsed_data[rname] = dict(
+            startpos=int(startpos),
+            endpos=int(endpos),
+            numreads=int(numreads),
+            covbases=int(covbases),
+            coverage=float(coverage),
+            meandepth=float(meandepth),
+            meanbaseq=float(meanbaseq),
+            meanmapq=float(meanmapq),
+        )
+
+    return parsed_data
diff --git a/multiqc/modules/samtools/flagstat.py b/multiqc/modules/samtools/flagstat.py
@@ -40,31 +40,27 @@ def parse_samtools_flagstats(self):
         # General Stats Table
         flagstats_headers = {
             "flagstat_total": {
-                "title": f"{config.read_count_prefix} Reads",
+                "title": "Reads",
                 "description": f"Total reads in the bam file ({config.read_count_desc})",
-                "min": 0,
-                "modify": lambda x: x * config.read_count_multiplier,
                 "shared_key": "read_count",
                 "hidden": True,
             },
             "mapped_passed": {
-                "title": f"{config.read_count_prefix} Reads Mapped",
-                "description": f"Reads Mapped in the bam file ({config.read_count_desc})",
-                "min": 0,
-                "modify": lambda x: x * config.read_count_multiplier,
+                "title": "Reads mapped",
+                "description": f"Reads mapped in the bam file ({config.read_count_desc})",
                 "shared_key": "read_count",
             },
             "mapped_passed_pct": {
-                "title": "% Reads Mapped",
-                "description": "% Reads Mapped in the bam file",
+                "title": "% Reads mapped",
+                "description": "% Reads mapped in the bam file",
                 "min": 0,
                 "max": 100,
                 "suffix": "%",
                 "scale": "RdYlGn",
                 "hidden": True,
             },
         }
-        self.general_stats_addcols(self.samtools_flagstat, flagstats_headers)
+        self.general_stats_addcols(self.samtools_flagstat, flagstats_headers, namespace="flagstat")
 
         # Make dot plot of counts
         keys = {}
@@ -100,15 +96,15 @@ def parse_samtools_flagstats(self):
         )
 
         self.add_section(
-            name="Samtools Flagstat",
+            name="Flagstat",
             anchor="samtools-flagstat",
             description="This module parses the output from <code>samtools flagstat</code>. All numbers in millions.",
             plot=beeswarm.plot(
                 self.samtools_flagstat,
                 keys,
                 {
                     "id": "samtools-flagstat-dp",
-                    "title": "Samtools Flagstat: Read Counts",
+                    "title": "Samtools flagstat: Read Counts",
                 },
             ),
         )

diff --git a/multiqc/modules/samtools/idxstats.py b/multiqc/modules/samtools/idxstats.py
@@ -139,7 +139,7 @@ def parse_samtools_idxstats(self):
             "id": "samtools-idxstats-mapped-reads-plot",
             "title": "Samtools idxstats: Mapped reads per contig",
             "ylab": "# mapped reads",
-            "xlab": "Chromosome Name",
+            "xlab": "Chromosome name",
             "logswitch": True,
             "categories": True,
             "tt_label": "<strong>{point.category}:</strong> {point.y:.2f}",

diff --git a/multiqc/modules/samtools/rmdup.py b/multiqc/modules/samtools/rmdup.py
@@ -17,7 +17,7 @@ def parse_samtools_rmdup(self):
         for f in self.find_log_files("samtools/rmdup", filehandles=True):
             # Example below:
             # [bam_rmdupse_core] 26602816 / 103563641 = 0.2569 in library '   '
-            dups_regex = "\[bam_rmdups?e?_core\] (\d+) / (\d+) = (\d+\.\d+) in library '(.*)'"
+            dups_regex = r"\[bam_rmdups?e?_core\] (\d+) / (\d+) = (\d+\.\d+) in library '(.*)'"
             s_name = f["s_name"]
             for line in f["f"]:
                 match = re.search(dups_regex, line)
@@ -68,14 +68,14 @@ def parse_samtools_rmdup(self):
         # General Stats Table
         stats_headers = {
             "pct_dups": {
-                "title": "% Dups",
+                "title": "Duplicates",
                 "description": "Percent of duplicate alignments",
                 "min": 0,
                 "max": 100,
                 "suffix": "%",
                 "scale": "OrRd",
             }
         }
-        self.general_stats_addcols(self.samtools_rmdup, stats_headers)
+        self.general_stats_addcols(self.samtools_rmdup, stats_headers, namespace="rmdup")
 
         return len(self.samtools_rmdup)
diff --git a/multiqc/modules/samtools/samtools.py b/multiqc/modules/samtools/samtools.py
@@ -8,6 +8,7 @@
 from .flagstat import FlagstatReportMixin
 from .idxstats import IdxstatsReportMixin
 from .rmdup import RmdupReportMixin
+from .coverage import CoverageReportMixin
 
 # Import the Samtools submodules
 from .stats import StatsReportMixin
@@ -16,7 +17,9 @@
 log = logging.getLogger(__name__)
 
 
-class MultiqcModule(BaseMultiqcModule, StatsReportMixin, FlagstatReportMixin, IdxstatsReportMixin, RmdupReportMixin):
+class MultiqcModule(
+    BaseMultiqcModule, StatsReportMixin, FlagstatReportMixin, IdxstatsReportMixin, RmdupReportMixin, CoverageReportMixin
+):
     """Samtools has a number of different commands and outputs.
     This MultiQC module supports some but not all. The code for
     each script is split into its own file and adds a section to
@@ -55,6 +58,10 @@ def __init__(self):
         if n["rmdup"] > 0:
             log.info(f"Found {n['rmdup']} rmdup reports")
 
+        n["coverage"] = self.parse_samtools_coverage()
+        if n["coverage"] > 0:
+            log.info(f"Found {n['coverage']} coverage reports")
+
         # Exit if we didn't find anything
         if sum(n.values()) == 0:
             raise ModuleNoSamplesFound