Skip to content

Commit

Permalink
Samtools coverage submodule (#2356)
Browse files Browse the repository at this point in the history
* Allow override title per-dataset

* Add samtools coverage submodule

* [automated] Update CHANGELOG.md

* Fix changelog

* Update multiqc/modules/samtools/rmdup.py

Co-authored-by: Phil Ewels <phil.ewels@seqera.io>

* Fix rmdup regex

---------

Co-authored-by: MultiQC Bot <multiqc-bot@seqera.io>
Co-authored-by: Phil Ewels <phil.ewels@seqera.io>
  • Loading branch information
3 people committed Feb 23, 2024
1 parent 7ddfa63 commit 4630828
Show file tree
Hide file tree
Showing 9 changed files with 285 additions and 38 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,10 @@
- **HTSeq Count**: robust file reading loop, ignore `.parquet` files ([#2364](https://github.com/MultiQC/MultiQC/pull/2364))
- **Illumina InterOp Statistics**: do not set `'scale': False` as a default ([#2350](https://github.com/MultiQC/MultiQC/pull/2350))
- **mosdepth**: fix regression in showing general stats ([#2346](https://github.com/MultiQC/MultiQC/pull/2346))
- **PURPLE**: support v4.0.1 output without `version` column ([#2366](https://github.com/MultiQC/MultiQC/pull/2366))
- **Samtools**: support `coverage` command ([#2356](https://github.com/MultiQC/MultiQC/pull/2356))
- **UMI-tools**: support `extract` command ([#2296](https://github.com/MultiQC/MultiQC/pull/2296))
- **Whatshap**: make robust to stdout appended to TSV ([#2361](https://github.com/MultiQC/MultiQC/pull/2361))
- **PURPLE**: support v4.0.1 output without `version` column ([#2366](https://github.com/MultiQC/MultiQC/pull/2366))

## [MultiQC v1.20](https://github.com/ewels/MultiQC/releases/tag/v1.20) - 2024-02-12

Expand Down
245 changes: 245 additions & 0 deletions multiqc/modules/samtools/coverage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,245 @@
import copy

from typing import Dict, Union

import logging

from multiqc.plots import linegraph, table

log = logging.getLogger(__name__)


class CoverageReportMixin:
def parse_samtools_coverage(self):
"""Find Samtools coverage logs and parse their data"""

data_by_sample = dict()
for f in self.find_log_files("samtools/coverage"):
metrics_by_chrom = parse_single_report(f)
if len(metrics_by_chrom) > 0:
if f["s_name"] in data_by_sample:
log.debug(f"Duplicate sample name found! Overwriting: {f['s_name']}")
self.add_data_source(f, section="coverage")
data_by_sample[f["s_name"]] = metrics_by_chrom

# Filter to strip out ignored sample names
data_by_sample = self.ignore_samples(data_by_sample)
if len(data_by_sample) == 0:
return 0

# Superfluous function call to confirm that it is used in this module
# Replace None with actual version if it is available
self.add_software_version(None)

# Write parsed report data to a file (restructure first)
self.write_data_file(data_by_sample, "multiqc_samtools_coverage")

# Make a table/violin summarising the stats across all regions (mean or sum)
self.summary_table(data_by_sample)

# Make a line plot showing coverage stats per region, with a tab switch between stats
self.lineplot_per_region(data_by_sample)

# Return the number of logs that were found
return len(data_by_sample)

def summary_table(self, data_by_sample):
table_data = {sname: {} for sname in data_by_sample}
for sample, d_by_chrom in data_by_sample.items():
vals = list(d_by_chrom.values())
table_data[sample]["numreads"] = sum([m["numreads"] for m in vals])
table_data[sample]["covbases"] = sum([m["covbases"] for m in vals])
for m in vals:
m["size"] = m["endpos"] - m["startpos"] + 1
total_size = sum([m["size"] for m in vals])
# Average weighted by size. Multiplying by individual weight and dividing by total weight
table_data[sample]["coverage"] = sum([m["coverage"] * m["size"] for m in vals]) / total_size
table_data[sample]["meandepth"] = sum([m["meandepth"] * m["size"] for m in vals]) / total_size
table_data[sample]["meanbaseq"] = sum([m["meanbaseq"] * m["size"] for m in vals]) / total_size
table_data[sample]["meanmapq"] = sum([m["meanmapq"] * m["size"] for m in vals]) / total_size

headers = {
"numreads": {
"title": "Reads",
"description": "Total number of mapped reads",
"shared_key": "read_count",
"scale": "RdYlGn",
},
"covbases": {
"title": "Bases",
"description": "Total number of mapped base pairs",
"shared_key": "base_count",
"scale": "Blues",
},
"coverage": {
"title": "Coverage",
"description": "Percentage of region covered with reads",
"min": 0,
"max": 100,
"suffix": "%",
"scale": "YlGn",
},
"meandepth": {
"title": "Mean depth",
"description": "Mean depth of coverage",
"min": 0,
"suffix": "x",
"scale": "RdYlGn",
},
"meanbaseq": {
"title": "Mean BQ",
"description": "Mean base quality",
"min": 0,
"scale": "Blues",
},
"meanmapq": {
"title": "Mean MQ",
"description": "Mean mapping quality",
"min": 0,
"max": 60,
"scale": "RdYlGn",
},
}

self.add_section(
name="Coverage: global stats",
anchor="samtools-coverage-table-section",
description=(
"Stats parsed from <code>samtools coverage</code> output, and summarized "
"(added up or weighted-averaged) across all regions."
),
plot=table.plot(
table_data,
copy.deepcopy(headers),
pconfig={
"id": "samtools-coverage-table",
"title": "Samtools Coverage: Summary",
},
),
)

for h in headers:
headers[h]["hidden"] = True
headers["meandepth"]["hidden"] = False
self.general_stats_addcols(table_data, headers, namespace="coverage")

def lineplot_per_region(self, data_by_sample):
tabs = {
"numreads": {
"title": "Mapped reads per region",
"label": "Reads",
"ylab": "# reads",
"tt_suffix": " reads",
"tt_decimals": 0,
},
"covbases": {
"title": "Mapped base pairs per region",
"label": "Bases",
"ylab": "# bases",
"tt_suffix": " bp",
"tt_decimals": 0,
},
"coverage": {
"title": "Percentage of region covered with reads",
"label": "Coverage",
"ylab": "Coverage %",
"tt_suffix": "%",
},
"meandepth": {
"title": "Mean depth per region",
"label": "Mean depth",
"ylab": "Depth",
"tt_suffix": "x",
},
"meanbaseq": {
"title": "Mean base quality per region",
"label": "BQ",
"ylab": "Base quality",
"tt_suffix": "",
},
"meanmapq": {
"title": "Mean mapping quality per region",
"label": "MQ",
"ylab": "Mapping quality",
"tt_suffix": "",
},
}
datasets = [
{
sample: {chrom: metrics[metric] for chrom, metrics in data_by_sample[sample].items()}
for sample in data_by_sample
}
for metric in tabs
]
data_labels = list(tabs.values())
for dconfig in data_labels:
dconfig["title"] = "Samtools coverage: " + dconfig["title"]

self.add_section(
name="Coverage: stats per region",
description="Per-region stats parsed from <code>samtools coverage</code> output.",
anchor="samtools-coverage-section",
plot=linegraph.plot(
datasets,
{
"id": "samtools-coverage",
"title": data_labels[0]["title"],
"xlab": "Region",
"ylab": data_labels[0]["ylab"],
"ymin": 0,
"categories": True,
"smooth_points": 500,
"logswitch": True,
"hide_zero_cats": False,
"data_labels": data_labels,
},
),
)


EXPECTED_COLUMNS = [
"rname",
"startpos",
"endpos",
"numreads",
"covbases",
"coverage",
"meandepth",
"meanbaseq",
"meanmapq",
]


def parse_single_report(f) -> Dict[str, Dict[str, Union[int, float]]]:
"""
Example:
#rname startpos endpos numreads covbases coverage meandepth meanbaseq meanmapq
oligo_1512_adapters 1 156 10 156 100 7.26282 18.4 29.6
oligo_741_adapters 1 156 0 0 0 0 0 0
Returns a dictionary with the contig name (rname) as the key and the rest of the fields as a dictionary
"""
parsed_data = {}
lines = f["f"].splitlines()
expected_header = "#" + "\t".join(EXPECTED_COLUMNS)
if lines[0] != expected_header:
logging.warning(f"Expected header for samtools coverage: {expected_header}, got: {lines[0]}")
return {}

for line in lines[1:]:
fields = line.strip().split("\t")
if len(fields) != len(EXPECTED_COLUMNS):
logging.warning(f"Skipping line with {len(fields)} fields, expected {len(EXPECTED_COLUMNS)}: {line}")
rname, startpos, endpos, numreads, covbases, coverage, meandepth, meanbaseq, meanmapq = fields
parsed_data[rname] = dict(
startpos=int(startpos),
endpos=int(endpos),
numreads=int(numreads),
covbases=int(covbases),
coverage=float(coverage),
meandepth=float(meandepth),
meanbaseq=float(meanbaseq),
meanmapq=float(meanmapq),
)

return parsed_data
20 changes: 8 additions & 12 deletions multiqc/modules/samtools/flagstat.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,31 +40,27 @@ def parse_samtools_flagstats(self):
# General Stats Table
flagstats_headers = {
"flagstat_total": {
"title": f"{config.read_count_prefix} Reads",
"title": "Reads",
"description": f"Total reads in the bam file ({config.read_count_desc})",
"min": 0,
"modify": lambda x: x * config.read_count_multiplier,
"shared_key": "read_count",
"hidden": True,
},
"mapped_passed": {
"title": f"{config.read_count_prefix} Reads Mapped",
"description": f"Reads Mapped in the bam file ({config.read_count_desc})",
"min": 0,
"modify": lambda x: x * config.read_count_multiplier,
"title": "Reads mapped",
"description": f"Reads mapped in the bam file ({config.read_count_desc})",
"shared_key": "read_count",
},
"mapped_passed_pct": {
"title": "% Reads Mapped",
"description": "% Reads Mapped in the bam file",
"title": "% Reads mapped",
"description": "% Reads mapped in the bam file",
"min": 0,
"max": 100,
"suffix": "%",
"scale": "RdYlGn",
"hidden": True,
},
}
self.general_stats_addcols(self.samtools_flagstat, flagstats_headers)
self.general_stats_addcols(self.samtools_flagstat, flagstats_headers, namespace="flagstat")

# Make dot plot of counts
keys = {}
Expand Down Expand Up @@ -100,15 +96,15 @@ def parse_samtools_flagstats(self):
)

self.add_section(
name="Samtools Flagstat",
name="Flagstat",
anchor="samtools-flagstat",
description="This module parses the output from <code>samtools flagstat</code>. All numbers in millions.",
plot=beeswarm.plot(
self.samtools_flagstat,
keys,
{
"id": "samtools-flagstat-dp",
"title": "Samtools Flagstat: Read Counts",
"title": "Samtools flagstat: Read Counts",
},
),
)
Expand Down
2 changes: 1 addition & 1 deletion multiqc/modules/samtools/idxstats.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ def parse_samtools_idxstats(self):
"id": "samtools-idxstats-mapped-reads-plot",
"title": "Samtools idxstats: Mapped reads per contig",
"ylab": "# mapped reads",
"xlab": "Chromosome Name",
"xlab": "Chromosome name",
"logswitch": True,
"categories": True,
"tt_label": "<strong>{point.category}:</strong> {point.y:.2f}",
Expand Down
6 changes: 3 additions & 3 deletions multiqc/modules/samtools/rmdup.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ def parse_samtools_rmdup(self):
for f in self.find_log_files("samtools/rmdup", filehandles=True):
# Example below:
# [bam_rmdupse_core] 26602816 / 103563641 = 0.2569 in library ' '
dups_regex = "\[bam_rmdups?e?_core\] (\d+) / (\d+) = (\d+\.\d+) in library '(.*)'"
dups_regex = r"\[bam_rmdups?e?_core\] (\d+) / (\d+) = (\d+\.\d+) in library '(.*)'"
s_name = f["s_name"]
for line in f["f"]:
match = re.search(dups_regex, line)
Expand Down Expand Up @@ -68,14 +68,14 @@ def parse_samtools_rmdup(self):
# General Stats Table
stats_headers = {
"pct_dups": {
"title": "% Dups",
"title": "Duplicates",
"description": "Percent of duplicate alignments",
"min": 0,
"max": 100,
"suffix": "%",
"scale": "OrRd",
}
}
self.general_stats_addcols(self.samtools_rmdup, stats_headers)
self.general_stats_addcols(self.samtools_rmdup, stats_headers, namespace="rmdup")

return len(self.samtools_rmdup)
9 changes: 8 additions & 1 deletion multiqc/modules/samtools/samtools.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from .flagstat import FlagstatReportMixin
from .idxstats import IdxstatsReportMixin
from .rmdup import RmdupReportMixin
from .coverage import CoverageReportMixin

# Import the Samtools submodules
from .stats import StatsReportMixin
Expand All @@ -16,7 +17,9 @@
log = logging.getLogger(__name__)


class MultiqcModule(BaseMultiqcModule, StatsReportMixin, FlagstatReportMixin, IdxstatsReportMixin, RmdupReportMixin):
class MultiqcModule(
BaseMultiqcModule, StatsReportMixin, FlagstatReportMixin, IdxstatsReportMixin, RmdupReportMixin, CoverageReportMixin
):
"""Samtools has a number of different commands and outputs.
This MultiQC module supports some but not all. The code for
each script is split into its own file and adds a section to
Expand Down Expand Up @@ -55,6 +58,10 @@ def __init__(self):
if n["rmdup"] > 0:
log.info(f"Found {n['rmdup']} rmdup reports")

n["coverage"] = self.parse_samtools_coverage()
if n["coverage"] > 0:
log.info(f"Found {n['coverage']} coverage reports")

# Exit if we didn't find anything
if sum(n.values()) == 0:
raise ModuleNoSamplesFound
Expand Down

0 comments on commit 4630828

Please sign in to comment.