Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Samtools coverage submodule #2356

Merged
merged 10 commits into from
Feb 23, 2024
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,10 @@
- **HTSeq Count**: robust file reading loop, ignore `.parquet` files ([#2364](https://github.com/MultiQC/MultiQC/pull/2364))
- **Illumina InterOp Statistics**: do not set `'scale': False` as a default ([#2350](https://github.com/MultiQC/MultiQC/pull/2350))
- **mosdepth**: fix regression in showing general stats ([#2346](https://github.com/MultiQC/MultiQC/pull/2346))
- **PURPLE**: support v4.0.1 output without `version` column ([#2366](https://github.com/MultiQC/MultiQC/pull/2366))
- **Samtools**: support `coverage` command ([#2356](https://github.com/MultiQC/MultiQC/pull/2356))
- **UMI-tools**: support `extract` command ([#2296](https://github.com/MultiQC/MultiQC/pull/2296))
- **Whatshap**: make robust to stdout appended to TSV ([#2361](https://github.com/MultiQC/MultiQC/pull/2361))
- **PURPLE**: support v4.0.1 output without `version` column ([#2366](https://github.com/MultiQC/MultiQC/pull/2366))

## [MultiQC v1.20](https://github.com/ewels/MultiQC/releases/tag/v1.20) - 2024-02-12

Expand Down
245 changes: 245 additions & 0 deletions multiqc/modules/samtools/coverage.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,245 @@
import copy

from typing import Dict, Union

import logging

from multiqc.plots import linegraph, table

log = logging.getLogger(__name__)


class CoverageReportMixin:
def parse_samtools_coverage(self):
"""Find Samtools coverage logs and parse their data"""

data_by_sample = dict()
for f in self.find_log_files("samtools/coverage"):
metrics_by_chrom = parse_single_report(f)
if len(metrics_by_chrom) > 0:
if f["s_name"] in data_by_sample:
log.debug(f"Duplicate sample name found! Overwriting: {f['s_name']}")
self.add_data_source(f, section="coverage")
data_by_sample[f["s_name"]] = metrics_by_chrom

# Filter to strip out ignored sample names
data_by_sample = self.ignore_samples(data_by_sample)
if len(data_by_sample) == 0:
return 0

# Superfluous function call to confirm that it is used in this module
# Replace None with actual version if it is available
self.add_software_version(None)

# Write parsed report data to a file (restructure first)
self.write_data_file(data_by_sample, "multiqc_samtools_coverage")

# Make a table/violin summarising the stats across all regions (mean or sum)
self.summary_table(data_by_sample)

# Make a line plot showing coverage stats per region, with a tab switch between stats
self.lineplot_per_region(data_by_sample)

# Return the number of logs that were found
return len(data_by_sample)

def summary_table(self, data_by_sample):
table_data = {sname: {} for sname in data_by_sample}
for sample, d_by_chrom in data_by_sample.items():
vals = list(d_by_chrom.values())
table_data[sample]["numreads"] = sum([m["numreads"] for m in vals])
table_data[sample]["covbases"] = sum([m["covbases"] for m in vals])
for m in vals:
m["size"] = m["endpos"] - m["startpos"] + 1
total_size = sum([m["size"] for m in vals])
# Average weighted by size. Multiplying by individual weight and dividing by total weight
table_data[sample]["coverage"] = sum([m["coverage"] * m["size"] for m in vals]) / total_size
table_data[sample]["meandepth"] = sum([m["meandepth"] * m["size"] for m in vals]) / total_size
table_data[sample]["meanbaseq"] = sum([m["meanbaseq"] * m["size"] for m in vals]) / total_size
table_data[sample]["meanmapq"] = sum([m["meanmapq"] * m["size"] for m in vals]) / total_size

headers = {
"numreads": {
"title": "Reads",
"description": "Total number of mapped reads",
"shared_key": "read_count",
"scale": "RdYlGn",
},
"covbases": {
"title": "Bases",
"description": "Total number of mapped base pairs",
"shared_key": "base_count",
"scale": "Blues",
},
"coverage": {
"title": "Coverage",
"description": "Percentage of region covered with reads",
"min": 0,
"max": 100,
"suffix": "%",
"scale": "YlGn",
},
"meandepth": {
"title": "Mean depth",
"description": "Mean depth of coverage",
"min": 0,
"suffix": "x",
"scale": "RdYlGn",
},
"meanbaseq": {
"title": "Mean BQ",
"description": "Mean base quality",
"min": 0,
"scale": "Blues",
},
"meanmapq": {
"title": "Mean MQ",
"description": "Mean mapping quality",
"min": 0,
"max": 60,
"scale": "RdYlGn",
},
}

self.add_section(
name="Coverage: global stats",
anchor="samtools-coverage-table-section",
description=(
"Stats parsed from <code>samtools coverage</code> output, and summarized "
"(added up or weighted-averaged) across all regions."
),
plot=table.plot(
table_data,
copy.deepcopy(headers),
pconfig={
"id": "samtools-coverage-table",
"title": "Samtools Coverage: Summary",
},
),
)

for h in headers:
headers[h]["hidden"] = True
headers["meandepth"]["hidden"] = False
self.general_stats_addcols(table_data, headers, namespace="coverage")

def lineplot_per_region(self, data_by_sample):
tabs = {
"numreads": {
"title": "Mapped reads per region",
"label": "Reads",
"ylab": "# reads",
"tt_suffix": " reads",
"tt_decimals": 0,
},
"covbases": {
"title": "Mapped base pairs per region",
"label": "Bases",
"ylab": "# bases",
"tt_suffix": " bp",
"tt_decimals": 0,
},
"coverage": {
"title": "Percentage of region covered with reads",
"label": "Coverage",
"ylab": "Coverage %",
"tt_suffix": "%",
},
"meandepth": {
"title": "Mean depth per region",
"label": "Mean depth",
"ylab": "Depth",
"tt_suffix": "x",
},
"meanbaseq": {
"title": "Mean base quality per region",
"label": "BQ",
"ylab": "Base quality",
"tt_suffix": "",
},
"meanmapq": {
"title": "Mean mapping quality per region",
"label": "MQ",
"ylab": "Mapping quality",
"tt_suffix": "",
},
}
datasets = [
{
sample: {chrom: metrics[metric] for chrom, metrics in data_by_sample[sample].items()}
for sample in data_by_sample
}
for metric in tabs
]
data_labels = list(tabs.values())
for dconfig in data_labels:
dconfig["title"] = "Samtools coverage: " + dconfig["title"]

self.add_section(
name="Coverage: stats per region",
description="Per-region stats parsed from <code>samtools coverage</code> output.",
anchor="samtools-coverage-section",
plot=linegraph.plot(
datasets,
{
"id": "samtools-coverage",
"title": data_labels[0]["title"],
"xlab": "Region",
"ylab": data_labels[0]["ylab"],
"ymin": 0,
"categories": True,
"smooth_points": 500,
"logswitch": True,
"hide_zero_cats": False,
"data_labels": data_labels,
},
),
)


EXPECTED_COLUMNS = [
"rname",
"startpos",
"endpos",
"numreads",
"covbases",
"coverage",
"meandepth",
"meanbaseq",
"meanmapq",
]


def parse_single_report(f) -> Dict[str, Dict[str, Union[int, float]]]:
"""
Example:
#rname startpos endpos numreads covbases coverage meandepth meanbaseq meanmapq
oligo_1512_adapters 1 156 10 156 100 7.26282 18.4 29.6
oligo_741_adapters 1 156 0 0 0 0 0 0

Returns a dictionary with the contig name (rname) as the key and the rest of the fields as a dictionary
"""
parsed_data = {}
lines = f["f"].splitlines()
expected_header = "#" + "\t".join(EXPECTED_COLUMNS)
if lines[0] != expected_header:
logging.warning(f"Expected header for samtools coverage: {expected_header}, got: {lines[0]}")
return {}

for line in lines[1:]:
fields = line.strip().split("\t")
if len(fields) != len(EXPECTED_COLUMNS):
logging.warning(f"Skipping line with {len(fields)} fields, expected {len(EXPECTED_COLUMNS)}: {line}")
rname, startpos, endpos, numreads, covbases, coverage, meandepth, meanbaseq, meanmapq = fields
parsed_data[rname] = dict(
startpos=int(startpos),
endpos=int(endpos),
numreads=int(numreads),
covbases=int(covbases),
coverage=float(coverage),
meandepth=float(meandepth),
meanbaseq=float(meanbaseq),
meanmapq=float(meanmapq),
)

return parsed_data
20 changes: 8 additions & 12 deletions multiqc/modules/samtools/flagstat.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,31 +40,27 @@ def parse_samtools_flagstats(self):
# General Stats Table
flagstats_headers = {
"flagstat_total": {
"title": f"{config.read_count_prefix} Reads",
"title": "Reads",
"description": f"Total reads in the bam file ({config.read_count_desc})",
"min": 0,
"modify": lambda x: x * config.read_count_multiplier,
"shared_key": "read_count",
"hidden": True,
},
"mapped_passed": {
"title": f"{config.read_count_prefix} Reads Mapped",
"description": f"Reads Mapped in the bam file ({config.read_count_desc})",
"min": 0,
"modify": lambda x: x * config.read_count_multiplier,
"title": "Reads mapped",
"description": f"Reads mapped in the bam file ({config.read_count_desc})",
"shared_key": "read_count",
},
"mapped_passed_pct": {
"title": "% Reads Mapped",
"description": "% Reads Mapped in the bam file",
"title": "% Reads mapped",
"description": "% Reads mapped in the bam file",
"min": 0,
"max": 100,
"suffix": "%",
"scale": "RdYlGn",
"hidden": True,
},
}
self.general_stats_addcols(self.samtools_flagstat, flagstats_headers)
self.general_stats_addcols(self.samtools_flagstat, flagstats_headers, namespace="flagstat")

# Make dot plot of counts
keys = {}
Expand Down Expand Up @@ -100,15 +96,15 @@ def parse_samtools_flagstats(self):
)

self.add_section(
name="Samtools Flagstat",
name="Flagstat",
anchor="samtools-flagstat",
description="This module parses the output from <code>samtools flagstat</code>. All numbers in millions.",
plot=beeswarm.plot(
self.samtools_flagstat,
keys,
{
"id": "samtools-flagstat-dp",
"title": "Samtools Flagstat: Read Counts",
"title": "Samtools flagstat: Read Counts",
},
),
)
Expand Down
2 changes: 1 addition & 1 deletion multiqc/modules/samtools/idxstats.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,7 @@ def parse_samtools_idxstats(self):
"id": "samtools-idxstats-mapped-reads-plot",
"title": "Samtools idxstats: Mapped reads per contig",
"ylab": "# mapped reads",
"xlab": "Chromosome Name",
"xlab": "Chromosome name",
"logswitch": True,
"categories": True,
"tt_label": "<strong>{point.category}:</strong> {point.y:.2f}",
Expand Down
4 changes: 2 additions & 2 deletions multiqc/modules/samtools/rmdup.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,14 +68,14 @@ def parse_samtools_rmdup(self):
# General Stats Table
stats_headers = {
"pct_dups": {
"title": "% Dups",
"title": "Dups",
vladsavelyev marked this conversation as resolved.
Show resolved Hide resolved
"description": "Percent of duplicate alignments",
"min": 0,
"max": 100,
"suffix": "%",
"scale": "OrRd",
}
}
self.general_stats_addcols(self.samtools_rmdup, stats_headers)
self.general_stats_addcols(self.samtools_rmdup, stats_headers, namespace="rmdup")

return len(self.samtools_rmdup)
9 changes: 8 additions & 1 deletion multiqc/modules/samtools/samtools.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from .flagstat import FlagstatReportMixin
from .idxstats import IdxstatsReportMixin
from .rmdup import RmdupReportMixin
from .coverage import CoverageReportMixin

# Import the Samtools submodules
from .stats import StatsReportMixin
Expand All @@ -16,7 +17,9 @@
log = logging.getLogger(__name__)


class MultiqcModule(BaseMultiqcModule, StatsReportMixin, FlagstatReportMixin, IdxstatsReportMixin, RmdupReportMixin):
class MultiqcModule(
BaseMultiqcModule, StatsReportMixin, FlagstatReportMixin, IdxstatsReportMixin, RmdupReportMixin, CoverageReportMixin
):
"""Samtools has a number of different commands and outputs.
This MultiQC module supports some but not all. The code for
each script is split into its own file and adds a section to
Expand Down Expand Up @@ -55,6 +58,10 @@ def __init__(self):
if n["rmdup"] > 0:
log.info(f"Found {n['rmdup']} rmdup reports")

n["coverage"] = self.parse_samtools_coverage()
if n["coverage"] > 0:
log.info(f"Found {n['coverage']} coverage reports")

# Exit if we didn't find anything
if sum(n.values()) == 0:
raise ModuleNoSamplesFound
Expand Down