Skip to content

Commit

Permalink
Mosdepth: fix prioritizing region over global information (#2106)
Browse files Browse the repository at this point in the history
* Mosdepth: when parsing summary, prioritize total_region over total (added in region mode enabled by --by)

* Mosdepth: priotize region reports over global if both available

* Clean up

* [automated] Update CHANGELOG.md

---------

Co-authored-by: MultiQC Bot <multiqc-bot@seqera.io>
Co-authored-by: Phil Ewels <phil.ewels@seqera.io>
  • Loading branch information
3 people committed Oct 17, 2023
1 parent 01303a7 commit 4a76790
Show file tree
Hide file tree
Showing 2 changed files with 70 additions and 52 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
- **Samtools**: Add MQ0 reads to the Percent Mapped barplot in Stats submodule ([#2123](https://github.com/ewels/MultiQC/pull/2123))
- **Picard**: Adapt WgsMetrics to parabricks bammetrics outputs ([#2127](https://github.com/ewels/MultiQC/pull/2127))
- **WhatsHap**: Process truncated input with no ALL chromosome ([#2095](https://github.com/ewels/MultiQC/pull/2095))
- **mosdepth**: fix prioritizing region over global information ([#2106](https://github.com/ewels/MultiQC/pull/2106))
- **Dragen**: make sure all inputs are recorded in multiqc_sources.txt ([#2128](https://github.com/ewels/MultiQC/pull/2128))
- **HTSeq Count**: allow counts files with more than 2 columns ([#2129](https://github.com/ewels/MultiQC/pull/2129))

Expand Down
121 changes: 69 additions & 52 deletions multiqc/modules/mosdepth/mosdepth.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,14 +121,18 @@ def __init__(self):
)

self.cfg = read_config()
genstats_headers = defaultdict(OrderedDict)
genstats = defaultdict(OrderedDict) # mean coverage
genstats_headers = defaultdict(dict)
genstats = defaultdict(dict) # mean coverage

# Parse mean coverage
for f in self.find_log_files("mosdepth/summary"):
s_name = self.clean_s_name(f["fn"], f)
for line in f["f"].splitlines():
if line.startswith("total\t"):
# The first column can be a contig name, "total", "total_region".
# We want to use "total_region" if available. It is available when
# --by is specified. It always goes after "total", so we can just
# assume it will override the information collected for "total":
if line.startswith("total\t") or line.startswith("total_region\t"):
contig, length, bases, mean, min_cov, max_cov = line.split("\t")
genstats[s_name]["mean_coverage"] = mean
self.add_data_source(f, s_name=s_name, section="summary")
Expand All @@ -139,33 +143,46 @@ def __init__(self):

# Filter out any samples from --ignore-samples
genstats = defaultdict(OrderedDict, self.ignore_samples(genstats))
samples_found = set(genstats.keys())
samples_in_summary = set(genstats.keys())

data_by_scope = {}
for scope in "global", "region":
data_by_scope[scope] = self.parse_cov_dist(scope)
data_by_scope[scope] = [defaultdict(OrderedDict, self.ignore_samples(d)) for d in data_by_scope[scope]]
for d in data_by_scope[scope]:
samples_found.update(set(d.keys()))
data_dicts_global = self.parse_cov_dist("global")
data_dicts_region = self.parse_cov_dist("region")
data_dicts_global = [self.ignore_samples(d) for d in data_dicts_global]
data_dicts_region = [self.ignore_samples(d) for d in data_dicts_region]

# No samples found
if len(samples_found) == 0:
samples_global = set.union(*(set(d.keys()) for d in data_dicts_global))
samples_region = set.union(*(set(d.keys()) for d in data_dicts_region))
samples_found = samples_in_summary | samples_global | samples_region
if not samples_found:
raise ModuleNoSamplesFound
log.info(f"Found {len(samples_found)} reports")

for scope, data in data_by_scope.items():
if scope == "global":
descr_suf = ". Calculated across the entire genome length"
title_suf = ""
id_suf = ""
fn_suf = ""
else:
descr_suf = ". Calculated across the target regions"
title_suf = " (regions only)"
id_suf = "-regions"
fn_suf = "_regions"

log.info(f"Found reports for {len(samples_found)} samples")

descr_suf = ""
if any(data_dicts_global) and any(data_dicts_region):
descr_suf = (
f". Note that for {len(samples_global)} samples, a BED file was "
f"provided, so the data was calculated across those regions. For "
f"{len(samples_region)} samples, it's calculated across "
f"the entire genome length"
)
samples_both = samples_global & samples_region
if samples_both:
descr_suf += (
f". {len(samples_both)} samples have both global and region "
f"reports, and we are showing the data for regions"
)
elif samples_region:
descr_suf = f". Calculated across the target regions"
elif samples_global:
descr_suf = f". Calculated across the entire genome length"

if samples_region or samples_global:
# Prioritizing region reports if found
data = data_dicts_global
for d, d_region in zip(data, data_dicts_region):
d.update(d_region)
cumcov_dist_data, cov_dist_data, perchrom_avg_data, xy_cov = data

if cumcov_dist_data:
xmax = 0
for sample, data in cumcov_dist_data.items():
Expand All @@ -177,11 +194,11 @@ def __init__(self):
cumcov_dist_data_writeable = {
sample: {str(k): v for k, v in sorted(data.items())} for sample, data in cumcov_dist_data.items()
}
self.write_data_file(cumcov_dist_data_writeable, f"mosdepth_cumcov{fn_suf}_dist")
self.write_data_file(cumcov_dist_data_writeable, f"mosdepth_cumcov_dist")

self.add_section(
name=f"Cumulative coverage distribution{title_suf}",
anchor=f"mosdepth-cumcoverage{id_suf}-dist",
name=f"Cumulative coverage distribution",
anchor=f"mosdepth-cumcoverage-dist",
description=(
f"Proportion of bases in the reference genome with, "
f"at least, a given depth of coverage{descr_suf}"
Expand All @@ -190,8 +207,8 @@ def __init__(self):
plot=linegraph.plot(
cumcov_dist_data,
{
"id": f"mosdepth-cumcoverage-dist{id_suf}-id",
"title": f"Mosdepth: Cumulative coverage distribution{title_suf}",
"id": f"mosdepth-cumcoverage-dist-id",
"title": f"Mosdepth: Cumulative coverage distribution",
"xlab": "Cumulative Coverage (X)",
"ylab": "% bases in genome/regions covered by at least X reads",
"ymax": 100,
Expand All @@ -207,7 +224,7 @@ def __init__(self):
cov_dist_data_writeable = {
sample: {str(k): v for k, v in sorted(data.items())} for sample, data in cov_dist_data.items()
}
self.write_data_file(cov_dist_data_writeable, f"mosdepth_cov{fn_suf}_dist")
self.write_data_file(cov_dist_data_writeable, f"mosdepth_cov_dist")

# Set ymax so that zero coverage values are ignored.
ymax = 0
Expand All @@ -217,17 +234,17 @@ def __init__(self):
ymax = max(ymax, max(positive_cov))

self.add_section(
name=f"Coverage distribution{title_suf}",
anchor=f"mosdepth-coverage-dist{id_suf}-cov",
name=f"Coverage distribution",
anchor=f"mosdepth-coverage-dist-cov",
description=(
f"Proportion of bases in the reference genome with a given " f"depth of coverage{descr_suf}"
),
helptext=coverage_histogram_helptext,
plot=linegraph.plot(
cov_dist_data,
{
"id": f"mosdepth-coverage-dist{id_suf}-id",
"title": f"Mosdepth: Coverage distribution{title_suf}",
"id": f"mosdepth-coverage-dist-id",
"title": f"Mosdepth: Coverage distribution",
"xlab": "Coverage (X)",
"ylab": "% bases in genome/regions covered by X reads",
"ymax": ymax * 1.05,
Expand All @@ -240,15 +257,15 @@ def __init__(self):
)
if perchrom_avg_data:
# Write data to file
self.write_data_file(perchrom_avg_data, f"mosdepth_perchrom{fn_suf}")
self.write_data_file(perchrom_avg_data, f"mosdepth_perchrom")

num_contigs = max([len(x.keys()) for x in perchrom_avg_data.values()])
if num_contigs > 1:
perchrom_plot = linegraph.plot(
perchrom_avg_data,
{
"id": f"mosdepth-coverage-per-contig{id_suf}",
"title": f"Mosdepth: Coverage per contig{title_suf}",
"id": f"mosdepth-coverage-per-contig",
"title": f"Mosdepth: Coverage per contig",
"xlab": "Region",
"ylab": "Average Coverage",
"categories": True,
Expand All @@ -263,8 +280,8 @@ def __init__(self):
perchrom_plot = bargraph.plot(
perchrom_avg_data,
pconfig={
"id": f"mosdepth-coverage-per-contig{id_suf}",
"title": f"Mosdepth: Coverage per contig{title_suf}",
"id": f"mosdepth-coverage-per-contig",
"title": f"Mosdepth: Coverage per contig",
"xlab": "Sample",
"ylab": "Average Coverage",
"tt_suffix": "x",
Expand All @@ -273,9 +290,9 @@ def __init__(self):
)

self.add_section(
name=f"Average coverage per contig{title_suf}",
anchor=f"mosdepth-coverage-per-contig{id_suf}-id",
description=f"Average coverage per contig or chromosome{id_suf}",
name=f"Average coverage per contig",
anchor=f"mosdepth-coverage-per-contig-id",
description=f"Average coverage per contig or chromosome",
plot=perchrom_plot,
)

Expand All @@ -284,16 +301,16 @@ def __init__(self):
xy_keys["x"] = {"name": self.cfg.get("xchr", "Chromosome X")}
xy_keys["y"] = {"name": self.cfg.get("xchr", "Chromosome Y")}
pconfig = {
"id": f"mosdepth-xy-coverage-plot{id_suf}",
"title": f"Mosdepth: chrXY coverage{title_suf}",
"id": f"mosdepth-xy-coverage-plot",
"title": f"Mosdepth: chrXY coverage",
"ylab": f"Percent of X+Y coverage",
"cpswitch_counts_label": "Coverage",
"cpswitch_percent_label": "Percent of X+Y coverage",
"cpswitch_c_active": False,
}
self.add_section(
name=f"XY coverage{title_suf}",
anchor=f"mosdepth-xy-coverage{id_suf}",
name=f"XY coverage",
anchor=f"mosdepth-xy-coverage",
plot=bargraph.plot(xy_cov, xy_keys, pconfig),
)

Expand All @@ -315,9 +332,10 @@ def __init__(self):
self.general_stats_addcols(genstats, genstats_headers)

def parse_cov_dist(self, scope):
cumcov_dist_data = defaultdict(OrderedDict) # cumulative distribution
cov_dist_data = defaultdict(OrderedDict) # absolute (non-cumulative) coverage
perchrom_avg_data = defaultdict(OrderedDict) # per chromosome average coverage
cumcov_dist_data = defaultdict(dict) # cumulative distribution
cov_dist_data = defaultdict(dict) # absolute (non-cumulative) coverage
perchrom_avg_data = defaultdict(dict) # per chromosome average coverage
xy_cov = dict()

# Parse coverage distributions
for f in self.find_log_files(f"mosdepth/{scope}_dist"):
Expand Down Expand Up @@ -401,7 +419,6 @@ def parse_cov_dist(self, scope):
)

# Additionally, collect X and Y counts if we have them
xy_cov = dict()
for s_name, perchrom in perchrom_avg_data.items():
x_cov = False
y_cov = False
Expand Down

0 comments on commit 4a76790

Please sign in to comment.