Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Mosdepth: fix prioritizing region over global information #2106

Merged
merged 7 commits into from
Oct 17, 2023
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
- **HiCPro**: fix parsing scientific notation in hicpro-ashic. Thanks @Just-Roma ([#2126](https://github.com/ewels/MultiQC/pull/2126))
- **Picard**: MarkDuplicates: Fix parsing mixed strings/numbers, account for missing trailing `0` ([#2083](https://github.com/ewels/MultiQC/pull/2083), [#2094](https://github.com/ewels/MultiQC/pull/2094))
- **WhatsHap**: Process truncated input with no ALL chromosome ([#2095](https://github.com/ewels/MultiQC/pull/2095))
- **mosdepth**: fix prioritizing region over global information ([#2106](https://github.com/ewels/MultiQC/pull/2106))

## [MultiQC v1.16](https://github.com/ewels/MultiQC/releases/tag/v1.16) - 2023-09-22

Expand Down
121 changes: 69 additions & 52 deletions multiqc/modules/mosdepth/mosdepth.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,14 +121,18 @@ def __init__(self):
)

self.cfg = read_config()
genstats_headers = defaultdict(OrderedDict)
genstats = defaultdict(OrderedDict) # mean coverage
genstats_headers = defaultdict(dict)
genstats = defaultdict(dict) # mean coverage

# Parse mean coverage
for f in self.find_log_files("mosdepth/summary"):
s_name = self.clean_s_name(f["fn"], f)
for line in f["f"].splitlines():
if line.startswith("total\t"):
# The first column can be a contig name, "total", "total_region".
# We want to use "total_region" if available. It is available when
# --by is specified. It always goes after "total", so we can just
# assume it will override the information collected for "total":
if line.startswith("total\t") or line.startswith("total_region\t"):
contig, length, bases, mean, min_cov, max_cov = line.split("\t")
genstats[s_name]["mean_coverage"] = mean
self.add_data_source(f, s_name=s_name, section="summary")
Expand All @@ -139,33 +143,46 @@ def __init__(self):

# Filter out any samples from --ignore-samples
genstats = defaultdict(OrderedDict, self.ignore_samples(genstats))
samples_found = set(genstats.keys())
samples_in_summary = set(genstats.keys())

data_by_scope = {}
for scope in "global", "region":
data_by_scope[scope] = self.parse_cov_dist(scope)
data_by_scope[scope] = [defaultdict(OrderedDict, self.ignore_samples(d)) for d in data_by_scope[scope]]
for d in data_by_scope[scope]:
samples_found.update(set(d.keys()))
data_dicts_global = self.parse_cov_dist("global")
data_dicts_region = self.parse_cov_dist("region")
data_dicts_global = [self.ignore_samples(d) for d in data_dicts_global]
data_dicts_region = [self.ignore_samples(d) for d in data_dicts_region]

# No samples found
if len(samples_found) == 0:
samples_global = set.union(*(set(d.keys()) for d in data_dicts_global))
samples_region = set.union(*(set(d.keys()) for d in data_dicts_region))
samples_found = samples_in_summary | samples_global | samples_region
if not samples_found:
raise ModuleNoSamplesFound
log.info(f"Found {len(samples_found)} reports")

for scope, data in data_by_scope.items():
if scope == "global":
descr_suf = ". Calculated across the entire genome length"
title_suf = ""
id_suf = ""
fn_suf = ""
else:
descr_suf = ". Calculated across the target regions"
title_suf = " (regions only)"
id_suf = "-regions"
fn_suf = "_regions"

log.info(f"Found reports for {len(samples_found)} samples")

descr_suf = ""
if any(data_dicts_global) and any(data_dicts_region):
descr_suf = (
f". Note that for {len(samples_global)} samples, a BED file was "
f"provided, so the data was calculated across those regions. For "
f"{len(samples_region)} samples, it's calculated across "
f"the entire genome length"
)
samples_both = samples_global & samples_region
if samples_both:
descr_suf += (
f". {len(samples_both)} samples have both global and region "
f"reports, and we are showing the data for regions"
)
elif samples_region:
descr_suf = f". Calculated across the target regions"
elif samples_global:
descr_suf = f". Calculated across the entire genome length"

if samples_region or samples_global:
# Prioritizing region reports if found
data = data_dicts_global
for d, d_region in zip(data, data_dicts_region):
d.update(d_region)
cumcov_dist_data, cov_dist_data, perchrom_avg_data, xy_cov = data

if cumcov_dist_data:
xmax = 0
for sample, data in cumcov_dist_data.items():
Expand All @@ -177,11 +194,11 @@ def __init__(self):
cumcov_dist_data_writeable = {
sample: {str(k): v for k, v in sorted(data.items())} for sample, data in cumcov_dist_data.items()
}
self.write_data_file(cumcov_dist_data_writeable, f"mosdepth_cumcov{fn_suf}_dist")
self.write_data_file(cumcov_dist_data_writeable, f"mosdepth_cumcov_dist")

self.add_section(
name=f"Cumulative coverage distribution{title_suf}",
anchor=f"mosdepth-cumcoverage{id_suf}-dist",
name=f"Cumulative coverage distribution",
anchor=f"mosdepth-cumcoverage-dist",
description=(
f"Proportion of bases in the reference genome with, "
f"at least, a given depth of coverage{descr_suf}"
Expand All @@ -190,8 +207,8 @@ def __init__(self):
plot=linegraph.plot(
cumcov_dist_data,
{
"id": f"mosdepth-cumcoverage-dist{id_suf}-id",
"title": f"Mosdepth: Cumulative coverage distribution{title_suf}",
"id": f"mosdepth-cumcoverage-dist-id",
"title": f"Mosdepth: Cumulative coverage distribution",
"xlab": "Cumulative Coverage (X)",
"ylab": "% bases in genome/regions covered by at least X reads",
"ymax": 100,
Expand All @@ -207,7 +224,7 @@ def __init__(self):
cov_dist_data_writeable = {
sample: {str(k): v for k, v in sorted(data.items())} for sample, data in cov_dist_data.items()
}
self.write_data_file(cov_dist_data_writeable, f"mosdepth_cov{fn_suf}_dist")
self.write_data_file(cov_dist_data_writeable, f"mosdepth_cov_dist")

# Set ymax so that zero coverage values are ignored.
ymax = 0
Expand All @@ -217,17 +234,17 @@ def __init__(self):
ymax = max(ymax, max(positive_cov))

self.add_section(
name=f"Coverage distribution{title_suf}",
anchor=f"mosdepth-coverage-dist{id_suf}-cov",
name=f"Coverage distribution",
anchor=f"mosdepth-coverage-dist-cov",
description=(
f"Proportion of bases in the reference genome with a given " f"depth of coverage{descr_suf}"
),
helptext=coverage_histogram_helptext,
plot=linegraph.plot(
cov_dist_data,
{
"id": f"mosdepth-coverage-dist{id_suf}-id",
"title": f"Mosdepth: Coverage distribution{title_suf}",
"id": f"mosdepth-coverage-dist-id",
"title": f"Mosdepth: Coverage distribution",
"xlab": "Coverage (X)",
"ylab": "% bases in genome/regions covered by X reads",
"ymax": ymax * 1.05,
Expand All @@ -240,15 +257,15 @@ def __init__(self):
)
if perchrom_avg_data:
# Write data to file
self.write_data_file(perchrom_avg_data, f"mosdepth_perchrom{fn_suf}")
self.write_data_file(perchrom_avg_data, f"mosdepth_perchrom")

num_contigs = max([len(x.keys()) for x in perchrom_avg_data.values()])
if num_contigs > 1:
perchrom_plot = linegraph.plot(
perchrom_avg_data,
{
"id": f"mosdepth-coverage-per-contig{id_suf}",
"title": f"Mosdepth: Coverage per contig{title_suf}",
"id": f"mosdepth-coverage-per-contig",
"title": f"Mosdepth: Coverage per contig",
"xlab": "Region",
"ylab": "Average Coverage",
"categories": True,
Expand All @@ -263,8 +280,8 @@ def __init__(self):
perchrom_plot = bargraph.plot(
perchrom_avg_data,
pconfig={
"id": f"mosdepth-coverage-per-contig{id_suf}",
"title": f"Mosdepth: Coverage per contig{title_suf}",
"id": f"mosdepth-coverage-per-contig",
"title": f"Mosdepth: Coverage per contig",
"xlab": "Sample",
"ylab": "Average Coverage",
"tt_suffix": "x",
Expand All @@ -273,9 +290,9 @@ def __init__(self):
)

self.add_section(
name=f"Average coverage per contig{title_suf}",
anchor=f"mosdepth-coverage-per-contig{id_suf}-id",
description=f"Average coverage per contig or chromosome{id_suf}",
name=f"Average coverage per contig",
anchor=f"mosdepth-coverage-per-contig-id",
description=f"Average coverage per contig or chromosome",
plot=perchrom_plot,
)

Expand All @@ -284,16 +301,16 @@ def __init__(self):
xy_keys["x"] = {"name": self.cfg.get("xchr", "Chromosome X")}
xy_keys["y"] = {"name": self.cfg.get("xchr", "Chromosome Y")}
pconfig = {
"id": f"mosdepth-xy-coverage-plot{id_suf}",
"title": f"Mosdepth: chrXY coverage{title_suf}",
"id": f"mosdepth-xy-coverage-plot",
"title": f"Mosdepth: chrXY coverage",
"ylab": f"Percent of X+Y coverage",
"cpswitch_counts_label": "Coverage",
"cpswitch_percent_label": "Percent of X+Y coverage",
"cpswitch_c_active": False,
}
self.add_section(
name=f"XY coverage{title_suf}",
anchor=f"mosdepth-xy-coverage{id_suf}",
name=f"XY coverage",
anchor=f"mosdepth-xy-coverage",
plot=bargraph.plot(xy_cov, xy_keys, pconfig),
)

Expand All @@ -315,9 +332,10 @@ def __init__(self):
self.general_stats_addcols(genstats, genstats_headers)

def parse_cov_dist(self, scope):
cumcov_dist_data = defaultdict(OrderedDict) # cumulative distribution
cov_dist_data = defaultdict(OrderedDict) # absolute (non-cumulative) coverage
perchrom_avg_data = defaultdict(OrderedDict) # per chromosome average coverage
cumcov_dist_data = defaultdict(dict) # cumulative distribution
cov_dist_data = defaultdict(dict) # absolute (non-cumulative) coverage
perchrom_avg_data = defaultdict(dict) # per chromosome average coverage
xy_cov = dict()

# Parse coverage distributions
for f in self.find_log_files(f"mosdepth/{scope}_dist"):
Expand Down Expand Up @@ -401,7 +419,6 @@ def parse_cov_dist(self, scope):
)

# Additionally, collect X and Y counts if we have them
xy_cov = dict()
for s_name, perchrom in perchrom_avg_data.items():
x_cov = False
y_cov = False
Expand Down