Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update Qualimap: add additional entries for qualimap when region stats present #1798

Merged
merged 4 commits into from
Nov 30, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,8 +43,9 @@
- **Sambamba Markdup**
- Catch zero division in sambamba markdup ([#1654](https://github.com/ewels/MultiQC/issues/1654))
- **Samtools**
- Added additional (by default hidden) column for `flagstat` that displays percentage of mapped reads in a bam ([#1733](https://github.com/ewels/MultiQC/issues/1733))
- Added additional column for `flagstat` that displays percentage of mapped reads in a bam (hidden by default) ([#1733](https://github.com/ewels/MultiQC/issues/1733))
- **Qualimap**
- Added additional columns in general stats for BamQC results that displays region on-target stats if region bed has been supplied (hidden by default) ([#1798](https://github.com/ewels/MultiQC/pull/1798))
- Bugfix: Remove General Stats rows for filtered samples ([#1780](https://github.com/ewels/MultiQC/issues/1780))

## [MultiQC v1.13](https://github.com/ewels/MultiQC/releases/tag/v1.13) - 2022-09-08
Expand Down
78 changes: 58 additions & 20 deletions multiqc/modules/qualimap/QM_BamQC.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,28 +81,50 @@ def parse_reports(self):
def parse_genome_results(self, f):
"""Parse the contents of the Qualimap BamQC genome_results.txt file"""
regexes = {
"bam_file": r"bam file = (.+)",
"total_reads": r"number of reads = ([\d,]+)",
"mapped_reads": r"number of mapped reads = ([\d,]+)",
"mapped_bases": r"number of mapped bases = ([\d,]+)",
"sequenced_bases": r"number of sequenced bases = ([\d,]+)",
"mean_insert_size": r"mean insert size = ([\d,\.]+)",
"median_insert_size": r"median insert size = ([\d,\.]+)",
"mean_mapping_quality": r"mean mapping quality = ([\d,\.]+)",
"general_error_rate": r"general error rate = ([\d,\.]+)",
"mean_coverage": r"mean coverageData = ([\d,\.]+)",
"Input": {
"bam_file": r"bam file = (.+)",
},
"Globals": {
"total_reads": r"number of reads = ([\d,]+)",
"mapped_reads": r"number of mapped reads = ([\d,]+)",
"mapped_bases": r"number of mapped bases = ([\d,]+)",
"sequenced_bases": r"number of sequenced bases = ([\d,]+)",
},
"Insert size": {
"mean_insert_size": r"mean insert size = ([\d,\.]+)",
"median_insert_size": r"median insert size = ([\d,\.]+)",
},
"Mapping quality": {
"mean_mapping_quality": r"mean mapping quality = ([\d,\.]+)",
},
"Mismatches and indels": {
"general_error_rate": r"general error rate = ([\d,\.]+)",
},
"Coverage": {
"mean_coverage": r"mean coverageData = ([\d,\.]+)",
},
"Globals inside": {
"regions_size": r"regions size = ([\d,\.]+)",
"regions_mapped_reads": r"number of mapped reads = ([\d,]+)", # WARNING: Same as in Globals
},
}
d = dict()
for k, r in regexes.items():
r_search = re.search(r, f["f"], re.MULTILINE)
if r_search:
if "\d" in r:
try:
d[k] = float(r_search.group(1).replace(",", ""))
except ValueError:
d[k] = r_search.group(1)
else:
d[k] = r_search.group(1)
section = None
for line in f["f"].splitlines():
if line.startswith(">>>>>>>"):
section = line[8:]
elif section:
for k, r in regexes.get(section, {}).items():
r_search = re.search(r, line)
if r_search:
if "\d" in r:
try:
d[k] = float(r_search.group(1).replace(",", ""))
except ValueError:
d[k] = r_search.group(1)
else:
d[k] = r_search.group(1)

# Check we have an input filename
if "bam_file" not in d:
log.debug("Couldn't find an input filename in genome_results file {}".format(f["fn"]))
Expand All @@ -119,6 +141,8 @@ def parse_genome_results(self, f):
self.general_stats_data[s_name]["percentage_aligned"] = d["percentage_aligned"]
self.general_stats_data[s_name]["general_error_rate"] = d["general_error_rate"] * 100
self.general_stats_data[s_name]["mean_coverage"] = d["mean_coverage"]
self.general_stats_data[s_name]["regions_size"] = d["regions_size"]
self.general_stats_data[s_name]["regions_mapped_reads"] = d["regions_mapped_reads"]
except KeyError:
pass

Expand Down Expand Up @@ -592,6 +616,20 @@ def general_stats_headers(self):
"format": "{0:.2f}",
"hidden": True,
}
self.general_stats_headers["regions_size"] = {
"title": "{} Region size".format(config.read_count_prefix),
"description": "Size of target region",
"suffix": " bp",
"scale": "PuBuGn",
"hidden": True,
}
self.general_stats_headers["regions_mapped_reads"] = {
"title": "{} Aligned".format(config.read_count_prefix),
"description": "Number of mapped reads on target region ({})".format(config.read_count_desc),
"scale": "RdYlGn",
"shared_key": "read_count",
"hidden": True,
}


def _calculate_bases_within_thresholds(bases_by_depth, total_size, depth_thresholds):
Expand Down
6 changes: 5 additions & 1 deletion multiqc/modules/snippy/snippy.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,9 @@ def __init__(self):
if f["s_name"] in self.snippy_data:
log.debug("Duplicate sample name found for snippy! Overwriting: {}".format(f["s_name"]))
# Add the file data under the key filename
self.snippy_data[f["s_name"]] = self.parse_snippy_txt(f["f"])
data = self.parse_snippy_txt(f["f"])
if data:
self.snippy_data[f["s_name"]] = data

self.add_data_source(f, section="snippy")

Expand Down Expand Up @@ -99,6 +101,8 @@ def parse_snippy_txt(self, file):
split_line = line.strip().split("\t")
if split_line[0] in self.snippy_col:
data[split_line[0]] = int(split_line[1])
if len(data) == 0:
return False
for col in self.snippy_col:
if col not in data:
data[col] = 0
Expand Down