Skip to content

Commit

Permalink
Merge pull request #1798 from jfy133/qualimap-update
Browse files Browse the repository at this point in the history
Update Qualimap: add additional entries for qualimap when region stats present
  • Loading branch information
ewels committed Nov 30, 2022
2 parents 3b74a38 + 8d713ba commit e185c85
Show file tree
Hide file tree
Showing 3 changed files with 65 additions and 22 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,9 @@
- **Sambamba Markdup**
- Catch zero division in sambamba markdup ([#1654](https://github.com/ewels/MultiQC/issues/1654))
- **Samtools**
- Added additional (by default hidden) column for `flagstat` that displays percentage of mapped reads in a bam ([#1733](https://github.com/ewels/MultiQC/issues/1733))
- Added additional column for `flagstat` that displays percentage of mapped reads in a bam (hidden by default) ([#1733](https://github.com/ewels/MultiQC/issues/1733))
- **Qualimap**
- Added additional columns in general stats for BamQC results that displays region on-target stats if region bed has been supplied (hidden by default) ([#1798](https://github.com/ewels/MultiQC/pull/1798))
- Bugfix: Remove General Stats rows for filtered samples ([#1780](https://github.com/ewels/MultiQC/issues/1780))

## [MultiQC v1.13](https://github.com/ewels/MultiQC/releases/tag/v1.13) - 2022-09-08
Expand Down
78 changes: 58 additions & 20 deletions multiqc/modules/qualimap/QM_BamQC.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,28 +81,50 @@ def parse_reports(self):
def parse_genome_results(self, f):
"""Parse the contents of the Qualimap BamQC genome_results.txt file"""
regexes = {
"bam_file": r"bam file = (.+)",
"total_reads": r"number of reads = ([\d,]+)",
"mapped_reads": r"number of mapped reads = ([\d,]+)",
"mapped_bases": r"number of mapped bases = ([\d,]+)",
"sequenced_bases": r"number of sequenced bases = ([\d,]+)",
"mean_insert_size": r"mean insert size = ([\d,\.]+)",
"median_insert_size": r"median insert size = ([\d,\.]+)",
"mean_mapping_quality": r"mean mapping quality = ([\d,\.]+)",
"general_error_rate": r"general error rate = ([\d,\.]+)",
"mean_coverage": r"mean coverageData = ([\d,\.]+)",
"Input": {
"bam_file": r"bam file = (.+)",
},
"Globals": {
"total_reads": r"number of reads = ([\d,]+)",
"mapped_reads": r"number of mapped reads = ([\d,]+)",
"mapped_bases": r"number of mapped bases = ([\d,]+)",
"sequenced_bases": r"number of sequenced bases = ([\d,]+)",
},
"Insert size": {
"mean_insert_size": r"mean insert size = ([\d,\.]+)",
"median_insert_size": r"median insert size = ([\d,\.]+)",
},
"Mapping quality": {
"mean_mapping_quality": r"mean mapping quality = ([\d,\.]+)",
},
"Mismatches and indels": {
"general_error_rate": r"general error rate = ([\d,\.]+)",
},
"Coverage": {
"mean_coverage": r"mean coverageData = ([\d,\.]+)",
},
"Globals inside": {
"regions_size": r"regions size = ([\d,\.]+)",
"regions_mapped_reads": r"number of mapped reads = ([\d,]+)", # WARNING: Same as in Globals
},
}
d = dict()
for k, r in regexes.items():
r_search = re.search(r, f["f"], re.MULTILINE)
if r_search:
if "\d" in r:
try:
d[k] = float(r_search.group(1).replace(",", ""))
except ValueError:
d[k] = r_search.group(1)
else:
d[k] = r_search.group(1)
section = None
for line in f["f"].splitlines():
if line.startswith(">>>>>>>"):
section = line[8:]
elif section:
for k, r in regexes.get(section, {}).items():
r_search = re.search(r, line)
if r_search:
if "\d" in r:
try:
d[k] = float(r_search.group(1).replace(",", ""))
except ValueError:
d[k] = r_search.group(1)
else:
d[k] = r_search.group(1)

# Check we have an input filename
if "bam_file" not in d:
log.debug("Couldn't find an input filename in genome_results file {}".format(f["fn"]))
Expand All @@ -119,6 +141,8 @@ def parse_genome_results(self, f):
self.general_stats_data[s_name]["percentage_aligned"] = d["percentage_aligned"]
self.general_stats_data[s_name]["general_error_rate"] = d["general_error_rate"] * 100
self.general_stats_data[s_name]["mean_coverage"] = d["mean_coverage"]
self.general_stats_data[s_name]["regions_size"] = d["regions_size"]
self.general_stats_data[s_name]["regions_mapped_reads"] = d["regions_mapped_reads"]
except KeyError:
pass

Expand Down Expand Up @@ -592,6 +616,20 @@ def general_stats_headers(self):
"format": "{0:.2f}",
"hidden": True,
}
self.general_stats_headers["regions_size"] = {
"title": "{} Region size".format(config.read_count_prefix),
"description": "Size of target region",
"suffix": " bp",
"scale": "PuBuGn",
"hidden": True,
}
self.general_stats_headers["regions_mapped_reads"] = {
"title": "{} Aligned".format(config.read_count_prefix),
"description": "Number of mapped reads on target region ({})".format(config.read_count_desc),
"scale": "RdYlGn",
"shared_key": "read_count",
"hidden": True,
}


def _calculate_bases_within_thresholds(bases_by_depth, total_size, depth_thresholds):
Expand Down
6 changes: 5 additions & 1 deletion multiqc/modules/snippy/snippy.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,9 @@ def __init__(self):
if f["s_name"] in self.snippy_data:
log.debug("Duplicate sample name found for snippy! Overwriting: {}".format(f["s_name"]))
# Add the file data under the key filename
self.snippy_data[f["s_name"]] = self.parse_snippy_txt(f["f"])
data = self.parse_snippy_txt(f["f"])
if data:
self.snippy_data[f["s_name"]] = data

self.add_data_source(f, section="snippy")

Expand Down Expand Up @@ -99,6 +101,8 @@ def parse_snippy_txt(self, file):
split_line = line.strip().split("\t")
if split_line[0] in self.snippy_col:
data[split_line[0]] = int(split_line[1])
if len(data) == 0:
return False
for col in self.snippy_col:
if col not in data:
data[col] = 0
Expand Down

0 comments on commit e185c85

Please sign in to comment.