Merge pull request #1798 from jfy133/qualimap-update

Update Qualimap: add additional entries for qualimap when region stats present
MultiQC · Nov 30, 2022 · e185c85 · e185c85
2 parents 3b74a38 + 8d713ba
commit e185c85
Show file tree

Hide file tree

Showing 3 changed files with 65 additions and 22 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -46,8 +46,9 @@
 - **Sambamba Markdup**
   - Catch zero division in sambamba markdup ([#1654](https://github.com/ewels/MultiQC/issues/1654))
 - **Samtools**
-  - Added additional (by default hidden) column for `flagstat` that displays percentage of mapped reads in a bam ([#1733](https://github.com/ewels/MultiQC/issues/1733))
+  - Added additional column for `flagstat` that displays percentage of mapped reads in a bam (hidden by default) ([#1733](https://github.com/ewels/MultiQC/issues/1733))
 - **Qualimap**
+  - Added additional columns in general stats for BamQC results that displays region on-target stats if region bed has been supplied (hidden by default) ([#1798](https://github.com/ewels/MultiQC/pull/1798))
   - Bugfix: Remove General Stats rows for filtered samples ([#1780](https://github.com/ewels/MultiQC/issues/1780))
 
 ## [MultiQC v1.13](https://github.com/ewels/MultiQC/releases/tag/v1.13) - 2022-09-08

diff --git a/multiqc/modules/qualimap/QM_BamQC.py b/multiqc/modules/qualimap/QM_BamQC.py
@@ -81,28 +81,50 @@ def parse_reports(self):
 def parse_genome_results(self, f):
     """Parse the contents of the Qualimap BamQC genome_results.txt file"""
     regexes = {
-        "bam_file": r"bam file = (.+)",
-        "total_reads": r"number of reads = ([\d,]+)",
-        "mapped_reads": r"number of mapped reads = ([\d,]+)",
-        "mapped_bases": r"number of mapped bases = ([\d,]+)",
-        "sequenced_bases": r"number of sequenced bases = ([\d,]+)",
-        "mean_insert_size": r"mean insert size = ([\d,\.]+)",
-        "median_insert_size": r"median insert size = ([\d,\.]+)",
-        "mean_mapping_quality": r"mean mapping quality = ([\d,\.]+)",
-        "general_error_rate": r"general error rate = ([\d,\.]+)",
-        "mean_coverage": r"mean coverageData = ([\d,\.]+)",
+        "Input": {
+            "bam_file": r"bam file = (.+)",
+        },
+        "Globals": {
+            "total_reads": r"number of reads = ([\d,]+)",
+            "mapped_reads": r"number of mapped reads = ([\d,]+)",
+            "mapped_bases": r"number of mapped bases = ([\d,]+)",
+            "sequenced_bases": r"number of sequenced bases = ([\d,]+)",
+        },
+        "Insert size": {
+            "mean_insert_size": r"mean insert size = ([\d,\.]+)",
+            "median_insert_size": r"median insert size = ([\d,\.]+)",
+        },
+        "Mapping quality": {
+            "mean_mapping_quality": r"mean mapping quality = ([\d,\.]+)",
+        },
+        "Mismatches and indels": {
+            "general_error_rate": r"general error rate = ([\d,\.]+)",
+        },
+        "Coverage": {
+            "mean_coverage": r"mean coverageData = ([\d,\.]+)",
+        },
+        "Globals inside": {
+            "regions_size": r"regions size = ([\d,\.]+)",
+            "regions_mapped_reads": r"number of mapped reads = ([\d,]+)",  # WARNING: Same as in Globals
+        },
     }
     d = dict()
-    for k, r in regexes.items():
-        r_search = re.search(r, f["f"], re.MULTILINE)
-        if r_search:
-            if "\d" in r:
-                try:
-                    d[k] = float(r_search.group(1).replace(",", ""))
-                except ValueError:
-                    d[k] = r_search.group(1)
-            else:
-                d[k] = r_search.group(1)
+    section = None
+    for line in f["f"].splitlines():
+        if line.startswith(">>>>>>>"):
+            section = line[8:]
+        elif section:
+            for k, r in regexes.get(section, {}).items():
+                r_search = re.search(r, line)
+                if r_search:
+                    if "\d" in r:
+                        try:
+                            d[k] = float(r_search.group(1).replace(",", ""))
+                        except ValueError:
+                            d[k] = r_search.group(1)
+                    else:
+                        d[k] = r_search.group(1)
+
     # Check we have an input filename
     if "bam_file" not in d:
         log.debug("Couldn't find an input filename in genome_results file {}".format(f["fn"]))
@@ -119,6 +141,8 @@ def parse_genome_results(self, f):
         self.general_stats_data[s_name]["percentage_aligned"] = d["percentage_aligned"]
         self.general_stats_data[s_name]["general_error_rate"] = d["general_error_rate"] * 100
         self.general_stats_data[s_name]["mean_coverage"] = d["mean_coverage"]
+        self.general_stats_data[s_name]["regions_size"] = d["regions_size"]
+        self.general_stats_data[s_name]["regions_mapped_reads"] = d["regions_mapped_reads"]
     except KeyError:
         pass
 
@@ -592,6 +616,20 @@ def general_stats_headers(self):
         "format": "{0:.2f}",
         "hidden": True,
     }
+    self.general_stats_headers["regions_size"] = {
+        "title": "{} Region size".format(config.read_count_prefix),
+        "description": "Size of target region",
+        "suffix": " bp",
+        "scale": "PuBuGn",
+        "hidden": True,
+    }
+    self.general_stats_headers["regions_mapped_reads"] = {
+        "title": "{} Aligned".format(config.read_count_prefix),
+        "description": "Number of mapped reads on target region ({})".format(config.read_count_desc),
+        "scale": "RdYlGn",
+        "shared_key": "read_count",
+        "hidden": True,
+    }
 
 
 def _calculate_bases_within_thresholds(bases_by_depth, total_size, depth_thresholds):

diff --git a/multiqc/modules/snippy/snippy.py b/multiqc/modules/snippy/snippy.py
@@ -54,7 +54,9 @@ def __init__(self):
             if f["s_name"] in self.snippy_data:
                 log.debug("Duplicate sample name found for snippy! Overwriting: {}".format(f["s_name"]))
             # Add the file data under the key filename
-            self.snippy_data[f["s_name"]] = self.parse_snippy_txt(f["f"])
+            data = self.parse_snippy_txt(f["f"])
+            if data:
+                self.snippy_data[f["s_name"]] = data
 
             self.add_data_source(f, section="snippy")
 
@@ -99,6 +101,8 @@ def parse_snippy_txt(self, file):
             split_line = line.strip().split("\t")
             if split_line[0] in self.snippy_col:
                 data[split_line[0]] = int(split_line[1])
+        if len(data) == 0:
+            return False
         for col in self.snippy_col:
             if col not in data:
                 data[col] = 0