Qualimap: address NBSP as thousand separators (#2282)

* Qualimap: address NBSP as thousand separators in rnaseqqc * [automated] Update CHANGELOG.md * Qualimap: address NBSP in BamQC as well * Fix changelog * Fix changelog * Determine decimal format once per file. Reuse function between BamQC and rnaseqqc submodules. More efficient parsing, only call re once per line * Handle error of mixed formats * Update multiqc/modules/qualimap/__init__.py Co-authored-by: Phil Ewels <phil.ewels@seqera.io> --------- Co-authored-by: MultiQC Bot <multiqc-bot@seqera.io> Co-authored-by: Phil Ewels <phil.ewels@seqera.io>
MultiQC · Feb 6, 2024 · 41c0228 · 41c0228
1 parent a33c6eb
commit 41c0228
Show file tree

Hide file tree

Showing 4 changed files with 190 additions and 86 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -41,6 +41,7 @@
 - **HUMID**: Add cluster statistics ([#2265](https://github.com/MultiQC/MultiQC/pull/2265))
 - **mosdepth**: Add additional summaries to general stats #2257 ([#2257](https://github.com/MultiQC/MultiQC/pull/2257))
 - **Picard**: fix using multiple times in report: do not pass `module.anchor` to `self.find_log_files` ([#2255](https://github.com/MultiQC/MultiQC/pull/2255))
+- **QualiMap**: address NBSP as thousand separators ([#2282](https://github.com/MultiQC/MultiQC/pull/2282))
 - **Seqera Platform CLI**: Updates for v0.9.2 ([#2248](https://github.com/MultiQC/MultiQC/pull/2248))
 
 ## [MultiQC v1.19](https://github.com/ewels/MultiQC/releases/tag/v1.19) - 2023-12-18

diff --git a/multiqc/modules/qualimap/QM_BamQC.py b/multiqc/modules/qualimap/QM_BamQC.py
@@ -2,10 +2,13 @@
 
 
 import logging
+import os
+
 import math
 import re
 
 from multiqc import config
+from multiqc.modules.qualimap import parse_numerals
 from multiqc.plots import linegraph
 
 # Initialise the logger
@@ -82,58 +85,62 @@ def parse_reports(self):
 
 def parse_genome_results(self, f):
     """Parse the contents of the Qualimap BamQC genome_results.txt file"""
-    regexes = {
-        "Input": {
-            "bam_file": r"bam file = (.+)",
-        },
-        "Globals": {
-            "total_reads": r"number of reads = ([\d,]+)",
-            "mapped_reads": r"number of mapped reads = ([\d,]+)",
-            "mapped_bases": r"number of mapped bases = ([\d,]+)",
-            "sequenced_bases": r"number of sequenced bases = ([\d,]+)",
-        },
-        "Insert size": {
-            "mean_insert_size": r"mean insert size = ([\d,\.]+)",
-            "median_insert_size": r"median insert size = ([\d,\.]+)",
-        },
-        "Mapping quality": {
-            "mean_mapping_quality": r"mean mapping quality = ([\d,\.]+)",
-        },
-        "Mismatches and indels": {
-            "general_error_rate": r"general error rate = ([\d,\.]+)",
-        },
-        "Coverage": {
-            "mean_coverage": r"mean coverageData = ([\d,\.]+)",
-        },
-        "Globals inside": {
-            "regions_size": r"regions size = ([\d,\.]+)",
-            "regions_mapped_reads": r"number of mapped reads = ([\d,]+)",  # WARNING: Same as in Globals
-        },
+
+    int_metrics = {
+        "number of reads": "total_reads",
+        "number of mapped reads": "mapped_reads",
+        "number of mapped reads inside": "regions_mapped_reads",
+        "number of mapped bases": "mapped_bases",
+        "number of sequenced bases": "sequenced_bases",
+        "regions size": "regions_size",
     }
-    d = dict()
+    float_metrics = {
+        "mean insert size": "mean_insert_size",
+        "median insert size": "median_insert_size",
+        "mean mapping quality": "mean_mapping_quality",
+        "mean coverageData": "mean_coverage",
+    }
+    rate_metrics = {  # useful to determine decimal separator
+        "general error rate": "general_error_rate",
+        "GC percentage": "gc_percentage",
+        "duplication rate": "duplication_rate",
+        "homopolymer indels": "homopolymer_indels",
+    }
+
+    value_regex = re.compile(r"\s+[\d,\.\xa0]+\s+")
+    preparsed_d = dict()
+    # Keeping track of the section because "number of mapped reads" occurs both
+    # under "Globals" and "Globals inside"
     section = None
     for line in f["f"].splitlines():
         if line.startswith(">>>>>>>"):
             section = line[8:]
-        elif section:
-            for k, r in regexes.get(section, {}).items():
-                r_search = re.search(r, line)
-                if r_search:
-                    if r"\d" in r:
-                        try:
-                            d[k] = float(r_search.group(1).replace(",", ""))
-                        except ValueError:
-                            d[k] = r_search.group(1)
-                    else:
-                        d[k] = r_search.group(1)
+        elif not section:
+            continue
+        if "=" in line:
+            key, val = line.split("=", 1)
+            m = re.search(value_regex, val)
+            if m:
+                val = m.group(0)
+                val = re.sub(r"\xa0", "", val)
+            key = key.strip()
+            if "Globals inside" in section and key == "number of mapped reads":
+                key = "number of mapped reads inside"
+            preparsed_d[key] = val.strip()
 
     # Check we have an input filename
-    if "bam_file" not in d:
+    if "bam file" not in preparsed_d:
         log.debug(f"Couldn't find an input filename in genome_results file {f['fn']}")
         return None
-
-    # Get a nice sample name
-    s_name = self.clean_s_name(d["bam_file"], f)
+    s_name = self.clean_s_name(preparsed_d["bam file"], f)
+
+    d = parse_numerals(
+        preparsed_d,
+        float_metrics=float_metrics,
+        int_metrics=int_metrics,
+        rate_metrics=rate_metrics,
+        fpath=os.path.join(f["root"], f["fn"]),
+    )
 
     if "general_error_rate" in d:
         d["general_error_rate"] = d["general_error_rate"] * 100.0
@@ -174,7 +181,7 @@ def parse_coverage(self, f):
         if line.startswith("#"):
             continue
         coverage, count = line.split(None, 1)
-        coverage = int(round(float(coverage)))
+        coverage = int(round(float(coverage.replace(",", "."))))
         count = float(count)
         d[coverage] = count
 

diff --git a/multiqc/modules/qualimap/QM_RNASeq.py b/multiqc/modules/qualimap/QM_RNASeq.py
@@ -2,9 +2,11 @@
 
 
 import logging
+import os
 import re
 
 from multiqc import config
+from multiqc.modules.qualimap import parse_numerals
 from multiqc.plots import bargraph, linegraph
 
 # Initialise the logger
@@ -13,51 +15,54 @@
 
 def parse_reports(self):
     """Find Qualimap RNASeq reports and parse their data"""
-
     self.qualimap_rnaseq_genome_results = dict()
-    regexes = {
-        "reads_aligned": r"read(?:s| pairs) aligned\s*=\s*([\d,]+)",
-        "total_alignments": r"total alignments\s*=\s*([\d,]+)",
-        "non_unique_alignments": r"non-unique alignments\s*=\s*([\d,]+)",
-        "reads_aligned_genes": r"aligned to genes\s*=\s*([\d,]+)",
-        "ambiguous_alignments": r"ambiguous alignments\s*=\s*([\d,]+)",
-        "not_aligned": r"not aligned\s*=\s*([\d,]+)",
-        "5_3_bias": r"5'-3' bias\s*=\s*([\d,\.]+)$",
-        "reads_aligned_exonic": r"exonic\s*=\s*([\d,]+)",
-        "reads_aligned_intronic": r"intronic\s*=\s*([\d,]+)",
-        "reads_aligned_intergenic": r"intergenic\s*=\s*([\d,]+)",
-        "reads_aligned_overlapping_exon": r"overlapping exon\s*=\s*([\d,]+)",
+
+    int_metrics = {
+        "read pairs aligned": "reads_aligned",
+        "reads aligned": "reads_aligned",
+        "total alignments": "total_alignments",
+        "non-unique alignments": "non_unique_alignments",
+        "aligned to genes": "reads_aligned_genes",
+        "ambiguous alignments": "ambiguous_alignments",
+        "not aligned": "not_aligned",
+        "exonic": "reads_aligned_exonic",
+        "intronic": "reads_aligned_intronic",
+        "intergenic": "reads_aligned_intergenic",
+        "overlapping exon": "reads_aligned_overlapping_exon",
     }
+    rate_metrics = {
+        "5' bias": "5_bias",
+        "3' bias": "3_bias",
+        "SSP estimation (fwd/rev)": "ssp_estimation",
+        "5'-3' bias": "5_3_bias",
+    }
+
+    value_regex = re.compile(r"\s+[\d,\.\xa0]+\s+")
     for f in self.find_log_files("qualimap/rnaseq/rnaseq_results"):
-        d = dict()
+        preparsed_d = dict()
+        for line in f["f"].splitlines():
+            if "=" in line:
+                key, val = line.split("=", 1)
+                m = re.search(value_regex, val)
+                if m:
+                    val = m.group(0)
+                    val = re.sub(r"\xa0", "", val)
+                key = key.strip()
+                preparsed_d[key] = val.strip()
 
-        # Get the sample name
-        s_name_regex = re.search(r"bam file\s*=\s*(.+)", f["f"], re.MULTILINE)
-        if s_name_regex:
-            d["bam_file"] = s_name_regex.group(1)
-            s_name = self.clean_s_name(d["bam_file"], f)
-        else:
-            log.warning(f"Couldn't find an input filename in genome_results file {f['root']}/{f['fn']}")
+        # Check we have an input filename
+        if "bam file" not in preparsed_d:
+            log.debug(f"Couldn't find an input filename in genome_results file {f['fn']}")
             return None
+        s_name = self.clean_s_name(preparsed_d["bam file"], f)
 
-        # Check for and 'fix' European style decimal places / thousand separators
-        comma_regex = re.search(r"exonic\s*=\s*[\d\.]+ \(\d{1,3},\d+%\)", f["f"], re.MULTILINE)
-        if comma_regex:
-            log.debug(f"Trying to fix European comma style syntax in Qualimap report {f['root']}/{f['fn']}")
-            f["f"] = f["f"].replace(".", "")
-            f["f"] = f["f"].replace(",", ".")
-
-        # Go through all numeric regexes
-        for k, r in regexes.items():
-            r_search = re.search(r, f["f"], re.MULTILINE)
-            if r_search:
-                try:
-                    d[k] = float(r_search.group(1).replace(",", ""))
-                except UnicodeEncodeError:
-                    # Qualimap reports infinity (\u221e) when 3' bias denominator is zero
-                    pass
-                except ValueError:
-                    d[k] = r_search.group(1)
+        d = parse_numerals(
+            preparsed_d,
+            float_metrics={},
+            int_metrics=int_metrics,
+            rate_metrics=rate_metrics,
+            fpath=os.path.join(f["root"], f["fn"]),
+        )
 
         # Add to general stats table
         for k in ["5_3_bias", "reads_aligned"]:
@@ -72,7 +77,7 @@ def parse_reports(self):
         self.qualimap_rnaseq_genome_results[s_name] = d
         self.add_data_source(f, s_name=s_name, section="rna_genome_results")
 
-    #### Coverage profile
+    # Coverage profile
     self.qualimap_rnaseq_cov_hist = dict()
     for f in self.find_log_files("qualimap/rnaseq/coverage", filehandles=True):
         s_name = self.get_s_name(f)
@@ -81,7 +86,7 @@ def parse_reports(self):
             if line.startswith("#"):
                 continue
             coverage, count = line.split(None, 1)
-            coverage = int(round(float(coverage)))
+            coverage = int(round(float(coverage.replace(",", "."))))
             count = float(count)
             d[coverage] = count
 
@@ -103,7 +108,7 @@ def parse_reports(self):
     self.qualimap_rnaseq_genome_results = self.ignore_samples(self.qualimap_rnaseq_genome_results)
     self.qualimap_rnaseq_cov_hist = self.ignore_samples(self.qualimap_rnaseq_cov_hist)
 
-    #### Plots
+    # Plots
     # Genomic Origin Bar Graph
     # NB: Ignore 'Overlapping Exon' in report - these make the numbers add up to > 100%
     if len(self.qualimap_rnaseq_genome_results) > 0:

diff --git a/multiqc/modules/qualimap/__init__.py b/multiqc/modules/qualimap/__init__.py
@@ -1,3 +1,94 @@
+import logging
+from typing import Dict, Union
+
 from .qualimap import MultiqcModule
 
 __all__ = ["MultiqcModule"]
+
+log = logging.getLogger(__name__)
+
+
+def parse_numerals(
+    preparsed_d: Dict[str, str],
+    float_metrics: Dict[str, str],
+    int_metrics: Dict[str, str],
+    rate_metrics: Dict[str, str],
+    fpath: str,
+) -> Dict[str, Union[int, float, str]]:
+    """
+    Take pre-parsed Qualimap report (keys to string values), and properly parse
+    numeral values, taking regional formats into account.
+    """
+    # Determine if decimal separator is dot or comma
+    decimalcomma = None
+    for k in rate_metrics:
+        if k in preparsed_d:
+            val = preparsed_d[k]
+            if "," in val and "." in val:
+                log.error(
+                    f"Couldn't determine decimal separator for file {fpath}, as both . and , are "
+                    f"found in a rational value: {val}"
+                )
+                return {}
+            if "," in val:
+                if decimalcomma is False:
+                    log.error(
+                        f"Couldn't determine decimal separator for file {fpath}, as differently formatted"
+                        f"rational values are found"
+                    )
+                    return {}
+                decimalcomma = True
+            if "." in val:
+                if decimalcomma is True:
+                    log.error(
+                        f"Couldn't determine decimal separator for file {fpath}, as differently formatted"
+                        f"rational values are found"
+                    )
+                    return {}
+                decimalcomma = False
+    if decimalcomma is None:
+        # All expected float numbers are integer, so attempt to instead determine the
+        # thousands separator from large int values.
+        for k in int_metrics:
+            if k in preparsed_d:
+                val = preparsed_d[k]
+                if "," in val and "." in val:
+                    log.error(
+                        f"Couldn't determine decimal separator for file {fpath}, as both . and , are "
+                        f"found in a rational value: {val}"
+                    )
+                    return {}
+                if "," in val:
+                    if decimalcomma is True:
+                        log.error(
+                            f"Couldn't determine decimal separator for file {fpath}, as differently formatted"
+                            f"rational values are found"
+                        )
+                        return {}
+                    decimalcomma = False
+                if "." in val:
+                    if decimalcomma is False:
+                        log.error(
+                            f"Couldn't determine decimal separator for file {fpath}, as differently formatted"
+                            f"rational values are found"
+                        )
+                        return {}
+                    decimalcomma = True
+    if decimalcomma is None:
+        log.debug(f"Couldn't determine decimal separator for file {fpath}")
+
+    d = {}
+    for k, v in preparsed_d.items():
+        v = v.strip("X").strip("%")
+        if k in float_metrics or k in rate_metrics or k in int_metrics:
+            if decimalcomma is True:
+                v = v.replace(".", "").replace(",", ".")
+            v = v.replace(",", "")
+            if k in int_metrics:
+                d[int_metrics[k]] = int(v)
+            elif k in float_metrics:
+                d[float_metrics[k]] = float(v)
+            elif k in rate_metrics:
+                d[rate_metrics[k]] = float(v)
+
+    return d