Skip to content

Commit

Permalink
Qualimap: address NBSP as thousand separators (#2282)
Browse files Browse the repository at this point in the history
* Qualimap: address NBSP as thousand separators in rnaseqqc

* [automated] Update CHANGELOG.md

* Qualimap: address NBSP in BamQC as well

* Fix changelog

* Fix changelog

* Determine decimal format once per file. Reuse function between BamQC and rnaseqqc submodules. More efficient parsing, only call re once per line

* Handle error of mixed formats

* Update multiqc/modules/qualimap/__init__.py

Co-authored-by: Phil Ewels <phil.ewels@seqera.io>

---------

Co-authored-by: MultiQC Bot <multiqc-bot@seqera.io>
Co-authored-by: Phil Ewels <phil.ewels@seqera.io>
  • Loading branch information
3 people committed Feb 6, 2024
1 parent a33c6eb commit 41c0228
Show file tree
Hide file tree
Showing 4 changed files with 190 additions and 86 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
- **HUMID**: Add cluster statistics ([#2265](https://github.com/MultiQC/MultiQC/pull/2265))
- **mosdepth**: Add additional summaries to general stats #2257 ([#2257](https://github.com/MultiQC/MultiQC/pull/2257))
- **Picard**: fix using multiple times in report: do not pass `module.anchor` to `self.find_log_files` ([#2255](https://github.com/MultiQC/MultiQC/pull/2255))
- **QualiMap**: address NBSP as thousand separators ([#2282](https://github.com/MultiQC/MultiQC/pull/2282))
- **Seqera Platform CLI**: Updates for v0.9.2 ([#2248](https://github.com/MultiQC/MultiQC/pull/2248))

## [MultiQC v1.19](https://github.com/ewels/MultiQC/releases/tag/v1.19) - 2023-12-18
Expand Down
95 changes: 51 additions & 44 deletions multiqc/modules/qualimap/QM_BamQC.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,13 @@


import logging
import os

import math
import re

from multiqc import config
from multiqc.modules.qualimap import parse_numerals
from multiqc.plots import linegraph

# Initialise the logger
Expand Down Expand Up @@ -82,58 +85,62 @@ def parse_reports(self):

def parse_genome_results(self, f):
"""Parse the contents of the Qualimap BamQC genome_results.txt file"""
regexes = {
"Input": {
"bam_file": r"bam file = (.+)",
},
"Globals": {
"total_reads": r"number of reads = ([\d,]+)",
"mapped_reads": r"number of mapped reads = ([\d,]+)",
"mapped_bases": r"number of mapped bases = ([\d,]+)",
"sequenced_bases": r"number of sequenced bases = ([\d,]+)",
},
"Insert size": {
"mean_insert_size": r"mean insert size = ([\d,\.]+)",
"median_insert_size": r"median insert size = ([\d,\.]+)",
},
"Mapping quality": {
"mean_mapping_quality": r"mean mapping quality = ([\d,\.]+)",
},
"Mismatches and indels": {
"general_error_rate": r"general error rate = ([\d,\.]+)",
},
"Coverage": {
"mean_coverage": r"mean coverageData = ([\d,\.]+)",
},
"Globals inside": {
"regions_size": r"regions size = ([\d,\.]+)",
"regions_mapped_reads": r"number of mapped reads = ([\d,]+)", # WARNING: Same as in Globals
},

int_metrics = {
"number of reads": "total_reads",
"number of mapped reads": "mapped_reads",
"number of mapped reads inside": "regions_mapped_reads",
"number of mapped bases": "mapped_bases",
"number of sequenced bases": "sequenced_bases",
"regions size": "regions_size",
}
d = dict()
float_metrics = {
"mean insert size": "mean_insert_size",
"median insert size": "median_insert_size",
"mean mapping quality": "mean_mapping_quality",
"mean coverageData": "mean_coverage",
}
rate_metrics = { # useful to determine decimal separator
"general error rate": "general_error_rate",
"GC percentage": "gc_percentage",
"duplication rate": "duplication_rate",
"homopolymer indels": "homopolymer_indels",
}

value_regex = re.compile(r"\s+[\d,\.\xa0]+\s+")
preparsed_d = dict()
# Keeping track of the section because "number of mapped reads" occurs both
# under "Globals" and "Globals inside"
section = None
for line in f["f"].splitlines():
if line.startswith(">>>>>>>"):
section = line[8:]
elif section:
for k, r in regexes.get(section, {}).items():
r_search = re.search(r, line)
if r_search:
if r"\d" in r:
try:
d[k] = float(r_search.group(1).replace(",", ""))
except ValueError:
d[k] = r_search.group(1)
else:
d[k] = r_search.group(1)
elif not section:
continue
if "=" in line:
key, val = line.split("=", 1)
m = re.search(value_regex, val)
if m:
val = m.group(0)
val = re.sub(r"\xa0", "", val)
key = key.strip()
if "Globals inside" in section and key == "number of mapped reads":
key = "number of mapped reads inside"
preparsed_d[key] = val.strip()

# Check we have an input filename
if "bam_file" not in d:
if "bam file" not in preparsed_d:
log.debug(f"Couldn't find an input filename in genome_results file {f['fn']}")
return None

# Get a nice sample name
s_name = self.clean_s_name(d["bam_file"], f)
s_name = self.clean_s_name(preparsed_d["bam file"], f)

d = parse_numerals(
preparsed_d,
float_metrics=float_metrics,
int_metrics=int_metrics,
rate_metrics=rate_metrics,
fpath=os.path.join(f["root"], f["fn"]),
)

if "general_error_rate" in d:
d["general_error_rate"] = d["general_error_rate"] * 100.0
Expand Down Expand Up @@ -174,7 +181,7 @@ def parse_coverage(self, f):
if line.startswith("#"):
continue
coverage, count = line.split(None, 1)
coverage = int(round(float(coverage)))
coverage = int(round(float(coverage.replace(",", "."))))
count = float(count)
d[coverage] = count

Expand Down
89 changes: 47 additions & 42 deletions multiqc/modules/qualimap/QM_RNASeq.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,11 @@


import logging
import os
import re

from multiqc import config
from multiqc.modules.qualimap import parse_numerals
from multiqc.plots import bargraph, linegraph

# Initialise the logger
Expand All @@ -13,51 +15,54 @@

def parse_reports(self):
"""Find Qualimap RNASeq reports and parse their data"""

self.qualimap_rnaseq_genome_results = dict()
regexes = {
"reads_aligned": r"read(?:s| pairs) aligned\s*=\s*([\d,]+)",
"total_alignments": r"total alignments\s*=\s*([\d,]+)",
"non_unique_alignments": r"non-unique alignments\s*=\s*([\d,]+)",
"reads_aligned_genes": r"aligned to genes\s*=\s*([\d,]+)",
"ambiguous_alignments": r"ambiguous alignments\s*=\s*([\d,]+)",
"not_aligned": r"not aligned\s*=\s*([\d,]+)",
"5_3_bias": r"5'-3' bias\s*=\s*([\d,\.]+)$",
"reads_aligned_exonic": r"exonic\s*=\s*([\d,]+)",
"reads_aligned_intronic": r"intronic\s*=\s*([\d,]+)",
"reads_aligned_intergenic": r"intergenic\s*=\s*([\d,]+)",
"reads_aligned_overlapping_exon": r"overlapping exon\s*=\s*([\d,]+)",

int_metrics = {
"read pairs aligned": "reads_aligned",
"reads aligned": "reads_aligned",
"total alignments": "total_alignments",
"non-unique alignments": "non_unique_alignments",
"aligned to genes": "reads_aligned_genes",
"ambiguous alignments": "ambiguous_alignments",
"not aligned": "not_aligned",
"exonic": "reads_aligned_exonic",
"intronic": "reads_aligned_intronic",
"intergenic": "reads_aligned_intergenic",
"overlapping exon": "reads_aligned_overlapping_exon",
}
rate_metrics = {
"5' bias": "5_bias",
"3' bias": "3_bias",
"SSP estimation (fwd/rev)": "ssp_estimation",
"5'-3' bias": "5_3_bias",
}

value_regex = re.compile(r"\s+[\d,\.\xa0]+\s+")
for f in self.find_log_files("qualimap/rnaseq/rnaseq_results"):
d = dict()
preparsed_d = dict()
for line in f["f"].splitlines():
if "=" in line:
key, val = line.split("=", 1)
m = re.search(value_regex, val)
if m:
val = m.group(0)
val = re.sub(r"\xa0", "", val)
key = key.strip()
preparsed_d[key] = val.strip()

# Get the sample name
s_name_regex = re.search(r"bam file\s*=\s*(.+)", f["f"], re.MULTILINE)
if s_name_regex:
d["bam_file"] = s_name_regex.group(1)
s_name = self.clean_s_name(d["bam_file"], f)
else:
log.warning(f"Couldn't find an input filename in genome_results file {f['root']}/{f['fn']}")
# Check we have an input filename
if "bam file" not in preparsed_d:
log.debug(f"Couldn't find an input filename in genome_results file {f['fn']}")
return None
s_name = self.clean_s_name(preparsed_d["bam file"], f)

# Check for and 'fix' European style decimal places / thousand separators
comma_regex = re.search(r"exonic\s*=\s*[\d\.]+ \(\d{1,3},\d+%\)", f["f"], re.MULTILINE)
if comma_regex:
log.debug(f"Trying to fix European comma style syntax in Qualimap report {f['root']}/{f['fn']}")
f["f"] = f["f"].replace(".", "")
f["f"] = f["f"].replace(",", ".")

# Go through all numeric regexes
for k, r in regexes.items():
r_search = re.search(r, f["f"], re.MULTILINE)
if r_search:
try:
d[k] = float(r_search.group(1).replace(",", ""))
except UnicodeEncodeError:
# Qualimap reports infinity (\u221e) when 3' bias denominator is zero
pass
except ValueError:
d[k] = r_search.group(1)
d = parse_numerals(
preparsed_d,
float_metrics={},
int_metrics=int_metrics,
rate_metrics=rate_metrics,
fpath=os.path.join(f["root"], f["fn"]),
)

# Add to general stats table
for k in ["5_3_bias", "reads_aligned"]:
Expand All @@ -72,7 +77,7 @@ def parse_reports(self):
self.qualimap_rnaseq_genome_results[s_name] = d
self.add_data_source(f, s_name=s_name, section="rna_genome_results")

#### Coverage profile
# Coverage profile
self.qualimap_rnaseq_cov_hist = dict()
for f in self.find_log_files("qualimap/rnaseq/coverage", filehandles=True):
s_name = self.get_s_name(f)
Expand All @@ -81,7 +86,7 @@ def parse_reports(self):
if line.startswith("#"):
continue
coverage, count = line.split(None, 1)
coverage = int(round(float(coverage)))
coverage = int(round(float(coverage.replace(",", "."))))
count = float(count)
d[coverage] = count

Expand All @@ -103,7 +108,7 @@ def parse_reports(self):
self.qualimap_rnaseq_genome_results = self.ignore_samples(self.qualimap_rnaseq_genome_results)
self.qualimap_rnaseq_cov_hist = self.ignore_samples(self.qualimap_rnaseq_cov_hist)

#### Plots
# Plots
# Genomic Origin Bar Graph
# NB: Ignore 'Overlapping Exon' in report - these make the numbers add up to > 100%
if len(self.qualimap_rnaseq_genome_results) > 0:
Expand Down
91 changes: 91 additions & 0 deletions multiqc/modules/qualimap/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,94 @@
import logging
from typing import Dict, Union

from .qualimap import MultiqcModule

__all__ = ["MultiqcModule"]

log = logging.getLogger(__name__)


def parse_numerals(
preparsed_d: Dict[str, str],
float_metrics: Dict[str, str],
int_metrics: Dict[str, str],
rate_metrics: Dict[str, str],
fpath: str,
) -> Dict[str, Union[int, float, str]]:
"""
Take pre-parsed Qualimap report (keys to string values), and properly parse
numeral values, taking regional formats into account.
"""
# Determine if decimal separator is dot or comma
decimalcomma = None
for k in rate_metrics:
if k in preparsed_d:
val = preparsed_d[k]
if "," in val and "." in val:
log.error(
f"Couldn't determine decimal separator for file {fpath}, as both . and , are "
f"found in a rational value: {val}"
)
return {}
if "," in val:
if decimalcomma is False:
log.error(
f"Couldn't determine decimal separator for file {fpath}, as differently formatted"
f"rational values are found"
)
return {}
decimalcomma = True
if "." in val:
if decimalcomma is True:
log.error(
f"Couldn't determine decimal separator for file {fpath}, as differently formatted"
f"rational values are found"
)
return {}
decimalcomma = False
if decimalcomma is None:
# All expected float numbers are integer, so attempt to instead determine the
# thousands separator from large int values.
for k in int_metrics:
if k in preparsed_d:
val = preparsed_d[k]
if "," in val and "." in val:
log.error(
f"Couldn't determine decimal separator for file {fpath}, as both . and , are "
f"found in a rational value: {val}"
)
return {}
if "," in val:
if decimalcomma is True:
log.error(
f"Couldn't determine decimal separator for file {fpath}, as differently formatted"
f"rational values are found"
)
return {}
decimalcomma = False
if "." in val:
if decimalcomma is False:
log.error(
f"Couldn't determine decimal separator for file {fpath}, as differently formatted"
f"rational values are found"
)
return {}
decimalcomma = True
if decimalcomma is None:
log.debug(f"Couldn't determine decimal separator for file {fpath}")

d = {}
for k, v in preparsed_d.items():
v = v.strip("X").strip("%")
if k in float_metrics or k in rate_metrics or k in int_metrics:
if decimalcomma is True:
v = v.replace(".", "").replace(",", ".")
v = v.replace(",", "")
if k in int_metrics:
d[int_metrics[k]] = int(v)
elif k in float_metrics:
d[float_metrics[k]] = float(v)
elif k in rate_metrics:
d[rate_metrics[k]] = float(v)

return d

0 comments on commit 41c0228

Please sign in to comment.