Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Qualimap: address NBSP as thousand separators #2282

Merged
merged 10 commits into from
Feb 6, 2024
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
- **HiFiasm**: account for lines with no asterisk ([#2268](https://github.com/MultiQC/MultiQC/pull/2268))
- **mosdepth**: Add additional summaries to general stats #2257 ([#2257](https://github.com/MultiQC/MultiQC/pull/2257))
- **Picard**: fix using multiple times in report: do not pass `module.anchor` to `self.find_log_files` ([#2255](https://github.com/MultiQC/MultiQC/pull/2255))
- **QualiMap**: address NBSP as thousand separators ([#2282](https://github.com/MultiQC/MultiQC/pull/2282))
- **Seqera Platform CLI**: Updates for v0.9.2 ([#2248](https://github.com/MultiQC/MultiQC/pull/2248))

## [MultiQC v1.19](https://github.com/ewels/MultiQC/releases/tag/v1.19) - 2023-12-18
Expand Down
95 changes: 51 additions & 44 deletions multiqc/modules/qualimap/QM_BamQC.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,13 @@


import logging
import os

import math
import re

from multiqc import config
from multiqc.modules.qualimap import parse_numerals
from multiqc.plots import linegraph

# Initialise the logger
Expand Down Expand Up @@ -82,58 +85,62 @@ def parse_reports(self):

def parse_genome_results(self, f):
"""Parse the contents of the Qualimap BamQC genome_results.txt file"""
regexes = {
"Input": {
"bam_file": r"bam file = (.+)",
},
"Globals": {
"total_reads": r"number of reads = ([\d,]+)",
"mapped_reads": r"number of mapped reads = ([\d,]+)",
"mapped_bases": r"number of mapped bases = ([\d,]+)",
"sequenced_bases": r"number of sequenced bases = ([\d,]+)",
},
"Insert size": {
"mean_insert_size": r"mean insert size = ([\d,\.]+)",
"median_insert_size": r"median insert size = ([\d,\.]+)",
},
"Mapping quality": {
"mean_mapping_quality": r"mean mapping quality = ([\d,\.]+)",
},
"Mismatches and indels": {
"general_error_rate": r"general error rate = ([\d,\.]+)",
},
"Coverage": {
"mean_coverage": r"mean coverageData = ([\d,\.]+)",
},
"Globals inside": {
"regions_size": r"regions size = ([\d,\.]+)",
"regions_mapped_reads": r"number of mapped reads = ([\d,]+)", # WARNING: Same as in Globals
},

int_metrics = {
"number of reads": "total_reads",
"number of mapped reads": "mapped_reads",
"number of mapped reads inside": "regions_mapped_reads",
"number of mapped bases": "mapped_bases",
"number of sequenced bases": "sequenced_bases",
"regions size": "regions_size",
}
d = dict()
float_metrics = {
"mean insert size": "mean_insert_size",
"median insert size": "median_insert_size",
"mean mapping quality": "mean_mapping_quality",
"mean coverageData": "mean_coverage",
}
rate_metrics = { # useful to determine decimal separator
"general error rate": "general_error_rate",
"GC percentage": "gc_percentage",
"duplication rate": "duplication_rate",
"homopolymer indels": "homopolymer_indels",
}

value_regex = re.compile(r"\s+[\d,\.\xa0]+\s+")
preparsed_d = dict()
# Keeping track of the section because "number of mapped reads" occurs both
# under "Globals" and "Globals inside"
section = None
for line in f["f"].splitlines():
if line.startswith(">>>>>>>"):
section = line[8:]
elif section:
for k, r in regexes.get(section, {}).items():
r_search = re.search(r, line)
if r_search:
if r"\d" in r:
try:
d[k] = float(r_search.group(1).replace(",", ""))
except ValueError:
d[k] = r_search.group(1)
else:
d[k] = r_search.group(1)
elif not section:
continue
if "=" in line:
key, val = line.split("=", 1)
m = re.search(value_regex, val)
if m:
val = m.group(0)
val = re.sub(r"\xa0", "", val)
key = key.strip()
if "Globals inside" in section and key == "number of mapped reads":
key = "number of mapped reads inside"
preparsed_d[key] = val.strip()

# Check we have an input filename
if "bam_file" not in d:
if "bam file" not in preparsed_d:
log.debug(f"Couldn't find an input filename in genome_results file {f['fn']}")
return None

# Get a nice sample name
s_name = self.clean_s_name(d["bam_file"], f)
s_name = self.clean_s_name(preparsed_d["bam file"], f)

d = parse_numerals(
preparsed_d,
float_metrics=float_metrics,
int_metrics=int_metrics,
rate_metrics=rate_metrics,
fpath=os.path.join(f["root"], f["fn"]),
)

if "general_error_rate" in d:
d["general_error_rate"] = d["general_error_rate"] * 100.0
Expand Down Expand Up @@ -174,7 +181,7 @@ def parse_coverage(self, f):
if line.startswith("#"):
continue
coverage, count = line.split(None, 1)
coverage = int(round(float(coverage)))
coverage = int(round(float(coverage.replace(",", "."))))
count = float(count)
d[coverage] = count

Expand Down
89 changes: 47 additions & 42 deletions multiqc/modules/qualimap/QM_RNASeq.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,11 @@


import logging
import os
import re

from multiqc import config
from multiqc.modules.qualimap import parse_numerals
from multiqc.plots import bargraph, linegraph

# Initialise the logger
Expand All @@ -13,51 +15,54 @@

def parse_reports(self):
"""Find Qualimap RNASeq reports and parse their data"""

self.qualimap_rnaseq_genome_results = dict()
regexes = {
"reads_aligned": r"read(?:s| pairs) aligned\s*=\s*([\d,]+)",
"total_alignments": r"total alignments\s*=\s*([\d,]+)",
"non_unique_alignments": r"non-unique alignments\s*=\s*([\d,]+)",
"reads_aligned_genes": r"aligned to genes\s*=\s*([\d,]+)",
"ambiguous_alignments": r"ambiguous alignments\s*=\s*([\d,]+)",
"not_aligned": r"not aligned\s*=\s*([\d,]+)",
"5_3_bias": r"5'-3' bias\s*=\s*([\d,\.]+)$",
"reads_aligned_exonic": r"exonic\s*=\s*([\d,]+)",
"reads_aligned_intronic": r"intronic\s*=\s*([\d,]+)",
"reads_aligned_intergenic": r"intergenic\s*=\s*([\d,]+)",
"reads_aligned_overlapping_exon": r"overlapping exon\s*=\s*([\d,]+)",

int_metrics = {
"read pairs aligned": "reads_aligned",
"reads aligned": "reads_aligned",
"total alignments": "total_alignments",
"non-unique alignments": "non_unique_alignments",
"aligned to genes": "reads_aligned_genes",
"ambiguous alignments": "ambiguous_alignments",
"not aligned": "not_aligned",
"exonic": "reads_aligned_exonic",
"intronic": "reads_aligned_intronic",
"intergenic": "reads_aligned_intergenic",
"overlapping exon": "reads_aligned_overlapping_exon",
}
rate_metrics = {
"5' bias": "5_bias",
"3' bias": "3_bias",
"SSP estimation (fwd/rev)": "ssp_estimation",
"5'-3' bias": "5_3_bias",
}

value_regex = re.compile(r"\s+[\d,\.\xa0]+\s+")
for f in self.find_log_files("qualimap/rnaseq/rnaseq_results"):
d = dict()
preparsed_d = dict()
for line in f["f"].splitlines():
if "=" in line:
key, val = line.split("=", 1)
m = re.search(value_regex, val)
if m:
val = m.group(0)
val = re.sub(r"\xa0", "", val)
key = key.strip()
preparsed_d[key] = val.strip()

# Get the sample name
s_name_regex = re.search(r"bam file\s*=\s*(.+)", f["f"], re.MULTILINE)
if s_name_regex:
d["bam_file"] = s_name_regex.group(1)
s_name = self.clean_s_name(d["bam_file"], f)
else:
log.warning(f"Couldn't find an input filename in genome_results file {f['root']}/{f['fn']}")
# Check we have an input filename
if "bam file" not in preparsed_d:
log.debug(f"Couldn't find an input filename in genome_results file {f['fn']}")
return None
s_name = self.clean_s_name(preparsed_d["bam file"], f)

# Check for and 'fix' European style decimal places / thousand separators
comma_regex = re.search(r"exonic\s*=\s*[\d\.]+ \(\d{1,3},\d+%\)", f["f"], re.MULTILINE)
if comma_regex:
log.debug(f"Trying to fix European comma style syntax in Qualimap report {f['root']}/{f['fn']}")
f["f"] = f["f"].replace(".", "")
f["f"] = f["f"].replace(",", ".")

# Go through all numeric regexes
for k, r in regexes.items():
r_search = re.search(r, f["f"], re.MULTILINE)
if r_search:
try:
d[k] = float(r_search.group(1).replace(",", ""))
except UnicodeEncodeError:
# Qualimap reports infinity (\u221e) when 3' bias denominator is zero
pass
except ValueError:
d[k] = r_search.group(1)
d = parse_numerals(
preparsed_d,
float_metrics={},
int_metrics=int_metrics,
rate_metrics=rate_metrics,
fpath=os.path.join(f["root"], f["fn"]),
)

# Add to general stats table
for k in ["5_3_bias", "reads_aligned"]:
Expand All @@ -72,7 +77,7 @@ def parse_reports(self):
self.qualimap_rnaseq_genome_results[s_name] = d
self.add_data_source(f, s_name=s_name, section="rna_genome_results")

#### Coverage profile
# Coverage profile
self.qualimap_rnaseq_cov_hist = dict()
for f in self.find_log_files("qualimap/rnaseq/coverage", filehandles=True):
s_name = self.get_s_name(f)
Expand All @@ -81,7 +86,7 @@ def parse_reports(self):
if line.startswith("#"):
continue
coverage, count = line.split(None, 1)
coverage = int(round(float(coverage)))
coverage = int(round(float(coverage.replace(",", "."))))
vladsavelyev marked this conversation as resolved.
Show resolved Hide resolved
count = float(count)
d[coverage] = count

Expand All @@ -103,7 +108,7 @@ def parse_reports(self):
self.qualimap_rnaseq_genome_results = self.ignore_samples(self.qualimap_rnaseq_genome_results)
self.qualimap_rnaseq_cov_hist = self.ignore_samples(self.qualimap_rnaseq_cov_hist)

#### Plots
# Plots
# Genomic Origin Bar Graph
# NB: Ignore 'Overlapping Exon' in report - these make the numbers add up to > 100%
if len(self.qualimap_rnaseq_genome_results) > 0:
Expand Down
91 changes: 91 additions & 0 deletions multiqc/modules/qualimap/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,94 @@
import logging
from typing import Dict, Union

from .qualimap import MultiqcModule

__all__ = ["MultiqcModule"]

log = logging.getLogger(__name__)


def parse_numerals(
preparsed_d: Dict[str, str],
float_metrics: Dict[str, str],
int_metrics: Dict[str, str],
rate_metrics: Dict[str, str],
fpath: str,
) -> Dict[str, Union[int, float, str]]:
"""
Take pre-parsed Qualimap report (keys to string values), and properly parse
numeral values, taking regional formats into account.
"""
# Determine if decimal separator is dot or comma
decimalcomma = None
for k in rate_metrics:
if k in preparsed_d:
val = preparsed_d[k]
if "," in val and "." in val:
log.error(
f"Couldn't determine decimal separator for file {fpath}, as both . and , are "
f"found in a rational value: {val}"
)
return {}
if "," in val:
if decimalcomma is False:
log.error(
f"Couldn't determine decimal separator for file {fpath}, as differently formatted"
f"rational values are found"
)
return {}
decimalcomma = True
if "." in val:
if decimalcomma is True:
log.error(
f"Couldn't determine decimal separator for file {fpath}, as differently formatted"
f"rational values are found"
)
return {}
decimalcomma = False
if decimalcomma is None:
# All expected float numbers are integer, so attempt to instead determine the
# thousands separator from large int values.
for k in int_metrics:
if k in preparsed_d:
val = preparsed_d[k]
if "," in val and "." in val:
log.error(
f"Couldn't determine decimal separator for file {fpath}, as both . and , are "
f"found in a rational value: {val}"
)
return {}
if "," in val:
if decimalcomma is True:
log.error(
f"Couldn't determine decimal separator for file {fpath}, as differently formatted"
f"rational values are found"
)
return {}
decimalcomma = False
if "." in val:
if decimalcomma is False:
log.error(
f"Couldn't determine decimal separator for file {fpath}, as differently formatted"
f"rational values are found"
)
return {}
decimalcomma = True
if decimalcomma is None:
log.warning(f"Couldn't determine decimal separator for file {fpath}")
vladsavelyev marked this conversation as resolved.
Show resolved Hide resolved

d = {}
for k, v in preparsed_d.items():
v = v.strip("X").strip("%")
if k in float_metrics or k in rate_metrics or k in int_metrics:
if decimalcomma is True:
v = v.replace(".", "").replace(",", ".")
v = v.replace(",", "")
if k in int_metrics:
d[int_metrics[k]] = int(v)
elif k in float_metrics:
d[float_metrics[k]] = float(v)
elif k in rate_metrics:
d[rate_metrics[k]] = float(v)

return d