Conpair concordance: account for different format when run with `--ou…

…tfile` (#1855) * Fix Conpair module and search pattern for concordance * Updating CHANGELOG * CHANGELOG update * Linting update --------- Co-authored-by: Edouard Henrion <edouard.henrion@mcgill.ca> Co-authored-by: Phil Ewels <phil.ewels@seqera.io>
MultiQC · Aug 30, 2023 · d783223 · d783223
1 parent baaee71
commit d783223
Show file tree

Hide file tree

Showing 3 changed files with 19 additions and 13 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -69,6 +69,8 @@ for more information.
 
 ### Module updates
 
+- **Conpair**
+  - Bugfix: allow to find and proprely parse the `concordance` output of Conpair, which may output 2 kinds of format for `concordance` depending if it's ran with or without `--outfile` ([#1851](https://github.com/ewels/MultiQC/issues/1851))
 - **Cell Ranger**
   - Bugfix: avoid multiple `KeyError` exceptions when parsing Cell Ranger 7.x `web_summary.html` ([#1853](https://github.com/ewels/MultiQC/issues/1853), [#1871](https://github.com/ewels/MultiQC/issues/1871))
 - **DRAGEN**

diff --git a/multiqc/modules/conpair/conpair.py b/multiqc/modules/conpair/conpair.py
@@ -55,21 +55,25 @@ def parse_conpair_logs(self, f):
         One parser to rule them all."""
 
         conpair_regexes = {
-            "concordance_concordance": r"Concordance: ([\d\.]+)%",
-            "concordance_used_markers": r"Based on (\d+)/\d+ markers",
-            "concordance_total_markers": r"Based on \d+/(\d+) markers",
-            "concordance_marker_threshold": r"\(coverage per marker threshold : (\d+) reads\)",
-            "concordance_min_mapping_quality": r"Minimum mappinq quality: (\d+)",
-            "concordance_min_base_quality": r"Minimum base quality: (\d+)",
-            "contamination_normal": r"Normal sample contamination level: ([\d\.]+)%",
-            "contamination_tumor": r"Tumor sample contamination level: ([\d\.]+)%",
+            "concordance_concordance": [r"Concordance: ([\d\.]+)%\nBased.*", r"(\d\.\d+)\nBased.*"],
+            "concordance_used_markers": [r"Based on (\d+)/\d+ markers"],
+            "concordance_total_markers": [r"Based on \d+/(\d+) markers"],
+            "concordance_marker_threshold": [r"\(coverage per marker threshold : (\d+) reads\)"],
+            "concordance_min_mapping_quality": [r"Minimum mappinq quality: (\d+)"],
+            "concordance_min_base_quality": [r"Minimum base quality: (\d+)"],
+            "contamination_normal": [r"Normal sample contamination level: ([\d\.]+)%"],
+            "contamination_tumor": [r"Tumor sample contamination level: ([\d\.]+)%"],
         }
 
         parsed_data = {}
-        for k, r in conpair_regexes.items():
-            match = re.search(r, f["f"])
-            if match:
-                parsed_data[k] = float(match.group(1))
+        for k, r_arr in conpair_regexes.items():
+            for r in r_arr:
+                match = re.search(r, f["f"])
+                if match:
+                    parsed_data[k] = float(match.group(1))
+                    if k == "concordance_concordance" and not "Concordance" in r:
+                        parsed_data[k] = 100 * parsed_data[k]
+                    break
 
         def _cp_type(data):
             if "concordance_concordance" in parsed_data:

diff --git a/multiqc/utils/search_patterns.yaml b/multiqc/utils/search_patterns.yaml
@@ -230,7 +230,7 @@ clusterflow/runfiles:
   contents: "Cluster Flow Run File"
   num_lines: 2
 conpair/concordance:
-  contents: "markers (coverage per marker threshold : "
+  contents_re: '.*markers \(coverage per marker threshold\s?: .*'
   num_lines: 3
 conpair/contamination:
   contents: "Tumor sample contamination level: "