MultiQC · vladsavelyev · Nov 13, 2023 · Oct 11, 2023 · Oct 12, 2023 · Oct 12, 2023
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -19,6 +19,8 @@
 - **Pangolin**: update for v4: add QC Note , update tool versions columns ([#2157](https://github.com/ewels/MultiQC/pull/2157))
 - **fastp**: add version parsing ([#2159](https://github.com/ewels/MultiQC/pull/2159))
 - **fastp**: correctly parse sample name from --in1/--in2 command. Prefer file name if not `fastp.json`; fallback to file name when error ([#2139](https://github.com/ewels/MultiQC/pull/2139))
+- **Picard**: Generalize to directly support Sentieon and Parabricks outputs ([#2110](https://github.com/ewels/MultiQC/pull/2110))
+- **Sentieon**: Removed the module in favour of directly supporting the outputs in the **Picard** module. Note that `-m sentieon` will no longer work. The exported plot and data files will be prefixed as `picard` (instead of `sentieon`). Note that the Sentieon module used to fetch the sample names from the file names by default, and now it follows the Picard module's logic, and prioritizes the commands recorded in the logs. To override, use the `use_filename_as_sample_name` config flag ([#2110](https://github.com/ewels/MultiQC/pull/2110))
 
 ## [MultiQC v1.17](https://github.com/ewels/MultiQC/releases/tag/v1.17) - 2023-10-17
 

diff --git a/CSP.txt b/CSP.txt
@@ -3,7 +3,7 @@
 
 script-src 'self'
     # 1.18
-    'sha256-W6HVWfJ0uI4smLSf8gzVvnrHUJaXGP/HEeBvr0SYfyk=' # multiqc/templates/default/assets/js/multiqc_tables.js
+    'sha256-aY1YMeLr1IxkwxjBe0x60QzbuT4u5Mh/QC6brcAN9Do=' # multiqc/templates/default/assets/js/multiqc_tables.js
 
     # 1.17
     'sha256-krKzkLmKjisEgw0YGKglUFqLmEh6sK08Qw6xPmmo/10=' # ////////////////////////////////////////////////// Base JS for MultiQC Reports//

diff --git a/docs/modules/picard.md b/docs/modules/picard.md
@@ -202,3 +202,26 @@ picard_config:
 This will omit that section from the report entirely, and also skip parsing the histogram data.
 By specifying this option you may speed up the run time for MultiQC with these types of files
 significantly.
+
+### Sample names
+
+MultiQC supports outputs from multiple runs of a Picard tool merged together into one
+file. In order to handle multiple sample data in on file correctly, MultiQC needed
+to take the sample name elsewhere rather than the file name. For this reason, MultiQC
+attempts to parse the command line recorded in the output header. For example, an
+output from the `GcBias` tool contains a header line like this:
+
+```
+# net.sf.picard.analysis.CollectGcBiasMetrics REFERENCE_SEQUENCE=/reference/genome.fa
+INPUT=/alignments/P0001_101/P0001_101.bam OUTPUT=P0001_101.collectGcBias.txt ...
+```
+
+MultiQC would extract the BAM file name that goes after `INPUT=` and take `P0001_101`
+as a sample name. If MultiQC fails to parse the command line for any reason, it will
+fall back to using the file name. It is also possible to force using the file names
+as sample names by enabling the following config option:
+
+```yaml
+picard_config:
+  s_name_filenames: true
+```
diff --git a/multiqc/modules/base_module.py b/multiqc/modules/base_module.py
@@ -142,7 +142,7 @@ def find_log_files(self, sp_key, filecontents=True, filehandles=False):
 
             # Filter out files based on exclusion patterns
             if path_filters_exclude and len(path_filters_exclude) > 0:
-                # Try both the given path and also the path prefixed with the analyis dirs
+                # Try both the given path and also the path prefixed with the analysis dirs
                 exlusion_hits = itertools.chain(
                     (fnmatch.fnmatch(report.last_found_file, pfe) for pfe in path_filters_exclude),
                     *(

diff --git a/multiqc/modules/biobambam2/biobambam2.py b/multiqc/modules/biobambam2/biobambam2.py
@@ -30,15 +30,7 @@ def __init__(self):
         self.general_stats_data = dict()
         n = dict()
 
-        n["bamsormadup"] = MarkDuplicates.parse_reports(
-            self,
-            log_key="biobambam2/bamsormadup",
-            section_name="bamsormadup",
-            section_anchor="biobambam2-bamsormadup",
-            plot_title="biobambam2: bamsormadup deduplication stats",
-            plot_id="biobambam2_bamsormadup_plot",
-            data_filename="bamsormadup_bamsormadup",
-        )
+        n["bamsormadup"] = MarkDuplicates.parse_reports(self, "bamsormadup")
         if n["bamsormadup"] > 0:
             log.info("Found {} bamsormadup reports".format(n["bamsormadup"]))
 

diff --git a/multiqc/modules/picard/AlignmentSummaryMetrics.py b/multiqc/modules/picard/AlignmentSummaryMetrics.py
@@ -1,156 +1,157 @@
 """ MultiQC submodule to parse output from Picard AlignmentSummaryMetrics """
 
 import logging
-import os
-import re
 from collections import OrderedDict
 
+from multiqc.modules.picard import util
 from multiqc.plots import bargraph
 
 # Initialise the logger
 log = logging.getLogger(__name__)
 
 
-def parse_reports(self):
+def parse_reports(module):
     """Find Picard AlignmentSummaryMetrics reports and parse their data"""
 
-    # Set up vars
-    self.picard_alignment_metrics = dict()
+    data_by_sample = dict()
 
     # Go through logs and find Metrics
-    for f in self.find_log_files("picard/alignment_metrics", filehandles=True):
-        parsed_data = dict()
-        s_name = None
+    for f in module.find_log_files(f"{module.anchor}/alignment_metrics", filehandles=True):
+        # Sample name from input file name by default.
+        s_name = f["s_name"]
         keys = None
-        for l in f["f"]:
-            # New log starting
-            if "AlignmentSummaryMetrics" in l and "INPUT" in l:
-                s_name = None
+
+        for line in f["f"]:
+            maybe_s_name = util.extract_sample_name(
+                module,
+                line,
+                f,
+                picard_tool="CollectAlignmentSummaryMetrics",
+                sentieon_algo="AlignmentStat",
+            )
+            if maybe_s_name:
+                s_name = maybe_s_name
                 keys = None
-                # Pull sample name from input
-                fn_search = re.search(r"INPUT(?:=|\s+)(\[?[^\s]+\]?)", l, flags=re.IGNORECASE)
-                if fn_search:
-                    s_name = os.path.basename(fn_search.group(1).strip("[]"))
-                    s_name = self.clean_s_name(s_name, f)
-                    parsed_data[s_name] = dict()
-
-            if s_name is not None:
-                if "AlignmentSummaryMetrics" in l and "## METRICS CLASS" in l:
-                    keys = f["f"].readline().strip("\n").split("\t")
-                elif keys:
-                    vals = l.strip("\n").split("\t")
-                    if len(vals) == len(keys):
-                        # Ignore the FIRST_OF_PAIR / SECOND_OF_PAIR data to simplify things
-                        if vals[0] == "PAIR" or vals[0] == "UNPAIRED":
-                            for i, k in enumerate(keys):
-                                try:
-                                    parsed_data[s_name][k] = float(vals[i])
-                                except ValueError:
-                                    parsed_data[s_name][k] = vals[i]
-                    else:
-                        s_name = None
-                        keys = None
-
-        # Superfluous function call to confirm that it is used in this module
-        # Replace None with actual version if it is available
-        self.add_software_version(None, s_name)
-
-        # Remove empty dictionaries
-        for s_name in list(parsed_data.keys()):
-            if len(parsed_data[s_name]) == 0:
-                parsed_data.pop(s_name, None)
-
-        # Manipulate sample names if multiple baits found
-        for s_name in parsed_data.keys():
-            if s_name in self.picard_alignment_metrics:
-                log.debug("Duplicate sample name found in {}! Overwriting: {}".format(f["fn"], s_name))
-            self.add_data_source(f, s_name, section="AlignmentSummaryMetrics")
-            self.picard_alignment_metrics[s_name] = parsed_data[s_name]
+
+            if s_name is None:
+                continue
+
+            if util.is_line_right_before_table(
+                line, picard_class="AlignmentSummaryMetrics", sentieon_algo="AlignmentStat"
+            ):
+                if s_name in data_by_sample:
+                    log.debug(f"Duplicate sample name found in {f['fn']}! Overwriting: " f"{s_name}")
+                data_by_sample[s_name] = dict()
+                module.add_data_source(f, s_name, section="AlignmentSummaryMetrics")
+                keys = f["f"].readline().strip("\n").split("\t")
+
+            elif keys:
+                vals = line.strip("\n").split("\t")
+                if len(vals) != len(keys):
+                    keys = None
+                    continue
+
+                # Ignore the FIRST_OF_PAIR / SECOND_OF_PAIR data to simplify things
+                if vals[0] == "PAIR" or vals[0] == "UNPAIRED":
+                    for k, v in zip(keys, vals):
+                        try:
+                            v = float(v)
+                        except ValueError:
+                            pass
+                        data_by_sample[s_name][k] = v
 
     # Filter to strip out ignored sample names
-    self.picard_alignment_metrics = self.ignore_samples(self.picard_alignment_metrics)
-
-    if len(self.picard_alignment_metrics) > 0:
-        # Write parsed data to a file
-        self.write_data_file(self.picard_alignment_metrics, "multiqc_picard_AlignmentSummaryMetrics")
-
-        # Add to general stats table
-        self.general_stats_headers["PCT_PF_READS_ALIGNED"] = {
-            "title": "% Aligned",
-            "description": "Percent of aligned reads",
-            "max": 100,
-            "min": 0,
-            "suffix": "%",
-            "format": "{:,.0f}",
-            "scale": "RdYlGn",
-            "modify": lambda x: self.multiply_hundred(x),
-        }
-        for s_name in self.picard_alignment_metrics:
-            if s_name not in self.general_stats_data:
-                self.general_stats_data[s_name] = dict()
-            self.general_stats_data[s_name].update(self.picard_alignment_metrics[s_name])
-
-        # Make the bar plot of alignment read count + # aligned bases
-        pdata = dict()
-        for s_name in self.picard_alignment_metrics.keys():
-            pdata[s_name] = dict()
-            # Picard reports both reads for PE data. Divide it by two as most people will expect # clusters
-            if self.picard_alignment_metrics[s_name]["CATEGORY"] == "PAIR":
-                pdata[s_name]["total_reads"] = self.picard_alignment_metrics[s_name]["TOTAL_READS"] / 2
-                pdata[s_name]["aligned_reads"] = self.picard_alignment_metrics[s_name]["PF_READS_ALIGNED"] / 2
-            else:
-                pdata[s_name]["total_reads"] = self.picard_alignment_metrics[s_name]["TOTAL_READS"]
-                pdata[s_name]["aligned_reads"] = self.picard_alignment_metrics[s_name]["PF_READS_ALIGNED"]
-            pdata[s_name]["unaligned_reads"] = pdata[s_name]["total_reads"] - pdata[s_name]["aligned_reads"]
-
-        keys = [OrderedDict(), OrderedDict()]
-        keys[0]["aligned_reads"] = {"name": "Aligned Reads"}
-        keys[0]["unaligned_reads"] = {"name": "Unaligned Reads"}
-        keys[1]["PF_ALIGNED_BASES"] = {"name": "Aligned Bases"}
-
-        # Config for the plot
-        pconfig = {
-            "id": "picard_alignment_summary",
-            "title": "Picard: Alignment Summary",
-            "ylab": "# Reads",
-            "data_labels": [
-                {
-                    "name": "Aligned Reads",
-                    "ylab": "# Reads",
-                    "cpswitch_counts_label": "Number of Reads",
-                },
-                {
-                    "name": "Aligned Bases",
-                    "ylab": "# Bases",
-                    "cpswitch_counts_label": "Number of Bases",
-                },
-            ],
-        }
-
-        # The different data sets we want to plot
-        self.add_section(
-            name="Alignment Summary",
-            anchor="picard-alignmentsummary",
-            description="Please note that Picard's read counts are divided by two for paired-end data. Total bases (including unaligned) is not provided.",
-            plot=bargraph.plot([pdata, self.picard_alignment_metrics], keys, pconfig),
-        )
-
-        # Make a bar plot of mean read length
-        keys = {"MEAN_READ_LENGTH": {"name": "Mean Read Length"}}
-        pconfig = {
-            "id": "picard_alignment_readlength_plot",
-            "title": "Picard: Mean Read Length",
-            "ylab": "Base pairs",
-            "cpswitch": False,
-        }
-
-        # The different data sets we want to plot
-        self.add_section(
-            name="Mean read length",
-            anchor="picard_alignment_readlength",
-            description="The mean read length of the set of reads examined.",
-            plot=bargraph.plot(self.picard_alignment_metrics, keys, pconfig),
-        )
+    data_by_sample = module.ignore_samples(data_by_sample)
+    if len(data_by_sample) == 0:
+        return 0
+
+    # Superfluous function call to confirm that it is used in this module
+    # Replace None with actual version if it is available
+    module.add_software_version(None)
+
+    # Write parsed data to a file
+    module.write_data_file(data_by_sample, f"multiqc_{module.anchor}_AlignmentSummaryMetrics")
+
+    # Add to general stats table
+    module.general_stats_headers["PCT_PF_READS_ALIGNED"] = {
+        "title": "% Aligned",
+        "description": "Percent of aligned reads",
+        "max": 100,
+        "min": 0,
+        "suffix": "%",
+        "format": "{:,.0f}",
+        "scale": "RdYlGn",
+        "modify": lambda x: util.multiply_hundred(x),
+    }
+    for s_name in data_by_sample:
+        if s_name not in module.general_stats_data:
+            module.general_stats_data[s_name] = dict()
+        module.general_stats_data[s_name].update(data_by_sample[s_name])
+
+    # Make the bar plot of alignment read count + # aligned bases
+    pdata = dict()
+    for s_name in data_by_sample.keys():
+        pdata[s_name] = dict()
+        # Picard reports both reads for PE data. Divide it by two as most people will
+        # expect # clusters
+        if data_by_sample[s_name]["CATEGORY"] == "PAIR":
+            pdata[s_name]["total_reads"] = data_by_sample[s_name]["TOTAL_READS"] / 2
+            pdata[s_name]["aligned_reads"] = data_by_sample[s_name]["PF_READS_ALIGNED"] / 2
+        else:
+            pdata[s_name]["total_reads"] = data_by_sample[s_name]["TOTAL_READS"]
+            pdata[s_name]["aligned_reads"] = data_by_sample[s_name]["PF_READS_ALIGNED"]
+        pdata[s_name]["unaligned_reads"] = pdata[s_name]["total_reads"] - pdata[s_name]["aligned_reads"]
+
+    keys = [OrderedDict(), OrderedDict()]
+    keys[0]["aligned_reads"] = {"name": "Aligned Reads"}
+    keys[0]["unaligned_reads"] = {"name": "Unaligned Reads"}
+    keys[1]["PF_ALIGNED_BASES"] = {"name": "Aligned Bases"}
+
+    # Config for the plot
+    pconfig = {
+        "id": f"{module.anchor}_alignment_summary",
+        "title": f"{module.name}: Alignment Summary",
+        "ylab": "# Reads",
+        "data_labels": [
+            {
+                "name": "Aligned Reads",
+                "ylab": "# Reads",
+                "cpswitch_counts_label": "Number of Reads",
+            },
+            {
+                "name": "Aligned Bases",
+                "ylab": "# Bases",
+                "cpswitch_counts_label": "Number of Bases",
+            },
+        ],
+    }
+
+    # The different data sets we want to plot
+    module.add_section(
+        name="Alignment Summary",
+        anchor=f"{module.anchor}-alignmentsummary",
+        description=f"Please note that {module.name}'s read counts are divided by two "
+        f"for paired-end data. Total bases (including unaligned) is not "
+        f"provided.",
+        plot=bargraph.plot([pdata, data_by_sample], keys, pconfig),
+    )
+
+    # Make a bar plot of mean read length
+    keys = {"MEAN_READ_LENGTH": {"name": "Mean Read Length"}}
+    pconfig = {
+        "id": f"{module.anchor}_alignment_readlength_plot",
+        "title": f"{module.name}: Mean Read Length",
+        "ylab": "Base pairs",
+        "cpswitch": False,
+    }
+
+    # The different data sets we want to plot
+    module.add_section(
+        name="Mean read length",
+        anchor=f"{module.anchor}_alignment_readlength",
+        description="The mean read length of the set of reads examined.",
+        plot=bargraph.plot(data_by_sample, keys, pconfig),
+    )
 
     # Return the number of detected samples to the parent module
-    return len(self.picard_alignment_metrics)
+    return len(data_by_sample)