UMI-tools: support extract command (#2296)

* UMI-tools: support extract command * [automated] Update CHANGELOG.md * Add barplot * Barplot with extract stats * Doc * Changelog * Lint * Fix duplicated IDs in linting * Merge conflict --------- Co-authored-by: MultiQC Bot <multiqc-bot@seqera.io> Co-authored-by: Phil Ewels <phil.ewels@seqera.io>
MultiQC · Feb 19, 2024 · c5ef275 · c5ef275
1 parent 73dbdde
commit c5ef275
Show file tree

Hide file tree

Showing 5 changed files with 187 additions and 113 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -18,6 +18,7 @@
 
 ### Module updates
 
+- **UMI-tools**: support `extract` command ([#2296](https://github.com/MultiQC/MultiQC/pull/2296))
 - **bcl2fastq**: fix the top undetermined barcodes plot ([#2340](https://github.com/MultiQC/MultiQC/pull/2340))
 - **DRAGEN**: add few coverage metrics in general stats ([#2341](https://github.com/MultiQC/MultiQC/pull/2341))
 - **DRAGEN**: fix showing the number of found samples ([#2347](https://github.com/MultiQC/MultiQC/pull/2347))
@@ -99,7 +100,6 @@ The v1.20 release is also the first release we've had since we moved the MultiQC
 - **BCL Convert**: add index, project names to sample statistics and calculate mean quality for lane statistics. ([#2261](https://github.com/MultiQC/MultiQC/pull/2261))
 - **BCL Convert**: fix duplicated `yield` for 3.9.3+ when the yield is provided explicitly in Quality_Metrics ([#2253](https://github.com/MultiQC/MultiQC/pull/2253))
 - **BCL Convert**: handle samples with zero yield ([#2297](https://github.com/MultiQC/MultiQC/pull/2297))
-- **Bismark**: fix old link in Bismark docs ([#2252](https://github.com/MultiQC/MultiQC/pull/2252))
 - **Bismark**: fix old link in docs ([#2252](https://github.com/MultiQC/MultiQC/pull/2252))
 - **Cutadapt**: support JSON format ([#2281](https://github.com/MultiQC/MultiQC/pull/2281))
 - **HiFiasm**: account for lines with no asterisk ([#2268](https://github.com/MultiQC/MultiQC/pull/2268))

diff --git a/docs/modules/umitools.md b/docs/modules/umitools.md
@@ -8,4 +8,4 @@ description: >
 
 The MultiQC module for umitools parses logs from umi-tools
 
-Currently only the `dedup` command is supported.
+Currently `dedup` and `extract` commands are supported.
diff --git a/multiqc/modules/umitools/umitools.py b/multiqc/modules/umitools/umitools.py
@@ -3,6 +3,7 @@
 
 import logging
 import re
+from typing import Dict, Optional
 
 from multiqc import config
 from multiqc.modules.base_module import BaseMultiqcModule, ModuleNoSamplesFound
@@ -27,125 +28,138 @@ def __init__(self):
             doi="10.1101/gr.209601.116",
         )
 
-        # Find and load any umitools log files
-        self.umitools_data = dict()
-        for f in self.find_log_files("umitools"):
+        dedup_data_by_sample = dict()
+        for f in self.find_log_files("umitools/dedup"):
             # Parse the log file for sample name and statistics
-            input_fname, data = self.parse_logs(f)
-            if data and len(data) > 1:
-                # Clean the sample name
-                f["s_name"] = self.clean_s_name(input_fname, f)
-                # Log a warning if the log file matches an existing sample name
-                if f["s_name"] in self.umitools_data:
+            data = self.parse_dedup_logs(f)
+            if data:
+                f["s_name"] = self._parse_s_name(f) or f["s_name"]
+                if f["s_name"] in dedup_data_by_sample:
                     log.debug(f"Duplicate sample name found! Overwriting: {f['s_name']}")
-                # Store the data in the overall data dictionary
-                self.umitools_data[f["s_name"]] = data
-                # Add the log file information to the multiqc_sources.txt
-                self.add_data_source(f)
-                # Add version info
-                self.add_software_version(data["version"], f["s_name"])
+                dedup_data_by_sample[f["s_name"]] = data
+                self.add_data_source(f)  # Add the log file information to the multiqc_sources.txt
+                if "version" in data:  # Add version info
+                    self.add_software_version(data.pop("version"), f["s_name"])
 
-        # Check the log files against the user supplied list of samples to ignore
-        self.umitools_data = self.ignore_samples(self.umitools_data)
+        extract_data_by_sample = dict()
+        for f in self.find_log_files("umitools/extract"):
+            # Parse the log file for sample name and statistics
+            data = self.parse_extract_logs(f)
+            if data:
+                f["s_name"] = self._parse_s_name(f) or f["s_name"]
+                if f["s_name"] in extract_data_by_sample:
+                    log.debug(f"Duplicate sample name found! Overwriting: {f['s_name']}")
+                extract_data_by_sample[f["s_name"]] = data
+                self.add_data_source(f)  # Add the log file information to the multiqc_sources.txt
+                if "version" in data:  # Add version info
+                    self.add_software_version(data.pop("version"), f["s_name"])
 
-        # If no files are found, raise an exception
-        if len(self.umitools_data) == 0:
+        # Check the log files against the user supplied list of samples to ignore
+        dedup_data_by_sample = self.ignore_samples(dedup_data_by_sample)
+        extract_data_by_sample = self.ignore_samples(extract_data_by_sample)
+        if len(dedup_data_by_sample) == 0 and len(extract_data_by_sample) == 0:
             raise ModuleNoSamplesFound
+        log.info(f"Found {len(dedup_data_by_sample)} deduplication reports")
+        log.info(f"Found {len(extract_data_by_sample)} extract reports")
 
-        # Log the number of reports found
-        log.info(f"Found {len(self.umitools_data)} reports")
-
-        # Write parsed report data to a file
-        self.write_data_file(self.umitools_data, "multiqc_umitools")
+        # Write parsed reports data to a file
+        self.write_data_file(dedup_data_by_sample, "multiqc_umitools_dedup")
+        self.write_data_file(extract_data_by_sample, "multiqc_umitools_extract")
 
-        # write data to the general statistics table
-        self.umitools_general_stats_table()
-
-        # add a section with a deduplicated reads plot to the report
-        self.add_section(
-            name="Deduplicated Reads",
-            anchor="umitools-dedup-plot",
-            plot=self.umitools_deduplication_plot(),
-        )
-
-        # add a section with a beeswarm plot of UMI stats to the report
-        self.add_section(
-            name="UMI Stats",
-            anchor="umitools-umi-stats",
-            description="Statistics from running `umi_tools dedeup`",
-            helptext="""
-            - **Positions Dedup**: Total number of positions deduplicated
-            - **Total UMIs**: Total UMIs found in sample
-            - **Unique UMIs**: Unique UMIs found in sample
-            - **Mean #UMI**: Mean number of unique UMIs per position
-            - **Max #UMI**: Max number of unique UMIs per position
-            """,
-            plot=self.umitools_umi_stats_swarm(),
-        )
+        if dedup_data_by_sample:
+            self.dedup_general_stats_table(dedup_data_by_sample)
+            self.deduplication_plot(dedup_data_by_sample)
+            self.umi_stats_violin(dedup_data_by_sample)
 
-    def parse_logs(self, f):
-        # Check this is a dedup log
-        if "# output generated by dedup" not in f["f"]:
-            log.debug(f"Skipping as not a dedup log: {f['fn']}")
-            return None, None
+        if extract_data_by_sample:
+            self.extract_general_stats_table(extract_data_by_sample)
+            self.umitools_extract_barplot(extract_data_by_sample)
 
+    def _parse_s_name(self, f) -> Optional[str]:
         # Get the s_name from the input file if possible
         # stdin : <_io.TextIOWrapper name='M18-39155_T1.Aligned.sortedByCoord.out.bam' mode='r' encoding='UTF-8'>
         s_name_re = r"stdin\s+:\s+<_io\.TextIOWrapper name='([^\']+)'"
         s_name_match = re.search(s_name_re, f["f"])
         if s_name_match:
-            s_name = s_name_match.group(1)
-        else:
-            s_name = f["s_name"]
+            return self.clean_s_name(s_name_match.group(1))
+        return None
 
-        # Initialise a dictionary to hold the data from this log file
-        data = {}
+    @staticmethod
+    def parse_dedup_logs(f) -> Dict:
+        regexes = [
+            (int, "total_umis", r"INFO total_umis (\d+)"),
+            (int, "unique_umis", r"INFO #umis (\d+)"),
+            (int, "dedup_input_reads", r"INFO Reads: Input Reads: (\d+)"),
+            (int, "dedup_output_reads", r"INFO Number of reads out: (\d+)"),
+            (int, "positions_deduplicated", r"INFO Total number of positions deduplicated: (\d+)"),
+            (float, "mean_umi_per_pos", r"INFO Mean number of unique UMIs per position: ([\d\.]+)"),
+            (float, "max_umi_per_pos", r"INFO Max. number of unique UMIs per position: (\d+)"),
+            (str, "version", r"# UMI-tools version: ([\d\.]+)"),
+        ]
 
+        data = {}
         # Search for values using regular expressions
-        regexes = {
-            "total_umis": r"INFO total_umis (\d+)",
-            "unique_umis": r"INFO #umis (\d+)",
-            "input_reads": r"INFO Reads: Input Reads: (\d+)",
-            "output_reads": r"INFO Number of reads out: (\d+)",
-            "positions_deduplicated": r"INFO Total number of positions deduplicated: (\d+)",
-            "mean_umi_per_pos": r"INFO Mean number of unique UMIs per position: ([\d\.]+)",
-            "max_umi_per_pos": r"INFO Max. number of unique UMIs per position: (\d+)",
-            "version": r"# UMI-tools version: ([\d\.]+)",
-        }
-        for key, regex in regexes.items():
+        for type_, key, regex in regexes:
             re_matches = re.search(regex, f["f"])
             if re_matches:
                 if key == "version":
                     data[key] = re_matches.group(1)
                 else:
-                    data[key] = float(re_matches.group(1))
+                    data[key] = type_(re_matches.group(1))
 
-        # calculate a few simple supplementary stats
-        try:
-            data["percent_passing_dedup"] = round(((data["output_reads"] / data["input_reads"]) * 100.0), 2)
-        except (KeyError, ZeroDivisionError):
-            pass
-        try:
-            data["removed_reads"] = data["input_reads"] - data["output_reads"]
-        except KeyError:
-            pass
+            # Calculate a few simple supplementary stats
+            try:
+                data["dedup_percent_passing"] = round(
+                    ((data["dedup_output_reads"] / data["dedup_input_reads"]) * 100.0), 2
+                )
+            except (KeyError, ZeroDivisionError):
+                pass
+            try:
+                data["dedup_removed_reads"] = data["dedup_input_reads"] - data["dedup_output_reads"]
+            except KeyError:
+                pass
 
-        return s_name, data
+        return data
+
+    @staticmethod
+    def parse_extract_logs(f) -> Dict:
+        regexes = [
+            (int, "extract_input_reads", r"INFO Input Reads: (\d+)"),
+            (int, "read1_mismatch", r"INFO regex does not match read1: (\d+)"),
+            (int, "read1_match", r"INFO regex matches read1: (\d+)"),
+            (int, "read2_mismatch", r"INFO regex does not match read2: (\d+)"),
+            (int, "read2_match", r"INFO regex matches read2: (\d+)"),
+            (int, "extract_output_reads", r"INFO Reads output: (\d+)"),
+            (str, "version", r"# UMI-tools version: ([\d\.]+)"),
+        ]
+
+        data = {}
+        # Search for values using regular expressions
+        for type_, key, regex in regexes:
+            re_matches = re.search(regex, f["f"])
+            if re_matches:
+                if key == "version":
+                    data[key] = re_matches.group(1)
+                else:
+                    data[key] = type_(re_matches.group(1))
 
-    def umitools_general_stats_table(self):
-        """Take the parsed stats from the umitools report and add it to the
-        basic stats table at the top of the report"""
+        return data
 
+    def dedup_general_stats_table(self, data_by_sample):
+        """
+        Take the parsed stats from the umitools report and add it to the
+        basic stats table at the top of the report
+        """
         headers = {
-            "output_reads": {
+            "dedup_output_reads": {
                 "title": f"{config.read_count_prefix} Unique Reads",
                 "description": f"Reads remaining after deduplication ({config.read_count_desc})",
                 "min": 0,
                 "modify": lambda x: x * config.read_count_multiplier,
                 "shared_key": "read_count",
                 "scale": "PuRd",
             },
-            "percent_passing_dedup": {
+            "dedup_percent_passing": {
                 "title": "% Pass Dedup",
                 "description": "% processed reads that passed deduplication",
                 "max": 100,
@@ -154,50 +168,93 @@ def umitools_general_stats_table(self):
                 "scale": "RdYlGn",
             },
         }
-        self.general_stats_addcols(self.umitools_data, headers)
+        self.general_stats_addcols(data_by_sample, headers)
+
+    def extract_general_stats_table(self, data_by_sample):
+        headers = {
+            "extract_output_reads": {
+                "title": f"{config.read_count_prefix} Extracted Reads",
+                "description": f"Reads remaining after extraction ({config.read_count_desc})",
+                "min": 0,
+                "modify": lambda x: x * config.read_count_multiplier,
+                "shared_key": "read_count",
+                "scale": "PuRd",
+            },
+        }
+        self.general_stats_addcols(data_by_sample, headers)
 
-    def umitools_deduplication_plot(self):
+    def deduplication_plot(self, data_by_sample):
         """Generate a plot the read deduplication rates for the main report"""
 
         # Specify the order of the different possible categories
         keys = {
-            "output_reads": {"color": "#7fc97f", "name": "Reads remaining"},
-            "removed_reads": {"color": "#fdc086", "name": "Reads removed"},
+            "dedup_output_reads": {"color": "#7fc97f", "name": "Reads remaining"},
+            "dedup_removed_reads": {"color": "#fdc086", "name": "Reads removed"},
         }
 
-        # Config for the plot
-        config = {
-            "id": "umitools_deduplication",
-            "title": "UMI-tools: Deduplication Counts",
-            "ylab": "# Reads",
-            "cpswitch_counts_label": "Number of Reads",
+        return self.add_section(
+            name="Deduplicated Reads",
+            anchor="umitools-dedup-plot",
+            plot=bargraph.plot(
+                data_by_sample,
+                keys,
+                {
+                    "id": "umitools_deduplication_barplot",
+                    "title": "UMI-tools: Deduplication Counts",
+                    "ylab": "# Reads",
+                    "cpswitch_counts_label": "Number of Reads",
+                },
+            ),
+        )
+
+    def umitools_extract_barplot(self, data_by_sample):
+        keys = {
+            "read1_match": {"color": "#7fc9c7", "name": "Read1 Match"},
+            "read1_mismatch": {"color": "#fd9286", "name": "Read1 Mismatch"},
+            "read2_match": {"color": "#7f89c9", "name": "Read2 Match"},
+            "read2_mismatch": {"color": "#fd86aa", "name": "Read2 Mismatch"},
         }
 
-        return bargraph.plot(self.umitools_data, keys, config)
+        # add a section with a beeswarm plot of UMI stats to the report
+        self.add_section(
+            name="Extract Stats",
+            anchor="umitools_extract",
+            description="Read stats from `umi_tools extract`",
+            plot=bargraph.plot(
+                data_by_sample,
+                keys,
+                {
+                    "id": "umitools_extract_barplot",
+                    "title": "UMI-tools: Extract Stats",
+                    "ylab": "# Reads",
+                    "cpswitch_counts_label": "Number of Reads",
+                },
+            ),
+        )
 
-    def umitools_umi_stats_swarm(self):
-        """Generate a swarmplot of umi stats for the main report"""
+    def umi_stats_violin(self, data_by_sample):
+        """Generate a violin of UMI stats for the main report"""
 
         headers = {
             "positions_deduplicated": {
                 "title": "Positions Dedup",
                 "description": "Total number of positions deduplicated",
                 "min": 0,
-                "format": "{:,.0f}",
+                "format": "{:,d}",
                 "scale": "Greens",
             },
             "total_umis": {
                 "title": "Total UMIs",
                 "description": "Total UMIs found in sample",
                 "min": 0,
-                "format": "{:,.0f}",
+                "format": "{:,d}",
                 "scale": "Blues",
             },
             "unique_umis": {
                 "title": "Unique UMIs",
                 "description": "Unique UMIs found in sample",
                 "min": 0,
-                "format": "{:,.0f}",
+                "format": "{:,d}",
                 "scale": "Purples",
             },
             "mean_umi_per_pos": {
@@ -216,10 +273,24 @@ def umitools_umi_stats_swarm(self):
             },
         }
 
-        # Config for the table
-        config = {
-            "id": "umitools_stats_swarmplot",
-            "table_title": "UMI-tools: UMI stats",
-        }
-
-        return beeswarm.plot(self.umitools_data, headers, config)
+        # add a section with a beeswarm plot of UMI stats to the report
+        self.add_section(
+            name="UMI Stats",
+            anchor="umitools-umi-stats",
+            description="Statistics from running `umi_tools dedup` or `umi_tools extract`",
+            helptext="""
+            - **Positions Dedup**: Total number of positions deduplicated
+            - **Total UMIs**: Total UMIs found in sample
+            - **Unique UMIs**: Unique UMIs found in sample
+            - **Mean #UMI**: Mean number of unique UMIs per position
+            - **Max #UMI**: Max number of unique UMIs per position
+            """,
+            plot=beeswarm.plot(
+                data_by_sample,
+                headers,
+                {
+                    "id": "umitools_stats_violin",
+                    "table_title": "UMI-tools: UMI stats",
+                },
+            ),
+        )