NanoStat: support new output format (#1997)

* NanoStat: support new output format. Fixes #1995 * Docs: more mention of alternative tool names. Means if anyone searches on the modules listing page for these, it'll show up. * Split legacy parsing into its own function --------- Co-authored-by: Phil Ewels <phil.ewels@seqera.io>
MultiQC · Aug 28, 2023 · a5a3129 · a5a3129
1 parent bfaeb6b
commit a5a3129
Show file tree

Hide file tree

Showing 4 changed files with 57 additions and 17 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -26,6 +26,8 @@
   - Fix a bug happening when both `region` and `global` coverage histograms for a sample are available (i.e. when mosdepth was run with `--by`, see [mosdepth docs](https://github.com/brentp/mosdepth#usage)). In this case, data was effectively merged. Instead, summarise it separately and add a separate report section for the region-based coverage data.
 - **RSeQC**
   - Fix "max() arg is an empty sequence" error ([#1985](https://github.com/ewels/MultiQC/issues/1985))
+- **NanoStat**
+  - Support new format ([#1995](https://github.com/ewels/MultiQC/issues/1995)).
 - **DRAGEN**
   - Make DRAGEN module use `fn_clean_exts` instead of hardcoded file names. Fixes working with arbitrary file names ([#1865])
 - **WhatsHap**

diff --git a/docs/modules/nanostat.md b/docs/modules/nanostat.md
@@ -2,10 +2,12 @@
 name: NanoStat
 url: https://github.com/wdecoster/nanostat
 description: >
-  Calculate various statistics from a long read sequencing dataset in fastq, bam or albacore
-  sequencing summary format.
+  Calculate various statistics from a long read sequencing dataset in FastQ, BAM or albacore
+  sequencing summary format (supports NanoPack; NanoPlot, NanoComp).
 ---
 
 The nanostat module parses text output generated by
 [NanoStat](https://github.com/wdecoster/nanostat/), a program for summarising results of sequencing
-on Oxford Nanopore methods (MinION, PromethION etc.)
+on Oxford Nanopore methods (MinION, PromethION etc.).
+
+Note that the program is deprecated now, however, the tools from the [NanoPack family](https://github.com/wdecoster/nanopack/) (NanoPlot, NanoComp) still call the same code indirectly.
diff --git a/multiqc/modules/nanostat/nanostat.py b/multiqc/modules/nanostat/nanostat.py
@@ -16,19 +16,21 @@
 class MultiqcModule(BaseMultiqcModule):
     """NanoStat module"""
 
-    _KEYS_NUM = [
-        "Active channels",
-        "Number of reads",
-        "Total bases",
-        "Total bases aligned",
-        "Read length N50",
-        "Mean read length",
-        "Median read length",
-        "Median read quality",
-        "Mean read quality",
-        "Average percent identity",
-        "Median percent identity",
-    ]
+    _KEYS_MAPPING = {
+        "number_of_reads": "Number of reads",
+        "number_of_bases": "Total bases",
+        "number_of_bases_aligned": "Total bases aligned",
+        "fraction_bases_aligned": "Fraction of bases aligned",
+        "median_read_length": "Median read length",
+        "mean_read_length": "Mean read length",
+        "read_length_stdev": "STDEV read length",
+        "n50": "Read length N50",
+        "average_identity": "Average percent identity",
+        "median_identity": "Median percent identity",
+        "active_channels": "Active channels",
+        "mean_qual": "Mean read quality",
+        "median_qual": "Median read quality",
+    }
 
     _KEYS_READ_Q = [
         ">Q5",
@@ -58,6 +60,8 @@ def __init__(self):
         self.has_fasta = False
         for f in self.find_log_files("nanostat", filehandles=True):
             self.parse_nanostat_log(f)
+        for f in self.find_log_files("nanostat/legacy", filehandles=True):
+            self.parse_legacy_nanostat_log(f)
 
         # Filter to strip out ignored sample names
         self.nanostat_data = self.ignore_samples(self.nanostat_data)
@@ -90,7 +94,28 @@ def parse_nanostat_log(self, f):
         Note: Tool can be run in two different modes, giving two variants to the output.
         To avoid overwriting keys from different modes, keys are given a suffix.
         """
+        nano_stats = {}
+        for line in f["f"]:
+            parts = line.strip().split()
+            if len(parts) == 2 and parts[0] in self._KEYS_MAPPING.keys():
+                key = self._KEYS_MAPPING.get(parts[0])
+                if key:
+                    nano_stats[key] = float(parts[1])
+            else:
+                parts = line.strip().split(":")
+                key = parts[0].replace("Reads ", "")
+                if key in self._KEYS_READ_Q:
+                    # Number of reads above Q score cutoff
+                    val = int(parts[1].strip().split()[0])
+                    nano_stats[key] = val
+        self.save_data(f, nano_stats)
+
+    def parse_legacy_nanostat_log(self, f):
+        """Parse legacy output from NanoStat
 
+        Note: Tool can be run in two different modes, giving two variants to the output.
+        To avoid overwriting keys from different modes, keys are given a suffix.
+        """
         nano_stats = {}
         for line in f["f"]:
             parts = line.strip().split(":")
@@ -99,14 +124,21 @@ def parse_nanostat_log(self, f):
 
             key = parts[0]
 
-            if key in self._KEYS_NUM:
+            if key in self._KEYS_MAPPING.values():
                 val = float(parts[1].replace(",", ""))
                 nano_stats[key] = val
             elif key in self._KEYS_READ_Q:
                 # Number of reads above Q score cutoff
                 val = int(parts[1].strip().split()[0])
                 nano_stats[key] = val
+        self.save_data(f, nano_stats)
+
+    def save_data(self, f, nano_stats):
+        """
+        Normalise fields and save parsed data.
 
+        Used for both legacy and new data formats.
+        """
         if ">Q5" in nano_stats:
             self.has_qscores = True
 

diff --git a/multiqc/utils/search_patterns.yaml b/multiqc/utils/search_patterns.yaml
@@ -432,6 +432,10 @@ kallisto:
   contents: "[quant] finding pseudoalignments for the reads"
   shared: true
 nanostat:
+  max_filesize: 4096
+  contents_re: "Metrics dataset\\s*"
+  num_lines: 1
+nanostat/legacy:
   max_filesize: 4096
   contents_re: "General summary:\\s*"
   num_lines: 1