MultiQC · vladsavelyev · Nov 17, 2023 · Nov 9, 2023 · Nov 9, 2023 · Nov 9, 2023
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -26,6 +26,8 @@ Highlights:
   - Seqera Platform CLI reports statistics generated by the Seqera Platform CLI.
 - [**Xenome**](https://github.com/data61/gossamer/blob/master/docs/xenome.md) ([#1860](https://github.com/ewels/MultiQC/pull/1860))
   - A tool for classifying reads from xenograft sources.
+- [**xengsort**](https://gitlab.com/genomeinformatics/xengsort) ([#2168](https://github.com/ewels/MultiQC/pull/2168))
+  - xengsort is a fast xenograft read sorter based on space-efficient k-mer hashing
 
 ### Module updates
 

diff --git a/docs/modules/xengsort.md b/docs/modules/xengsort.md
@@ -0,0 +1,25 @@
+---
+name: xengsort
+url: https://gitlab.com/genomeinformatics/xengsort
+description: >
+  Fast xenograft read sorter based on space-efficient k-mer hashing
+---
+
+The module parses results generated by the `xengsort classify` command.
+
+**Note** - MultiQC parses the standard out from xengsort, hence one has to redirect
+command line output to a file in order to use it with the MultiQC module. Also note
+that the tool does not register any sample name information in the output, so MultiQC
+attempts to fetch the sample name from the file name by default.
+
+For example, if your xengsort command was:
+
+```sh
+xengsort classify --index myindex \
+  --fastq paired.1.fq.gz --pairs paired.2.fq.gz \
+  --prefix myresults \
+  --classification count \
+  > sample.txt
+```
+
+Then the sample name in the report will be `sample`, which is the base name of the file.
-
-Then the sample name in the report will be `sample`, which is the base name of the file.
-
-Then the sample name in the report will be `sample`, which is the base name of the file.
diff --git a/multiqc/modules/xengsort/__init__.py b/multiqc/modules/xengsort/__init__.py
@@ -0,0 +1,3 @@
+from .xengsort import MultiqcModule
+
+__all__ = ["MultiqcModule"]
diff --git a/multiqc/modules/xengsort/xengsort.py b/multiqc/modules/xengsort/xengsort.py
@@ -0,0 +1,156 @@
+""" MultiQC module to parse log output from xengsort classify """
+
+from collections import defaultdict
+
+import logging
+from typing import Dict
+
+from multiqc.modules.base_module import BaseMultiqcModule, ModuleNoSamplesFound
+from multiqc.plots import bargraph, table
+
+# Initialise the logger
+log = logging.getLogger(__name__)
+
+
+class MultiqcModule(BaseMultiqcModule):
+    def __init__(self):
+        # Initialise the parent object
+        super(MultiqcModule, self).__init__(
+            name="xengsort",
+            anchor="xengsort",
+            href="https://gitlab.com/genomeinformatics/xengsort",
+            info="is a fast xenograft read sorter based on space-efficient k-mer hashing",
+            doi="doi.org/10.4230/LIPIcs.WABI.2020.4",
+        )
+
+        # Find and load any Xenome reports
+        self.percents = dict()
+        self.counts = dict()
+        for f in self.find_log_files("xengsort"):
+            self._parse_log(f)
+
+        # Filter to strip out ignored sample names
+        self.percents = self.ignore_samples(self.percents)
+        self.counts = self.ignore_samples(self.counts)
+        if len(self.percents) == 0:
+            raise ModuleNoSamplesFound
+        log.info(f"Found {len(self.counts)} reports")
+
+        # Superfluous function call to confirm that it is used in this module
+        # Replace None with actual version if it is available
+        self.add_software_version(None)
+
+        # Write parsed report data to a file
+        self.write_data_file(self.percents, f"multiqc_{self.anchor}_percents")
+        self.write_data_file(self.counts, f"multiqc_{self.anchor}_counts")
+
+        self._build_table()
+        self._build_plot()
+
+    def _parse_log(self, f):
+        lines = iter(f["contents_lines"])
+        for line in iter(lines):
+            if "\t" in line:
+                fields = line.strip().split("\t")
+                if set(fields) == {"prefix", "host", "graft", "ambiguous", "both", "neither"}:
+                    values = next(lines).strip().split("\t")
+                    data = dict(zip(fields, values))
+                    s_name = data.pop("prefix")
+                    f["s_name"] = s_name
+                    data = {k: int(v) for k, v in data.items()}
+                    percents = {k: v / sum(data.values()) * 100 for k, v in data.items()}
+
+                    if s_name in self.counts:
+                        log.debug(f"Duplicate sample name found! Overwriting: {s_name}")
+                    self.add_data_source(f, s_name)
+                    self.counts[s_name] = data
+                    self.percents[s_name] = percents
+                    break
+
+    def _build_table(self):
+        """
+        Prepare headers and data for a table. Add a section with a table,
+        and add a few columns into the general stats.
+        """
+        headers: Dict[str, Dict] = {}
+        table_data = defaultdict(dict)
+
+        scale_by_cls = {
+            "graft": "Blues",
+            "host": "Reds",
+            "both": "Purples",
+            "ambiguous": "Greys",
+            "neither": "Greys",
+        }
+        for sn, data in self.percents.items():
+            for cls in ["graft", "host", "ambiguous", "both", "neither"]:
+                table_data[sn][f"{cls}_reads_pct"] = data.get(cls)
+                headers[f"{cls}_reads_pct"] = {
+                    "rid": f"{self.anchor}_{cls}_reads_pct",  # to make the ID unique from xenome
+                    "title": f"{cls.capitalize()} reads",
+                    "description": f"share of {cls} reads in the sample",
+                    "min": 0,
+                    "suffix": "%",
+                    "scale": scale_by_cls[cls],
+                    "format": "{:,.2f}",
+                    "hidden": cls in ["both", "neither", "ambiguous"],
+                }
+        self.general_stats_addcols(table_data, headers)
+        detail_headers = headers.copy()
+        for metric in headers:
+            detail_headers[metric]["hidden"] = False
+
+        for sn, data in self.counts.items():
+            for cls in ["graft", "host", "ambiguous", "both", "neither"]:
+                table_data[sn][f"{cls}_reads_cnt"] = data.get(cls)
+                detail_headers[f"{cls}_reads_cnt"] = {
+                    "rid": f"{self.anchor}_{cls}_reads_cnt",  # to make the ID unique from xenome
+                    "title": f"{cls.capitalize()} reads",
+                    "description": f"number of {cls} reads in the sample",
+                    "min": 0,
+                    "scale": scale_by_cls.get(cls),
+                    "format": "{:,d}",
+                    "hidden": True,
+                }
+
+        self.add_section(
+            name="Summary table",
+            anchor=f"{self.anchor}-summary-table-section",
+            plot=table.plot(table_data, detail_headers),
+        )
+
+    def _build_plot(self):
+        """
+        Create two bar plots: based on summary and detail data.
+        """
+        cats = {
+            "graft": {"name": "Graft", "color": "#377eb8"},  # blue
+            "host": {"name": "Host", "color": "#e41a1c"},  # red
+            "both": {"name": "Both", "color": "#984ea3"},  # purple
+            "ambiguous": {"name": "Ambiguous", "color": "#616161"},  # grey
+            "neither": {"name": "Neither", "color": "b3b3b3"},  # light grey
+        }
+        self.add_section(
+            description=f"This plot shows the number of reads classified by {self.name}",
+            helptext="""
+            There are 5 possible categories:  
+            * **Graft**: reads found in graft species, e.g. human
+            * **Host**: reads found in host species, e.g. mouse
+            * **Both**: reads found in either of the species
+            * **Neither**: reads was found in neither of the species
+            * **Ambiguous**: reads origin could not be adequately determined.  
+            """,
+            name="Summary classification",
+            anchor=f"{self.anchor}_summary_bar_plot_section",
+            plot=bargraph.plot(
+                self.counts,
+                cats,
+                {
+                    "id": f"{self.anchor}_summary_bar_plot",
+                    "title": f"{self.name}: summary classification",
+                    "ylab": "# Reads",
+                    "cpswitch_counts_label": "Number of reads",
+                    "cpswitch_c_active": False,
+                },
+            ),
+        )
diff --git a/multiqc/modules/xenome/xenome.py b/multiqc/modules/xenome/xenome.py
@@ -182,6 +182,7 @@ def _build_table(self):
                 table_data[sn][f"{cls}_reads_pct"] = val
                 if cls == "human":
                     headers[f"{cls}_reads_pct"] = {
+                        "rid": f"{self.anchor}_{cls}_reads_pct",  # to make the ID unique from xengsort
                         "title": "Human reads",
                         "description": "share of human reads in the sample",
                         "min": 0,
@@ -191,6 +192,7 @@ def _build_table(self):
                     }
                 else:
                     headers[f"{cls}_reads_pct"] = {
+                        "rid": f"{self.anchor}_{cls}_reads_pct",  # to make the ID unique from xengsort
                         "title": f"{cls.capitalize()} reads",
                         "description": f"share of {cls} reads in the sample",
                         "min": 0,

diff --git a/multiqc/utils/config_defaults.yaml b/multiqc/utils/config_defaults.yaml
@@ -288,8 +288,6 @@ fn_clean_exts:
   - ".fastqc_metrics"
   - ".labels"
   - ".bammetrics.metrics"
-  - "_xenome_stats"
-  - ".xenome_stats"
 
 # These are removed after the above, only if sample names
 # start or end with this string. Again, removed in order.
@@ -888,3 +886,6 @@ module_order:
   - xenome:
       module_tag:
         - DNA
+  - xengsort:
+      module_tag:
+        - DNA
diff --git a/multiqc/utils/search_patterns.yaml b/multiqc/utils/search_patterns.yaml
@@ -874,3 +874,6 @@ whatshap/stats:
 xenome:
   contents: "B	G	H	M	count	percent	class"
   num_lines: 2
+xengsort:
+  contents: "# Xengsort classify"
+  num_lines: 2
diff --git a/setup.py b/setup.py
@@ -208,6 +208,7 @@
             "verifybamid = multiqc.modules.verifybamid:MultiqcModule",
             "whatshap = multiqc.modules.whatshap:MultiqcModule",
             "xenome = multiqc.modules.xenome:MultiqcModule",
+            "xengsort = multiqc.modules.xengsort:MultiqcModule",
         ],
         "multiqc.templates.v1": [
             "default = multiqc.templates.default",