Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

New module: xengsort #2168

Merged
merged 9 commits into from
Nov 17, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ Highlights:
- Seqera Platform CLI reports statistics generated by the Seqera Platform CLI.
- [**Xenome**](https://github.com/data61/gossamer/blob/master/docs/xenome.md) ([#1860](https://github.com/ewels/MultiQC/pull/1860))
- A tool for classifying reads from xenograft sources.
- [**xengsort**](https://gitlab.com/genomeinformatics/xengsort) ([#2168](https://github.com/ewels/MultiQC/pull/2168))
- xengsort is a fast xenograft read sorter based on space-efficient k-mer hashing

### Module updates

Expand Down
25 changes: 25 additions & 0 deletions docs/modules/xengsort.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
---
name: xengsort
url: https://gitlab.com/genomeinformatics/xengsort
description: >
Fast xenograft read sorter based on space-efficient k-mer hashing
---

The module parses results generated by the `xengsort classify` command.

**Note** - MultiQC parses the standard out from xengsort, hence one has to redirect
command line output to a file in order to use it with the MultiQC module. Also note
that the tool does not register any sample name information in the output, so MultiQC
attempts to fetch the sample name from the file name by default.

For example, if your xengsort command was:

```sh
xengsort classify --index myindex \
--fastq paired.1.fq.gz --pairs paired.2.fq.gz \
--prefix myresults \
--classification count \
> sample.txt
```

Then the sample name in the report will be `sample`, which is the base name of the file.
Comment on lines +24 to +25
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This doesn't seem to be the case.

Suggested change
Then the sample name in the report will be `sample`, which is the base name of the file.

3 changes: 3 additions & 0 deletions multiqc/modules/xengsort/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from .xengsort import MultiqcModule

__all__ = ["MultiqcModule"]
156 changes: 156 additions & 0 deletions multiqc/modules/xengsort/xengsort.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
""" MultiQC module to parse log output from xengsort classify """

from collections import defaultdict

import logging
from typing import Dict

from multiqc.modules.base_module import BaseMultiqcModule, ModuleNoSamplesFound
from multiqc.plots import bargraph, table

# Initialise the logger
log = logging.getLogger(__name__)


class MultiqcModule(BaseMultiqcModule):
def __init__(self):
# Initialise the parent object
super(MultiqcModule, self).__init__(
name="xengsort",
anchor="xengsort",
href="https://gitlab.com/genomeinformatics/xengsort",
info="is a fast xenograft read sorter based on space-efficient k-mer hashing",
doi="doi.org/10.4230/LIPIcs.WABI.2020.4",
)

# Find and load any Xenome reports
self.percents = dict()
self.counts = dict()
for f in self.find_log_files("xengsort"):
self._parse_log(f)

# Filter to strip out ignored sample names
self.percents = self.ignore_samples(self.percents)
self.counts = self.ignore_samples(self.counts)
if len(self.percents) == 0:
raise ModuleNoSamplesFound
log.info(f"Found {len(self.counts)} reports")

# Superfluous function call to confirm that it is used in this module
# Replace None with actual version if it is available
self.add_software_version(None)

# Write parsed report data to a file
self.write_data_file(self.percents, f"multiqc_{self.anchor}_percents")
self.write_data_file(self.counts, f"multiqc_{self.anchor}_counts")

self._build_table()
self._build_plot()

def _parse_log(self, f):
lines = iter(f["contents_lines"])
for line in iter(lines):
if "\t" in line:
fields = line.strip().split("\t")
if set(fields) == {"prefix", "host", "graft", "ambiguous", "both", "neither"}:
values = next(lines).strip().split("\t")
data = dict(zip(fields, values))
s_name = data.pop("prefix")
f["s_name"] = s_name
data = {k: int(v) for k, v in data.items()}
percents = {k: v / sum(data.values()) * 100 for k, v in data.items()}

if s_name in self.counts:
log.debug(f"Duplicate sample name found! Overwriting: {s_name}")
self.add_data_source(f, s_name)
self.counts[s_name] = data
self.percents[s_name] = percents
break

def _build_table(self):
"""
Prepare headers and data for a table. Add a section with a table,
and add a few columns into the general stats.
"""
headers: Dict[str, Dict] = {}
table_data = defaultdict(dict)

scale_by_cls = {
"graft": "Blues",
"host": "Reds",
"both": "Purples",
"ambiguous": "Greys",
"neither": "Greys",
}
for sn, data in self.percents.items():
for cls in ["graft", "host", "ambiguous", "both", "neither"]:
table_data[sn][f"{cls}_reads_pct"] = data.get(cls)
headers[f"{cls}_reads_pct"] = {
"rid": f"{self.anchor}_{cls}_reads_pct", # to make the ID unique from xenome
"title": f"{cls.capitalize()} reads",
"description": f"share of {cls} reads in the sample",
"min": 0,
"suffix": "%",
"scale": scale_by_cls[cls],
"format": "{:,.2f}",
"hidden": cls in ["both", "neither", "ambiguous"],
}
self.general_stats_addcols(table_data, headers)
detail_headers = headers.copy()
for metric in headers:
detail_headers[metric]["hidden"] = False

for sn, data in self.counts.items():
for cls in ["graft", "host", "ambiguous", "both", "neither"]:
table_data[sn][f"{cls}_reads_cnt"] = data.get(cls)
detail_headers[f"{cls}_reads_cnt"] = {
"rid": f"{self.anchor}_{cls}_reads_cnt", # to make the ID unique from xenome
"title": f"{cls.capitalize()} reads",
"description": f"number of {cls} reads in the sample",
"min": 0,
"scale": scale_by_cls.get(cls),
"format": "{:,d}",
"hidden": True,
}

self.add_section(
name="Summary table",
anchor=f"{self.anchor}-summary-table-section",
plot=table.plot(table_data, detail_headers),
)

def _build_plot(self):
"""
Create two bar plots: based on summary and detail data.
"""
cats = {
"graft": {"name": "Graft", "color": "#377eb8"}, # blue
"host": {"name": "Host", "color": "#e41a1c"}, # red
"both": {"name": "Both", "color": "#984ea3"}, # purple
"ambiguous": {"name": "Ambiguous", "color": "#616161"}, # grey
"neither": {"name": "Neither", "color": "b3b3b3"}, # light grey
}
self.add_section(
description=f"This plot shows the number of reads classified by {self.name}",
helptext="""
There are 5 possible categories:
* **Graft**: reads found in graft species, e.g. human
* **Host**: reads found in host species, e.g. mouse
* **Both**: reads found in either of the species
* **Neither**: reads was found in neither of the species
* **Ambiguous**: reads origin could not be adequately determined.
""",
name="Summary classification",
anchor=f"{self.anchor}_summary_bar_plot_section",
plot=bargraph.plot(
self.counts,
cats,
{
"id": f"{self.anchor}_summary_bar_plot",
"title": f"{self.name}: summary classification",
"ylab": "# Reads",
"cpswitch_counts_label": "Number of reads",
"cpswitch_c_active": False,
},
),
)
2 changes: 2 additions & 0 deletions multiqc/modules/xenome/xenome.py
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,7 @@ def _build_table(self):
table_data[sn][f"{cls}_reads_pct"] = val
if cls == "human":
headers[f"{cls}_reads_pct"] = {
"rid": f"{self.anchor}_{cls}_reads_pct", # to make the ID unique from xengsort
"title": "Human reads",
"description": "share of human reads in the sample",
"min": 0,
Expand All @@ -191,6 +192,7 @@ def _build_table(self):
}
else:
headers[f"{cls}_reads_pct"] = {
"rid": f"{self.anchor}_{cls}_reads_pct", # to make the ID unique from xengsort
"title": f"{cls.capitalize()} reads",
"description": f"share of {cls} reads in the sample",
"min": 0,
Expand Down
5 changes: 3 additions & 2 deletions multiqc/utils/config_defaults.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -288,8 +288,6 @@ fn_clean_exts:
- ".fastqc_metrics"
- ".labels"
- ".bammetrics.metrics"
- "_xenome_stats"
- ".xenome_stats"

# These are removed after the above, only if sample names
# start or end with this string. Again, removed in order.
Expand Down Expand Up @@ -888,3 +886,6 @@ module_order:
- xenome:
module_tag:
- DNA
- xengsort:
module_tag:
- DNA
3 changes: 3 additions & 0 deletions multiqc/utils/search_patterns.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -874,3 +874,6 @@ whatshap/stats:
xenome:
contents: "B G H M count percent class"
num_lines: 2
xengsort:
contents: "# Xengsort classify"
num_lines: 2
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,6 +208,7 @@
"verifybamid = multiqc.modules.verifybamid:MultiqcModule",
"whatshap = multiqc.modules.whatshap:MultiqcModule",
"xenome = multiqc.modules.xenome:MultiqcModule",
"xengsort = multiqc.modules.xengsort:MultiqcModule",
],
"multiqc.templates.v1": [
"default = multiqc.templates.default",
Expand Down