-
Notifications
You must be signed in to change notification settings - Fork 583
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
New module: sourmash compare (#1812)
* add compare mqc mod code * setup.py * doc * doc readme * changelog * config defaults * update mod * add wildcard to search pattern * run black * remove lables from text file * sort imports for failed isort * rip gather * update docstring * change names * fix raise warning, aes titles * Apply suggestions from code review Co-authored-by: Vlad Savelyev <vladislav.sav@gmail.com> * Refactor: avoid intermediate dict and extra abstractions, re-add write_data_file, capitalise labels * Use warning if sourmash result is not found * Update CHANGELOG.md * Show input name in the plot title * Address review * Fix merge artefact * Add comments on sample name * Remove .labels.txt from HTML ID --------- Co-authored-by: Vlad Savelyev <vladislav.sav@gmail.com> Co-authored-by: vladsaveliev <vladislav.savelyev@populationgenomics.org.au> Co-authored-by: Phil Ewels <phil.ewels@seqera.io>
- Loading branch information
1 parent
056d21d
commit e3907d0
Showing
8 changed files
with
137 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
--- | ||
name: sourmash | ||
url: https://github.com/sourmash-bio/sourmash | ||
description: Quickly search, compare, and analyze genomic and metagenomic data sets. | ||
--- | ||
|
||
The sourmash module produces summary statistics from the | ||
[sourmash](https://github.com/sourmash-bio/sourmash) tool. | ||
The module can summarise data from the following sourmash output files | ||
(descriptions from command line help output): | ||
|
||
- `sourmash compare` | ||
- create a similarity matrix comparing many samples. | ||
|
||
Additional information on sourmash and its outputs is available on | ||
the [sourmash documentation website](https://sourmash.readthedocs.io/en/latest/). |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
from .sourmash import MultiqcModule |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,78 @@ | ||
#!/usr/bin/env python | ||
|
||
""" MultiQC module to parse similarity matrix output by sourmash compare """ | ||
|
||
import logging | ||
import os | ||
import re | ||
|
||
import numpy | ||
|
||
from multiqc.plots import heatmap | ||
|
||
# Initialise the logger | ||
log = logging.getLogger(__name__) | ||
|
||
|
||
class CompareMixin: | ||
def parse_compare(self): | ||
""" | ||
Modeled after vcftools relatedness2 module, which also has many samples represented in the parsed file. | ||
""" | ||
matrices = {} | ||
|
||
for f in self.find_log_files("sourmash/compare", filehandles=True): | ||
labels = [x.strip() for x in f["f"]] | ||
if labels: | ||
matrix_path = re.sub(".labels.txt", "", f["f"].name) | ||
if not os.path.exists(matrix_path): | ||
log.warning( | ||
f"Found a 'labels' file expected by Sourmash: '{f['f'].name}', " | ||
f"however, could not find a accompanying matrix binary file " | ||
f"'{matrix_path}'. So assuming that wasn't a Sourmash result" | ||
) | ||
continue | ||
with open(matrix_path, "rb") as fh: | ||
matrix = numpy.load(fh) | ||
# Note that "s_name" here is not a sample name, but the name of the | ||
# input file, that contains a comparison matrix across multiple samples. | ||
matrices[f["s_name"]] = (labels, matrix.tolist()) | ||
self.add_data_source(f, section="compare") | ||
|
||
matrices = self.ignore_samples(matrices) | ||
if len(matrices) == 0: | ||
return 0 | ||
|
||
log.info(f"Found {len(matrices)} valid compare results") | ||
|
||
self.write_data_file(matrices, "sourmash_compare") | ||
|
||
helptext = """ | ||
Sourmash `compare` calculates the similarity score between samples. A higher score indicates a higher degree of | ||
similarity, up to a maximum of 1. Samples are clustered by similarity on each axis, and specific IDs can be | ||
found in the graph with the Highlight tab. | ||
""" | ||
|
||
for name, (labels, data) in matrices.items(): | ||
# Note that "name" here is not a sample name, but the name of the input file, | ||
# that contains a comparison matrix across multiple samples. | ||
id = name.lower().strip().replace(" ", "-").replace(".labels.txt", "") | ||
self.add_section( | ||
name=f"Sample similarity (<code>{name}</code>)", | ||
anchor=f"sourmash-compare-{id}", | ||
description=f"Heatmap of similarity values from the output of `sourmash compare` run on <code>{name}</code>", | ||
helptext=helptext, | ||
plot=heatmap.plot( | ||
data, | ||
xcats=labels, | ||
ycats=labels, | ||
pconfig={ | ||
"id": f"sourmash-compare-heatmap-{id}", | ||
"title": "Sourmash: Compare", | ||
"square": True, | ||
"decimalPlaces": 7, | ||
}, | ||
), | ||
) | ||
|
||
return len(matrices) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
#!/usr/bin/env python | ||
|
||
""" MultiQC module to parse output from sourmash """ | ||
|
||
import logging | ||
|
||
from multiqc.modules.base_module import BaseMultiqcModule | ||
|
||
from .compare import CompareMixin | ||
|
||
# Initialise the logger | ||
log = logging.getLogger(__name__) | ||
|
||
|
||
class MultiqcModule(BaseMultiqcModule, CompareMixin): | ||
def __init__(self): | ||
super(MultiqcModule, self).__init__( | ||
name="Sourmash", | ||
anchor="sourmash", | ||
href="https://github.com/sourmash-bio/sourmash", | ||
info="quickly searches, compares, and analyzes genomic and metagenomic data sets.", | ||
doi="10.21105/joss.00027", | ||
) | ||
|
||
n = dict() | ||
n["compare"] = self.parse_compare() | ||
if n["compare"] > 0: | ||
log.info("Found {} compare results".format(n["compare"])) | ||
|
||
if sum(n.values()) == 0: | ||
raise UserWarning |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters