Skip to content

Commit

Permalink
New module: sourmash compare (#1812)
Browse files Browse the repository at this point in the history
* add compare mqc mod code

* setup.py

* doc

* doc readme

* changelog

* config defaults

* update mod

* add wildcard to search pattern

* run black

* remove lables from text file

* sort imports for failed isort

* rip gather

* update docstring

* change names

* fix raise warning, aes titles

* Apply suggestions from code review

Co-authored-by: Vlad Savelyev <vladislav.sav@gmail.com>

* Refactor: avoid intermediate dict and extra abstractions, re-add write_data_file, capitalise labels

* Use warning if sourmash result is not found

* Update CHANGELOG.md

* Show input name in the plot title

* Address review

* Fix merge artefact

* Add comments on sample name

* Remove .labels.txt from HTML ID

---------

Co-authored-by: Vlad Savelyev <vladislav.sav@gmail.com>
Co-authored-by: vladsaveliev <vladislav.savelyev@populationgenomics.org.au>
Co-authored-by: Phil Ewels <phil.ewels@seqera.io>
  • Loading branch information
4 people committed Sep 14, 2023
1 parent 056d21d commit e3907d0
Show file tree
Hide file tree
Showing 8 changed files with 137 additions and 0 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,8 @@ This idea goes way back to [issue #290](https://github.com/ewels/MultiQC/issues/
- Rapid and standardized annotation of bacterial genomes, MAGs & plasmids.
- [mapDamage](https://github.com/ginolhac/mapDamage)
- mapDamage2 is a computational framework written in Python and R, which tracks and quantifies DNA damage patterns among ancient DNA sequencing reads generated by Next-Generation Sequencing platforms.
- [**Sourmash**](https://github.com/sourmash-bio/sourmash)
- Quickly search, compare, and analyze genomic and metagenomic data sets.

### Module updates

Expand Down
16 changes: 16 additions & 0 deletions docs/modules/sourmash.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
---
name: sourmash
url: https://github.com/sourmash-bio/sourmash
description: Quickly search, compare, and analyze genomic and metagenomic data sets.
---

The sourmash module produces summary statistics from the
[sourmash](https://github.com/sourmash-bio/sourmash) tool.
The module can summarise data from the following sourmash output files
(descriptions from command line help output):

- `sourmash compare`
- create a similarity matrix comparing many samples.

Additional information on sourmash and its outputs is available on
the [sourmash documentation website](https://sourmash.readthedocs.io/en/latest/).
1 change: 1 addition & 0 deletions multiqc/modules/sourmash/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .sourmash import MultiqcModule
78 changes: 78 additions & 0 deletions multiqc/modules/sourmash/compare.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
#!/usr/bin/env python

""" MultiQC module to parse similarity matrix output by sourmash compare """

import logging
import os
import re

import numpy

from multiqc.plots import heatmap

# Initialise the logger
log = logging.getLogger(__name__)


class CompareMixin:
def parse_compare(self):
"""
Modeled after vcftools relatedness2 module, which also has many samples represented in the parsed file.
"""
matrices = {}

for f in self.find_log_files("sourmash/compare", filehandles=True):
labels = [x.strip() for x in f["f"]]
if labels:
matrix_path = re.sub(".labels.txt", "", f["f"].name)
if not os.path.exists(matrix_path):
log.warning(
f"Found a 'labels' file expected by Sourmash: '{f['f'].name}', "
f"however, could not find a accompanying matrix binary file "
f"'{matrix_path}'. So assuming that wasn't a Sourmash result"
)
continue
with open(matrix_path, "rb") as fh:
matrix = numpy.load(fh)
# Note that "s_name" here is not a sample name, but the name of the
# input file, that contains a comparison matrix across multiple samples.
matrices[f["s_name"]] = (labels, matrix.tolist())
self.add_data_source(f, section="compare")

matrices = self.ignore_samples(matrices)
if len(matrices) == 0:
return 0

log.info(f"Found {len(matrices)} valid compare results")

self.write_data_file(matrices, "sourmash_compare")

helptext = """
Sourmash `compare` calculates the similarity score between samples. A higher score indicates a higher degree of
similarity, up to a maximum of 1. Samples are clustered by similarity on each axis, and specific IDs can be
found in the graph with the Highlight tab.
"""

for name, (labels, data) in matrices.items():
# Note that "name" here is not a sample name, but the name of the input file,
# that contains a comparison matrix across multiple samples.
id = name.lower().strip().replace(" ", "-").replace(".labels.txt", "")
self.add_section(
name=f"Sample similarity (<code>{name}</code>)",
anchor=f"sourmash-compare-{id}",
description=f"Heatmap of similarity values from the output of `sourmash compare` run on <code>{name}</code>",
helptext=helptext,
plot=heatmap.plot(
data,
xcats=labels,
ycats=labels,
pconfig={
"id": f"sourmash-compare-heatmap-{id}",
"title": "Sourmash: Compare",
"square": True,
"decimalPlaces": 7,
},
),
)

return len(matrices)
31 changes: 31 additions & 0 deletions multiqc/modules/sourmash/sourmash.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#!/usr/bin/env python

""" MultiQC module to parse output from sourmash """

import logging

from multiqc.modules.base_module import BaseMultiqcModule

from .compare import CompareMixin

# Initialise the logger
log = logging.getLogger(__name__)


class MultiqcModule(BaseMultiqcModule, CompareMixin):
def __init__(self):
super(MultiqcModule, self).__init__(
name="Sourmash",
anchor="sourmash",
href="https://github.com/sourmash-bio/sourmash",
info="quickly searches, compares, and analyzes genomic and metagenomic data sets.",
doi="10.21105/joss.00027",
)

n = dict()
n["compare"] = self.parse_compare()
if n["compare"] > 0:
log.info("Found {} compare results".format(n["compare"]))

if sum(n.values()) == 0:
raise UserWarning
6 changes: 6 additions & 0 deletions multiqc/utils/config_defaults.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -284,6 +284,7 @@ fn_clean_exts:
- ".scATAC_metrics"
- ".scATAC.metrics"
- ".fastqc_metrics"
- ".labels"

# These are removed after the above, only if sample names
# start or end with this string. Again, removed in order.
Expand Down Expand Up @@ -731,6 +732,11 @@ module_order:
module_tag:
- RNA
- DNA
- sourmash:
module_tag:
- DNA
- RNA
- metagenomics
- kaiju:
module_tag:
- DNA
Expand Down
2 changes: 2 additions & 0 deletions multiqc/utils/search_patterns.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -561,6 +561,8 @@ somalier/pairs:
fn: "*.pairs.tsv"
contents: "hom_concordance"
num_lines: 5
sourmash/compare:
fn: "*.labels.txt"
pbmarkdup:
contents_re: "LIBRARY +READS +UNIQUE MOLECULES +DUPLICATE READS"
num_lines: 5
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,7 @@
"snpsplit = multiqc.modules.snpsplit:MultiqcModule",
"somalier = multiqc.modules.somalier:MultiqcModule",
"sortmerna = multiqc.modules.sortmerna:MultiqcModule",
"sourmash = multiqc.modules.sourmash:MultiqcModule",
"stacks = multiqc.modules.stacks:MultiqcModule",
"star = multiqc.modules.star:MultiqcModule",
"supernova = multiqc.modules.supernova:MultiqcModule",
Expand Down

0 comments on commit e3907d0

Please sign in to comment.