Skip to content

Commit

Permalink
Seqwho (#1503)
Browse files Browse the repository at this point in the history
* First implementation of seqwho graphs and data.

* Fix markdown lint.

* Fix syntax.

* Update readme.

* Minor clean up

* Rename headers to cats

* Fix

* Search pattern based on content

* Fix CI

* Make species and library labels

---------

Co-authored-by: Phil Ewels <phil.ewels@scilifelab.se>
Co-authored-by: Vlad Savelyev <vladislav.sav@gmail.com>
  • Loading branch information
3 people committed Feb 3, 2024
1 parent 98c5bd3 commit c2de371
Show file tree
Hide file tree
Showing 7 changed files with 290 additions and 0 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -520,6 +520,8 @@ for more information.
- Mark duplicate reads from PacBio sequencing of an amplified library
- [**WhatsHap**](https://whatshap.readthedocs.io)
- WhatsHap is a software for phasing genomic variants using DNA sequencing reads
- [**SeqWho**](https://daehwankimlab.github.io/seqwho/)
- Tool to determine a FASTQ(A) sequencing file identity, both source protocol and species of origin.

### Module feature additions

Expand Down
11 changes: 11 additions & 0 deletions docs/modules/seqwho.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
---
name: SeqWho
url: https://daehwankimlab.github.io/seqwho/
description: >
SeqWho is a reliable and extremely rapid program designed to determine a FASTQ(A) sequencing file identity, both source protocol and species of origin.
---

The SeqWho module parses results generated by
[SeqWho](https://daehwankimlab.github.io/seqwho/),
a tool that allows you to determine a FASTQ(A) sequencing file identity, both source protocol and species of origin so you can see if the composition of
the library matches with what you expect.
3 changes: 3 additions & 0 deletions multiqc/modules/seqwho/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from .seqwho import MultiqcModule

__all__ = ["MultiqcModule"]
266 changes: 266 additions & 0 deletions multiqc/modules/seqwho/seqwho.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,266 @@
""" MultiQC module to parse output from SeqWho """

import json
import logging
from collections import OrderedDict

from multiqc import config
from multiqc.modules.base_module import BaseMultiqcModule, ModuleNoSamplesFound
from multiqc.plots import linegraph, bargraph
from multiqc.utils import mqc_colour

# Initialise the logger
log = logging.getLogger(__name__)


LIBRARIES = ["amplicon", "bisulf", "wxs", "chip", "wgs", "dnase", "rnaseq", "atac", "mirnaseq"]
SPECIES = ["human", "mouse"]


class MultiqcModule(BaseMultiqcModule):
"""SeqWho module"""

def __init__(self):
# Initialise the parent object
super(MultiqcModule, self).__init__(
name="SeqWho",
anchor="seqwho",
href="https://daehwankimlab.github.io/seqwho/",
info="is a tool to determine a FASTQ(A) sequencing file identity, both source protocol and species of origin.",
# doi=""
)

# Find and load any SeqWho reports
self.seqwho_data = dict()
self.seqwho_qualdis = dict()
self.seqwho_qualscore = dict()
self.seqwho_readdist = dict()

for f in self.find_log_files("seqwho", filehandles=True):
self.parse_json(f)

# Superfluous function call to confirm that it is used in this module
# Replace None with actual version if it is available
self.add_software_version(None)

# Filter to strip out ignored sample names
self.seqwho_data = self.ignore_samples(self.seqwho_data)
self.seqwho_qualdis = self.ignore_samples(self.seqwho_qualdis)
self.seqwho_qualscore = self.ignore_samples(self.seqwho_qualscore)
self.seqwho_readdist = self.ignore_samples(self.seqwho_readdist)

if len(self.seqwho_data) == 0:
raise ModuleNoSamplesFound

log.info(f"Found {len(self.seqwho_data)} reports")

# Write parsed report data to a file
self.write_data_file(self.seqwho_data, "multiqc_seqwho")

# General Stats Table
self.seqwho_general_stats_table()

# SeqWho MLE Species Table
self.seqwho_species_plot()

# SeqWho MLE Sequencing Table
self.seqwho_library_plot()

# Quality Distribution
self.seqwho_qualdist_plot()

# Quality Score
self.seqwho_qualscore_plot()

# Read Distribution
self.seqwho_readdist_plot()

# Parse our nice little JSON file
def parse_json(self, f):
"""Parse the JSON output from SeqWho and save the summary statistics"""
try:
parsed_json = json.load(f["f"])
except json.JSONDecodeError:
log.warning(f"Could not parse SeqWho JSON: '{f['fn']}'")
return

# Get sample name from JSON first
for s_name, data in parsed_json.items():
s_name = self.clean_s_name(s_name, f["root"])
self.add_data_source(f, s_name)
self.seqwho_data[s_name] = {}

call_data = data["Call"]
self.seqwho_data[s_name]["predicted_species"] = call_data["Species"]
self.seqwho_data[s_name]["predicted_library"] = call_data["Library"]
self.seqwho_data[s_name]["mle"] = call_data["Maximum Likelihood Estimate"]
self.seqwho_data[s_name]["est_read_number"] = data["Estimated Read Number"]

# Make MLE Matrix table for subset of predicted species and library
mle_matrix = call_data["MLE Matrix"]
mle_columns = SPECIES + LIBRARIES
for i in mle_columns:
self.seqwho_data[s_name][i] = mle_matrix[i]

# Make a Quality Score vs count version of the data
self.seqwho_qualdis[s_name] = OrderedDict()
qual_dist = data["Quality Dist"]
for i in range(len(qual_dist)):
self.seqwho_qualdis[s_name][i] = qual_dist[i]

# Make a Quality score vs position version of the data
self.seqwho_qualscore[s_name] = OrderedDict()
qual_scores = data["Quality Scores"]
for i in range(len(qual_scores)):
self.seqwho_qualscore[s_name][i] = qual_scores[i]

# Make a Read Length vs count of the data
self.seqwho_readdist[s_name] = OrderedDict()
read_dist = data["Read lengths"]
for i in range(len(read_dist)):
self.seqwho_readdist[s_name][i] = read_dist[i]

def seqwho_general_stats_table(self):
"""Take the parsed stats from the SeqWho report and add it to the
General Statistics table at the top of the report"""
species_scale = mqc_colour.mqc_colour_scale("Dark2")
species_colors = [{v: species_scale.get_colour(i, lighten=0.5)} for i, v in enumerate(SPECIES)]
library_scale = mqc_colour.mqc_colour_scale("Accent")
library_colors = [{v: library_scale.get_colour(i, lighten=0.5)} for i, v in enumerate(LIBRARIES)]

headers = {
"predicted_species": {
"title": "Pred Species",
"description": "Predicted species",
"cond_formatting_colours": species_colors,
"cond_formatting_rules": {v: [{"s_eq": v}] for v in SPECIES},
"scale": False,
},
"predicted_library": {
"title": "Pred Library",
"description": "Predicted library type",
"cond_formatting_colours": library_colors,
"cond_formatting_rules": {v: [{"s_eq": v}] for v in LIBRARIES},
"scale": False,
},
"mle": {
"title": "Max Likelihood Est",
"description": "Overall maximum likelihood estimate (overall)",
"min": 0,
"format": "{:,.3f}",
"scale": "RdYlGn",
},
"est_read_number": {
"title": f"{config.read_count_prefix} Est Reads",
"description": f"Estimated read number ({config.read_count_desc})",
"min": 0,
"modify": lambda x: x * config.read_count_multiplier,
"scale": "Blues",
"shared_key": "read_count",
},
}
self.general_stats_addcols(self.seqwho_data, headers)

def seqwho_species_plot(self):
"""
Take the Maximum Likelihood Species data from the SeqWho report and make
barchart
"""

config = {
"id": "seqwho_species_plot",
"title": "SeqWho: Species",
"ylab": "Maximum likelihood estimate",
"ymax": 1.0,
"stacking": None,
"cpswitch": False,
"cpswitch_counts_label": "Maximum likelihood",
"tt_decimals": 3,
}

self.add_section(
name="Species",
anchor="seqwho_species",
description="This plot shows the maximum likelihood that a given species matches.",
plot=bargraph.plot(self.seqwho_data, SPECIES, config),
)

def seqwho_library_plot(self):
"""Take the Maximum Likelihood Library data from the SeqWho report and make a
barplot"""

# Config for the plot
config = {
"id": "seqwho_sequencing_plot",
"title": "SeqWho: Library Type",
"ylab": "Maximum likelihood estimate",
"ymax": 1.0,
"stacking": None,
"cpswitch": False,
"cpswitch_counts_label": "Maximum likelihood",
"tt_decimals": 3,
}

self.add_section(
name="Library Type",
anchor="seqwho_library",
description="This plot shows the maximum likelihood that a given library matches.",
plot=bargraph.plot(self.seqwho_data, LIBRARIES, config),
)

def seqwho_qualdist_plot(self):
"""Generate the Average Sequence Quality Distribution plot"""
pconfig = {
"id": "seqwho_qualdis_plot",
"title": "SeqWho: Per Sequence Quality Scores",
"ylab": "Reads",
"xlab": "Sequence Quality (Phred)",
"xDecimals": False,
"ymin": 0,
"tt_label": "<b>Phred {point.x}</b>: {point.y:.0f} reads",
}

self.add_section(
name="Predicted Per Sequence Quality Scores",
anchor="seqwho_qualdis",
description="This plot shows the predicted number of reads with average quality scores.",
plot=linegraph.plot(self.seqwho_qualdis, pconfig),
)

def seqwho_qualscore_plot(self):
"""Generate the Average Quality Score per position plot"""
pconfig = {
"id": "seqwho_qualscore_plot",
"title": "SeqWho: Quality Score",
"ylab": "Sequence Quality (Phred)",
"xlab": "Position (bp)",
"xDecimals": False,
"ymin": 0,
"tt_label": "<b>Base {point.x} </b>: {point.y}",
}

self.add_section(
name="Predicted Mean Quality Scores",
anchor="seqwho_qualscore",
description="This plot shows the predicted mean quality value across each base position in the read.",
plot=linegraph.plot(self.seqwho_qualscore, pconfig),
)

def seqwho_readdist_plot(self):
"""Generate the number of reads with certain length plot"""
pconfig = {
"id": "seqwho_readdist_plot",
"title": "SeqWho: Read Distribution",
"ylab": "Reads",
"xlab": "Length (bp)",
"xDecimals": False,
"ymin": 0,
"tt_label": "<b>Base {point.x} bp</b>: {point.y} reads",
}

self.add_section(
name="Predicted Read Distribution",
anchor="seqwho_readdist",
description="This plot shows predicted the number of reads with certain lengths.",
plot=linegraph.plot(self.seqwho_readdist, pconfig),
)
4 changes: 4 additions & 0 deletions multiqc/utils/config_defaults.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -793,6 +793,10 @@ module_order:
module_tag:
- RNA
- DNA
- seqwho:
module_tag:
- RNA
- DNA
- afterqc:
module_tag:
- RNA
Expand Down
3 changes: 3 additions & 0 deletions multiqc/utils/search_patterns.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -767,6 +767,9 @@ samtools/rmdup:
shared: true
sargasso:
fn: "overall_filtering_summary.txt"
seqwho:
contents: ' "Per Base Seq": ['
num_lines: 10
seqyclean:
fn: "*_SummaryStatistics.tsv"
sexdeterrmine:
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,7 @@
"samtools = multiqc.modules.samtools:MultiqcModule",
"sargasso = multiqc.modules.sargasso:MultiqcModule",
"seqyclean = multiqc.modules.seqyclean:MultiqcModule",
"seqwho = multiqc.modules.seqwho:MultiqcModule",
"sexdeterrmine = multiqc.modules.sexdeterrmine:MultiqcModule",
"sickle = multiqc.modules.sickle:MultiqcModule",
"skewer = multiqc.modules.skewer:MultiqcModule",
Expand Down

0 comments on commit c2de371

Please sign in to comment.