Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Seqwho #1503

Merged
merged 14 commits into from
Feb 3, 2024
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -516,6 +516,8 @@ for more information.
- Mark duplicate reads from PacBio sequencing of an amplified library
- [**WhatsHap**](https://whatshap.readthedocs.io)
- WhatsHap is a software for phasing genomic variants using DNA sequencing reads
- [**SeqWho**](https://daehwankimlab.github.io/seqwho/)
- Tool to determine a FASTQ(A) sequencing file identity, both source protocol and species of origin.

### Module feature additions

Expand Down
11 changes: 11 additions & 0 deletions docs/modules/seqwho.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
---
name: SeqWho
url: https://daehwankimlab.github.io/seqwho/
description: >
SeqWho is a reliable and extremely rapid program designed to determine a FASTQ(A) sequencing file identity, both source protocol and species of origin.
---

The SeqWho module parses results generated by
[SeqWho](https://daehwankimlab.github.io/seqwho/),
a tool that allows you to determine a FASTQ(A) sequencing file identity, both source protocol and species of origin so you can see if the composition of
the library matches with what you expect.
3 changes: 3 additions & 0 deletions multiqc/modules/seqwho/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from .seqwho import MultiqcModule

__all__ = ["MultiqcModule"]
266 changes: 266 additions & 0 deletions multiqc/modules/seqwho/seqwho.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,266 @@
""" MultiQC module to parse output from SeqWho """

import json
import logging
from collections import OrderedDict

from multiqc import config
from multiqc.modules.base_module import BaseMultiqcModule, ModuleNoSamplesFound
from multiqc.plots import linegraph, bargraph
from multiqc.utils import mqc_colour

# Initialise the logger
log = logging.getLogger(__name__)


LIBRARIES = ["amplicon", "bisulf", "wxs", "chip", "wgs", "dnase", "rnaseq", "atac", "mirnaseq"]
SPECIES = ["human", "mouse"]


class MultiqcModule(BaseMultiqcModule):
"""SeqWho module"""

def __init__(self):
# Initialise the parent object
super(MultiqcModule, self).__init__(
name="SeqWho",
anchor="seqwho",
href="https://daehwankimlab.github.io/seqwho/",
info="is a tool to determine a FASTQ(A) sequencing file identity, both source protocol and species of origin.",
# doi=""
)

# Find and load any SeqWho reports
self.seqwho_data = dict()
self.seqwho_qualdis = dict()
self.seqwho_qualscore = dict()
self.seqwho_readdist = dict()

for f in self.find_log_files("seqwho", filehandles=True):
self.parse_json(f)

# Superfluous function call to confirm that it is used in this module
# Replace None with actual version if it is available
self.add_software_version(None)

# Filter to strip out ignored sample names
self.seqwho_data = self.ignore_samples(self.seqwho_data)
self.seqwho_qualdis = self.ignore_samples(self.seqwho_qualdis)
self.seqwho_qualscore = self.ignore_samples(self.seqwho_qualscore)
self.seqwho_readdist = self.ignore_samples(self.seqwho_readdist)

if len(self.seqwho_data) == 0:
raise ModuleNoSamplesFound

log.info(f"Found {len(self.seqwho_data)} reports")

# Write parsed report data to a file
self.write_data_file(self.seqwho_data, "multiqc_seqwho")

# General Stats Table
self.seqwho_general_stats_table()

# SeqWho MLE Species Table
self.seqwho_species_plot()

# SeqWho MLE Sequencing Table
self.seqwho_library_plot()

# Quality Distribution
self.seqwho_qualdist_plot()

# Quality Score
self.seqwho_qualscore_plot()

# Read Distribution
self.seqwho_readdist_plot()

# Parse our nice little JSON file
def parse_json(self, f):
"""Parse the JSON output from SeqWho and save the summary statistics"""
try:
parsed_json = json.load(f["f"])
except json.JSONDecodeError:
log.warning(f"Could not parse SeqWho JSON: '{f['fn']}'")
return

# Get sample name from JSON first
for s_name, data in parsed_json.items():
s_name = self.clean_s_name(s_name, f["root"])
self.add_data_source(f, s_name)
self.seqwho_data[s_name] = {}

call_data = data["Call"]
self.seqwho_data[s_name]["predicted_species"] = call_data["Species"]
self.seqwho_data[s_name]["predicted_library"] = call_data["Library"]
self.seqwho_data[s_name]["mle"] = call_data["Maximum Likelihood Estimate"]
self.seqwho_data[s_name]["est_read_number"] = data["Estimated Read Number"]

# Make MLE Matrix table for subset of predicted species and library
mle_matrix = call_data["MLE Matrix"]
mle_columns = SPECIES + LIBRARIES
for i in mle_columns:
self.seqwho_data[s_name][i] = mle_matrix[i]

# Make a Quality Score vs count version of the data
self.seqwho_qualdis[s_name] = OrderedDict()
qual_dist = data["Quality Dist"]
for i in range(len(qual_dist)):
self.seqwho_qualdis[s_name][i] = qual_dist[i]

# Make a Quality score vs position version of the data
self.seqwho_qualscore[s_name] = OrderedDict()
qual_scores = data["Quality Scores"]
for i in range(len(qual_scores)):
self.seqwho_qualscore[s_name][i] = qual_scores[i]

# Make a Read Length vs count of the data
self.seqwho_readdist[s_name] = OrderedDict()
read_dist = data["Read lengths"]
for i in range(len(read_dist)):
self.seqwho_readdist[s_name][i] = read_dist[i]

def seqwho_general_stats_table(self):
"""Take the parsed stats from the SeqWho report and add it to the
General Statistics table at the top of the report"""
species_scale = mqc_colour.mqc_colour_scale("Dark2")
species_colors = [{v: species_scale.get_colour(i, lighten=0.5)} for i, v in enumerate(SPECIES)]
library_scale = mqc_colour.mqc_colour_scale("Accent")
library_colors = [{v: library_scale.get_colour(i, lighten=0.5)} for i, v in enumerate(LIBRARIES)]

headers = {
"predicted_species": {
"title": "Pred Species",
"description": "Predicted species",
"cond_formatting_colours": species_colors,
"cond_formatting_rules": {v: [{"s_eq": v}] for v in SPECIES},
"scale": False,
},
"predicted_library": {
"title": "Pred Library",
"description": "Predicted library type",
"cond_formatting_colours": library_colors,
"cond_formatting_rules": {v: [{"s_eq": v}] for v in LIBRARIES},
"scale": False,
},
"mle": {
"title": "Max Likelihood Est",
"description": "Overall maximum likelihood estimate (overall)",
"min": 0,
"format": "{:,.3f}",
"scale": "RdYlGn",
},
"est_read_number": {
"title": f"{config.read_count_prefix} Est Reads",
"description": f"Estimated read number ({config.read_count_desc})",
"min": 0,
"modify": lambda x: x * config.read_count_multiplier,
"scale": "Blues",
"shared_key": "read_count",
},
}
self.general_stats_addcols(self.seqwho_data, headers)

def seqwho_species_plot(self):
"""
Take the Maximum Likelihood Species data from the SeqWho report and make
barchart
"""

config = {
"id": "seqwho_species_plot",
"title": "SeqWho: Species",
"ylab": "Maximum likelihood estimate",
"ymax": 1.0,
"stacking": None,
"cpswitch": False,
"cpswitch_counts_label": "Maximum likelihood",
"tt_decimals": 3,
}

self.add_section(
name="Species",
anchor="seqwho_species",
description="This plot shows the maximum likelihood that a given species matches.",
plot=bargraph.plot(self.seqwho_data, SPECIES, config),
)

def seqwho_library_plot(self):
"""Take the Maximum Likelihood Library data from the SeqWho report and make a
barplot"""

# Config for the plot
config = {
"id": "seqwho_sequencing_plot",
"title": "SeqWho: Library Type",
"ylab": "Maximum likelihood estimate",
"ymax": 1.0,
"stacking": None,
"cpswitch": False,
"cpswitch_counts_label": "Maximum likelihood",
"tt_decimals": 3,
}

self.add_section(
name="Library Type",
anchor="seqwho_library",
description="This plot shows the maximum likelihood that a given library matches.",
plot=bargraph.plot(self.seqwho_data, LIBRARIES, config),
)

def seqwho_qualdist_plot(self):
"""Generate the Average Sequence Quality Distribution plot"""
pconfig = {
"id": "seqwho_qualdis_plot",
"title": "SeqWho: Per Sequence Quality Scores",
"ylab": "Reads",
"xlab": "Sequence Quality (Phred)",
"xDecimals": False,
"ymin": 0,
"tt_label": "<b>Phred {point.x}</b>: {point.y:.0f} reads",
}

self.add_section(
name="Predicted Per Sequence Quality Scores",
anchor="seqwho_qualdis",
description="This plot shows the predicted number of reads with average quality scores.",
plot=linegraph.plot(self.seqwho_qualdis, pconfig),
)

def seqwho_qualscore_plot(self):
"""Generate the Average Quality Score per position plot"""
pconfig = {
"id": "seqwho_qualscore_plot",
"title": "SeqWho: Quality Score",
"ylab": "Sequence Quality (Phred)",
"xlab": "Position (bp)",
"xDecimals": False,
"ymin": 0,
"tt_label": "<b>Base {point.x} </b>: {point.y}",
}

self.add_section(
name="Predicted Mean Quality Scores",
anchor="seqwho_qualscore",
description="This plot shows the predicted mean quality value across each base position in the read.",
plot=linegraph.plot(self.seqwho_qualscore, pconfig),
)

def seqwho_readdist_plot(self):
"""Generate the number of reads with certain length plot"""
pconfig = {
"id": "seqwho_readdist_plot",
"title": "SeqWho: Read Distribution",
"ylab": "Reads",
"xlab": "Length (bp)",
"xDecimals": False,
"ymin": 0,
"tt_label": "<b>Base {point.x} bp</b>: {point.y} reads",
}

self.add_section(
name="Predicted Read Distribution",
anchor="seqwho_readdist",
description="This plot shows predicted the number of reads with certain lengths.",
plot=linegraph.plot(self.seqwho_readdist, pconfig),
)
4 changes: 4 additions & 0 deletions multiqc/utils/config_defaults.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -793,6 +793,10 @@ module_order:
module_tag:
- RNA
- DNA
- seqwho:
module_tag:
- RNA
- DNA
- afterqc:
module_tag:
- RNA
Expand Down
3 changes: 3 additions & 0 deletions multiqc/utils/search_patterns.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -767,6 +767,9 @@ samtools/rmdup:
shared: true
sargasso:
fn: "overall_filtering_summary.txt"
seqwho:
contents: ' "Per Base Seq": ['
num_lines: 10
seqyclean:
fn: "*_SummaryStatistics.tsv"
sexdeterrmine:
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,7 @@
"samtools = multiqc.modules.samtools:MultiqcModule",
"sargasso = multiqc.modules.sargasso:MultiqcModule",
"seqyclean = multiqc.modules.seqyclean:MultiqcModule",
"seqwho = multiqc.modules.seqwho:MultiqcModule",
"sexdeterrmine = multiqc.modules.sexdeterrmine:MultiqcModule",
"sickle = multiqc.modules.sickle:MultiqcModule",
"skewer = multiqc.modules.skewer:MultiqcModule",
Expand Down