Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

BCLConvert: fix mean quality, fix count-per-lane barplot #2197

Merged
merged 6 commits into from
Nov 25, 2023
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
- Upgrade the jQuery tablesorter plugin to v2 ([#1666](https://github.com/ewels/MultiQC/pull/1666))
- Allow specifying default sort columns for tables with `defaultsort` ([#1667](https://github.com/ewels/MultiQC/pull/1667))
- Create CODE_OF_CONDUCT.md ([#2195](https://github.com/ewels/MultiQC/pull/2195))
- BCLConvert: fix mean quality, fix count-per-lane barplot ([#2197](https://github.com/ewels/MultiQC/pull/2197))

### New Modules

Expand Down
132 changes: 77 additions & 55 deletions multiqc/modules/bclconvert/bclconvert.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,9 @@
import os
import xml.etree.ElementTree as ET
from collections import defaultdict
from functools import lru_cache
from itertools import islice
from typing import Tuple, Dict, Optional

from multiqc import config
from multiqc.modules.base_module import BaseMultiqcModule, ModuleNoSamplesFound
Expand Down Expand Up @@ -66,18 +68,22 @@ def __init__(self):
self._parse_top_unknown_barcodes(bclconvert_data, last_run_id)

# Collect counts by lane and sample
bclconvert_by_sample_lane = dict()
bclconvert_by_lane, bclconvert_by_sample, source_files = self._split_data_by_lane_and_sample(bclconvert_data)
(
bclconvert_by_lane,
bclconvert_by_sample,
counts_by_sample_by_lane,
source_files,
) = self._split_data_by_lane_and_sample(bclconvert_data)

# Filter to strip out ignored sample names
bclconvert_by_lane = self.ignore_samples(bclconvert_by_lane)
bclconvert_by_sample = self.ignore_samples(bclconvert_by_sample)
bclconvert_by_sample_lane = self.ignore_samples(bclconvert_by_sample_lane)
counts_by_sample_by_lane = self.ignore_samples(counts_by_sample_by_lane)

# Return with Warning if no files are found
if len(bclconvert_by_lane) == 0 and len(bclconvert_by_sample) == 0:
raise ModuleNoSamplesFound
log.info("{} lanes and {} samples found".format(len(bclconvert_by_lane), len(bclconvert_by_sample)))
log.info(f"{len(bclconvert_by_lane)} lanes and {len(bclconvert_by_sample)} samples found")

# Print source files
for s in source_files.keys():
Expand All @@ -88,9 +94,12 @@ def __init__(self):
section="bclconvert-bysample",
)

self.write_data_file(
{str(k): bclconvert_by_lane[k] for k in bclconvert_by_lane.keys()}, "multiqc_bclconvert_bylane"
)
# Calculate mean quality scores
for sample_id, sample in bclconvert_by_sample.items():
if sample["yield"] > 0:
sample["mean_quality"] = sample["_quality_score_sum"] / sample["yield"]
vladsavelyev marked this conversation as resolved.
Show resolved Hide resolved

self.write_data_file(bclconvert_by_lane, "multiqc_bclconvert_bylane")
self.write_data_file(bclconvert_by_sample, "multiqc_bclconvert_bysample")

# Superfluous function call to confirm that it is used in this module
Expand Down Expand Up @@ -151,12 +160,6 @@ def __init__(self):
),
)

# Add section for counts by sample
# get cats for per-lane tab
lcats = set()
for s_name in bclconvert_by_sample_lane:
lcats.update(bclconvert_by_sample_lane[s_name].keys())
lcats = sorted(list(lcats))
self.add_section(
name="Clusters by sample",
anchor="bclconvert-bysample",
Expand All @@ -167,9 +170,9 @@ def __init__(self):
plot=bargraph.plot(
[
self.get_bar_data_from_counts(bclconvert_data, bclconvert_by_sample, last_run_id),
bclconvert_by_sample_lane,
counts_by_sample_by_lane,
],
[cats, lcats],
[cats, sorted(bclconvert_by_lane.keys())],
{
"id": "bclconvert_sample_counts",
"title": "bclconvert: Clusters by sample",
Expand Down Expand Up @@ -200,26 +203,24 @@ def __init__(self):
),
)

def _get_genome_size(self):
@staticmethod
@lru_cache()
vladsavelyev marked this conversation as resolved.
Show resolved Hide resolved
def _get_genome_size() -> Optional[int]:
gs = getattr(config, "bclconvert", {}).get("genome_size")
if gs:
try:
gs = float(gs)
gs = int(gs)
except ValueError:
presets = {"hg19_genome": 2897310462, "hg38_genome": 3049315783, "mm10_genome": 2652783500}
if gs in presets:
gs = presets[gs]
else:
log.warning(
"The size for genome "
+ gs
+ " is unknown to MultiQC, "
+ "please specify it explicitly or choose one of the following: "
+ ", ".join(presets.keys())
+ "."
f"Genome '{gs}' is unknown to MultiQC, please specify size explicitly or choose one of the "
f"following: {', '.join(presets.keys())}."
)
gs = None
log.debug("Determined genome size as " + str(gs))
log.debug(f"Determined genome size as: {gs}")
return gs

@staticmethod
Expand All @@ -230,7 +231,8 @@ def _reads_dictionary():
"perfect_index_reads": 0,
"one_mismatch_index_reads": 0,
"basesQ30": 0,
"mean_quality": 0,
"depth": 0,
"_quality_score_sum": 0, # used to re-calculate mean_quality
}

@staticmethod
Expand Down Expand Up @@ -354,7 +356,8 @@ def parse_demux_data(self, demux_file, bclconvert_data, num_demux_files):
run_data = bclconvert_data.get(demux_file["run_id"], dict())
bclconvert_data[demux_file["run_id"]] = run_data
with open(filename) as fh:
for row in csv.DictReader(fh, delimiter=","):
reader: csv.DictReader = csv.DictReader(fh, delimiter=",")
for row in reader:
lane_id = "L{}".format(row["Lane"])
lane = run_data.get(lane_id)
if lane is None:
Expand Down Expand Up @@ -384,10 +387,11 @@ def parse_demux_data(self, demux_file, bclconvert_data, num_demux_files):
sample["yield"] += int(row["# Reads"]) * demux_file["cluster_length"]
sample["perfect_index_reads"] += int(row["# Perfect Index Reads"])
sample["one_mismatch_index_reads"] += int(row["# One Mismatch Index Reads"])
# Column only present pre v3.9.3

# columns only present pre v3.9.3, after they moved to quality_metrics
sample["basesQ30"] += int(row.get("# of >= Q30 Bases (PF)", 0))
# Column only present pre v3.9.3
sample["mean_quality"] += float(row.get("Mean Quality Score (PF)", 0))
# Collecting to re-calculate mean_quality:
sample["_quality_score_sum"] += float(row.get("Mean Quality Score (PF)", 0)) * sample["yield"]

if lane_id not in total_reads_in_lane:
total_reads_in_lane[lane_id] = 0
Expand All @@ -407,7 +411,7 @@ def parse_qmetrics_data(self, bclconvert_data, qmetrics_file):
filename = str(os.path.join(qmetrics_file["root"], qmetrics_file["fn"]))
self.total_reads_in_lane_per_file[filename] = dict()

reader = csv.DictReader(open(filename), delimiter=",")
reader: csv.DictReader = csv.DictReader(open(filename), delimiter=",")
for row in reader:
run_data = bclconvert_data[qmetrics_file["run_id"]]
lane_id = "L{}".format(row["Lane"])
Expand All @@ -424,8 +428,10 @@ def parse_qmetrics_data(self, bclconvert_data, qmetrics_file):

# Parse the stats that moved to this file in v3.9.3
lane["basesQ30"] += int(row["YieldQ30"])
lane_sample["yield"] += int(row["Yield"])
lane_sample["basesQ30"] += int(row["YieldQ30"])
lane_sample["mean_quality"] += float(row["Mean Quality Score (PF)"])
# Collecting to re-calculate mean_quality:
lane_sample["_quality_score_sum"] += float(row["QualityScoreSum"])

def _parse_top_unknown_barcodes(self, bclconvert_data, last_run_id):
run_data = bclconvert_data[last_run_id]
Expand All @@ -439,13 +445,15 @@ def _parse_top_unknown_barcodes(self, bclconvert_data, last_run_id):
thisbarcode = str(unknown_barcode_row["index"]) + "-" + str(unknown_barcode_row["index2"])
run_data[thislane]["top_unknown_barcodes"][thisbarcode] = int(unknown_barcode_row["# Reads"])

def _total_reads_for_run(self, bclconvert_data, run_id):
@staticmethod
def _total_reads_for_run(bclconvert_data, run_id):
totalreads = 0
for lane_id, lane in bclconvert_data[run_id].items():
totalreads += lane["clusters"]
return totalreads

def _total_reads_all_runs(self, bclconvert_data):
@staticmethod
def _total_reads_all_runs(bclconvert_data):
totalreads = 0
for key, run_data in bclconvert_data.items():
for lane_id, lane in run_data.items():
Expand All @@ -465,19 +473,18 @@ def _set_lane_percentage_stats(self, data, cluster_length):
data["percent_oneMismatch"] = float(data["one_mismatch_index_reads"]) / float(data["clusters"]) * 100.0
except ZeroDivisionError:
data["percent_oneMismatch"] = "NA"
try:
data["depth"] = float(data["basesQ30"]) / float(self._get_genome_size())
except ZeroDivisionError:
data["depth"] = "NA"
except TypeError:
if self._get_genome_size():
data["depth"] = float(data["basesQ30"]) / self._get_genome_size()
else:
data["depth"] = "NA"

def _split_data_by_lane_and_sample(self, bclconvert_data):
def _split_data_by_lane_and_sample(self, bclconvert_data) -> Tuple[Dict, Dict, Dict, Dict]:
"""
Populate a collection of "stats across all lanes" and "stats across all samples"
"""
bclconvert_by_lane = dict()
bclconvert_by_sample = dict()
count_by_sample_by_lane = defaultdict(lambda: defaultdict(int)) # counts only - used for a stacked bargraph
source_files = dict()
for run_id, r in bclconvert_data.items():
# set stats for each lane (across all samples) in bclconvert_bylane dictionary
Expand All @@ -504,29 +511,27 @@ def _split_data_by_lane_and_sample(self, bclconvert_data):
bclconvert_by_sample[sample_id] = self._reads_dictionary()

s = bclconvert_by_sample[sample_id]

s["clusters"] += int(sample["clusters"])
s["yield"] += int(sample["yield"])
s["perfect_index_reads"] += int(sample["perfect_index_reads"])
s["one_mismatch_index_reads"] += int(sample["one_mismatch_index_reads"])
s["basesQ30"] += int(sample["basesQ30"])
s["mean_quality"] += float(sample["mean_quality"])
s["cluster_length"] = lane["cluster_length"]
s["_quality_score_sum"] += sample["_quality_score_sum"]
vladsavelyev marked this conversation as resolved.
Show resolved Hide resolved

try:
if "depth" not in s:
s["depth"] = 0
s["depth"] += float(sample["basesQ30"]) / float(self._get_genome_size())
except ZeroDivisionError:
s["depth"] = "NA"
except TypeError:
if not self._get_genome_size():
s["depth"] = "NA"
else:
s["depth"] += float(sample["basesQ30"]) / self._get_genome_size()

if sample_id not in ["top_unknown_barcodes"]:
if sample_id not in source_files:
source_files[sample_id] = []
source_files[sample_id].append(sample["filename"])
return bclconvert_by_lane, bclconvert_by_sample, source_files

count_by_sample_by_lane[sample_id][lane_key_name] += sample["clusters"]

return bclconvert_by_lane, bclconvert_by_sample, count_by_sample_by_lane, source_files

def sample_stats_table(self, bclconvert_data, bclconvert_by_sample):
sample_stats_data = dict()
Expand Down Expand Up @@ -573,6 +578,7 @@ def sample_stats_table(self, bclconvert_data, bclconvert_by_sample):
# "one_mismatch_index_reads": sample['one_mismatch_index_reads'],
"perfect_pecent": perfect_percent,
"one_mismatch_pecent": one_mismatch_pecent,
"mean_quality": sample["mean_quality"],
}
if sample["depth"] != "NA":
depth_available = True
Expand All @@ -583,7 +589,11 @@ def sample_stats_table(self, bclconvert_data, bclconvert_by_sample):
"title": "Coverage",
"description": (
"Estimated sequencing depth based on the number of bases with quality score greater or equal to Q30, "
"assuming the genome size is {}, as provided in config".format(self._get_genome_size())
+ (
f", assuming the genome size is {self._get_genome_size()} as provided in config"
if self._get_genome_size() is not None
else ""
)
),
"min": 0,
"suffix": "X",
Expand Down Expand Up @@ -656,6 +666,13 @@ def sample_stats_table(self, bclconvert_data, bclconvert_by_sample):
"scale": "RdYlGn",
"suffix": "%",
}
headers["mean_quality"] = {
"title": "Mean Quality Score (PF)",
"description": "Mean quality score of bases passing filter",
"min": 0,
"max": 40,
"scale": "RdYlGn",
}

# Table config
table_config = {
Expand Down Expand Up @@ -683,8 +700,12 @@ def lane_stats_table(self, bclconvert_by_lane):
headers["depth-lane"] = {
"title": "Coverage",
"description": (
"Estimated sequencing depth based on the number of bases with quality score greater or equal to Q30, "
"assuming the genome size is {}, as provided in config".format(self._get_genome_size())
"Estimated sequencing depth based on the number of bases with quality score greater or equal to Q30"
+ (
f", assuming the genome size is {self._get_genome_size()} as provided in config"
if self._get_genome_size() is not None
else ""
)
),
"suffix": "X",
"scale": "BuPu",
Expand Down Expand Up @@ -770,10 +791,11 @@ def lane_stats_table(self, bclconvert_by_lane):

return table.plot(bclconvert_bylane_foroutput, headers, table_config)

def prepend_runid(self, runId, rest):
return str(runId) + " - " + str(rest)
@staticmethod
def prepend_runid(runid, rest):
return str(runid) + " - " + str(rest)

def get_bar_data_from_counts(self, bclconvert_data, counts, last_run_id):
def get_bar_data_from_counts(self, bclconvert_data, counts, last_run_id) -> Dict[str, Dict[str, int]]:
# For per-lane stats we fetch undetermined reads, too.
bar_data = {}
for key, value in counts.items():
Expand Down