Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cellranger: Count submodule updated to parse Antibody Capture summary #2118

Merged
merged 14 commits into from
Oct 17, 2023
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
- **HiCPro**: fix parsing scientific notation in hicpro-ashic. Thanks @Just-Roma ([#2126](https://github.com/ewels/MultiQC/pull/2126))
- **Picard**: MarkDuplicates: Fix parsing mixed strings/numbers, account for missing trailing `0` ([#2083](https://github.com/ewels/MultiQC/pull/2083), [#2094](https://github.com/ewels/MultiQC/pull/2094))
- **WhatsHap**: Process truncated input with no ALL chromosome ([#2095](https://github.com/ewels/MultiQC/pull/2095))
- **Cellranger**: Count submodule updated to parse Antibody Capture summary ([#2118](https://github.com/ewels/MultiQC/pull/2118))

## [MultiQC v1.16](https://github.com/ewels/MultiQC/releases/tag/v1.16) - 2023-09-22

Expand Down
246 changes: 186 additions & 60 deletions multiqc/modules/cellranger/count.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from collections import OrderedDict

from multiqc import config
from multiqc.plots import linegraph, table
from multiqc.plots import bargraph, linegraph, table

from ._utils import *

Expand All @@ -19,18 +19,22 @@ class CellRangerCountMixin:

def parse_count_html(self):
self.cellrangercount_data = dict()
self.cellrangercount_antibody_data = dict()
self.cellrangercount_general_data = dict()
self.cellrangercount_warnings = dict()
self.cellrangercount_plots_conf = {"bc": dict(), "genes": dict()}
self.cellrangercount_plots_data = {"bc": dict(), "genes": dict()}
self.count_general_data_headers = OrderedDict()
self.count_data_headers = OrderedDict()
self.antibody_data_headers = OrderedDict()
self.count_warnings_headers = OrderedDict()

for f in self.find_log_files("cellranger/count_html", filehandles=True):
self.parse_count_report(f)

self.cellrangercount_data = self.ignore_samples(self.cellrangercount_data)
if self.cellrangercount_antibody_data:
self.cellrangercount_antibody_data = self.ignore_samples(self.cellrangercount_antibody_data)
self.cellrangercount_general_data = self.ignore_samples(self.cellrangercount_general_data)
self.cellrangercount_warnings = self.ignore_samples(self.cellrangercount_warnings)
for k in self.cellrangercount_plots_data.keys():
Expand Down Expand Up @@ -71,66 +75,98 @@ def parse_count_html(self):
],
)

if self.cellrangercount_antibody_data:
self.antibody_data_headers["reads"] = {
"rid": "antibody_data_reads",
"title": "{} Reads".format(config.read_count_prefix),
"description": "Number of reads ({})".format(config.read_count_desc),
"modify": lambda x: x * config.read_count_multiplier,
}
self.antibody_data_headers = set_hidden_cols(
self.antibody_data_headers,
["Q30 bc", "Q30 UMI", "Q30 read", "saturation", "umi per cell", "reads in aggregate bc"],
)

if len(self.cellrangercount_general_data) == 0:
return 0

else:
self.general_stats_addcols(self.cellrangercount_general_data, self.count_general_data_headers)
self.general_stats_addcols(self.cellrangercount_general_data, self.count_general_data_headers)

# Write parsed report data to a file
self.write_data_file(self.cellrangercount_data, "multiqc_cellranger_count")
# Write parsed report data to a file
self.write_data_file(self.cellrangercount_data, "multiqc_cellranger_count")
if self.cellrangercount_antibody_data:
self.write_data_file(self.cellrangercount_antibody_data, "multiqc_cellranger_antibody_count")

# Add sections to the report
if len(self.cellrangercount_warnings) > 0:
self.add_section(
name="Count - Warnings",
anchor="cellranger-count-warnings",
description="Warnings encountered during the analysis",
plot=table.plot(self.cellrangercount_warnings, self.count_warnings_headers, {"namespace": "Count"}),
)

# Add sections to the report
if len(self.cellrangercount_warnings) > 0:
self.add_section(
name="Count - Warnings",
anchor="cellranger-count-warnings",
description="Warnings encountered during the analysis",
plot=table.plot(self.cellrangercount_warnings, self.count_warnings_headers, {"namespace": "Count"}),
)
self.add_section(
name="Count - Summary stats",
anchor="cellranger-count-stats",
description="Summary QC metrics from Cell Ranger count",
plot=table.plot(self.cellrangercount_data, self.count_data_headers, {"namespace": "Count"}),
)

if self.cellrangercount_antibody_data:
self.add_section(
name="Count - Summary stats",
anchor="cellranger-count-stats",
name="Antibody - Summary stats",
anchor="cellranger-antibody-stats",
description="Summary QC metrics from Cell Ranger count",
plot=table.plot(self.cellrangercount_data, self.count_data_headers, {"namespace": "Count"}),
plot=table.plot(
self.cellrangercount_antibody_data, self.antibody_data_headers, {"namespace": "Antibody"}
),
)

self.add_section(
name="Count - BC rank plot",
anchor="cellranger-count-bcrank-plot",
description=self.cellrangercount_plots_conf["bc"]["description"],
helptext=self.cellrangercount_plots_conf["bc"]["helptext"],
plot=linegraph.plot(self.cellrangercount_plots_data["bc"], self.cellrangercount_plots_conf["bc"]["config"]),
)

if "antibody_counts" in self.cellrangercount_plots_conf:
self.add_section(
name="Count - BC rank plot",
anchor="cellranger-count-bcrank-plot",
description=self.cellrangercount_plots_conf["bc"]["description"],
helptext=self.cellrangercount_plots_conf["bc"]["helptext"],
plot=linegraph.plot(
self.cellrangercount_plots_data["bc"], self.cellrangercount_plots_conf["bc"]["config"]
name="Antibody - Counts Distribution Bargraph",
anchor="cellranger-antibody-counts",
description=self.cellrangercount_plots_conf["antibody_counts"]["description"],
helptext=self.cellrangercount_plots_conf["antibody_counts"]["helptext"],
plot=bargraph.plot(
self.cellrangercount_plots_data["antibody_counts"],
self.cellrangercount_plots_conf["antibody_counts"]["keys"],
self.cellrangercount_plots_conf["antibody_counts"]["config"],
),
)

self.add_section(
name="Count - Median genes",
anchor="cellranger-count-genes-plot",
description=self.cellrangercount_plots_conf["genes"]["description"],
helptext=self.cellrangercount_plots_conf["genes"]["helptext"],
plot=linegraph.plot(
self.cellrangercount_plots_data["genes"], self.cellrangercount_plots_conf["genes"]["config"]
),
)

if "saturation" in self.cellrangercount_plots_data:
self.add_section(
name="Count - Median genes",
anchor="cellranger-count-genes-plot",
description=self.cellrangercount_plots_conf["genes"]["description"],
helptext=self.cellrangercount_plots_conf["genes"]["helptext"],
name="Count - Saturation plot",
anchor="cellranger-count-saturation-plot",
description=self.cellrangercount_plots_conf["saturation"]["description"],
helptext=self.cellrangercount_plots_conf["saturation"]["helptext"],
plot=linegraph.plot(
self.cellrangercount_plots_data["genes"], self.cellrangercount_plots_conf["genes"]["config"]
self.cellrangercount_plots_data["saturation"],
self.cellrangercount_plots_conf["saturation"]["config"],
),
)

try:
self.add_section(
name="Count - Saturation plot",
anchor="cellranger-count-saturation-plot",
description=self.cellrangercount_plots_conf["saturation"]["description"],
helptext=self.cellrangercount_plots_conf["saturation"]["helptext"],
plot=linegraph.plot(
self.cellrangercount_plots_data["saturation"],
self.cellrangercount_plots_conf["saturation"]["config"],
),
)
except KeyError:
pass

return len(self.cellrangercount_general_data)
return len(self.cellrangercount_general_data)

def parse_count_report(self, f):
"""Go through the html report of cell ranger and extract the data in a dicts"""
Expand Down Expand Up @@ -202,7 +238,6 @@ def parse_count_report(self, f):
)

# Store full data from cell ranger count report
data = dict()
data_rows = (
summary["summary_tab"]["sequencing"]["table"]["rows"]
+ summary["summary_tab"]["cells"]["table"]["rows"]
Expand Down Expand Up @@ -242,14 +277,16 @@ def parse_count_report(self, f):
"median umi/cell": "YlGn",
"saturation": "YlOrRd",
}
data, self.count_data_headers = update_dict(
table, self.count_data_headers = update_dict(
data_general_stats,
self.count_data_headers,
data_rows,
col_dict,
colours,
"Count",
)
if not table:
return None

# Extract warnings if any
warnings = dict()
Expand Down Expand Up @@ -315,23 +352,112 @@ def parse_count_report(self, f):
"bc": parse_bcknee_data(summary["summary_tab"]["cells"]["barcode_knee_plot"]["data"], s_name),
"genes": {s_name: transform_data(summary["analysis_tab"]["median_gene_plot"]["plot"]["data"][0])},
}
try:
if "seq_saturation_plot" in summary["analysis_tab"]:
plots_data["saturation"] = {
s_name: transform_data(summary["analysis_tab"]["seq_saturation_plot"]["plot"]["data"][0])
}
except KeyError:
pass

if len(data) > 0:
if s_name in self.cellrangercount_general_data:
log.debug("Duplicate sample name found in {}! Overwriting: {}".format(f["fn"], s_name))
self.add_data_source(f, s_name, module="cellranger", section="count")
self.cellrangercount_data[s_name] = data
self.cellrangercount_general_data[s_name] = data_general_stats
if len(warnings) > 0:
self.cellrangercount_warnings[s_name] = warnings
self.cellrangercount_plots_conf = plots
for k in plots_data.keys():
if k not in self.cellrangercount_plots_data.keys():
self.cellrangercount_plots_data[k] = dict()
self.cellrangercount_plots_data[k].update(plots_data[k])
# Store full data for ANTIBODY capture
antibody_data = dict()
if "ANTIBODY_sequencing" in summary["summary_tab"]:
data_rows = (
summary["summary_tab"]["ANTIBODY_sequencing"]["table"]["rows"]
+ summary["summary_tab"]["ANTIBODY_application"]["table"]["rows"]
)
col_dict = {
"Number of Reads": "reads",
"Valid Barcodes": "valid bc",
"Valid UMIs": "valid umi",
"Sequencing Saturation": "saturation",
"Q30 Bases in Barcode": "Q30 bc",
"Q30 Bases in Antibody Read": "Q30 read",
"Q30 Bases in UMI": "Q30 UMI",
"Fraction Antibody Reads": "antibody reads",
"Fraction Antibody Reads Usable": "antibody reads usable",
"Antibody Reads Usable per Cell": "antibody reads usable/cell",
"Fraction Antibody Reads in Aggregate Barcodes": "reads in aggregate bc",
"Fraction Unrecognized Antibody": "unrecognized antibody",
"Antibody Reads in Cells": "antibody reads in cells",
"Median UMIs per Cell (summed over all recognized antibody barcodes)": "umi per cell",
}
colours = {
"reads": "YlGn",
"antibody reads": "RdPu",
"reads in cells": "Blues",
"reads usable": "Greens",
"reads usable per cell": "Purples",
"reads in aggregate bc": "PuBuGn",
"valid bc": "Spectral",
"valid umi": "RdYlGn",
"Q30 bc": "YlGn",
"saturation": "YlOrRd",
}
antibody_data, self.antibody_data_headers = update_dict(
antibody_data,
self.antibody_data_headers,
data_rows,
col_dict,
colours,
"Antibody",
)

# Extract labels and values for the bargraph data
combined_data = {}
for label, value in zip(
summary["antibody_tab"]["antibody_treemap_plot"]["plot"]["data"][0]["labels"],
summary["antibody_tab"]["antibody_treemap_plot"]["plot"]["data"][0]["values"],
):
label_match = re.search(r"<b>(.*?)\s+\((.*?)%\)</b>", label)
if label_match:
label_value = label_match.group(1)
value_ = round(value * 100, 2)
combined_data[label_value] = value_

# Extract labels and number of cells for labelling the bargraph
combined_label = {}
for label, cells in zip(
summary["antibody_tab"]["antibody_treemap_plot"]["plot"]["data"][0]["labels"],
summary["antibody_tab"]["antibody_treemap_plot"]["plot"]["data"][0]["text"],
):
label_match = re.search(r"<b>(.*?)\s+\((.*?)%\)</b>", label)
if label_match:
label_value = label_match.group(1)
combined_label[label_value] = label_value + ": " + cells

# Use the label from `combined_label` for the plot
keys = dict()
for key, value in combined_label.items():
keys[key] = {"name": value}

plots["antibody_counts"] = {
"config": {
"id": "mqc_cellranger_antibody_counts",
"title": "Cell Ranger: Distribution of Antibody Counts",
"ylab": "% Total UMI",
"ymax": 100,
"cpswitch": False,
"use_legend": False,
"tt_decimals": 2,
"tt_suffix": "%",
"tt_percentages": False,
},
"keys": keys,
"description": "Antibody Counts Distribution Plot",
"helptext": "Relative composition of antibody counts for features with at least 1 UMI. Box size represents fraction of total UMIs from cell barcodes that are derived from this antibody. Hover over a box to view more information on a particular antibody, including number of associated barcodes.",
}
plots_data["antibody_counts"] = {s_name: combined_data}

if s_name in self.cellrangercount_general_data:
log.debug("Duplicate sample name found in {}! Overwriting: {}".format(f["fn"], s_name))
self.add_data_source(f, s_name, module="cellranger", section="count")
self.cellrangercount_data[s_name] = table
if "antibody_tab" in summary:
self.cellrangercount_antibody_data[s_name] = antibody_data
self.cellrangercount_general_data[s_name] = data_general_stats
if len(warnings) > 0:
self.cellrangercount_warnings[s_name] = warnings
self.cellrangercount_plots_conf.update(plots)
for k in plots_data.keys():
if k not in self.cellrangercount_plots_data.keys():
self.cellrangercount_plots_data[k] = dict()
self.cellrangercount_plots_data[k].update(plots_data[k])