Skip to content

Commit

Permalink
UMI-tools: support extract command (#2296)
Browse files Browse the repository at this point in the history
* UMI-tools: support extract command

* [automated] Update CHANGELOG.md

* Add barplot

* Barplot with extract stats

* Doc

* Changelog

* Lint

* Fix duplicated IDs in linting

* Merge conflict

---------

Co-authored-by: MultiQC Bot <multiqc-bot@seqera.io>
Co-authored-by: Phil Ewels <phil.ewels@seqera.io>
  • Loading branch information
3 people committed Feb 19, 2024
1 parent 73dbdde commit c5ef275
Show file tree
Hide file tree
Showing 5 changed files with 187 additions and 113 deletions.
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

### Module updates

- **UMI-tools**: support `extract` command ([#2296](https://github.com/MultiQC/MultiQC/pull/2296))
- **bcl2fastq**: fix the top undetermined barcodes plot ([#2340](https://github.com/MultiQC/MultiQC/pull/2340))
- **DRAGEN**: add few coverage metrics in general stats ([#2341](https://github.com/MultiQC/MultiQC/pull/2341))
- **DRAGEN**: fix showing the number of found samples ([#2347](https://github.com/MultiQC/MultiQC/pull/2347))
Expand Down Expand Up @@ -99,7 +100,6 @@ The v1.20 release is also the first release we've had since we moved the MultiQC
- **BCL Convert**: add index, project names to sample statistics and calculate mean quality for lane statistics. ([#2261](https://github.com/MultiQC/MultiQC/pull/2261))
- **BCL Convert**: fix duplicated `yield` for 3.9.3+ when the yield is provided explicitly in Quality_Metrics ([#2253](https://github.com/MultiQC/MultiQC/pull/2253))
- **BCL Convert**: handle samples with zero yield ([#2297](https://github.com/MultiQC/MultiQC/pull/2297))
- **Bismark**: fix old link in Bismark docs ([#2252](https://github.com/MultiQC/MultiQC/pull/2252))
- **Bismark**: fix old link in docs ([#2252](https://github.com/MultiQC/MultiQC/pull/2252))
- **Cutadapt**: support JSON format ([#2281](https://github.com/MultiQC/MultiQC/pull/2281))
- **HiFiasm**: account for lines with no asterisk ([#2268](https://github.com/MultiQC/MultiQC/pull/2268))
Expand Down
2 changes: 1 addition & 1 deletion docs/modules/umitools.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,4 @@ description: >

The MultiQC module for umitools parses logs from umi-tools

Currently only the `dedup` command is supported.
Currently `dedup` and `extract` commands are supported.
287 changes: 179 additions & 108 deletions multiqc/modules/umitools/umitools.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import logging
import re
from typing import Dict, Optional

from multiqc import config
from multiqc.modules.base_module import BaseMultiqcModule, ModuleNoSamplesFound
Expand All @@ -27,125 +28,138 @@ def __init__(self):
doi="10.1101/gr.209601.116",
)

# Find and load any umitools log files
self.umitools_data = dict()
for f in self.find_log_files("umitools"):
dedup_data_by_sample = dict()
for f in self.find_log_files("umitools/dedup"):
# Parse the log file for sample name and statistics
input_fname, data = self.parse_logs(f)
if data and len(data) > 1:
# Clean the sample name
f["s_name"] = self.clean_s_name(input_fname, f)
# Log a warning if the log file matches an existing sample name
if f["s_name"] in self.umitools_data:
data = self.parse_dedup_logs(f)
if data:
f["s_name"] = self._parse_s_name(f) or f["s_name"]
if f["s_name"] in dedup_data_by_sample:
log.debug(f"Duplicate sample name found! Overwriting: {f['s_name']}")
# Store the data in the overall data dictionary
self.umitools_data[f["s_name"]] = data
# Add the log file information to the multiqc_sources.txt
self.add_data_source(f)
# Add version info
self.add_software_version(data["version"], f["s_name"])
dedup_data_by_sample[f["s_name"]] = data
self.add_data_source(f) # Add the log file information to the multiqc_sources.txt
if "version" in data: # Add version info
self.add_software_version(data.pop("version"), f["s_name"])

# Check the log files against the user supplied list of samples to ignore
self.umitools_data = self.ignore_samples(self.umitools_data)
extract_data_by_sample = dict()
for f in self.find_log_files("umitools/extract"):
# Parse the log file for sample name and statistics
data = self.parse_extract_logs(f)
if data:
f["s_name"] = self._parse_s_name(f) or f["s_name"]
if f["s_name"] in extract_data_by_sample:
log.debug(f"Duplicate sample name found! Overwriting: {f['s_name']}")
extract_data_by_sample[f["s_name"]] = data
self.add_data_source(f) # Add the log file information to the multiqc_sources.txt
if "version" in data: # Add version info
self.add_software_version(data.pop("version"), f["s_name"])

# If no files are found, raise an exception
if len(self.umitools_data) == 0:
# Check the log files against the user supplied list of samples to ignore
dedup_data_by_sample = self.ignore_samples(dedup_data_by_sample)
extract_data_by_sample = self.ignore_samples(extract_data_by_sample)
if len(dedup_data_by_sample) == 0 and len(extract_data_by_sample) == 0:
raise ModuleNoSamplesFound
log.info(f"Found {len(dedup_data_by_sample)} deduplication reports")
log.info(f"Found {len(extract_data_by_sample)} extract reports")

# Log the number of reports found
log.info(f"Found {len(self.umitools_data)} reports")

# Write parsed report data to a file
self.write_data_file(self.umitools_data, "multiqc_umitools")
# Write parsed reports data to a file
self.write_data_file(dedup_data_by_sample, "multiqc_umitools_dedup")
self.write_data_file(extract_data_by_sample, "multiqc_umitools_extract")

# write data to the general statistics table
self.umitools_general_stats_table()

# add a section with a deduplicated reads plot to the report
self.add_section(
name="Deduplicated Reads",
anchor="umitools-dedup-plot",
plot=self.umitools_deduplication_plot(),
)

# add a section with a beeswarm plot of UMI stats to the report
self.add_section(
name="UMI Stats",
anchor="umitools-umi-stats",
description="Statistics from running `umi_tools dedeup`",
helptext="""
- **Positions Dedup**: Total number of positions deduplicated
- **Total UMIs**: Total UMIs found in sample
- **Unique UMIs**: Unique UMIs found in sample
- **Mean #UMI**: Mean number of unique UMIs per position
- **Max #UMI**: Max number of unique UMIs per position
""",
plot=self.umitools_umi_stats_swarm(),
)
if dedup_data_by_sample:
self.dedup_general_stats_table(dedup_data_by_sample)
self.deduplication_plot(dedup_data_by_sample)
self.umi_stats_violin(dedup_data_by_sample)

def parse_logs(self, f):
# Check this is a dedup log
if "# output generated by dedup" not in f["f"]:
log.debug(f"Skipping as not a dedup log: {f['fn']}")
return None, None
if extract_data_by_sample:
self.extract_general_stats_table(extract_data_by_sample)
self.umitools_extract_barplot(extract_data_by_sample)

def _parse_s_name(self, f) -> Optional[str]:
# Get the s_name from the input file if possible
# stdin : <_io.TextIOWrapper name='M18-39155_T1.Aligned.sortedByCoord.out.bam' mode='r' encoding='UTF-8'>
s_name_re = r"stdin\s+:\s+<_io\.TextIOWrapper name='([^\']+)'"
s_name_match = re.search(s_name_re, f["f"])
if s_name_match:
s_name = s_name_match.group(1)
else:
s_name = f["s_name"]
return self.clean_s_name(s_name_match.group(1))
return None

# Initialise a dictionary to hold the data from this log file
data = {}
@staticmethod
def parse_dedup_logs(f) -> Dict:
regexes = [
(int, "total_umis", r"INFO total_umis (\d+)"),
(int, "unique_umis", r"INFO #umis (\d+)"),
(int, "dedup_input_reads", r"INFO Reads: Input Reads: (\d+)"),
(int, "dedup_output_reads", r"INFO Number of reads out: (\d+)"),
(int, "positions_deduplicated", r"INFO Total number of positions deduplicated: (\d+)"),
(float, "mean_umi_per_pos", r"INFO Mean number of unique UMIs per position: ([\d\.]+)"),
(float, "max_umi_per_pos", r"INFO Max. number of unique UMIs per position: (\d+)"),
(str, "version", r"# UMI-tools version: ([\d\.]+)"),
]

data = {}
# Search for values using regular expressions
regexes = {
"total_umis": r"INFO total_umis (\d+)",
"unique_umis": r"INFO #umis (\d+)",
"input_reads": r"INFO Reads: Input Reads: (\d+)",
"output_reads": r"INFO Number of reads out: (\d+)",
"positions_deduplicated": r"INFO Total number of positions deduplicated: (\d+)",
"mean_umi_per_pos": r"INFO Mean number of unique UMIs per position: ([\d\.]+)",
"max_umi_per_pos": r"INFO Max. number of unique UMIs per position: (\d+)",
"version": r"# UMI-tools version: ([\d\.]+)",
}
for key, regex in regexes.items():
for type_, key, regex in regexes:
re_matches = re.search(regex, f["f"])
if re_matches:
if key == "version":
data[key] = re_matches.group(1)
else:
data[key] = float(re_matches.group(1))
data[key] = type_(re_matches.group(1))

# calculate a few simple supplementary stats
try:
data["percent_passing_dedup"] = round(((data["output_reads"] / data["input_reads"]) * 100.0), 2)
except (KeyError, ZeroDivisionError):
pass
try:
data["removed_reads"] = data["input_reads"] - data["output_reads"]
except KeyError:
pass
# Calculate a few simple supplementary stats
try:
data["dedup_percent_passing"] = round(
((data["dedup_output_reads"] / data["dedup_input_reads"]) * 100.0), 2
)
except (KeyError, ZeroDivisionError):
pass
try:
data["dedup_removed_reads"] = data["dedup_input_reads"] - data["dedup_output_reads"]
except KeyError:
pass

return s_name, data
return data

@staticmethod
def parse_extract_logs(f) -> Dict:
regexes = [
(int, "extract_input_reads", r"INFO Input Reads: (\d+)"),
(int, "read1_mismatch", r"INFO regex does not match read1: (\d+)"),
(int, "read1_match", r"INFO regex matches read1: (\d+)"),
(int, "read2_mismatch", r"INFO regex does not match read2: (\d+)"),
(int, "read2_match", r"INFO regex matches read2: (\d+)"),
(int, "extract_output_reads", r"INFO Reads output: (\d+)"),
(str, "version", r"# UMI-tools version: ([\d\.]+)"),
]

data = {}
# Search for values using regular expressions
for type_, key, regex in regexes:
re_matches = re.search(regex, f["f"])
if re_matches:
if key == "version":
data[key] = re_matches.group(1)
else:
data[key] = type_(re_matches.group(1))

def umitools_general_stats_table(self):
"""Take the parsed stats from the umitools report and add it to the
basic stats table at the top of the report"""
return data

def dedup_general_stats_table(self, data_by_sample):
"""
Take the parsed stats from the umitools report and add it to the
basic stats table at the top of the report
"""
headers = {
"output_reads": {
"dedup_output_reads": {
"title": f"{config.read_count_prefix} Unique Reads",
"description": f"Reads remaining after deduplication ({config.read_count_desc})",
"min": 0,
"modify": lambda x: x * config.read_count_multiplier,
"shared_key": "read_count",
"scale": "PuRd",
},
"percent_passing_dedup": {
"dedup_percent_passing": {
"title": "% Pass Dedup",
"description": "% processed reads that passed deduplication",
"max": 100,
Expand All @@ -154,50 +168,93 @@ def umitools_general_stats_table(self):
"scale": "RdYlGn",
},
}
self.general_stats_addcols(self.umitools_data, headers)
self.general_stats_addcols(data_by_sample, headers)

def extract_general_stats_table(self, data_by_sample):
headers = {
"extract_output_reads": {
"title": f"{config.read_count_prefix} Extracted Reads",
"description": f"Reads remaining after extraction ({config.read_count_desc})",
"min": 0,
"modify": lambda x: x * config.read_count_multiplier,
"shared_key": "read_count",
"scale": "PuRd",
},
}
self.general_stats_addcols(data_by_sample, headers)

def umitools_deduplication_plot(self):
def deduplication_plot(self, data_by_sample):
"""Generate a plot the read deduplication rates for the main report"""

# Specify the order of the different possible categories
keys = {
"output_reads": {"color": "#7fc97f", "name": "Reads remaining"},
"removed_reads": {"color": "#fdc086", "name": "Reads removed"},
"dedup_output_reads": {"color": "#7fc97f", "name": "Reads remaining"},
"dedup_removed_reads": {"color": "#fdc086", "name": "Reads removed"},
}

# Config for the plot
config = {
"id": "umitools_deduplication",
"title": "UMI-tools: Deduplication Counts",
"ylab": "# Reads",
"cpswitch_counts_label": "Number of Reads",
return self.add_section(
name="Deduplicated Reads",
anchor="umitools-dedup-plot",
plot=bargraph.plot(
data_by_sample,
keys,
{
"id": "umitools_deduplication_barplot",
"title": "UMI-tools: Deduplication Counts",
"ylab": "# Reads",
"cpswitch_counts_label": "Number of Reads",
},
),
)

def umitools_extract_barplot(self, data_by_sample):
keys = {
"read1_match": {"color": "#7fc9c7", "name": "Read1 Match"},
"read1_mismatch": {"color": "#fd9286", "name": "Read1 Mismatch"},
"read2_match": {"color": "#7f89c9", "name": "Read2 Match"},
"read2_mismatch": {"color": "#fd86aa", "name": "Read2 Mismatch"},
}

return bargraph.plot(self.umitools_data, keys, config)
# add a section with a beeswarm plot of UMI stats to the report
self.add_section(
name="Extract Stats",
anchor="umitools_extract",
description="Read stats from `umi_tools extract`",
plot=bargraph.plot(
data_by_sample,
keys,
{
"id": "umitools_extract_barplot",
"title": "UMI-tools: Extract Stats",
"ylab": "# Reads",
"cpswitch_counts_label": "Number of Reads",
},
),
)

def umitools_umi_stats_swarm(self):
"""Generate a swarmplot of umi stats for the main report"""
def umi_stats_violin(self, data_by_sample):
"""Generate a violin of UMI stats for the main report"""

headers = {
"positions_deduplicated": {
"title": "Positions Dedup",
"description": "Total number of positions deduplicated",
"min": 0,
"format": "{:,.0f}",
"format": "{:,d}",
"scale": "Greens",
},
"total_umis": {
"title": "Total UMIs",
"description": "Total UMIs found in sample",
"min": 0,
"format": "{:,.0f}",
"format": "{:,d}",
"scale": "Blues",
},
"unique_umis": {
"title": "Unique UMIs",
"description": "Unique UMIs found in sample",
"min": 0,
"format": "{:,.0f}",
"format": "{:,d}",
"scale": "Purples",
},
"mean_umi_per_pos": {
Expand All @@ -216,10 +273,24 @@ def umitools_umi_stats_swarm(self):
},
}

# Config for the table
config = {
"id": "umitools_stats_swarmplot",
"table_title": "UMI-tools: UMI stats",
}

return beeswarm.plot(self.umitools_data, headers, config)
# add a section with a beeswarm plot of UMI stats to the report
self.add_section(
name="UMI Stats",
anchor="umitools-umi-stats",
description="Statistics from running `umi_tools dedup` or `umi_tools extract`",
helptext="""
- **Positions Dedup**: Total number of positions deduplicated
- **Total UMIs**: Total UMIs found in sample
- **Unique UMIs**: Unique UMIs found in sample
- **Mean #UMI**: Mean number of unique UMIs per position
- **Max #UMI**: Max number of unique UMIs per position
""",
plot=beeswarm.plot(
data_by_sample,
headers,
{
"id": "umitools_stats_violin",
"table_title": "UMI-tools: UMI stats",
},
),
)

0 comments on commit c5ef275

Please sign in to comment.