Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fastp: correctly parse sample name from --in1/--in2 command. Fallback to file name #2139

Merged
merged 7 commits into from
Oct 30, 2023
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@

### Module updates

- **fastp**: correctly parse sample name from --in1/--in2 command. Prefer file name if not `fastp.json`; fallback to file name when error ([#2139](https://github.com/ewels/MultiQC/pull/2139))

## [MultiQC v1.17](https://github.com/ewels/MultiQC/releases/tag/v1.17) - 2023-10-17

### The one with the new logo
Expand Down
9 changes: 9 additions & 0 deletions docs/modules/fastp.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,12 @@ description: >

The Fastp module parses results generated by
[Fastp](https://github.com/OpenGene/fastp). Fastp can simply go through all fastq files in a folder and perform a series of quality control and filtering. Quality control and reporting are displayed both before and after filtering, allowing for a clear depiction of the consequences of the filtering process. Notably, the latter can be conducted on a variety of paramaters including quality scores, length, as well as the presence of adapters, polyG, or polyX tailing.

By default, the module generates the sample names based on the input FastQ file names in
the command line used by fastp. If you prefer, you can tell the module to use
the filenames as sample names instead. To do so, use the following config option:

```yaml
fastp:
s_name_filenames: true
```
6 changes: 3 additions & 3 deletions docs/modules/trimmomatic.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@ The Trimmomatic module parses standard error generated by
a flexible read trimming tool for Illumina NGS data. StdErr can be captured by
directing it to a file e.g. `trimmomatic command 2> trim_out.log`

By default, the module generates the sample names based on the command line used
by Trimmomatic. If you prefer, you can tell the module to use the filenames as
sample names instead. To do so, use the following config option:
By default, the module generates the sample names based on the input FastQ file names in
the command line used by Trimmomatic. If you prefer, you can tell the module to use
the filenames as sample names instead. To do so, use the following config option:

```yaml
trimmomatic:
Expand Down
117 changes: 73 additions & 44 deletions multiqc/modules/fastp/fastp.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import logging
import re
from collections import OrderedDict
from typing import Dict, Optional, Tuple

from multiqc import config
from multiqc.modules.base_module import BaseMultiqcModule, ModuleNoSamplesFound
Expand All @@ -29,6 +30,21 @@ def __init__(self):
doi="10.1093/bioinformatics/bty560",
)

data_by_sample = dict()
for f in self.find_log_files("fastp", filehandles=True):
s_name, parsed_json = self.parse_fastp_log(f)
if not s_name:
continue
if s_name in data_by_sample:
log.debug("Duplicate sample name found! Overwriting: {}".format(s_name))
data_by_sample[s_name] = parsed_json

# Filter to strip out ignored sample names
data_by_sample = self.ignore_samples(data_by_sample)
if len(data_by_sample) == 0:
raise ModuleNoSamplesFound
log.info("Found {} reports".format(len(data_by_sample)))

# Find and load any fastp reports
self.fastp_data = dict()
self.fastp_duplication_plotdata = dict()
Expand All @@ -41,24 +57,10 @@ def __init__(self):
self.fastp_qual_plotdata[k] = dict()
self.fastp_gc_content_data[k] = dict()
self.fastp_n_content_data[k] = dict()
for s_name, parsed_json in data_by_sample.items():
self.process_parsed_data(parsed_json, s_name)

for f in self.find_log_files("fastp", filehandles=True):
self.parse_fastp_log(f)

# Superfluous function call to confirm that it is used in this module
# Replace None with actual version if it is available
self.add_software_version(None, f["s_name"])

# Filter to strip out ignored sample names
self.fastp_data = self.ignore_samples(self.fastp_data)

if len(self.fastp_data) == 0:
raise ModuleNoSamplesFound

log.info("Found {} reports".format(len(self.fastp_data)))

# Write parsed report data to a file
## Parse whole JSON to save all its content
# Save entire original parsed JSON
self.write_data_file(self.fastp_all_data, "multiqc_fastp")

# General Stats Table
Expand Down Expand Up @@ -151,30 +153,53 @@ def __init__(self):
except RuntimeError:
log.debug("No data found for 'N content' plot")

def parse_fastp_log(self, f):
def parse_fastp_log(self, f) -> Tuple[Optional[str], Dict]:
"""Parse the JSON output from fastp and save the summary statistics"""
try:
parsed_json = json.load(f["f"])
except json.JSONDecodeError as e:
log.warning(f"Could not parse fastp JSON: '{f['fn']}': {e}")
return None
log.warning(f"Could not parse fastp JSON: '{f['fn']}': {e}, skipping sample")
return None, {}
if not isinstance(parsed_json, dict) or "command" not in parsed_json:
log.warning(f"Could not find 'command' field in JSON: '{f['fn']}'")
return None
log.warning(f"Could not find 'command' field in JSON: '{f['fn']}', skipping sample")
return None, {}

s_name = None
if getattr(config, "fastp", {}).get("s_name_filenames", False):
s_name = f["s_name"]

if s_name is None:
# Parse the "command" line usually found in the JSON, and use the first input
# FastQ file name to fetch the sample name.
cmd = parsed_json["command"].strip()
# On caveat is that the command won't have file names escaped properly,
# so we need some special logic to account for names with spaces:
# "fastp -c -g -y -i Sample 1 1.fastq.gz -o ..."
# "fastp -c -g -y --in1 Sample 1 1.fastq.gz --out1 ..."
# "fastp -c -g -y --in1 Sample 1 1.fastq.gz --in2 Sample 1_2.fastq.gz --out1 ..."
#
# Using a regex that extracts everything between "-i " or "--in1 " and " -".
# It still won't work exactly right for file names with dashes following a
# space, but that's a pretty rare case, and will still extract something
# meaningful.
m = re.search(r"(-i|--in1)\s(.+?)(?:\s-|$)", cmd)
if m:
s_name = self.clean_s_name(m.group(2), f)
else:
s_name = f["s_name"]
log.warning(
f"Could not parse sample name from the fastp command:\n{cmd}\n"
f"Falling back to extracting it from the file name: "
f"\"{f['fn']}\" -> \"{s_name}\""
)
return None, {}

cmd = parsed_json["command"].strip()
self.add_data_source(f, s_name)
return s_name, parsed_json

# Fetch a sample name from the command. The command won't have file names with
# spaces escaped properly, so we need to account for that:
# fastp -c -g -y -i Campaign 3 sample 1_1.fastq.gz -o ...
# Using a regex that extracts everything between "-i " and " -":
m = re.search(r"-i\s(.+?)(?:\s-|$)", cmd)
if not m:
log.warning(f"Could not parse sample name from fastp command: {f['fn']}")
return None
s_name = self.clean_s_name(m.group(1), f)
def process_parsed_data(self, parsed_json: Dict, s_name: str):
"""Process the JSON extracted from logs"""

self.add_data_source(f, s_name)
self.fastp_data[s_name] = {}
self.fastp_duplication_plotdata[s_name] = {}
self.fastp_insert_size_data[s_name] = {}
Expand All @@ -189,13 +214,13 @@ def parse_fastp_log(self, f):
for k in parsed_json["filtering_result"]:
self.fastp_data[s_name]["filtering_result_{}".format(k)] = float(parsed_json["filtering_result"][k])
except KeyError:
log.debug("fastp JSON did not have 'filtering_result' key: '{}'".format(f["fn"]))
log.debug(f"fastp JSON did not have 'filtering_result' key: '{s_name}'")

# Parse duplication
try:
self.fastp_data[s_name]["pct_duplication"] = float(parsed_json["duplication"]["rate"] * 100.0)
except KeyError:
log.debug("fastp JSON did not have a 'duplication' key: '{}'".format(f["fn"]))
log.debug(f"fastp JSON did not have a 'duplication' key: '{s_name}'")

# Parse after_filtering
try:
Expand All @@ -204,23 +229,23 @@ def parse_fastp_log(self, f):
parsed_json["summary"]["after_filtering"][k]
)
except KeyError:
log.debug("fastp JSON did not have a 'summary'-'after_filtering' keys: '{}'".format(f["fn"]))
log.debug(f"fastp JSON did not have a 'summary'-'after_filtering' keys: '{s_name}'")

# Parse data required to calculate Pct reads surviving
try:
self.fastp_data[s_name]["before_filtering_total_reads"] = float(
parsed_json["summary"]["before_filtering"]["total_reads"]
)
except KeyError:
log.debug("Could not find pre-filtering # reads: '{}'".format(f["fn"]))
log.debug(f"Could not find pre-filtering # reads: '{s_name}'")

try:
self.fastp_data[s_name]["pct_surviving"] = (
self.fastp_data[s_name]["filtering_result_passed_filter_reads"]
/ self.fastp_data[s_name]["before_filtering_total_reads"]
) * 100.0
except (KeyError, ZeroDivisionError) as e:
log.debug("Could not calculate 'pct_surviving' ({}): {}".format(e.__class__.__name__, f["fn"]))
log.debug(f"Could not calculate 'pct_surviving' ({e.__class__.__name__}): {s_name}")

# Parse adapter_cutting
try:
Expand All @@ -230,15 +255,15 @@ def parse_fastp_log(self, f):
except (ValueError, TypeError):
pass
except KeyError:
log.debug("fastp JSON did not have a 'adapter_cutting' key, skipping: '{}'".format(f["fn"]))
log.debug(f"fastp JSON did not have a 'adapter_cutting' key, skipping: '{s_name}'")

try:
self.fastp_data[s_name]["pct_adapter"] = (
self.fastp_data[s_name]["adapter_cutting_adapter_trimmed_reads"]
/ self.fastp_data[s_name]["before_filtering_total_reads"]
) * 100.0
except (KeyError, ZeroDivisionError) as e:
log.debug("Could not calculate 'pct_adapter' ({}): {}".format(e.__class__.__name__, f["fn"]))
log.debug(f"Could not calculate 'pct_adapter' ({e.__class__.__name__}): {s_name}")

# Duplication rate plot data
try:
Expand All @@ -252,7 +277,7 @@ def parse_fastp_log(self, f):
for i, v in enumerate(parsed_json["duplication"]["histogram"]):
self.fastp_duplication_plotdata[s_name][i + 1] = (float(v) / float(total_reads)) * 100.0
except KeyError:
log.debug("No duplication rate plot data: {}".format(f["fn"]))
log.debug(f"No duplication rate plot data: {s_name}")

# Insert size plot data
try:
Expand All @@ -270,15 +295,15 @@ def parse_fastp_log(self, f):
if i <= max_i:
self.fastp_insert_size_data[s_name][i + 1] = (float(v) / float(total_reads)) * 100.0
except KeyError:
log.debug("No insert size plot data: {}".format(f["fn"]))
log.debug(f"No insert size plot data: {s_name}")

for k in ["read1_before_filtering", "read2_before_filtering", "read1_after_filtering", "read2_after_filtering"]:
# Read quality data
try:
for i, v in enumerate(parsed_json[k]["quality_curves"]["mean"]):
self.fastp_qual_plotdata[k][s_name][i + 1] = float(v)
except KeyError:
log.debug("Read quality {} not found: {}".format(k, f["fn"]))
log.debug(f"Read quality {k} not found: {s_name}")

# GC and N content plots
try:
Expand All @@ -287,7 +312,7 @@ def parse_fastp_log(self, f):
for i, v in enumerate(parsed_json[k]["content_curves"]["N"]):
self.fastp_n_content_data[k][s_name][i + 1] = float(v) * 100.0
except KeyError:
log.debug("Content curve data {} not found: {}".format(k, f["fn"]))
log.debug(f"Content curve data {k} not found: {s_name}")

# Remove empty dicts
if len(self.fastp_data[s_name]) == 0:
Expand All @@ -300,6 +325,10 @@ def parse_fastp_log(self, f):
del self.fastp_all_data[s_name]
# Don't delete dicts with subkeys, messes up multi-panel plots

# Superfluous function call to confirm that it is used in this module
# Replace None with actual version if it is available
self.add_software_version(None)

def fastp_general_stats_table(self):
"""Take the parsed stats from the fastp report and add it to the
General Statistics table at the top of the report"""
Expand Down