Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Salmon reports library info to General Stats table #1485

Merged
merged 11 commits into from
Dec 17, 2023
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
- **Kraken**: fix `UnboundLocalError` ([#2230](https://github.com/ewels/MultiQC/pull/2230))
- **kraken**: fixed column keys in genstats ([#2205](https://github.com/ewels/MultiQC/pull/2205))
- **QualiMap**: BamQC: fix for global-only stats ([#2207](https://github.com/ewels/MultiQC/pull/2207))
- **Salmon**: add `library_types`, `compatible_fragment_ratio`, `strand_mapping_bias` to the general stats table ([#1485](https://github.com/ewels/MultiQC/pull/1485))

## [MultiQC v1.18](https://github.com/ewels/MultiQC/releases/tag/v1.18) - 2023-11-17

Expand Down
97 changes: 70 additions & 27 deletions multiqc/modules/salmon/salmon.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
""" MultiQC module to parse output from Salmon """


import json
import logging
import os
Expand All @@ -27,10 +26,11 @@ def __init__(self):
self.salmon_meta = dict()
for f in self.find_log_files("salmon/meta"):
# Get the s_name from the parent directory
s_name = os.path.basename(os.path.dirname(f["root"]))
s_name = self.clean_s_name(s_name, f)
self.salmon_meta[s_name] = json.loads(f["f"])
self.add_software_version(self.salmon_meta[s_name]["salmon_version"], s_name)
if os.path.basename(f["root"]) in ["aux_info", "aux"]:
ewels marked this conversation as resolved.
Show resolved Hide resolved
s_name = os.path.basename(os.path.dirname(f["root"]))
s_name = self.clean_s_name(s_name, f)
self.salmon_meta[s_name] = json.loads(f["f"])
self.add_software_version(self.salmon_meta[s_name]["salmon_version"], s_name)

# Parse Fragment Length Distribution logs
self.salmon_fld = dict()
Expand All @@ -48,41 +48,84 @@ def __init__(self):
self.add_data_source(f, s_name)
self.salmon_fld[s_name] = parsed

# Parse Library Format Counts information. JSON file expected
self.salmon_lfc = dict()
for f in self.find_log_files("salmon/lfc"):
s_name = os.path.basename(f["root"]) # lfc file located at root folder
s_name = self.clean_s_name(s_name, f)
self.salmon_lfc[s_name] = json.loads(f["f"])

# Filter to strip out ignored sample names
self.salmon_meta = self.ignore_samples(self.salmon_meta)
self.salmon_fld = self.ignore_samples(self.salmon_fld)
self.salmon_lfc = self.ignore_samples(self.salmon_lfc)

if len(self.salmon_meta) == 0 and len(self.salmon_fld) == 0:
if len(self.salmon_meta) == 0 and len(self.salmon_fld) == 0 and len(self.salmon_lfc) == 0:
raise ModuleNoSamplesFound

if len(self.salmon_meta) > 0:
log.info(f"Found {len(self.salmon_meta)} meta reports")
self.write_data_file(self.salmon_meta, "multiqc_salmon")
if len(self.salmon_fld) > 0:
log.info(f"Found {len(self.salmon_fld)} fragment length distributions")
if len(self.salmon_lfc) > 0:
log.info(f"Found {len(self.salmon_lfc)} library format counts reports")

# Add alignment rate to the general stats table
headers = {
"percent_mapped": {
"title": "% Aligned",
"description": "% Mapped reads",
"max": 100,
"min": 0,
"suffix": "%",
"scale": "YlGn",
},
"num_mapped": {
"title": "M Aligned",
"description": "Mapped reads (millions)",
"min": 0,
"scale": "PuRd",
"modify": lambda x: float(x) / 1000000,
"shared_key": "read_count",
},
}
self.general_stats_addcols(self.salmon_meta, headers)
if self.salmon_meta:
# Add alignment rate to the general stats table
# Convert library types to string:
for d in self.salmon_meta.values():
if "library_types" in d:
d["library_types"] = ", ".join(d["library_types"])

if len(self.salmon_fld) > 0:
headers = {
"percent_mapped": {
"title": "% Aligned",
"description": "% Mapped reads",
"max": 100,
"min": 0,
"suffix": "%",
"scale": "YlGn",
},
"num_mapped": {
"title": "M Aligned",
"description": "Mapped reads (millions)",
"min": 0,
"scale": "PuRd",
"modify": lambda x: float(x) / 1000000,
"shared_key": "read_count",
},
"library_types": {
"title": "Library types",
"description": "Library types",
"scale": False,
# Hide if all samples have the same value
"hidden": len(set(d.get("library_types") for d in self.salmon_meta.values())) == 1,
},
}
self.general_stats_addcols(self.salmon_meta, headers)

if self.salmon_lfc:
# Compatible fragments ratios data
lfc_headers = {
"compatible_fragment_ratio": {
"title": "CFR",
"description": "Compatible fragment ratio",
"min": 0.0,
"max": 1.0,
vladsavelyev marked this conversation as resolved.
Show resolved Hide resolved
"scale": "YlGn",
},
"strand_mapping_bias": {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think that we have any test data with this key yet..

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(Not a blocker, but would be nice to get some if we can).

"title": "M Bias",
"description": "Strand mapping bias",
"scale": "BuGn",
"max": 1.0,
},
}
# add strand mapping bias data
self.general_stats_addcols(self.salmon_lfc, lfc_headers)

if self.salmon_fld:
# Fragment length distribution plot
pconfig = {
"smooth_points": 500,
Expand Down
4 changes: 4 additions & 0 deletions multiqc/utils/search_patterns.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -733,6 +733,10 @@ rseqc/tin:
salmon/meta:
fn: "meta_info.json"
contents: "salmon_version"
num_lines: 10
max_filesize: 50000
salmon/lfc:
fn: "lib_format_counts.json"
salmon/fld:
fn: "flenDist.txt"
sambamba/markdup:
Expand Down