Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Do not remove invalid software versions #2166

Merged
merged 6 commits into from
Nov 9, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

- Fix column sorting in exported TSV files from a matplotlib linegraph plot ([#2143](https://github.com/ewels/MultiQC/pull/2143))
- Nanostat: account for both tab and spaces in v1.41+ search pattern ([#2155](https://github.com/ewels/MultiQC/pull/2155))
- Clean version strings with build IDs ([#2166](https://github.com/ewels/MultiQC/pull/2166))

### New Modules

Expand Down
123 changes: 51 additions & 72 deletions multiqc/utils/software_versions.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,13 @@
#!/usr/bin/env python

""" Super Special-Case MultiQC module to produce report section on software versions """
vladsavelyev marked this conversation as resolved.
Show resolved Hide resolved
""" Utility functions to handle software version reporting """


import logging
import os
import re
from collections import defaultdict
from typing import List
from typing import List, Dict, Union

import packaging.version
import yaml
Expand Down Expand Up @@ -58,20 +59,21 @@ def update_versions_from_config(config, report):
def load_versions_from_config(config):
"""Try to load software versions from config"""
log.debug("Reading software versions from config.software_versions")
software_versions_config = getattr(config, "software_versions", defaultdict(lambda: defaultdict(list)))
software_versions_config, is_valid = validate_software_versions(software_versions_config)

if not is_valid:
log.error("Software versions loaded config.software_versions is not in a valid format, ignoring the section.")

software_versions_file = defaultdict(lambda: defaultdict(list))
versions_config = getattr(config, "software_versions", defaultdict(lambda: defaultdict(list)))
if not isinstance(versions_config, dict):
log.error(f"Expected the `software_versions` config section to be a dictionary")
versions_config = {}
else:
versions_config = validate_software_versions(versions_config)

versions_from_files = defaultdict(lambda: defaultdict(list))
for f in mqc_report.files.get("software_versions", []):
file_name = os.path.join(f["root"], f["fn"])
with open(file_name) as f:
with open(file_name) as fh:
log.debug(f"Reading software versions settings from: {file_name}")
try:
log.debug(f"Reading software versions settings from: {file_name}")
software_versions_file_tmp = yaml.load(
f,
versions_from_one_file = yaml.load(
fh,
# We need to be cautious when loading unquoted version strings from a YAML file.
# For instance, the version `1.10` will be parsed as a float by default, this converted
# into `1.1`. Passing yaml.BaseLoader explicitly makes YAML treat all scalar values
Expand All @@ -84,33 +86,31 @@ def load_versions_from_config(config):
except yaml.scanner.ScannerError as e:
log.error(f"Error parsing versions YAML: {e}")

software_versions_file_tmp, is_valid = validate_software_versions(software_versions_file_tmp)

if not is_valid:
log.error(f"Software versions loaded from {file_name} is not in a valid format, ignoring the file.")
if not isinstance(versions_from_one_file, dict):
log.error(
f"Expected the software versions file {file_name} to contain a dictionary structure, "
f"ignoring the file."
)
continue

software_versions_file = merge(software_versions_file, software_versions_file_tmp)
versions_from_one_file = validate_software_versions(versions_from_one_file)
config.update_dict(versions_from_files, versions_from_one_file)

# Aggregate versions listed in config and file
software_versions = merge(software_versions_config, software_versions_file)
config.update_dict(versions_config, versions_from_files)

# Parse the aggregated versions
for group in list(software_versions):
softwares = software_versions[group]
for tool in list(softwares):
versions = softwares[tool]

for group in versions_config:
softwares = versions_config[group]
for tool in softwares:
# Try and convert version to packaging.versions.Version object and remove duplicates
versions = list(set([parse_version(version) for version in versions]))
versions = list(set([parse_version(version) for version in softwares[tool]]))
versions = sort_versions(versions)

softwares[tool] = versions

return software_versions
return versions_config


def validate_software_versions(input):
def validate_software_versions(versions_config: Dict) -> Dict[str, Dict]:
"""
Validate software versions input from config file

Expand All @@ -136,27 +136,24 @@ def validate_software_versions(input):

It is also possible to mix the two formats, but this is not recommended.

Returns a dict of dicts of list in the format (1) and a boolean indicating if the input is valid.
Returns a dict of dicts of list in the format (1).
"""

def _list_of_versions_is_good(lst: List[str]) -> bool:
good = True
def _filter_list(lst: List[str]) -> List[str]:
"""Remove all non-string version tags"""
fixed_lst = []
for item in lst:
if not isinstance(item, str):
log.error(
f"Version must be a string, got '{type(item).__name__}': '{item}'. Consider wrapping the value in quotes: '\"{item}\"'"
)
good = False
elif not packaging.version.parse(item):
log.error(f"Invalid version: '{item}'")
good = False
return good
else:
fixed_lst.append(item)
return fixed_lst

output = defaultdict(lambda: defaultdict(list))
if not isinstance(input, dict):
return output, False

for level1_key, level1_values in input.items():
for level1_key, level1_values in versions_config.items():
group = level1_key
software = level1_key

Expand All @@ -166,48 +163,30 @@ def _list_of_versions_is_good(lst: List[str]) -> bool:
software = level2_key
if not isinstance(versions, list):
versions = [versions]
if not _list_of_versions_is_good(versions):
return output, False
output[group][software] = versions
versions = _filter_list(versions)
if versions:
output[group][software] = versions

# Check if the input is in format (2)
else:
versions = level1_values
if not isinstance(versions, list):
versions = [versions]
if not _list_of_versions_is_good(versions):
return output, False
output[group][software] = versions

return output, True


def merge(a: dict, b: dict, path=None):
"""Merge two dict of dicts recursively"""
# source: https://stackoverflow.com/a/7205107
if path is None:
path = []
versions = _filter_list(versions)
if versions:
output[group][software] = versions

for key in b:
if key in a:
if isinstance(a[key], dict) and isinstance(b[key], dict):
merge(a[key], b[key], path + [str(key)])
elif isinstance(a[key], list) and isinstance(b[key], list):
a[key].extend(b[key])
else:
raise Exception("Conflict at " + ".".join(path + [str(key)]))
else:
a[key] = b[key]
return a
return output


def sort_versions(versions):
"""Sort list of versions in descending order. Accepts list with both strings and packaging.version.Version objects."""
try:
versions.sort()
except TypeError:
# If there is a mix, sort all as strings
versions.sort(key=str)
"""
Sort list of versions in descending order. Accepts list with both strings and packaging.version.Version
objects.
"""
version_objs = [v for v in versions if isinstance(v, packaging.version.Version)]
version_strs = [v for v in versions if not isinstance(v, packaging.version.Version)]
versions = sorted(version_objs) + sorted(version_strs)
return versions


Expand Down