MultiQC · vladsavelyev · Nov 2, 2023 · Oct 25, 2023 · Oct 25, 2023 · Oct 25, 2023
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -6,6 +6,9 @@
 
 ### New Modules
 
+- [**Seqera Platform CLI**](https://github.com/seqeralabs/tower-cli) ([#2151](https://github.com/ewels/MultiQC/pull/2151))
+  - Seqera Platform CLI reports statistics generated by the Seqera Platform CLI.
+
 ### Module updates
 
 - **fastp**: correctly parse sample name from --in1/--in2 command. Prefer file name if not `fastp.json`; fallback to file name when error ([#2139](https://github.com/ewels/MultiQC/pull/2139))

diff --git a/docs/core/development/modules.md b/docs/core/development/modules.md
@@ -793,10 +793,11 @@ headers['name'] = {
     'scale': 'GnBu',                # Colour scale for colour coding. Set to False to disable.
     'suffix': None,                 # Suffix for value (eg. '%')
     'format': '{:,.1f}',            # Output format() string
-    'shared_key': None              # See below for description
+    'shared_key': None,             # See below for description
     'modify': None,                 # Lambda function to modify values
     'hidden': False,                # Set to True to hide the column on page load
     'placement' : 1000.0,           # Alter the default ordering of columns in the table
+    'to_float': None,               # Lambda function to a float for coloring and sorting
 }
 ```
 
@@ -823,9 +824,14 @@ headers['name'] = {
     be useful when data could be sometimes useful. For example, some modules
     show "percentage aligned" on page load but hide "number of reads aligned".
 - `placement`
-  - If you feel that the results from your module should appear at the left side
+  - If you feel that the results from your module should appear on the left side
     of the table set this value less than 1000. Or to move the column right, set
     it greater than 1000. This value can be any float.
+- `to_float`
+  - Lambda function to convert value to a number, to support the quantitative
+    color code. By default, MultiQC attempt to convert each
+    value to `float()`, and if it fails, it will leave the value as a string without
+    color code.
 
 The typical use for the `modify` string is to divide large numbers such as read counts,
 to make them easier to interpret. If handling read counts, there are three config variables

diff --git a/docs/modules/seqera_cli.md b/docs/modules/seqera_cli.md
@@ -0,0 +1,16 @@
+---
+name: Seqera Platform CLI
+url: https://github.com/seqeralabs/tower-cli
+description: Reports statistics generated by the Seqera Platform CLI.
+---
+
+Parses a tar-gz dump containing logs and stats from a Seqera Platform run, that is,
+the `runs_SmUkr43Nul49G.tar.gz` file generated by the following command:
+
+```sh
+tw runs dump -id=SmUkr43Nul49G --workspace=seqeralabs/benchmarks --output=runs_SmUkr43Nul49G.tar.gz
+```
+
+Expects the dump to contain a `workflow.json` file, along with `workflow-load.json`.
+Can also parse an uncompressed version of the dump, that is, a `workflow.json` file
+and a `workflow-load.json` sitting together in a directory.
diff --git a/multiqc/modules/seqera_cli/__init__.py b/multiqc/modules/seqera_cli/__init__.py
@@ -0,0 +1 @@
+from .seqera_cli import MultiqcModule
diff --git a/multiqc/modules/seqera_cli/seqera_cli.py b/multiqc/modules/seqera_cli/seqera_cli.py
@@ -0,0 +1,218 @@
+""" MultiQC module to parse output from the Seqera Platform CLI """
+
+import datetime
+import json
+import logging
+import os
+import tarfile
+from collections import defaultdict
+
+from multiqc.modules.base_module import BaseMultiqcModule, ModuleNoSamplesFound
+from multiqc.plots import bargraph
+
+log = logging.getLogger(__name__)
+
+
+def _read_json_from_tar_gz(tar_file, fname):
+    try:
+        fh = tar_file.extractfile(tar_file.getmember(fname))
+        contents = fh.read()
+    except Exception as e:
+        log.warning(f"Could not extract file {fname} from archive {tar_file}: {e}")
+        return {}
+    try:
+        data = json.loads(contents)
+    except Exception as e:
+        log.warning(f"Could parse JSON from {fname} in {tar_file}: {e}")
+        return {}
+    return data
+
+
+def _parse_workflow_json(data):
+    keys = ["repository", "start", "complete", "revision"]
+    data = {k: data.get(k) for k in keys}
+    # "start" and "complete" are time stamps like time stamps like 2023-10-22T14:39:01Z
+    # parse them with a library, take the difference "complete" - "start" to get the
+    # duration, and convert the duration it to a human-readable format.
+    if "start" in data and "complete" in data:
+        start = datetime.datetime.strptime(data["start"], "%Y-%m-%dT%H:%M:%SZ")
+        complete = datetime.datetime.strptime(data["complete"], "%Y-%m-%dT%H:%M:%SZ")
+        data["duration"] = complete - start
+    return data
+
+
+def _parse_workflow_load_json(data):
+    keys = [
+        "cpuEfficiency",
+        "memoryEfficiency",
+        "cpuTime",
+        "readBytes",
+        "writeBytes",
+        "cost",
+        "pending",
+        "submitted",
+        "running",
+        "succeeded",
+        "failed",
+        "cached",
+    ]
+    data = {k: data.get(k) for k in keys}
+    return data
+
+
+class MultiqcModule(BaseMultiqcModule):
+    """
+    Seqera Platform CLI module for MultiQC. Should be able to process logs dump
+    usually written in a form of a tar-gz archive, as well as its uncompressed version.
+    that is reading workflow.json and workflow-load.json files directly.
+    """
+
+    def __init__(self):
+        super(MultiqcModule, self).__init__(
+            name="Seqera Platform CLI",
+            anchor="seqera_cli",
+            href="https://github.com/seqeralabs/tower-cli",
+            info="reports statistics generated by the Seqera Platform CLI.",
+            doi="10.1016/j.ajhg.2017.01.017",
+        )
+
+        data_by_run = defaultdict(dict)
+
+        # Parsing the tar-gz dump
+        for f in self.find_log_files("seqera_cli/run_dump", filecontents=False):
+            with tarfile.open(os.path.join(f["root"], f["fn"])) as tar_file:
+                if "workflow.json" not in tar_file.getnames():
+                    continue
+                d = _read_json_from_tar_gz(tar_file, "workflow.json")
+                run_id = d.get("id")
+                if not run_id:
+                    continue
+                d = _parse_workflow_json(d)
+                if not d:
+                    continue
+                self.add_data_source(f)
+                self.add_software_version(d.get("revision"))
+                data_by_run[run_id].update(d)
+
+                # Check if also workflow-load.json sits next to workflow.json
+                if "workflow-load.json" in tar_file.getnames():
+                    d = _read_json_from_tar_gz(tar_file, "workflow-load.json")
+                    data_by_run[run_id].update(_parse_workflow_load_json(d))
+
+        # Parsing the json files directly
+        for f in self.find_log_files("seqera_cli/workflow"):
+            d = json.loads(f["f"])
+            run_id = d.get("id")
+            if not run_id:
+                continue
+            d = _parse_workflow_json(d)
+            if not d:
+                continue
+            self.add_data_source(f)
+            self.add_software_version(d.get("revision"))
+            data_by_run[run_id].update(d)
+
+            # Check if also workflow-load.json sits next to workflow.json
+            workflow_load_path = os.path.join(f["root"], "workflow-load.json")
+            if os.path.isfile(workflow_load_path):
+                with open(workflow_load_path) as fh:
+                    d = json.load(fh)
+                    data_by_run[run_id].update(_parse_workflow_load_json(d))
+
+        # Filter to strip out ignored sample names
+        data_by_run = self.ignore_samples(data_by_run)
+
+        if len(data_by_run) == 0:
+            raise ModuleNoSamplesFound
+        log.info(f"Found {len(data_by_run)} reports")
+
+        # Write parsed report data to a file
+        self.write_data_file(data_by_run, "multiqc_seqera_cli")
+
+        headers = {
+            "repository": {
+                "title": "Repository",
+                "description": "Name of the repository",
+                "scale": False,
+                "modify": lambda x: f'<a href="{x}">{x.replace("https://", "").replace("http://", "").replace("github.com/", "")}</a>',
+            },
+            "start": {
+                "title": "Start",
+                "description": "Start time of the workflow",
+                "scale": False,
+                "hidden": True,
+            },
+            "complete": {
+                "title": "Complete",
+                "description": "End time of the workflow",
+                "scale": False,
+                "hidden": True,
+            },
+            "duration": {
+                "title": "Duration",
+                "description": "Duration of the workflow",
+                "scale": "BuPu",
+                "to_float": lambda x: x.total_seconds(),
+            },
+            "cpuEfficiency": {
+                "title": "CPU Efficiency",
+                "description": "Percentage of CPU time used by the workflow",
+                "format": "{:,.2f}",
+                "scale": "RdYlGn",
+            },
+            "memoryEfficiency": {
+                "title": "Memory Efficiency",
+                "description": "Percentage of memory used by the workflow",
+                "format": "{:,.2f}",
+                "scale": "YlGn",
+            },
+            "cpuTime": {
+                "title": "CPU Time",
+                "description": "Total CPU time used by the workflow",
+                "format": "{:,.2f}",
+                "scale": "Greys",
+                "suffix": "&nbsp;h",
+                "modify": lambda x: x / 1000 / 3600,
+            },
+            "readBytes": {
+                "title": "Read GB",
+                "description": "Total gigabytes read by the workflow",
+                "format": "{:,.2f}",
+                "scale": "Blues",
+                "suffix": "&nbsp;GB",
+                "modify": lambda x: x / 1024 / 1024 / 1024,
+            },
+            "writeBytes": {
+                "title": "Write GB",
+                "description": "Total gigabytes written by the workflow",
+                "format": "{:,.2f}",
+                "scale": "Greens",
+                "suffix": "&nbsp;GB",
+                "modify": lambda x: x / 1024 / 1024 / 1024,
+            },
+            "cost": {
+                "title": "Cost",
+                "description": "Cost of the workflow",
+                "format": "{:,.2f}",
+                "scale": "Reds",
+            },
+        }
+        self.general_stats_addcols(data_by_run, headers)
+
+        pconfig = {
+            "id": "seqera_cli_process_status",
+            "title": "Seqera Platform CLI: processes statuses",
+        }
+        cats = {
+            "pending": {"name": "Pending", "color": "#8f4199"},
+            "submitted": {"name": "Submitted", "color": "#e68642"},
+            "running": {"name": "Running", "color": "#4256e7"},
+            "cached": {"name": "Cached", "color": "#939598"},
+            "succeeded": {"name": "Succeeded", "color": "#28ae61"},
+            "failed": {"name": "Failed", "color": "#e7363e"},
+        }
+        self.add_section(
+            name="Seqera Platform CLI",
+            anchor="seqera-platform-cli",
+            plot=bargraph.plot(data_by_run, cats, pconfig),
+        )
diff --git a/multiqc/plots/table.py b/multiqc/plots/table.py
@@ -126,7 +126,13 @@ def make_table(dt):
         if header["scale"] is False:
             c_scale = None
         else:
-            c_scale = mqc_colour.mqc_colour_scale(header["scale"], header["dmin"], header["dmax"], id=table_id)
+            c_scale = mqc_colour.mqc_colour_scale(
+                name=header["scale"],
+                minval=header["dmin"],
+                maxval=header["dmax"],
+                to_float_fn=header.get("to_float"),
+                id=table_id,
+            )
 
         # Collect conditional formatting config
         cond_formatting_rules = {}
@@ -153,7 +159,8 @@ def make_table(dt):
                 try:
                     dmin = header["dmin"]
                     dmax = header["dmax"]
-                    percentage = ((float(val) - dmin) / (dmax - dmin)) * 100
+                    to_float = header.get("to_float", float)
+                    percentage = ((to_float(val) - dmin) / (dmax - dmin)) * 100
                     # Treat 0 as 0-width and make bars width of absolute value
                     if header.get("bars_zero_centrepoint"):
                         dmax = max(abs(header["dmin"]), abs(header["dmax"]))

diff --git a/multiqc/plots/table_object.py b/multiqc/plots/table_object.py
@@ -204,30 +204,34 @@ def __init__(self, data, headers=None, pconfig=None):
                 setdmax = False
                 setdmin = False
                 try:
-                    headers[idx][k]["dmax"] = float(headers[idx][k]["max"])
-                except TypeError:
+                    to_float_fn = headers[idx][k].get("to_float", float)
+                    headers[idx][k]["dmax"] = to_float_fn(headers[idx][k]["max"])
+                except Exception:
                     headers[idx][k]["dmax"] = 0
                     setdmax = True
 
                 try:
-                    headers[idx][k]["dmin"] = float(headers[idx][k]["min"])
-                except TypeError:
+                    to_float_fn = headers[idx][k].get("to_float", float)
+                    headers[idx][k]["dmin"] = to_float_fn(headers[idx][k]["min"])
+                except Exception:
                     headers[idx][k]["dmin"] = 0
                     setdmin = True
 
                 # Figure out the min / max if not supplied
                 if setdmax or setdmin:
                     for s_name, samp in data[idx].items():
                         try:
-                            val = float(samp[k])
+                            val = samp[k]
                             if callable(headers[idx][k]["modify"]):
                                 val = float(headers[idx][k]["modify"](val))
+                            to_float_fn = headers[idx][k].get("to_float", float)
+                            val = to_float_fn(val)
                             if setdmax:
                                 headers[idx][k]["dmax"] = max(headers[idx][k]["dmax"], val)
                             if setdmin:
                                 headers[idx][k]["dmin"] = min(headers[idx][k]["dmin"], val)
                         except (ValueError, TypeError):
-                            val = samp[k]  # couldn't convert to float - keep as a string
+                            pass  # couldn't convert to float - keep as a string
                         except KeyError:
                             pass  # missing data - skip
                     # Limit auto-generated scales with floor, ceiling and minRange.

diff --git a/multiqc/utils/config_defaults.yaml b/multiqc/utils/config_defaults.yaml
@@ -315,6 +315,8 @@ fn_clean_trim:
   - ".stats"
   - ".hist"
   - ".phased"
+  - ".tar"
+  - "runs_"
 
 # Files to ignore when indexing files.
 # Grep file match patterns.
@@ -589,6 +591,10 @@ module_order:
   - sexdeterrmine:
       module_tag:
         - DNA
+  - seqera_cli:
+      module_tag:
+        - DNA
+        - RNA
   - eigenstratdatabasetools:
       module_tag:
         - DNA