In [10]:
from pathlib import Path
from typing import List

import altair as alt
import janitor
import pandas as pd

In [6]:
benchmark_dir = Path("../benchmarks/")
assert benchmark_dir.exists()
assert benchmark_dir.is_dir()

## Data Processing

Organization of benchmarks directory:

1. pipeline
2. rules
3. individual runs

> I may want to add more information to the name of the rules to keep them separate and not overwritten.
> For instance, including the date would be useful or metadata such as the data size for SBC or debug status for the fitting pipeline.

In [30]:
def process_benchmark_file(bench_f: Path) -> pd.DataFrame:
    return pd.read_csv(bench_f, sep="\t").assign(
        step=bench_f.name.replace(bench_f.suffix, "")
    )


def get_benchmark_data_for_rule_dir(rule_d: Path, pipeline_name: str) -> pd.DataFrame:
    bench_dfs: List[pd.DataFrame] = [
        process_benchmark_file(b) for b in rule_d.iterdir()
    ]
    return (
        pd.concat(bench_dfs)
        .assign(rule=rule_d.name, pipeline=pipeline_name)
        .clean_names()
    )


benchmark_df_list: List[pd.DataFrame] = []

for pipeline_dir in benchmark_dir.iterdir():
    for rule_dir in pipeline_dir.iterdir():
        benchmark_df_list.append(
            get_benchmark_data_for_rule_dir(rule_dir, pipeline_name=pipeline_dir.name)
        )

benchmark_df = pd.concat(benchmark_df_list).reset_index(drop=True)
benchmark_df.head()

Unnamed: 0,s,h_m_s,max_rss,max_vms,max_uss,max_pss,io_in,io_out,mean_load,cpu_time,step,rule,pipeline
0,160.2274,0:02:40,1295.18,1938.04,1250.59,1271.62,39.25,4.05,16.19,26.33,sp6-default,sample_advi,010_010_run-crc-sampling-snakemake
1,1767.7202,0:29:27,1447.37,2517.04,1444.73,1444.91,803.91,102.04,93.2,1647.72,sp4-default-fullrank,sample_advi,010_010_run-crc-sampling-snakemake
2,131.3041,0:02:11,1285.27,1924.7,1282.27,1282.48,5.78,3.93,17.02,22.32,sp5-default,sample_advi,010_010_run-crc-sampling-snakemake
3,138.4463,0:02:18,1295.54,1948.86,1292.96,1293.05,34.23,4.71,20.34,28.41,sp4-centered-copynum,sample_advi,010_010_run-crc-sampling-snakemake
4,135.3755,0:02:15,1281.48,1930.82,1278.91,1278.98,0.39,4.7,16.47,22.57,sp2-default,sample_advi,010_010_run-crc-sampling-snakemake


## Data dictionary

| colname | type (unit) | description |
|-------- |-------------|-------------|
| s | float (seconds) | Running time in seconds. |
| h:m:s	| string (-) | Running time in hour, minutes, seconds format. |
| max_rss | float (MB) | Maximum "Resident Set Size”, this is the non-swapped physical memory a process has used. |
| max_vms | float (MB) | Maximum “Virtual Memory Size”, this is the total amount of virtual memory used by the process. |
| max_uss | float (MB) | “Unique Set Size”, this is the memory which is unique to a process and which would be freed if the process was terminated right now. |
| max_pss | float (MB) | “Proportional Set Size”, is the amount of memory shared with other processes, accounted in a way that the amount is divided evenly between the processes that share it (Linux only). |
| io_in | float (MB) | The number of MB read (cumulative). |
| io_out | float (MB) | The number of MB written (cumulative). |
| mean_load | float (-) | CPU usage over time, divided by the total running time (first row). |
| cpu_time | float (-) | CPU time summed for user and system. |

## Data analysis and visualization

In [53]:
benchmark_df.groupby(["pipeline", "rule"]).mean().round(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,s,max_rss,max_vms,max_uss,max_pss,io_in,io_out,mean_load,cpu_time
pipeline,rule,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
010_010_run-crc-sampling-snakemake,sample_advi,309.92,1304.34,2023.15,1297.18,1299.43,166.95,14.03,24.8,186.06
010_010_run-crc-sampling-snakemake,sample_mcmc,1512.58,1439.86,2333.38,1429.85,1433.68,320.52,47.7,80.13,1267.32


In [51]:
benchmark_df_long = benchmark_df[
    ["pipeline", "rule", "step", "cpu_time", "max_rss", "mean_load", "cpu_time"]
].pivot_longer(["pipeline", "rule", "step"])

(
    alt.Chart(benchmark_df_long)
    .mark_boxplot(size=50)
    .encode(
        x="rule",
        y=alt.Y("value", title=""),
        row=alt.Row("variable", title=""),
        column=alt.Column("pipeline"),
    )
    .properties(width=200, height=100)
    .resolve_scale(y="independent")
)

---

In [None]:
%load_ext watermark
%watermark -d -u -v -iv -b -h -m