## Boxplots by Topology (aggregated over system size)

One figure per metric; each figure has 6 stacked subplots (one per topology).
Data are aggregated over `system_size` (5, 10, 20). Style: violin + box, mean diamond, optional reference median line.

In [33]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path

In [34]:
cwd = Path('.').resolve()
_run_table = Path('5_results_data') / 'run_table.csv'
_search = cwd
RUN_TABLE_PATH = None
for _ in range(6):
    candidate = _search / _run_table
    if candidate.exists():
        RUN_TABLE_PATH = candidate
        break
    _search = _search.parent
if RUN_TABLE_PATH is None:
    RUN_TABLE_PATH = cwd / _run_table
# Always use 5_results_analysis/figures (even when cwd is .ipynb_checkpoints)
if cwd.name == 'notebooks':
    _base = cwd.parent
elif cwd.name == '.ipynb_checkpoints' and cwd.parent.name == 'notebooks':
    _base = cwd.parent.parent
else:
    _base = cwd
FIGURES_DIR = _base / 'figures'
FIGURES_DIR.mkdir(parents=True, exist_ok=True)
print("Data:", RUN_TABLE_PATH, "| Exists:", RUN_TABLE_PATH.exists())

Data: /home/irena/Documents/Research Project/topology-scale-mubench-replication/5_results_data/run_table.csv | Exists: True


In [35]:
df = pd.read_csv(RUN_TABLE_PATH)
df = df[df['__done'] == 'DONE'].copy()

TOPOLOGY_ORDER = [
    'sequential_fanout', 'parallel_fanout', 'chain_with_branching',
    'hierarchical_tree', 'probabilistic_tree', 'complex_mesh',
]
TOPOLOGY_LABELS = {
    'sequential_fanout': 'Seq FO', 'parallel_fanout': 'Par FO',
    'chain_with_branching': 'Chain', 'hierarchical_tree': 'Hierarchical',
    'probabilistic_tree': 'Probabilistic', 'complex_mesh': 'Mesh',
}

# Derived columns: latency in s, energy in kJ
df['avg_latency_s'] = df['avg_latency_ms'] / 1000
df['energy_kj'] = df['energy'] / 1000

METRIC_BLOCKS = [
    ('throughput_rps', 'Throughput (RPS)'),
    ('avg_latency_s', 'Avg Response Time (s)'),
    ('energy_kj', 'Energy (kJ)'),
    ('cpu_usage_avg', 'CPU utilization (cores)'),
    ('failure_rate', 'Failure rate (%)'),
]

df = df[df['topology'].isin(TOPOLOGY_ORDER)]
df['topology'] = pd.Categorical(df['topology'], categories=TOPOLOGY_ORDER, ordered=True)

_scale_pct = ('cpu_usage_avg' in df.columns and df['cpu_usage_avg'].max() <= 1.5) or ('failure_rate' in df.columns and df['failure_rate'].max() <= 1.5)
if _scale_pct:
    df = df.copy()
    if 'cpu_usage_avg' in df.columns and df['cpu_usage_avg'].max() <= 1.5:
        df['cpu_usage_avg'] = df['cpu_usage_avg'] * 100
    if 'failure_rate' in df.columns and df['failure_rate'].max() <= 1.5:
        df['failure_rate'] = df['failure_rate'] * 100

print("Rows:", len(df), "| Topologies:", df['topology'].nunique(), "| Sizes:", sorted(df['system_size'].unique().tolist()))

Rows: 180 | Topologies: 6 | Sizes: [5, 10, 20]


In [36]:
def remove_outliers(series):
    """IQR-based outlier removal (like reference)."""
    if series is None or len(series) == 0:
        return series
    Q1 = series.quantile(0.25)
    Q3 = series.quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    return series[(series >= lower) & (series <= upper)]