<a href="https://colab.research.google.com/github/Genometric/MSPC/blob/dev/Benchmark/PlotBenchmarkings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [44]:
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

sns.set_theme()
sns.set_context("paper")

In [45]:
df = pd.read_csv("benchmarking_results.tsv", sep="\t")
df_replicates = pd.read_csv("benchmarking_results_replicates.tsv", sep="\t")

In [46]:
df

Unnamed: 0,mspc_version,experiment_id,replicate_count,interval_count,runtime_seconds,peak_physical_memory_usage_bytes,peak_paged_memory_usage_bytes,peak_virtual_memory_usage_bytes
0,v2.1,ENCSR000BNU,2,18573,1.008059,44933120,37289984,183336960
1,v2.1,ENCSR000EFR,2,600000,3.501384,72560640,65654784,217939968
2,v2.1,ENCSR000EGD,2,10834,0.794680,43245568,35016704,165838848
3,v2.1,ENCSR000EGJ,2,600000,3.544179,72511488,66162688,217939968
4,v2.1,ENCSR000EGL,2,600000,3.243759,73588736,67121152,217939968
...,...,...,...,...,...,...,...,...
139,v6.0.0,ENCSR908CMW,2,600000,9.920540,215388160,181923840,2204071256064
140,v6.0.0,ENCSR914NEI,2,44805,3.022518,103391232,68603904,2204072173568
141,v6.0.0,ENCSR931HNY,2,600000,9.532760,215715840,181911552,2204071256064
142,v6.0.0,ENCSR987PBI,2,1667,0.652432,65130496,27033600,2204071370752


In [None]:
versions_groups = df.groupby("mspc_version")
len(versions_groups)

In [48]:
releases = {}
for release in versions_groups.groups:
  g = versions_groups.get_group(release)
  g_ = g.copy()
  g_ = g_.drop('mspc_version', axis=1)
  releases[release] = g_

In [49]:
def get_increase_percentage(cur_val, pre_val):
  return ((pre_val - cur_val) / abs(pre_val)) * 100

def get_improvement_foldchange(cur_val, pre_val):
  return pre_val/cur_val

In [None]:
changes = []
odf = releases["v6.0.0"]
for idx, row in releases["v1.1"].iterrows():
  other = odf.loc[(odf["interval_count"] == row["interval_count"]) & (odf["replicate_count"] == row["replicate_count"])]
  if len(other) == 1:
    changes.append({
        "experiment_id": row["experiment_id"],
        "replicate_count": row["replicate_count"],
        "interval_count": row["interval_count"],
        "runtime_change_percent": get_improvement_foldchange(other["runtime_seconds"].to_list()[0], row["runtime_seconds"]),
        "peak_physical_memory_usage_change_percent": get_improvement_foldchange(other["peak_physical_memory_usage_bytes"].to_list()[0], row["peak_physical_memory_usage_bytes"]),
        "peak_paged_memory_usage_change_percent": get_improvement_foldchange(other["peak_paged_memory_usage_bytes"].to_list()[0], row["peak_paged_memory_usage_bytes"]),
        "peak_virtual_memory_usage_change_percent": get_improvement_foldchange(other["peak_virtual_memory_usage_bytes"].to_list()[0], row["peak_virtual_memory_usage_bytes"])
    })
  elif len(other) == 0:
    print("Could not find a match; interval count " + str(row["interval_count"]) + ", and replicate count "+ str(row["replicate_count"]))
    continue
  else:
    print("More than one match found; interval count " + str(row["interval_count"]) + ", and replicate count " + str(row["replicate_count"]))
    continue

In [None]:
len(changes)

In [52]:
changes_df = pd.DataFrame(changes)

In [None]:
fig, ax = plt.subplots(1,1, figsize=(12,4))
fig.set_dpi(600)


changes_df.sort_values("runtime_change_percent", ascending=False, inplace=True)
sns.barplot(data=changes_df, x="interval_count", y="runtime_change_percent", order=changes_df["interval_count"], facecolor=(0.2, 0.2, 0.4, 1), ax=ax)
ax.set(xlabel="Interval count", ylabel="Runtime Improvement fold change\n")
ax.get_yaxis().set_major_formatter(matplotlib.ticker.FuncFormatter(lambda x, p: format(int(x), ',') + "x"))
ax.set_xticklabels(ax.get_xticklabels(), rotation=90)
ax.title.set_text("Runtime improvement between\nMSPC v1 and v6")

plt.tight_layout()
plt.savefig('benchmarks.png')

Convert peak physical memory usage scale from bytes to mega bytes.

In [54]:
peak_mem_col_name = "peak_physical_memory_usage_bytes"
df_replicates[peak_mem_col_name] = df_replicates[peak_mem_col_name].div(1024*1024)

In [55]:
def hide_even_num_ticks(axis):
  [l.set_visible(False) for (i,l) in enumerate(axis.xaxis.get_ticklabels()) if i % 2 != 0]

In [None]:
fig, ax = plt.subplots(1,2, figsize=(12,4))
fig.set_dpi(600)

sns.boxplot(x=df_replicates["replicate_count"], y=df_replicates["runtime_seconds"], ax=ax[0])
ax[0].set(xlabel="\n\nReplicate count", ylabel="Runtime (seconds)")
ax[0].title.set_text("Changes in runtime\nw.r.t increase in the replicate count")

sns.boxplot(x=df_replicates["replicate_count"], y=df_replicates[peak_mem_col_name], ax=ax[1])
ax[1].set(xlabel="\n\nReplicate count", ylabel="Peak Physical Memory Usage\n(Megabyte)")
ax[1].title.set_text("\nChanges in memory requirement\nw.r.t increase in the replicate count")

hide_even_num_ticks(ax[0])
hide_even_num_ticks(ax[1])

# Set color of boxes in the boxplot.
# for i,box in enumerate(ax[1].artists):
#     box.set_edgecolor('black')
#     box.set_facecolor((0.0, 0.0, 0.0, 0))
#     # iterate over whiskers and median lines
#     for j in range(6 * i, 6 * (i + 1)):
#          ax[1].lines[j].set_color('black')

plt.tight_layout()
plt.savefig('scalability.png')