In [None]:
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
import pandas as pd
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.multicomp import pairwise_tukeyhsd

In [None]:
df = pd.read_csv("data.csv", sep=',')
# Limiting ourselves to a subset of the data
df = df[(df["script"] == "cc") & (df["slurm_partition"] == "xeon") & (df["victim_selection"] == "SEQ")]
# Getting rid of the columns that do not matter here
df = df.drop(["do_log", "matrix_name", "script", "slurm_partition", "victim_selection"], axis = 1)
df

The question that we are asking ourselves: what is the impact of `scheme`, `layout` and their interaction ?

In [None]:
df.boxplot(column="exec_time", by="scheme", rot=90)
plt.xlabel("")
plt.ylabel("Execution time [s]")
plt.title("")
plt.ylim(bottom=0) 

In [None]:
df.boxplot(column="exec_time", by="layout")
plt.xlabel("")
plt.ylabel("Execution time [s]")
plt.title("")
plt.ylim(bottom=0) 

In [None]:
moore_lm = ols('exec_time ~ layout * scheme', data = df).fit()

In [None]:
table = sm.stats.anova_lm(moore_lm, typ=1)
table

In [None]:
results = pairwise_tukeyhsd(df["exec_time"], df["layout"])

In [None]:
print(results)

In [None]:
groups = list(map(lambda layout: df[df["layout"] == layout]["exec_time"], df["layout"].unique()))
sp.stats.f_oneway(*groups)

In [None]:
fvalue, pvalue = sp.stats.f_oneway(
    df[df["layout"] == "CENTRALIZED"]["exec_time"],
    df[df["layout"] == "PERGROUP"]["exec_time"],
    df[df["layout"] == "PERCPU"]["exec_time"])
pvalue

In [None]:
print(sp.stats.tukey_hsd(
    df[df["layout"] == "CENTRALIZED"]["exec_time"],
    df[df["layout"] == "PERGROUP"]["exec_time"],
    df[df["layout"] == "PERCPU"]["exec_time"]))

In [None]:
scheme_names = df["scheme"].unique()
groups = list(map(lambda scheme: df[df["scheme"] == scheme]["exec_time"], scheme_names))
sp.stats.f_oneway(*groups)

results = pairwise_tukeyhsd(df["exec_time"], df["scheme"])
print(results)

In [None]:
print(sp.stats.tukey_hsd(*groups))

In [None]:
df_summarized = df.groupby(["scheme", "layout"], as_index=False).mean().sort_values(by=["layout", "exec_time"])

In [None]:
for layout in df_summarized["layout"].unique():
    df_tmp = df_summarized[df_summarized["layout"] == layout]
    x = df_tmp["scheme"]
    y = df_tmp["exec_time"]
    x_line = list(range(len(y.index)))
    m, b = np.polyfit(x_line, y.values, deg=1)
    y_line = list(map(lambda a: a * m + b, x_line))
    p0 = plt.plot(x, y, "o")
    plt.plot(x_line, y_line, linestyle="dashed", color=p0[0].get_color(), label=layout)

plt.xlabel("Scheduling scheme")
plt.ylabel("Execution time [s]")
plt.title("")
plt.ylim(bottom=0)
plt.xticks(rotation=45)
plt.legend()
plt.show()