In [None]:
import os
import re
import numpy as np
import pandas as pd
from scipy.stats import shapiro, levene, f_oneway, kruskal
from statsmodels.stats.multicomp import pairwise_tukeyhsd
import scikit_posthocs as sp

In [3]:
# Similar to the parsefile.py, we parse the results files:

def parse_file(file_path):
    with open(file_path, 'r', encoding="utf-8") as f:
        content = f.read()

    blocks = re.split(r"Performance counter stats for 'system wide':", content)
    rows = []

    def clean_float(s):
        return float(s.replace('\u202F', '').replace(' ', '').replace(',', '.'))

    def clean_int(s):
        return int(s.replace('\u202F', '').replace(' ', ''))

    for block in blocks[1:]:
        pkg_match = re.search(r"([\d,]+) Joules power/energy-pkg/", block)
        core_match = re.search(r"([\d,]+) Joules power/energy-cores/", block)
        cycles_match = re.search(r"([\d\s\u202F]+) +cycles", block)
        instr_match = re.search(r"([\d\s\u202F]+) +instructions", block)
        cache_ref_match = re.search(r"([\d\s\u202F]+) +cache-references", block)
        cache_miss_match = re.search(r"([\d\s\u202F]+) +cache-misses", block)
        cs_match = re.search(r"([\d\s\u202F]+) +cs", block)
        migrations_match = re.search(r"([\d\s\u202F]+) +migrations", block)
        pf_match = re.search(r"([\d\s\u202F]+) +page-faults", block)

        data = {
            "energy_pkg": clean_float(pkg_match.group(1)),
            "energy_cores": clean_float(core_match.group(1)),
            "cycles": clean_int(cycles_match.group(1)),
            "instructions": clean_int(instr_match.group(1)),
            "cache_references": clean_int(cache_ref_match.group(1)),
            "cache_misses": clean_int(cache_miss_match.group(1)),
            "cs": clean_int(cs_match.group(1)),
            "migrations": clean_int(migrations_match.group(1)),
            "page_faults": clean_int(pf_match.group(1)),
        }
        rows.append(data)

    return rows


In [None]:
all_rows = []

# Create dataframe 

file_names = [
    "test_leo_skeleton_base_empty.txt",
    "test_leo_skeleton_base_long.txt",
    "test_leo_skeleton_base_medium.txt",
    "test_leo_skeleton_change_empty.txt",
    "test_leo_skeleton_change_long.txt", 
    "test_leo_skeleton_change_medium.txt", 
    "test_leo_skeleton_change+logging_long.txt",
    "test_leo_skeleton_change+logging_medium.txt",
    "test_mu_skeleton_base_empty.txt",
    "test_mu_skeleton_base_long.txt",
    "test_mu_skeleton_base_medium.txt",
    "test_mu_skeleton_change_empty.txt",
    "test_mu_skeleton_change_long.txt",
    "test_mu_skeleton_change_medium.txt",
    "test_mu_skeleton_change+logging_long.txt",
    "test_mu_skeleton_change+logging_medium.txt",
    "test_novelWriter_skeleton_base_empty.txt",
    "test_novelWriter_skeleton_base_long.txt",
    "test_novelWriter_skeleton_base_medium.txt",
    "test_novelWriter_skeleton_change_empty.txt",
    "test_novelWriter_skeleton_change_long.txt",
    "test_novelWriter_skeleton_change_medium.txt",
    "test_novelWriter_skeleton_change+logging_long.txt",
    "test_novelWriter_skeleton_change+logging_medium.txt"
]
for file in file_names:
    parsed = parse_file(file)
    test_name = os.path.basename(file).replace(".txt","")
    for idx, row in enumerate(parsed,1):
        row["test"] = test_name
        row["run"] = idx
        all_rows.append(row)
df = pd.DataFrame(all_rows)

print(df.head(10))

   energy_pkg  energy_cores      cycles  instructions  cache_references  \
0       42.26          4.75  2173328453     775201095          91714222   
1       53.29          5.25  2336570407     885062192          95272255   
2       46.18          5.14  2194307013     791346536          85571384   
3       42.57          5.08  2294232196     828141032          95937502   
4       44.17          4.88  2144127984     785100053          84155930   
5       46.70          4.95  2288881461     857928383          92242934   
6       52.30          4.84  2291043947     756573771          90915013   
7       41.00          4.98  2226272619     789312490          92898751   
8       47.99          5.12  2404529453     791234031          97021764   
9       45.71          4.90  2231443685     798998783          92434134   

   cache_misses     cs  migrations  page_faults                          test  \
0      39074097  29519          91          172  test_leo_skeleton_base_empty   
1      39832

In [16]:
print(df.columns)

Index(['energy_pkg', 'energy_cores', 'cycles', 'instructions',
       'cache_references', 'cache_misses', 'cs', 'migrations', 'page_faults',
       'test', 'run'],
      dtype='object')


In [None]:
# Test for normality - Shapiro-Wilk test

shapiro_results = []
for name, grp in df.groupby("test"):
    stat, p = shapiro(grp["energy_pkg"])
    shapiro_results.append({"test": name, "stat": stat, "p": p})
norm_df = pd.DataFrame(shapiro_results)

# Stat = values closer to 1 indicate normality
# p = probability of observing stat as extreme as you did 
# p <= 0.05 (most commonly), we can reject null and conclude this sample significantly deviates from normal
# p > 0.05, we don't have evidence the data isn't normal

print(norm_df)

alpha = 0.05

norm_df['is_normal'] = norm_df['p'] > alpha

normal_tests = norm_df[norm_df['is_normal']]['test'].tolist()
print("Test cases that are normally distributed (p > 0.05):")
for t in normal_tests:
    print(f"- {t}")

                                               test      stat         p
0                      test_leo_skeleton_base_empty  0.911920  0.016640
1                       test_leo_skeleton_base_long  0.874439  0.002106
2                     test_leo_skeleton_base_medium  0.922406  0.030980
3             test_leo_skeleton_change+logging_long  0.896235  0.006802
4           test_leo_skeleton_change+logging_medium  0.946061  0.132495
5                    test_leo_skeleton_change_empty  0.907956  0.013220
6                     test_leo_skeleton_change_long  0.874663  0.002131
7                   test_leo_skeleton_change_medium  0.945490  0.127884
8                       test_mu_skeleton_base_empty  0.971231  0.573386
9                        test_mu_skeleton_base_long  0.947625  0.145974
10                     test_mu_skeleton_base_medium  0.975716  0.703814
11             test_mu_skeleton_change+logging_long  0.962999  0.368681
12           test_mu_skeleton_change+logging_medium  0.954370  0

['base', 'change', 'change+logging']

In [93]:
# repeat this for the two comparison types
# repeat for different measured things (e.g. instructions or other energy cores)

# chatGPT was used as an assistant to find general implementations of the statistical tests, which were then implemented by myself below

alpha = 0.05

#groups_in_app = {
#    "base": df.loc[df["test"]=="test_leo_skeleton_base_long","energy_pkg"].values,
#    "change": df.loc[df["test"]=="test_leo_skeleton_change_long","energy_pkg"].values,
#    "change+logging": df.loc[df["test"]=="test_leo_skeleton_change+logging_long","energy_pkg"].values
#}

groups_in_app = {
    "mu": df.loc[df["test"]=="test_mu_skeleton_change+logging_medium","energy_pkg"].values,
    "leo": df.loc[df["test"]=="test_leo_skeleton_change+logging_medium","energy_pkg"].values,
    "nw": df.loc[df["test"]=="test_novelWriter_skeleton_change+logging_medium","energy_pkg"].values
}

print("Normality per scenario:")
normality = []
for v, data in groups_in_app.items():
    W, p = shapiro(data)
    if p>alpha:
        print(v, ": W=", round(W, 3), "p=", round(p, 3), "- normal")
    else:
        print(v, ": W=", round(W, 3), "p=", round(p, 3), "- non-normal")

print("\nVariance:")
stat_levene, p_levene = levene(*groups_in_app.values())
if p_levene>alpha:
    print("Levene’s test: W=", stat_levene, ", p=", p_levene, "- equal variance")
else:
    print("Levene’s test: W=", stat_levene, ", p=", p_levene, "- unequal variance")

all_data = np.concatenate(list(groups_in_app.values()))

# chatGPT generated the line below
all_labels = np.repeat(list(groups_in_app.keys()), [len(g) for g in groups_in_app.values()])

# If all groups are normally distributed (shapiro-wilk), and has equal variances (levene)
if all(shapiro(g)[1] > alpha for g in groups_in_app.values()) and p_levene > alpha:
    # Data is parametric
    # Then we can do one way ANOVA
    F, p_main = f_oneway(groups_in_app['base'], groups_in_app['change'], groups_in_app['change+logging'])
    if p_main>alpha:
        print("\nANOVA: F=", F, ", p=", p_main, "- no statistically significant difference")
    else:
        print("\nANOVA: F=", F, ", p=", p_main, "- statistically significant difference")

    # Post‑hoc test - Tukey HSD
    tukey = pairwise_tukeyhsd(endog=all_data, groups=all_labels, alpha=alpha)
    print("\nTukey HSD results:")
    print(tukey)
else:
    # Data is non-parametric
    # We can do Kruskal-Wallis
    #H, p_main = kruskal(groups_in_app['base'], groups_in_app['change'], groups_in_app['change+logging'])
    H, p_main = kruskal(groups_in_app['leo'], groups_in_app['mu'], groups_in_app['nw'])
    if p_main>alpha:
        print("\nKruskal–Wallis: H=", H, ", p=", p_main, "- no statistically significant difference")
    else:
        print("\nKruskal–Wallis: H=", H, ", p=", p_main, "- statistically significant difference")

    # Post‑hoc test - Dunn-Bonferroni
    df_ph = pd.DataFrame({
        "value": all_data,
        "group": all_labels
    })

    dunn_res = sp.posthoc_dunn(df_ph, val_col='value', group_col='group', p_adjust='bonferroni')

    print("\nDunn-Bonferroni pairwise p-values:")
    print(dunn_res)



Normality per scenario:
mu : W= 0.954 p= 0.221 - normal
leo : W= 0.946 p= 0.132 - normal
nw : W= 0.885 p= 0.004 - non-normal

Variance:
Levene’s test: W= 2.374705517933333 , p= 0.09904774395896343 - equal variance

Kruskal–Wallis: H= 17.951090422175053 , p= 0.0001264649692480154 - statistically significant difference

Dunn-Bonferroni pairwise p-values:
          leo        mu        nw
leo  1.000000  0.000222  0.003127
mu   0.000222  1.000000  1.000000
nw   0.003127  1.000000  1.000000
