In [1]:
import os
import re
import numpy as np
import pandas as pd
from scipy.stats import shapiro, levene, f_oneway, kruskal
from statsmodels.stats.multicomp import pairwise_tukeyhsd
import scikit_posthocs as sp

In [2]:
# Similar to the parsefile.py, we parse the results files:

def parse_file(file_path):
    with open(file_path, 'r', encoding="utf-8") as f:
        content = f.read()

    blocks = re.split(r"Performance counter stats for 'system wide':", content)
    rows = []

    def clean_float(s):
        return float(s.replace('\u202F', '').replace(' ', '').replace(',', '.'))

    def clean_int(s):
        return int(s.replace('\u202F', '').replace(' ', ''))

    for block in blocks[1:]:
        pkg_match = re.search(r"([\d,]+) Joules power/energy-pkg/", block)
        core_match = re.search(r"([\d,]+) Joules power/energy-cores/", block)
        cycles_match = re.search(r"([\d\s\u202F]+) +cycles", block)
        instr_match = re.search(r"([\d\s\u202F]+) +instructions", block)
        cache_ref_match = re.search(r"([\d\s\u202F]+) +cache-references", block)
        cache_miss_match = re.search(r"([\d\s\u202F]+) +cache-misses", block)
        cs_match = re.search(r"([\d\s\u202F]+) +cs", block)
        migrations_match = re.search(r"([\d\s\u202F]+) +migrations", block)
        pf_match = re.search(r"([\d\s\u202F]+) +page-faults", block)

        data = {
            "energy_pkg": clean_float(pkg_match.group(1)),
            "energy_cores": clean_float(core_match.group(1)),
            "cycles": clean_int(cycles_match.group(1)),
            "instructions": clean_int(instr_match.group(1)),
            "cache_references": clean_int(cache_ref_match.group(1)),
            "cache_misses": clean_int(cache_miss_match.group(1)),
            "cs": clean_int(cs_match.group(1)),
            "migrations": clean_int(migrations_match.group(1)),
            "page_faults": clean_int(pf_match.group(1)),
        }
        rows.append(data)

    return rows


In [10]:
all_rows = []

# Create dataframe 

file_names = [
    "test_leo_skeleton_base_empty.txt",
    "test_leo_skeleton_base_long.txt",
    "test_leo_skeleton_base_medium.txt",
    "test_leo_skeleton_change_empty.txt",
    "test_leo_skeleton_change_long.txt", 
    "test_leo_skeleton_change_medium.txt", 
    "test_leo_skeleton_change+logging_long.txt",
    "test_leo_skeleton_change+logging_medium.txt",
    "test_leo_skeleton_change+logging_empty.txt",
    "test_mu_skeleton_base_empty.txt",
    "test_mu_skeleton_base_long.txt",
    "test_mu_skeleton_base_medium.txt",
    "test_mu_skeleton_change_empty.txt",
    "test_mu_skeleton_change_long.txt",
    "test_mu_skeleton_change_medium.txt",
    "test_mu_skeleton_change+logging_long.txt",
    "test_mu_skeleton_change+logging_medium.txt",
    "test_mu_skeleton_change+logging_empty.txt",
    "test_novelWriter_skeleton_base_empty.txt",
    "test_novelWriter_skeleton_base_long.txt",
    "test_novelWriter_skeleton_base_medium.txt",
    "test_novelWriter_skeleton_change_empty.txt",
    "test_novelWriter_skeleton_change_long.txt",
    "test_novelWriter_skeleton_change_medium.txt",
    "test_novelWriter_skeleton_change+logging_long.txt",
    "test_novelWriter_skeleton_change+logging_medium.txt",
    "test_novelWriter_skeleton_change+logging_empty.txt"
]
for file in file_names:
    parsed = parse_file(file)
    test_name = os.path.basename(file).replace(".txt","")
    for idx, row in enumerate(parsed,1):
        row["test"] = test_name
        row["run"] = idx
        all_rows.append(row)
df = pd.DataFrame(all_rows)

print(df.head(10))

   energy_pkg  energy_cores      cycles  instructions  cache_references  \
0       58.15          2.62  1205632822     420613620          47533145   
1       45.54          2.41  1172328882     358697847          44816818   
2       53.22          2.33  1145813786     363630064          44353035   
3       56.20          2.78  1288981070     374030561          47821323   
4       61.27          2.63  1296263687     541997216          47250695   
5       62.07          2.42  1166862379     360993520          45594577   
6       55.56          2.36  1164006373     352163462          44268202   
7       58.49          2.41  1240766886     434766567          45593474   
8       55.31          2.80  1303753001     554700304          46004282   
9       56.28          2.50  1195266861     373586452          45852171   

   cache_misses     cs  migrations  page_faults                          test  \
0      23117788  11333          66         1120  test_leo_skeleton_base_empty   
1      22199

In [11]:
print(df.columns)

Index(['energy_pkg', 'energy_cores', 'cycles', 'instructions',
       'cache_references', 'cache_misses', 'cs', 'migrations', 'page_faults',
       'test', 'run'],
      dtype='object')


In [12]:
# Test for normality - Shapiro-Wilk test

shapiro_results = []
for name, grp in df.groupby("test"):
    stat, p = shapiro(grp["energy_pkg"])
    shapiro_results.append({"test": name, "stat": stat, "p": p})
norm_df = pd.DataFrame(shapiro_results)

# Stat = values closer to 1 indicate normality
# p = probability of observing stat as extreme as you did 
# p <= 0.05 (most commonly), we can reject null and conclude this sample significantly deviates from normal
# p > 0.05, we don't have evidence the data isn't normal

print(norm_df)

alpha = 0.05

norm_df['is_normal'] = norm_df['p'] > alpha

normal_tests = norm_df[norm_df['is_normal']]['test'].tolist()
print("Test cases that are normally distributed (p > 0.05):")
for t in normal_tests:
    print(f"- {t}")

                                               test      stat         p
0                      test_leo_skeleton_base_empty  0.979687  0.817173
1                       test_leo_skeleton_base_long  0.960833  0.325288
2                     test_leo_skeleton_base_medium  0.970981  0.566355
3            test_leo_skeleton_change+logging_empty  0.952592  0.198269
4             test_leo_skeleton_change+logging_long  0.955131  0.231478
5           test_leo_skeleton_change+logging_medium  0.945234  0.125869
6                    test_leo_skeleton_change_empty  0.909404  0.014375
7                     test_leo_skeleton_change_long  0.954016  0.216294
8                   test_leo_skeleton_change_medium  0.926824  0.040461
9                       test_mu_skeleton_base_empty  0.898541  0.007737
10                       test_mu_skeleton_base_long  0.948513  0.154226
11                     test_mu_skeleton_base_medium  0.886869  0.004070
12            test_mu_skeleton_change+logging_empty  0.955932  0

In [13]:
df.loc[df["test"]=="test_novelWriter_skeleton_change+logging_empty","energy_pkg"].values

array([51.62, 49.02, 61.08, 60.68, 51.12, 47.25, 72.09, 62.32, 55.05,
       44.41, 50.64, 65.06, 48.27, 61.6 , 52.61, 59.55, 47.98, 54.04,
       56.8 , 51.99, 57.7 , 63.92, 54.2 , 49.6 , 51.97, 60.44, 66.91,
       53.78, 54.1 , 51.56])

In [45]:
# repeat this for the two comparison types
# repeat for different measured things (e.g. instructions or other energy cores)

# chatGPT was used as an assistant to find general implementations of the statistical tests, which were then implemented by myself below

alpha = 0.05

groups_in_app = {
    "base": df.loc[df["test"]=="test_mu_skeleton_base_medium","energy_pkg"].values,
    "change": df.loc[df["test"]=="test_mu_skeleton_change_medium","energy_pkg"].values,
    "change+logging": df.loc[df["test"]=="test_mu_skeleton_change+logging_medium","energy_pkg"].values
}

#groups_in_app = {
#    "mu": df.loc[df["test"]=="test_mu_skeleton_change+logging_empty","energy_pkg"].values,
#    "leo": df.loc[df["test"]=="test_leo_skeleton_change+logging_empty","energy_pkg"].values,
#    "nw": df.loc[df["test"]=="test_novelWriter_skeleton_change+logging_empty","energy_pkg"].values
#}

print("Normality per scenario:")
normality = []
for v, data in groups_in_app.items():
    W, p = shapiro(data)
    if p>alpha:
        print(v, ": W=", round(W, 3), "p=", round(p, 3), "- normal")
    else:
        print(v, ": W=", round(W, 3), "p=", round(p, 3), "- non-normal")

print("\nVariance:")
stat_levene, p_levene = levene(*groups_in_app.values())
if p_levene>alpha:
    print("Levene’s test: W=", stat_levene, ", p=", p_levene, "- equal variance")
else:
    print("Levene’s test: W=", stat_levene, ", p=", p_levene, "- unequal variance")

all_data = np.concatenate(list(groups_in_app.values()))

# chatGPT generated the line below
all_labels = np.repeat(list(groups_in_app.keys()), [len(g) for g in groups_in_app.values()])

# If all groups are normally distributed (shapiro-wilk), and has equal variances (levene)
if all(shapiro(g)[1] > alpha for g in groups_in_app.values()) and p_levene > alpha:
    # Data is parametric
    # Then we can do one way ANOVA
    #F, p_main = f_oneway(groups_in_app['base'], groups_in_app['change'], groups_in_app['change+logging'])
    F, p_main = f_oneway(groups_in_app['leo'], groups_in_app['mu'], groups_in_app['nw'])
    if p_main>alpha:
        print("\nANOVA: F=", F, ", p=", p_main, "- no statistically significant difference")
    else:
        print("\nANOVA: F=", F, ", p=", p_main, "- statistically significant difference")

    # Post‑hoc test - Tukey HSD
    tukey = pairwise_tukeyhsd(endog=all_data, groups=all_labels, alpha=alpha)
    print("\nTukey HSD results:")
    print(tukey)
else:
    # Data is non-parametric
    # We can do Kruskal-Wallis
    H, p_main = kruskal(groups_in_app['base'], groups_in_app['change'], groups_in_app['change+logging'])
    #H, p_main = kruskal(groups_in_app['leo'], groups_in_app['mu'], groups_in_app['nw'])
    if p_main>alpha:
        print("\nKruskal–Wallis: H=", H, ", p=", p_main, "- no statistically significant difference")
    else:
        print("\nKruskal–Wallis: H=", H, ", p=", p_main, "- statistically significant difference")

    # Post‑hoc test - Dunn-Bonferroni
    df_ph = pd.DataFrame({
        "value": all_data,
        "group": all_labels
    })

    dunn_res = sp.posthoc_dunn(df_ph, val_col='value', group_col='group', p_adjust='bonferroni')

    print("\nDunn-Bonferroni pairwise p-values:")
    print(dunn_res)



Normality per scenario:
base : W= 0.887 p= 0.004 - non-normal
change : W= 0.978 p= 0.78 - normal
change+logging : W= 0.94 p= 0.089 - normal

Variance:
Levene’s test: W= 2.110424534070289 , p= 0.12734845852854015 - equal variance

Kruskal–Wallis: H= 6.475169295270684 , p= 0.039258604037308964 - statistically significant difference

Dunn-Bonferroni pairwise p-values:
                    base    change  change+logging
base            1.000000  0.035678        0.334679
change          0.035678  1.000000        1.000000
change+logging  0.334679  1.000000        1.000000
