In [34]:
import pandas as pd
import numpy as np
from scipy.stats import shapiro, friedmanchisquare, wilcoxon, levene
from statsmodels.stats.anova import AnovaRM
from statsmodels.stats.multitest import multipletests
from statsmodels.stats.multicomp import pairwise_tukeyhsd

In [30]:
accuracy_df  = pd.read_csv("results\\acc_results.csv")
precision_df = pd.read_csv("results\\prec_results.csv")
recall_df    = pd.read_csv("results\\recall_results.csv")
f1_df        = pd.read_csv("results\\f1_results.csv")

metrics = {"Accuracy": accuracy_df,
           "Precision": precision_df,
           "Recall": recall_df,
           "F1": f1_df}

In [35]:
def test_metrics(df, metric_name):
    print(f"\n===== {metric_name} =====")

    cnn = df['wlasny'].values
    resnet = df['resnet'].values
    efficient = df['efficient'].values

    # ===== sprawdzenie normalności rozkładu =====
    print("Shapiro-Wilk test (normalność rozkładu)")
    normal_flags = []
    for name, data in zip(['CNN','ResNet','EfficientNet'], [cnn,resnet,efficient]):
        stat, p = shapiro(data)
        normal = p > 0.05
        normal_flags.append(normal)
        print(f"{name}: stat={stat:.4f}, p={p:.4f} -> {'normalny' if normal else 'nienormalny'}")

    # ===== sprawdzenie homogeniczności wariancji =====
    stat, p = levene(cnn, resnet, efficient)
    homo = p > 0.05
    print(f"\nTest Levene’a: stat={stat:.4f}, p={p:.4f} -> {'jednorodne' if homo else 'niejednorodne'}")

    # ===== wybór testu głównego =====
    if all(normal_flags) and homo:
        print("\n>>> Wszystkie warunki spełnione -> ANOVA (parametryczny)")
        df_long = pd.DataFrame({
            "id": np.tile(np.arange(len(cnn)), 3),
            "model": np.repeat(["CNN","ResNet","EfficientNet"], len(cnn)),
            "score": np.concatenate([cnn, resnet, efficient])
        })
        anova = AnovaRM(df_long, 'score', 'id', within=['model']).fit()
        print(anova)

        # ===== post-hoc Tukey HSD =====
        print("\nPost-hoc Tukey HSD:")
        tukey = pairwise_tukeyhsd(endog=df_long['score'], groups=df_long['model'], alpha=0.05)
        print(tukey)

    else:
        print("\n>>> Warunki niespełnione -> używamy testu Friedmana (nieparametryczny)")
        stat, p = friedmanchisquare(cnn, resnet, efficient)
        print(f"Friedman: stat={stat:.4f}, p={p:.4f}")

        print("\nPost-hoc Wilcoxon (z poprawką Holm-Bonferroni):")
        pairs = [("CNN", "ResNet", cnn, resnet),
                 ("CNN", "EfficientNet", cnn, efficient),
                 ("ResNet", "EfficientNet", resnet, efficient)]

        labels, p_values, stats_list = [], [], []
        for name1, name2, d1, d2 in pairs:
            stat_w, p_val = wilcoxon(d1, d2)
            labels.append(f"{name1} vs {name2}")
            p_values.append(p_val)
            stats_list.append(stat_w)

        rejected, p_corrected, _, _ = multipletests(p_values, method="holm")

        for lbl, stat_w, p_raw, p_corr, rej in zip(labels, stats_list, p_values, p_corrected, rejected):
            signif = "istotna" if rej else "brak istotności"
            print(f"{lbl}: stat={stat_w:.4f}, raw p={p_raw:.4f}, corrected p={p_corr:.4f} -> {signif}")

In [36]:
for metric_name, df in metrics.items():
    test_metrics(df, metric_name)


===== Accuracy =====
Shapiro-Wilk test (normalność rozkładu)
CNN: stat=0.8127, p=0.1024 -> normalny
ResNet: stat=0.9316, p=0.6071 -> normalny
EfficientNet: stat=0.9556, p=0.7773 -> normalny

Test Levene’a: stat=1.0585, p=0.3773 -> jednorodne

>>> Wszystkie warunki spełnione -> ANOVA (parametryczny)
              Anova
      F Value Num DF Den DF Pr > F
----------------------------------
model 21.9205 2.0000 8.0000 0.0006


Post-hoc Tukey HSD:
      Multiple Comparison of Means - Tukey HSD, FWER=0.05      
   group1       group2    meandiff p-adj   lower  upper  reject
---------------------------------------------------------------
         CNN EfficientNet     0.11 0.0029  0.0413 0.1787   True
         CNN       ResNet    0.164 0.0001  0.0953 0.2327   True
EfficientNet       ResNet    0.054  0.132 -0.0147 0.1227  False
---------------------------------------------------------------

===== Precision =====
Shapiro-Wilk test (normalność rozkładu)
CNN: stat=0.9506, p=0.7417 -> normalny
Re