In [None]:
import pandas as pd

# Paths to files
path_rf   = r"C:\Users\grego\Downloads\random_forest_nested_cv_detailed_results_roc_auc.csv"
path_cb   = r"C:\Users\grego\Downloads\catboost_ohe_nested_cv_detailed_results_roc_auc.csv"
path_lr   = r"C:\Users\grego\Downloads\logreg_nested_cv_detailed_results_auc.csv"
path_lgbm = r"C:\Users\grego\Downloads\lightgbm_notrust_nested_cv_detailed_results_roc_auc.csv"
path_xgb  = r"C:\Users\grego\Downloads\xgboost_nested_cv_detailed_results_roc_auc.csv"

# Read CSV files
df_rf   = pd.read_csv(path_rf)
df_cb   = pd.read_csv(path_cb)
df_lr   = pd.read_csv(path_lr)
df_lgbm = pd.read_csv(path_lgbm)
df_xgb  = pd.read_csv(path_xgb)

# 3rd column (index 2, assuming this is F1)
rf_f1   = df_rf.iloc[:, 2]
cb_f1   = df_cb.iloc[:, 2]
lr_f1   = df_lr.iloc[:, 2]
lgbm_f1 = df_lgbm.iloc[:, 2]
xgb_f1  = df_xgb.iloc[:, 2]

# Combine into a single DataFrame with F1 names
all_f1 = pd.DataFrame({
    "RandomForest_F1": rf_f1,
    "CatBoost_F1": cb_f1,
    "LogReg_F1": lr_f1,
    "LightGBM_notrust_F1": lgbm_f1,
    "XGBoost_F1": xgb_f1,
})


print(all_f1.head(25))


    RandomForest_F1  CatBoost_F1  LogReg_F1  LightGBM_notrust_F1  XGBoost_F1
0          0.636050     0.649547   0.631099             0.648513    0.640470
1          0.622254     0.639759   0.626396             0.635664    0.637207
2          0.641449     0.653588   0.638964             0.655683    0.656866
3          0.617243     0.645950   0.632992             0.643236    0.640862
4          0.647426     0.660197   0.647149             0.657234    0.658800
5          0.626362     0.647746   0.631402             0.646705    0.650218
6          0.623893     0.644642   0.625042             0.645916    0.649718
7          0.639001     0.651365   0.632201             0.651077    0.650134
8          0.643870     0.651822   0.638356             0.651872    0.651320
9          0.637466     0.653814   0.643581             0.653143    0.653479
10         0.645396     0.649541   0.642058             0.651286    0.651950
11         0.634265     0.641794   0.625777             0.642180    0.644623

Friedman test on NCV results from 5 random seeds, to statistically compare models


In [2]:
from scipy.stats import friedmanchisquare

# columns with F1 scores
rf_f1   = all_f1["RandomForest_F1"].to_numpy()
cb_f1   = all_f1["CatBoost_F1"].to_numpy()
lr_f1   = all_f1["LogReg_F1"].to_numpy()
lgbm_f1 = all_f1["LightGBM_notrust_F1"].to_numpy()
xgb_f1  = all_f1["XGBoost_F1"].to_numpy()

# Friedman test for 5 models (based on F1)
stat, p = friedmanchisquare(rf_f1, cb_f1, lr_f1, lgbm_f1, xgb_f1)

print("Friedman chi2 =", stat)
print("p-value       =", "{:.30f}".format(p))


Friedman chi2 = 78.11199999999997
p-value       = 0.000000000000000437383396907599


 Pairwise Wilcoxon with Bonferroni correction for comparing several Machine Learning models

In [3]:
import numpy as np
import pandas as pd
import scikit_posthocs as sp

# 0. Significance levels
alpha_familywise = 0.05
alpha_1 = 0.05
alpha_2 = 0.01
alpha_3 = 0.001

# 1. Wide table with F1 scores
df_wide = all_f1.copy()
df_wide.columns = ["RF", "CatBoost", "LogReg", "LightGBM", "XGBoost"]

# 2. Long format: model / F1
df_long = df_wide.melt(var_name="model", value_name="F1")

# 3. Pairwise Wilcoxon with Bonferroni correction
wilc_bonf = sp.posthoc_wilcoxon(
    df_long,
    group_col="model",
    val_col="F1",
    p_adjust="bonferroni",
    correction=False
)

wilc_bonf.index.name = "Model"
wilc_bonf.columns.name = "Model"

# 4. Function for significance stars
def stars(p):
    if p < alpha_3:
        return "***"
    elif p < alpha_2:
        return "**"
    elif p < alpha_1:
        return "*"
    else:
        return ""

# 5. Pretty table with p-values and stars
pretty_wilc = wilc_bonf.map(lambda p: f"{p:.6f}{stars(p)}")

# 6. Print results
print(
    f"Pairwise Wilcoxon (Bonferroni-adjusted p-values, "
    f"family-wise α = {alpha_familywise:.2f}) based on F1:\n"
)
print(pretty_wilc.to_string())

print("\nLegend:")
print(f"  *    p < {alpha_1:.2f}")
print(f"  **   p < {alpha_2:.2f}")
print(f"  ***  p < {alpha_3:.3f}")
print("  no star = non-significant (p ≥ 0.05)")


Pairwise Wilcoxon (Bonferroni-adjusted p-values, family-wise α = 0.05) based on F1:

Model              RF     CatBoost       LogReg     LightGBM      XGBoost
Model                                                                    
RF           1.000000  0.000001***     1.000000  0.000001***  0.000001***
CatBoost  0.000001***     1.000000  0.000001***     1.000000     1.000000
LogReg       1.000000  0.000001***     1.000000  0.000001***  0.000001***
LightGBM  0.000001***     1.000000  0.000001***     1.000000     0.147220
XGBoost   0.000001***     1.000000  0.000001***     0.147220     1.000000

Legend:
  *    p < 0.05
  **   p < 0.01
  ***  p < 0.001
  no star = non-significant (p ≥ 0.05)


Comparing LightGBM without vs with trust data

In [4]:
import pandas as pd
from scipy.stats import wilcoxon

# Paths to files
path_lgbm_notrust = r"C:\Users\grego\Downloads\lightgbm_notrust_nested_cv_detailed_results_roc_auc.csv"
path_lgbm_trust   = r"C:\Users\grego\Downloads\lightgbm_withtrust_nested_cv_detailed_results_roc_auc.csv"

# Load both files
df_notrust = pd.read_csv(path_lgbm_notrust)
df_trust   = pd.read_csv(path_lgbm_trust)

# 3rd column = F1 (index 2)
f1_notrust = df_notrust.iloc[:, 2].to_numpy()
f1_trust   = df_trust.iloc[:, 2].to_numpy()

# Wilcoxon signed-rank test (paired)
stat, p = wilcoxon(f1_notrust, f1_trust)

print("Wilcoxon statistic =", stat)
print("p-value            =", p)



Wilcoxon statistic = 1.0
p-value            = 1.1920928955078125e-07
