In [1]:
import os
import pickle
from pathlib import Path
from typing import Generator

import numpy as np
import pandas as pd
import scikit_posthocs as sp
from IPython.display import HTML, display
from scipy import stats
from yaml import safe_load
from jmetal.lab.statistical_test.functions import (
    friedman_aligned_rank_test,
    friedman_aligned_ph_test,
)


pd.set_option("display.max_columns", 100)


def friedmann_nemenyi_test(
    unravelled_detailed_results: dict[str : dict[tuple[str, str], list[float]]],
) -> None:
    for metric in unravelled_detailed_results.keys():
        metric_df = pd.DataFrame(unravelled_detailed_results[metric])
        metric_vals = [metric_df[col].values for col in metric_df.columns]
        out = stats.friedmanchisquare(*metric_vals)
        print(
            f"Friedman Test {metric}: statistic={out.statistic:.3f}, pvalue={out.pvalue:.3f}"
        )
        metric_vals = np.array(metric_vals).T
        out = sp.posthoc_nemenyi_friedman(metric_vals)
        print("Nemenyi post-hoc test")
        feature_names = metric_df.columns
        out.index = feature_names
        out.columns = feature_names
        out = out.sort_index(axis=0).sort_index(axis=1)
        display(out)


def aligned_friedmann_holm_test(
    unravelled_detailed_results: dict[str : dict[tuple[str, str], list[float]]],
) -> None:
    for metric in unravelled_detailed_results.keys():
        metric_df = pd.DataFrame(unravelled_detailed_results[metric])
        # metric_vals = np.asarray([metric_df[col].values for col in metric_df.columns])
        metric_vals = metric_df.values

        out = friedman_aligned_rank_test(metric_vals)
        chi2_stat = out.loc["Aligned Rank stat"].iloc[0]
        p_value = out.loc["p-value"].iloc[0]
        print(f"Aligned-rank Friedman χ² {metric} = {chi2_stat:.3f}, p = {p_value:.3f}")

        z_vals, out, _ = friedman_aligned_ph_test(
            metric_vals, apv_procedure="Holm"  # Holm step-down correction
        )
        print("Holm post-hoc test")
        feature_names = metric_df.columns
        out.index = feature_names
        out.columns = feature_names
        out = out.sort_index(axis=0).sort_index(axis=1)
        display(out)


def present_results(
    paths: Generator,
    val_method: str = "lopo",
    remove_xgboost: bool = False,
    remove_chronos_small_from_test: bool = False,
    which_test: str = "friedmann-nemenyi",
    test_args: dict = {},
) -> None:
    results = []
    for reports_path in paths:
        report = pd.read_csv(reports_path, index_col=0)
        conf = safe_load(open(reports_path.parent / ".hydra/config.yaml"))
        if conf["validation_method"]["_target_"].split(".")[-1].lower() != val_method:
            continue
        model_name: str = conf["model"]["model"]["_target_"].split(".")[-1]
        features_name = (
            conf["feature_extractor"]["_target_"].split(".")[-1]
            if "model_name" not in conf["feature_extractor"]
            else conf["feature_extractor"]["model_name"]
        )
        validation_method = conf["validation_method"]["_target_"].split(".")[-1]
        if "aggregator" not in conf:
            aggregator = "MeanTimeAggregator"
        else:
            aggregator = (
                conf["aggregator"]["_target_"].split(".")[-1]
                if "_target_" in conf["aggregator"]
                else None
            )
        report_results = {}
        for col in report.columns:
            report_results[f"{col} avg"] = report[col].mean()
            report_results[f"{col} sem"] = report[col].sem() * 1.98  # 95% CI

        dataset = conf["dataset"]["dataset"]
        side = conf["dataset"]["side"]

        resampling = (
            conf["resampling"]["_target_"].split(".")[-1]
            if "resampling" in conf
            else "None"
        )

        resampling = resampling if resampling != "NoUnderSampler" else "None"
        # Collect results in a list of dicts
        if remove_xgboost and model_name == "XGBClassifier":
            continue
        results.append(
            {
                "Dataset": dataset,
                "Side": side,
                "Model": model_name,
                "Resampling": resampling,
                "Features": features_name,
                "Aggregator": aggregator,
                "Validation": validation_method,
                "Detailed Report": report,
                **report_results,
            }
        )

    # After the loop, display as a table
    df_results = pd.DataFrame(results)
    for (dataset, side, resampling), group in df_results.groupby(
        ["Dataset", "Side", "Resampling"]
    ):
        try:
            display(
                HTML(
                    f"""
                <div style='background-color:#ffe6e6; padding:18px; margin:10px 0; border-radius:8px;'>
                    <h2 style='color:#b30000; margin:0; font-size:2em;'>
                    Results for Dataset: <i>{dataset}</i>, Side: <i>{side}</i>, resampling: <i>{resampling}</i>
                    </h2>
                </div>
                """
                )
            )
            grouped_data = (
                group.sort_values(by=["Model", "Features", "Aggregator"])
                .drop(columns=["Detailed Report"])
                .drop_duplicates()
            )
            # display(grouped_data)

            def test_fn(x):
                if len(x) > 1:
                    return pd.Series(
                        [
                            "%.2f ± %.2f" % (np.round(x.values[0][i], 2), np.round(x.values[1][i], 2))
                            for i in range(x.shape[1])
                        ],
                        index=x.columns,
                    )
                else:
                    return pd.Series(
                        [f"{x.values[0][i]}" for i in range(x.shape[1])],
                        index=x.columns,
                    )

            grouped_data.columns = pd.MultiIndex.from_tuples(
                [
                    (
                        tuple(col.split(" "))
                        if "avg" in col or "sem" in col
                        else tuple([col])
                    )
                    for col in grouped_data.columns
                ]
            )
            grouped_data = grouped_data.T.groupby(level=0).apply(test_fn).T
            display(grouped_data)
            grouped_data = grouped_data.drop(columns=['accuracy_score', 'Dataset', "Side", "Resampling", "Validation"])
            grouped_data = grouped_data.rename(
                columns={
                    "balanced_accuracy_score": "Balanced Accuracy",
                    "f1_score": "F1",
                    "matthews_corrcoef": "MCC",
                    "roc_auc_score": "ROC AUC",
                    "precision_score": "Precision",
                    "recall_score": "Recall",
                }
            )


            # save to latex
            latex_path = Path("../tables_latex") / f"results_{dataset}_{side}_{resampling}_{validation_method}.tex"
            with open(latex_path, "w") as f:
                f.write(grouped_data.to_latex())  

            if "aggregator" in test_args.keys() and group["Aggregator"].nunique() > 2:
                group = group[
                    (group["Aggregator"] == test_args["aggregator"])
                    | (group["Aggregator"].isnull())
                ]

            unravelled_detailed_results = {
                metric: {} for metric in group["Detailed Report"].iloc[0].columns
            }
            for i, model_results in group.iterrows():
                for metric in model_results["Detailed Report"].columns:
                    cv_results = model_results["Detailed Report"][metric]
                    unravelled_detailed_results[metric][
                        (model_results["Model"], model_results["Features"])
                    ] = cv_results

            if which_test == "friedmann-nemenyi":
                friedmann_nemenyi_test(unravelled_detailed_results)
            elif which_test == "alignedfriedmann-holm":
                aligned_friedmann_holm_test(unravelled_detailed_results)
            else:
                raise ValueError(f"Unknown test: {which_test}")
        except Exception as e:
            print(f"Error processing group {dataset}, {side}, {resampling}: {e}")
            continue

    return results

In [2]:
results_path: str = "../outputs/"
all_results = list(Path(results_path).glob("*/*/*/reports.csv"))
results_path_adula: str = "../outputs_adula/"
all_results += list(Path(results_path_adula).glob("*/*/*/reports.csv"))

results_lopo = present_results(
    all_results,
    val_method="lopo",
    remove_xgboost=True,
    remove_chronos_small_from_test=True,
    which_test="friedmann-nemenyi",
    test_args={
        "aggregator": "none"}
)

Unnamed: 0,Aggregator,Dataset,Features,Model,Resampling,Side,Validation,accuracy_score,balanced_accuracy_score,f1_score,matthews_corrcoef,precision_score,recall_score,roc_auc_score
1,,BiHeartS,HandcraftedFeatureExtractor,DummyClassifier,GroupUnderSampler,left,LOPO,0.61 ± 0.10,0.49 ± 0.00,0.25 ± 0.15,-0.01 ± 0.01,0.24 ± 0.15,0.29 ± 0.16,0.49 ± 0.00
6,,BiHeartS,HandcraftedFeatureExtractor,DummyClassifier,GroupUnderSampler,left,LOPO,0.58 ± 0.07,0.50 ± 0.00,0.24 ± 0.17,-0.00 ± 0.01,0.25 ± 0.18,0.25 ± 0.17,0.50 ± 0.00
0,,BiHeartS,HandcraftedFeatureExtractor,LogisticRegression,GroupUnderSampler,left,LOPO,0.70 ± 0.05,0.61 ± 0.04,0.46 ± 0.13,0.22 ± 0.09,0.45 ± 0.13,0.51 ± 0.16,0.61 ± 0.04
5,,BiHeartS,HandcraftedFeatureExtractor,LogisticRegression,GroupUnderSampler,left,LOPO,0.60 ± 0.03,0.56 ± 0.03,0.52 ± 0.09,0.12 ± 0.06,0.49 ± 0.09,0.56 ± 0.10,0.56 ± 0.03


Error processing group BiHeartS, left, GroupUnderSampler: [Errno 2] No such file or directory: '../tables_latex/results_BiHeartS_left_GroupUnderSampler_LOPO.tex'


Unnamed: 0,Aggregator,Dataset,Features,Model,Resampling,Side,Validation,accuracy_score,balanced_accuracy_score,f1_score,matthews_corrcoef,precision_score,recall_score,roc_auc_score
12,,BiHeartS,HandcraftedFeatureExtractor,DummyClassifier,GroupUnderSampler,right,LOPO,0.48 ± 0.06,0.49 ± 0.01,0.30 ± 0.15,-0.02 ± 0.02,0.29 ± 0.15,0.33 ± 0.16,0.49 ± 0.01
14,,BiHeartS,HandcraftedFeatureExtractor,DummyClassifier,GroupUnderSampler,right,LOPO,0.54 ± 0.08,0.49 ± 0.01,0.22 ± 0.15,-0.01 ± 0.01,0.20 ± 0.14,0.27 ± 0.17,0.49 ± 0.01
11,,BiHeartS,HandcraftedFeatureExtractor,LogisticRegression,GroupUnderSampler,right,LOPO,0.54 ± 0.03,0.54 ± 0.02,0.47 ± 0.07,0.08 ± 0.05,0.52 ± 0.10,0.46 ± 0.11,0.54 ± 0.02
13,,BiHeartS,HandcraftedFeatureExtractor,LogisticRegression,GroupUnderSampler,right,LOPO,0.61 ± 0.07,0.57 ± 0.07,0.40 ± 0.14,0.14 ± 0.14,0.45 ± 0.13,0.42 ± 0.18,0.57 ± 0.07


Error processing group BiHeartS, right, GroupUnderSampler: [Errno 2] No such file or directory: '../tables_latex/results_BiHeartS_right_GroupUnderSampler_LOPO.tex'


Unnamed: 0,Aggregator,Dataset,Features,Model,Resampling,Side,Validation,accuracy_score,balanced_accuracy_score,f1_score,matthews_corrcoef,precision_score,recall_score,roc_auc_score
8,,HeartS,HandcraftedFeatureExtractor,DummyClassifier,GroupUnderSampler,unknown,LOPO,0.75 ± 0.19,0.51 ± 0.02,0.07 ± 0.10,0.01 ± 0.03,0.04 ± 0.07,0.21 ± 0.25,0.51 ± 0.02
7,,HeartS,HandcraftedFeatureExtractor,LogisticRegression,GroupUnderSampler,unknown,LOPO,0.79 ± 0.11,0.72 ± 0.09,0.36 ± 0.08,0.34 ± 0.12,0.37 ± 0.27,0.62 ± 0.23,0.72 ± 0.09


Error processing group HeartS, unknown, GroupUnderSampler: [Errno 2] No such file or directory: '../tables_latex/results_HeartS_unknown_GroupUnderSampler_LOPO.tex'


Unnamed: 0,Aggregator,Dataset,Features,Model,Resampling,Side,Validation,accuracy_score,balanced_accuracy_score,f1_score,matthews_corrcoef,precision_score,recall_score,roc_auc_score
10,,Workplace,HandcraftedFeatureExtractor,DummyClassifier,GroupUnderSampler,unknown,LOPO,0.48 ± 0.03,0.50 ± 0.01,0.40 ± 0.15,-0.00 ± 0.01,0.49 ± 0.20,0.34 ± 0.13,0.50 ± 0.01
9,,Workplace,HandcraftedFeatureExtractor,LogisticRegression,GroupUnderSampler,unknown,LOPO,0.49 ± 0.05,0.49 ± 0.05,0.55 ± 0.09,-0.01 ± 0.06,0.65 ± 0.08,0.55 ± 0.13,0.49 ± 0.05


Error processing group Workplace, unknown, GroupUnderSampler: [Errno 2] No such file or directory: '../tables_latex/results_Workplace_unknown_GroupUnderSampler_LOPO.tex'


In [3]:
results_lopo = present_results(
    all_results,
    val_method="tacv",
    remove_xgboost=True,
    remove_chronos_small_from_test=True,
)

Unnamed: 0,Aggregator,Dataset,Features,Model,Resampling,Side,Validation,accuracy_score,balanced_accuracy_score,f1_score,matthews_corrcoef,precision_score,recall_score,roc_auc_score
1,,BiHeartS,HandcraftedFeatureExtractor,DummyClassifier,GroupUnderSampler,left,TACV,0.61 ± 0.12,0.50 ± 0.02,0.29 ± 0.24,0.01 ± 0.03,0.29 ± 0.26,0.31 ± 0.25,0.50 ± 0.02
7,,BiHeartS,HandcraftedFeatureExtractor,DummyClassifier,GroupUnderSampler,left,TACV,0.54 ± 0.07,0.49 ± 0.01,0.29 ± 0.24,-0.02 ± 0.02,0.30 ± 0.27,0.28 ± 0.23,0.49 ± 0.01
0,,BiHeartS,HandcraftedFeatureExtractor,LogisticRegression,GroupUnderSampler,left,TACV,0.68 ± 0.12,0.63 ± 0.09,0.58 ± 0.19,0.28 ± 0.17,0.52 ± 0.18,0.67 ± 0.20,0.63 ± 0.09
6,,BiHeartS,HandcraftedFeatureExtractor,LogisticRegression,GroupUnderSampler,left,TACV,0.61 ± 0.05,0.58 ± 0.03,0.59 ± 0.13,0.17 ± 0.07,0.54 ± 0.13,0.65 ± 0.15,0.58 ± 0.03


Error processing group BiHeartS, left, GroupUnderSampler: [Errno 2] No such file or directory: '../tables_latex/results_BiHeartS_left_GroupUnderSampler_TACV.tex'


Unnamed: 0,Aggregator,Dataset,Features,Model,Resampling,Side,Validation,accuracy_score,balanced_accuracy_score,f1_score,matthews_corrcoef,precision_score,recall_score,roc_auc_score
11,,BiHeartS,HandcraftedFeatureExtractor,DummyClassifier,GroupUnderSampler,right,TACV,0.48 ± 0.02,0.49 ± 0.01,0.33 ± 0.17,-0.02 ± 0.02,0.31 ± 0.16,0.37 ± 0.18,0.49 ± 0.01
13,,BiHeartS,HandcraftedFeatureExtractor,DummyClassifier,GroupUnderSampler,right,TACV,0.54 ± 0.05,0.50 ± 0.01,0.22 ± 0.18,0.00 ± 0.02,0.17 ± 0.15,0.30 ± 0.24,0.50 ± 0.01
10,,BiHeartS,HandcraftedFeatureExtractor,LogisticRegression,GroupUnderSampler,right,TACV,0.57 ± 0.06,0.57 ± 0.05,0.51 ± 0.08,0.14 ± 0.11,0.50 ± 0.11,0.55 ± 0.11,0.57 ± 0.05
12,,BiHeartS,HandcraftedFeatureExtractor,LogisticRegression,GroupUnderSampler,right,TACV,0.64 ± 0.10,0.63 ± 0.08,0.50 ± 0.10,0.25 ± 0.14,0.50 ± 0.13,0.56 ± 0.14,0.63 ± 0.08


Error processing group BiHeartS, right, GroupUnderSampler: [Errno 2] No such file or directory: '../tables_latex/results_BiHeartS_right_GroupUnderSampler_TACV.tex'


Unnamed: 0,Aggregator,Dataset,Features,Model,Resampling,Side,Validation,accuracy_score,balanced_accuracy_score,f1_score,matthews_corrcoef,precision_score,recall_score,roc_auc_score
9,,HeartS,HandcraftedFeatureExtractor,DummyClassifier,GroupUnderSampler,unknown,TACV,0.65 ± 0.23,0.46 ± 0.05,0.08 ± 0.07,-0.05 ± 0.07,0.05 ± 0.04,0.24 ± 0.20,0.46 ± 0.05
8,,HeartS,HandcraftedFeatureExtractor,LogisticRegression,GroupUnderSampler,unknown,TACV,0.81 ± 0.12,0.75 ± 0.17,0.40 ± 0.17,0.36 ± 0.25,0.36 ± 0.26,0.66 ± 0.29,0.75 ± 0.17


Error processing group HeartS, unknown, GroupUnderSampler: [Errno 2] No such file or directory: '../tables_latex/results_HeartS_unknown_GroupUnderSampler_TACV.tex'
