From f28c1ff1a99cfb27c2762a2baf23c21b18177bce Mon Sep 17 00:00:00 2001
From: Paulo Augusto <paullo.augusto@hotmail.com>
Date: Tue, 6 Jun 2023 21:37:09 -0300
Subject: [PATCH] #32 multiple improvements to plots

---
 ica_benchmark/visualization/multiple_runs.py | 472 +++++++++++++------
 1 file changed, 329 insertions(+), 143 deletions(-)

diff --git a/ica_benchmark/visualization/multiple_runs.py b/ica_benchmark/visualization/multiple_runs.py
index d01c86d..56c4b23 100644
--- a/ica_benchmark/visualization/multiple_runs.py
+++ b/ica_benchmark/visualization/multiple_runs.py
@@ -10,51 +10,84 @@
 
 
 def annotate_bars(ax, labels):
-    for i, (bar, err_line, label) in enumerate(zip(ax.patches, ax.get_lines(), labels)):
+    bottom_lim, top_lim = ax.get_ylim()
+    mid = (top_lim - bottom_lim) / 2
+    for bar, err_line, label in zip(ax.patches, ax.get_lines(), labels):
         _x = bar.get_x() + bar.get_width() / 2
-        # _y = (p.get_y() + p.get_height() - len(label) / 50) / 2
-        _y = bar.get_y() + err_line.get_ydata()[1] + (err_line.get_ydata()[1] - bar.get_height())
+        bar_height = bar.get_height()
+        if bar_height > mid:
+            _y = (bar.get_y() + err_line.get_ydata()[0]) / 2
+        else:
+            _y = (top_lim + err_line.get_ydata()[1]) / 2
+        # err_line.get_ydata() -> (bottom error, top error)
         ax.text(_x, _y, label, ha="center", rotation=90)
 
 
-def ranked_barplot(results_df, grouping_col="algorithm", x_col="uid", val_col="kappa", figsize=None, x_label=None, save_filepath=None):
+def best_per_group_barplot(
+    results_df,
+    grouping_cols="algorithm",
+    x_col="uid",
+    val_col="kappa",
+    figsize=None,
+    x_label=None,
+    save_filepath=None,
+    ylim=None,
+    title=None,
+):
     x_label = x_label if x_label is not None else x_col
-    
+    ylim = ylim if ylim is not None else (0, 1)
+    title = title if title is not None else f"Best {val_col} per {x_label}"
+    grouping_cols = (
+        grouping_cols if isinstance(grouping_cols, list) else [grouping_cols]
+    )
     # Select best algorithm per subject
+    merge_cols = [x_col] + grouping_cols
     highest_df = (
-        results_df
-        .groupby([x_col, grouping_col], as_index=False)
+        results_df.groupby(merge_cols, as_index=False)
         .mean()
         .sort_values(by=val_col, ascending=False)
         .drop_duplicates(subset=x_col)
     )
 
-    # Filter original results_df to only include best algorithm per subject 
-    df = (
-        results_df
-        .merge(highest_df[[x_col, grouping_col]], on=[x_col, grouping_col], how="inner")
-        .sort_values(by=x_col, ascending=True)
+    # Filter original results_df to only include best algorithm per subject
+    df = results_df.merge(highest_df[merge_cols], how="inner").sort_values(
+        by=x_col, ascending=True
     )
-    label_df = highest_df[[x_col, grouping_col]].sort_values(by=x_col, ascending=True)
+    label_df = highest_df[merge_cols].sort_values(by=x_col, ascending=True)
 
     fig = plt.figure(figsize=figsize, dpi=150)
     ax = plt.gca()
     ax.grid()
-    sns.barplot(x=x_col, y=val_col, data=df, ax=ax, n_boot=N_BOOT, order=label_df[x_col])
+    sns.barplot(
+        x=x_col, y=val_col, data=df, ax=ax, n_boot=N_BOOT, order=label_df[x_col]
+    )
 
-    annotate_bars(ax, label_df[grouping_col])
-    ax.tick_params(labelrotation=45)
-    ax.set_ylim((0, 1))
+    bar_labels = label_df[grouping_cols].apply(lambda x: "\n".join(map(str, x)), axis=1)
+    annotate_bars(ax, bar_labels)
+    ax.tick_params(labelrotation=60)
+    ax.set_ylim(ylim)
     ax.set_xlabel(x_label)
-    ax.set_title(f"Best {val_col} per {x_label}", fontsize=12)
+    ax.set_title(title, fontsize=12)
 
+    fig.tight_layout()
     if save_filepath is not None:
         fig.savefig(save_filepath)
-    
 
-def detailed_barplot(results_df, x_col="uid", hue_col="algorithm", val_col="Kappa", order_col="run", save_filepath=None, w=5, cmap="nipy_spectral", x_label=None):
-    if x_label is None:
-        x_label = x_col
+
+def detailed_barplot(
+    results_df,
+    x_col="uid",
+    hue_col="algorithm",
+    val_col="Kappa",
+    key_cols="run",
+    save_filepath=None,
+    w=5,
+    cmap="nipy_spectral",
+    x_label=None,
+    title=None,
+):
+    x_label = x_label if x_label is not None else x_col
+    title = title if title is not None else f"{val_col} per {x_label}, per {hue_col}"
 
     fig = plt.figure(figsize=(20, 6), dpi=120)
     ax = plt.gca()
@@ -64,18 +97,9 @@ def detailed_barplot(results_df, x_col="uid", hue_col="algorithm", val_col="Kapp
 
     hues = results_df[hue_col].unique()
     n_hues = len(hues)
-    hue_color_dict = {
-        hue: cmap(i / n_hues)
-        for i, hue
-        in enumerate(hues)
-    }
+    hue_color_dict = {hue: cmap(i / n_hues) for i, hue in enumerate(hues)}
     legends = [
-        Patch(
-            facecolor=hue_color_dict[hue],
-            edgecolor=None,
-            label=hue
-        )
-        for hue in hues
+        Patch(facecolor=hue_color_dict[hue], edgecolor=None, label=hue) for hue in hues
     ]
 
     x_values = results_df[x_col].unique()
@@ -85,8 +109,7 @@ def detailed_barplot(results_df, x_col="uid", hue_col="algorithm", val_col="Kapp
         x_list = list()
         # Sort algorithms by mean Kappa value
         ordered_hue = (
-            x_df
-            .groupby(hue_col, as_index=False)
+            x_df.groupby(hue_col, as_index=False)
             .mean()
             .sort_values(by=val_col)[hue_col]
             .to_numpy()
@@ -97,7 +120,7 @@ def detailed_barplot(results_df, x_col="uid", hue_col="algorithm", val_col="Kapp
         for hue in ordered_hue:
             hue_df = x_df[x_df[hue_col] == hue]
             if hue_df[val_col].nunique() > 1:
-                res = bootstrap((hue_df[val_col], ), np.mean, n_resamples=100)
+                res = bootstrap((hue_df[val_col],), np.mean, n_resamples=100)
                 low = res.confidence_interval.low
                 high = res.confidence_interval.high
             else:
@@ -109,27 +132,47 @@ def detailed_barplot(results_df, x_col="uid", hue_col="algorithm", val_col="Kapp
             x_list.append(x_c)
             ax.bar(x_c, avg, width=w, color=hue_color_dict[hue], yerr=([low], [high]))
 
-            if (hue != best_hue):
-
-                pvalue = wilcoxon(hue_df[val_col], best_hue_df[val_col], alternative="less", zero_method="zsplit").pvalue
-                if (pvalue < 0.05):
-                    ax.text(x_c, -0.03 if avg > 0 else 0.03, "*", ha="center", va="center", fontsize=20, color="r")
+            if hue != best_hue:
+                pvalue = wilcoxon(
+                    hue_df[val_col],
+                    best_hue_df[val_col],
+                    alternative="less",
+                    zero_method="zsplit",
+                ).pvalue
+                if pvalue < 0.05:
+                    ax.text(
+                        x_c,
+                        -0.03 if avg > 0 else 0.03,
+                        "*",
+                        ha="center",
+                        va="center",
+                        fontsize=20,
+                        color="r",
+                    )
             x_c += w
         x_c += w * 2
 
         mid = np.mean(x_list)
-        ax.text(mid, -0.1, x, horizontalalignment="center", va="center_baseline", fontsize=15, rotation=-45)
+        ax.text(
+            mid,
+            -0.1,
+            x,
+            horizontalalignment="center",
+            verticalalignment="top",
+            fontsize=15,
+            rotation=-60,
+        )
     ax.set_xlabel(x_label, fontsize=20)
 
     for loc in ["right", "left", "top", "bottom"]:
         ax.spines[loc].set_visible(False)
     ax.set_xticks([])
-    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, fontsize=12)
+    ax.set_xticklabels(ax.get_xticklabels(), rotation=60, fontsize=12)
     ax.set_yticks(np.arange(0, 1.01, 0.1))
     ax.grid()
-    ax.legend(handles=legends, loc=(1, .2), fontsize=15)
+    ax.legend(handles=legends, loc=(1, 0.2), fontsize=15)
     ax.set_ylabel(val_col, fontsize=20)
-    ax.set_title(f"{val_col} per {x_col}, per {hue_col}", fontsize=20)
+    ax.set_title(title, fontsize=20)
 
     # Ensure figure doesnt get cropped during save
     fig.tight_layout()
@@ -137,86 +180,139 @@ def detailed_barplot(results_df, x_col="uid", hue_col="algorithm", val_col="Kapp
         fig.savefig(save_filepath)
 
 
-def average_barplot(results_df, x_col="algorithm", grouping_col="uid", val_col="Kappa", order_col="run", save_filepath=None, w=5, cmap="nipy_spectral", x_label=None, n_boots=N_BOOT):
-    if x_label is None:
-        x_label = x_col
+def average_barplot(
+    results_df,
+    x_col="algorithm",
+    grouping_cols="uid",
+    val_col="Kappa",
+    key_cols="run",
+    save_filepath=None,
+    w=5,
+    cmap="nipy_spectral",
+    x_label=None,
+    n_boots=N_BOOT,
+    title=None,
+):
+    x_label = x_col if x_label is None else x_label
+    key_cols = [key_cols] if isinstance(key_cols, str) else key_cols
+    title = title if title is not None else f"Average {val_col} for each {x_col}"
+    grouping_cols = [grouping_cols] if isinstance(grouping_cols, str) else grouping_cols
 
     fig = plt.figure(figsize=(20, 6), dpi=120)
     ax = plt.gca()
 
     x_c = w / 2
     cmap = plt.get_cmap(cmap)
-    
+
     hues = results_df[x_col].unique()
     n_hues = len(hues)
-    hue_color_dict = {
-        hue: cmap(i / n_hues)
-        for i, hue
-        in enumerate(hues)
-    }
+    hue_color_dict = {hue: cmap(i / n_hues) for i, hue in enumerate(hues)}
     legends = [
-        Patch(
-            facecolor=hue_color_dict[hue],
-            edgecolor=None,
-            label=hue
-        )
-        for hue in hues
+        Patch(facecolor=hue_color_dict[hue], edgecolor=None, label=hue) for hue in hues
     ]
 
-    ordered_x_df = results_df.groupby([x_col, grouping_col]).mean().groupby(x_col).mean().sort_values(by=val_col).reset_index()
+    ordered_x_df = (
+        results_df.groupby([x_col, *grouping_cols])
+        .mean()
+        .groupby(x_col)
+        .mean()
+        .sort_values(by=val_col)
+        .reset_index()
+    )
     best_x = ordered_x_df[x_col].to_numpy()[-1]
     best_x_df = results_df[results_df[x_col] == best_x]
     x_c = 0
     for x in ordered_x_df[x_col]:
         x_c += 1.5 * w
-        
+
         x_df = results_df[results_df[x_col] == x]
-        res = bootstrap((x_df[val_col], ), np.mean, n_resamples=n_boots)
+        res = bootstrap((x_df[val_col],), np.mean, n_resamples=n_boots)
         avg = x_df[val_col].mean()
 
-        ax.bar(x_c, avg, w, yerr=([avg - res.confidence_interval.low], [res.confidence_interval.high - avg]), color=hue_color_dict[x])
+        ax.bar(
+            x_c,
+            avg,
+            w,
+            yerr=(
+                [avg - res.confidence_interval.low],
+                [res.confidence_interval.high - avg],
+            ),
+            color=hue_color_dict[x],
+        )
         ax.set_xticks([])
-        ax.text(x_c, -0.025, x, horizontalalignment="center", fontsize=15)
-        ax.text(x_c, -0.05, r"$\bar\rho={:.3f}$".format(avg), horizontalalignment="center", fontsize=15, usetex=True)
-        if (x != best_x):
-            pvalue = wilcoxon(
-                x_df.sort_values(by=[grouping_col])[val_col],
-                best_x_df.sort_values(by=[grouping_col])[val_col],
-                alternative="less",
-                zero_method="zsplit"
-            ).pvalue
-            
-            if (pvalue < 0.05):
-                ax.text(x_c, res.confidence_interval.high, "*", horizontalalignment="center", fontsize=25, color="r")
-    #ax.legend(handles=legends, loc=(1, .2), fontsize=15)
+        ax.text(
+            x_c,
+            -0.025,
+            "{}\nrho={:.3f}".format(x, avg),
+            horizontalalignment="center",
+            verticalalignment="top",
+            fontsize=15,
+            rotation=90,
+        )
+        # ax.text(x_c, -0.05, r"$\bar\rho={:.3f}$".format(avg), horizontalalignment="center", fontsize=15, usetex=True, rotation=60)
+        if x != best_x:
+            merge_cols = [*grouping_cols, *key_cols]
+            best_metric_arr = best_x_df.merge(x_df[merge_cols], on=merge_cols)[val_col]
+            try:
+                pvalue = wilcoxon(
+                    x_df[val_col],
+                    best_metric_arr,
+                    alternative="less",
+                    zero_method="zsplit",
+                ).pvalue
+            except ValueError as value_exception:
+                raise ValueError(
+                    f"Could not calculate pvalue for by mering on {merge_cols}. Most likely you have not specified all key_cols.\n"
+                    + str(value_exception)
+                ) from value_exception
+
+            if pvalue < 0.05:
+                ax.text(
+                    x_c,
+                    res.confidence_interval.high,
+                    "*",
+                    horizontalalignment="center",
+                    fontsize=25,
+                    color="r",
+                )
+
+    ax.set_xlabel(x_label, fontsize=20)
     ax.set_ylabel(val_col, fontsize=20)
     ax.grid()
-    ax.set_title(f"Average {val_col} for each {x_col}", fontsize=20)
+    ax.set_title(title, fontsize=20)
+    fig.tight_layout()
 
     if save_filepath is not None:
         fig.savefig(save_filepath)
 
 
 def boxplot_algorithms(results_df, metric="kappa", save_filepath=None):
-    algorithms = results_df.groupby(["uid", "algorithm"]).mean().groupby("algorithm").mean().sort_values(by=metric).reset_index().algorithm
+    algorithms = (
+        results_df.groupby(["uid", "algorithm"])
+        .mean()
+        .groupby("algorithm")
+        .mean()
+        .sort_values(by=metric)
+        .reset_index()
+        .algorithm
+    )
     fig = plt.figure(figsize=(12, 6))
     ax = plt.gca()
     sns.boxplot(
         x="Algorithm",
         y=metric,
-        data=results_df.rename(
-            columns=dict(algorithm="Algorithm")
-        ),
+        data=results_df.rename(columns=dict(algorithm="Algorithm")),
         order=algorithms,
-        ax=ax
+        ax=ax,
     )
-    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, fontsize=12)
+    ax.set_xticklabels(ax.get_xticklabels(), rotation=60, fontsize=12)
     for loc in ["right", "left", "top", "bottom"]:
         ax.spines[loc].set_visible(False)
 
     ax.set_xlabel(ax.xaxis.get_label().get_text(), fontsize=15)
     ax.set_ylabel(ax.yaxis.get_label().get_text(), fontsize=15)
 
+    fig.tight_layout()
     if save_filepath is not None:
         fig.savefig(save_filepath)
 
@@ -228,12 +324,10 @@ def boxplot_subjects(results_df, metric="kappa", save_filepath=None):
         x="uid",
         y=metric,
         hue="Algorithm",
-        data=results_df.rename(
-            columns=dict(algorithm="Algorithm")
-        ),
-        ax=ax
+        data=results_df.rename(columns=dict(algorithm="Algorithm")),
+        ax=ax,
     )
-    ax.set_xticklabels(ax.get_xticklabels(), rotation=45, fontsize=12)
+    ax.set_xticklabels(ax.get_xticklabels(), rotation=60, fontsize=12)
     for loc in ["right", "left", "top", "bottom"]:
         ax.spines[loc].set_visible(False)
 
@@ -246,6 +340,7 @@ def boxplot_subjects(results_df, metric="kappa", save_filepath=None):
 
 if __name__ == "__main__":
     import argparse
+
     parser = argparse.ArgumentParser()
     parser.add_argument(
         "-path",
@@ -253,7 +348,7 @@ def boxplot_subjects(results_df, metric="kappa", save_filepath=None):
         dest="path",
         default="/home/paulo/Documents/GIT/BCI_MsC/experiments/issue_12/results.csv",
         type=Path,
-        help="Path to results.csv"
+        help="Path to results.csv",
     )
     parser.add_argument(
         "-save_folder",
@@ -261,7 +356,7 @@ def boxplot_subjects(results_df, metric="kappa", save_filepath=None):
         dest="save_folder",
         default="./plots",
         type=Path,
-        help="folder_to_save"
+        help="folder_to_save",
     )
     args = parser.parse_args()
 
@@ -272,59 +367,150 @@ def boxplot_subjects(results_df, metric="kappa", save_filepath=None):
     results_folder = args.save_folder
     results_folder.mkdir(exist_ok=True)
 
-    for classifier in classifiers:
-
-        ranked_barplot(
-            results_df.query("classifier == @classifier"),
-            x_col="uid",
-            val_col="Kappa",
-            grouping_col="algorithm",
-            save_filepath=results_folder / f"{classifier}.png"
-        )
+    # for classifier in classifiers:
+    #     classifier_df = results_df[results_df.classifier == classifier]
+    #     best_per_group_barplot(
+    #         classifier_df,
+    #         x_col="uid",
+    #         val_col="Kappa",
+    #         grouping_cols="algorithm",
+    #         save_filepath=results_folder
+    #         / f"best_algorithm_per_subject_for_{classifier}.png",
+    #         x_label="Subject",
+    #     )
+
+    #     detailed_barplot(
+    #         classifier_df,
+    #         x_col="uid",
+    #         hue_col="algorithm",
+    #         val_col="Kappa",
+    #         key_cols="run",
+    #         save_filepath=results_folder / f"detailed_{classifier}.png",
+    #         x_label="Subject",
+    #     )
+    #     detailed_barplot(
+    #         classifier_df,
+    #         x_col="algorithm",
+    #         hue_col="uid",
+    #         val_col="Kappa",
+    #         key_cols="run",
+    #         save_filepath=results_folder / f"algorithm_comparison_for_{classifier}.png",
+    #         x_label="Subject",
+    #     )
+    #     average_barplot(
+    #         classifier_df,
+    #         x_col="algorithm",
+    #         grouping_cols="uid",
+    #         val_col="Kappa",
+    #         key_cols="run",
+    #         save_filepath=results_folder
+    #         / f"average_per_algorithm_for_{classifier}.png",
+    #         n_boots=N_BOOT,
+    #     )
+    #     average_barplot(
+    #         classifier_df,
+    #         x_col="uid",
+    #         grouping_cols="algorithm",
+    #         val_col="Kappa",
+    #         key_cols="run",
+    #         save_filepath=results_folder / f"average_per_subject_for_{classifier}.png",
+    #         n_boots=N_BOOT,
+    #     )
+    # for algorithm in algorithms:
+    #     algorithm_df = results_df[results_df.algorithm == algorithm]
+    #     best_per_group_barplot(
+    #         algorithm_df,
+    #         x_col="uid",
+    #         val_col="Kappa",
+    #         grouping_cols="classifier",
+    #         save_filepath=results_folder
+    #         / f"best_classifer_per_subject_for_{algorithm}.png",
+    #         x_label="Subject",
+    #     )
+    #     detailed_barplot(
+    #         algorithm_df,
+    #         x_col="uid",
+    #         hue_col="classifier",
+    #         val_col="Kappa",
+    #         key_cols="run",
+    #         save_filepath=results_folder / f"detailed_{algorithm}.png",
+    #         x_label="Subject",
+    #     )
+    #     detailed_barplot(
+    #         algorithm_df,
+    #         x_col="classifier",
+    #         hue_col="uid",
+    #         val_col="Kappa",
+    #         key_cols="run",
+    #         save_filepath=results_folder / f"classifier_comparison_for_{algorithm}.png",
+    #         x_label="Subject",
+    #     )
+    #     average_barplot(
+    #         algorithm_df,
+    #         x_col="classifier",
+    #         grouping_cols="uid",
+    #         val_col="Kappa",
+    #         key_cols="run",
+    #         save_filepath=results_folder
+    #         / f"average_per_classifier_for_{algorithm}.png",
+    #         n_boots=N_BOOT,
+    #     )
+    #     average_barplot(
+    #         algorithm_df,
+    #         x_col="uid",
+    #         grouping_cols="classifier",
+    #         val_col="Kappa",
+    #         key_cols="run",
+    #         save_filepath=results_folder / f"average_per_subject_for_{algorithm}.png",
+    #         n_boots=N_BOOT,
+    #     )
+
+    best_per_group_barplot(
+        results_df,
+        grouping_cols=["algorithm"],
+        x_col="classifier",
+        val_col="Kappa",
+        figsize=None,
+        x_label="Classifier",
+        save_filepath=results_folder / "classifier_scores.png",
+    )
 
-        detailed_barplot(
-            results_df.query("classifier == @classifier"),
-            x_col="uid",
-            hue_col="algorithm",
-            val_col="Kappa",
-            order_col="run",
-            save_filepath=results_folder / f"detailed_{classifier}.png",
-            w=5,
-            cmap="nipy_spectral",
-            x_label=None
-        )
+    best_per_group_barplot(
+        results_df,
+        grouping_cols=["classifier"],
+        x_col="algorithm",
+        val_col="Kappa",
+        figsize=None,
+        x_label="Algorithm",
+        save_filepath=results_folder / "algorithm_scores.png",
+    )
 
-    for algorithm in algorithms:
-        ranked_barplot(
-            results_df.query("algorithm == @algorithm"),
-            x_col="uid",
-            val_col="Kappa",
-            grouping_col="classifier",
-            save_filepath=results_folder / f"{algorithm}.png"
-        )
+    average_barplot(
+        results_df,
+        x_col="uid",
+        grouping_cols=["algorithm", "classifier"],
+        val_col="Kappa",
+        key_cols="run",
+        save_filepath=results_folder / "average_per_subject.png",
+        n_boots=N_BOOT,
+    )
 
-        detailed_barplot(
-            results_df.query("algorithm == @algorithm"),
-            x_col="uid",
-            hue_col="classifier",
-            val_col="Kappa",
-            order_col="run",
-            save_filepath=results_folder / f"detailed_{algorithm}.png",
-            w=5,
-            cmap="nipy_spectral",
-            x_label=None
-        )
+    average_barplot(
+        results_df,
+        x_col="algorithm",
+        grouping_cols=["uid", "classifier"],
+        val_col="Kappa",
+        key_cols="run",
+        save_filepath=results_folder / "average_per_algorithm.png",
+        n_boots=N_BOOT,
+    )
 
-    for algorithm in algorithms:
-        
-        detailed_barplot(
-            results_df.query("algorithm == @algorithm"),
-            x_col="classifier",
-            hue_col="uid",
-            val_col="Kappa",
-            order_col="run",
-            save_filepath=results_folder / f"classifier_comparison_{algorithm}.png",
-            w=5,
-            cmap="nipy_spectral",
-            x_label=None
-        )
+    average_barplot(
+        results_df,
+        x_col="classifier",
+        grouping_cols=["uid", "algorithm"],
+        val_col="Kappa",
+        key_cols="run",
+        save_filepath=results_folder / "average_per_classifier.png",
+        n_boots=N_BOOT,
+    )