In [1]:
import os
import pandas as pd
import numpy as np
from datetime import date

import torch
from torchmetrics import Accuracy, BootStrapper, F1Score
from matplotlib import pyplot as plt


from eval_helper_functions import load_df, get_task_path_dict, get_pred_label_files, get_pred_label_files_MT
from sklearn.metrics import f1_score, accuracy_score

In [2]:
# input dir
result_collection = os.path.join("..", "model_output")
print(os.path.exists(result_collection))


True


In [3]:
# output dir
output_ensemble = os.path.join(".", f"{date.today()}_summary_results+bootstrap")
if not os.path.exists(output_ensemble):
    os.mkdir(output_ensemble)

In [4]:
model_types = ["LOGR", "CNN", "HISAN", "BERT", "MTCNN", "MTHISAN", "MTBERT"]
model_order = {"LOGR": "aa", 
               "CNN" :"ba", "MTCNN": "bb", "MTCNN_2": "bc", "MTCNN_3": "bd",
               "HISAN": "ca", "MTHISAN": "cb", "MTHISAN_2": "cc", "MTHISAN_3":"cd",
               "BERT" : "da", "MTBERT":"db", "MTBERT_2": "dc", "MTBERT_3": "dd"}

tasks = ["sit2", "sit3", "mor", "his", "beh", "sit"]
mt_tasks = ["morsit", "behhissit"]
task_sorter = {"mor": "a", "his" :"b", "beh": "c", "sit": "d", "sit2": "d", "sit3": "d"}
# note: KB-BERT was abbreviated to BERT in the filenames

In [5]:
def prediction_min_n_models_agree(row, min_count, num=False):
    
    if type(row.iloc[0]) != str:
        num=True
        
    model_predictions = row[1:6]
    value_counts = model_predictions.value_counts()
    maj_label = value_counts.idxmax()
    count = value_counts.iloc[0]
    if count >= min_count:
        return maj_label
    else:
        if num:
            return 101
        return "disagreement"


def simple_accuracy(y_true, y_pred):
    correct_pred = sum(y_true == y_pred)
    all_pred = len(y_pred)
    return (correct_pred / all_pred)

def bootstrap_metric_pytorch(y_true, y_pred, n_reps, quantile__lower=0.025, quantile__upper=0.975, bootstrap=True):
    y_true = np.array(y_true, dtype=np.int32)
    y_pred = np.array(y_pred, dtype=np.int32)

    torch.manual_seed(42)
    if bootstrap:
        quantiles = torch.tensor([quantile__lower, quantile__upper])
        metric_acc = Accuracy(task="multiclass", num_classes=len(set(list(y_true))))
        bootstrap_acc = BootStrapper(base_metric=metric_acc, num_bootstraps=n_reps, quantile=quantiles, raw=True)
        bootstrap_acc.update(preds=torch.from_numpy(y_pred), target=torch.from_numpy(y_true))
        acc_output = bootstrap_acc.compute()
        acc_output = {key: val.numpy() for key, val in acc_output.items()}

        # fig, ax = plt.subplots()
        # ax.hist(acc_output["raw"])
        # ax.set_title('Bootstrap Distribution')
        # ax.set_xlabel('Metric Score')
        # ax.set_ylabel('Frequency')
        # plt.show()
    
        metric_fmac = F1Score(task="multiclass", num_classes=len(set(list((y_true)))), average="macro")
        bootstrap_fmac = BootStrapper(base_metric=metric_fmac, num_bootstraps=n_reps, quantile=quantiles)
        bootstrap_fmac.update(preds=torch.from_numpy(y_pred), target=torch.from_numpy(y_true))
        fmac_output = bootstrap_fmac.compute()
        fmac_output = {key: val.numpy() for key, val in fmac_output.items()}
    else:
        # insert -1 as a dummy value
        fmac_output = {'mean': -1, 'std': -1, 'quantile': np.array([-1, -1])}
        acc_output = {'mean': -1, 'std': -1, 'quantile': np.array([-1, -1])}
    test_metrics = {"accuracy": acc_output,
                    "fmacro": fmac_output}

    return test_metrics


def get_ensemble_scores(df_predictions, true_col="true"):
    # create acc and fmacro scores for each column 
    # (true, folds 1-5, ensemble scores with 3, 4, or all 5 models predicting the same value)
    # for the 5 folds, also create bootstrapped metric means
    
    collected_scores = {"accuracy" : [],
                        "fmacro" : [],


                        "boot_accuracy_mean" : [],
                        "boot_accuracy_quantile_lower" : [],
                        "boot_accuracy_quantile_upper" : [],
                        "boot_accuracy_std": [],
                        
                        "boot_fmacro_mean": [],
                        "boot_fmacro_quantile_lower": [],
                        "boot_fmacro_quantile_upper" : [],
                        "boot_fmacro_std": []}
    

    labels = list(df_predictions[true_col].unique())
    for col in df_predictions.columns:
        print("**", col, "**")
        acc = simple_accuracy(y_true=df_predictions[true_col],
                              y_pred=df_predictions[col])
        acc_skl = accuracy_score(y_true=df_predictions[true_col],
                                y_pred=df_predictions[col])
        
        assert acc == acc_skl # sanity check

        fmacro = f1_score(y_true=df_predictions[true_col],
                              y_pred=df_predictions[col],
                              average="macro",
                              labels=labels,
                              zero_division=np.nan)
        


        if col in ["min3", "min4", "min5"]:
            bootstrap_results= bootstrap_metric_pytorch(y_true=df_predictions[true_col],
                                             y_pred=df_predictions[col],
                                             n_reps=500,
                                             bootstrap=False)   # not calculated because of disagreement label
        elif col in ["true", "1", "2", "3", "4", "5"]:
            bootstrap_results= bootstrap_metric_pytorch(y_true=df_predictions[true_col],
                                                y_pred=df_predictions[col],
                                                n_reps=300, 
                                                bootstrap=True)
        else:
            raise ValueError
        
        # already save as percentage * 100 for tables
        collected_scores["accuracy"].append(100*acc_skl)
        collected_scores["fmacro"].append(100*fmacro)
        
        collected_scores["boot_accuracy_mean"].append(100* float(bootstrap_results["accuracy"]["mean"]))
        collected_scores["boot_accuracy_quantile_lower"].append(100* bootstrap_results["accuracy"]["quantile"][0])
        collected_scores["boot_accuracy_quantile_upper"].append(100 * bootstrap_results["accuracy"]["quantile"][1])
        collected_scores["boot_accuracy_std"].append(100* float(bootstrap_results["accuracy"]["std"]))  

        collected_scores["boot_fmacro_mean"].append(100* float(bootstrap_results["fmacro"]["mean"]))
        collected_scores["boot_fmacro_quantile_lower"].append(100* bootstrap_results["fmacro"]["quantile"][0])
        collected_scores["boot_fmacro_quantile_upper"].append(100* bootstrap_results["fmacro"]["quantile"][1])
        collected_scores["boot_fmacro_std"].append(100* float(bootstrap_results["fmacro"]["std"]))


    return collected_scores

In [6]:
summary_df_acc = pd.DataFrame(columns=["true", "1", "2", "3", "4", "5", "min3", "min4", "min5",
                                        "accuracy_mean_over_folds", "accuracy_std_over_folds", 
                                        "boot_acc_mean_over_folds", 
                                        "boot_mean_accs_of_folds", 
                                        "boot_quantile_lower_acc_of_folds", "boot_quantile_upper_acc_of_folds", 
                                        "boot_accuracy_std_of_folds"])
summary_df_fmac = pd.DataFrame(columns=["true", "1", "2", "3", "4", "5", "min3", "min4", "min5",
                                         "fmacro_mean_over_folds", "fmacro_std_over_folds", 
                                         "boot_fmac_mean_over_folds", 
                                         "boot_mean_fmacs_of_folds", 
                                         "boot_quantile_lower_fmac_of_folds", "boot_quantile_upper_fmac_of_folds", 
                                         "boot_fmac_std_of_folds",
                                         "n_true_classes_in_folds",
                                         "n_unique_classes_in_folds"])

for model in model_types:
    for cw_flag in [True, False]:
        if "MT" not in model:
            task_dict = get_task_path_dict(result_collection, model_type=model, tasks=tasks)
            pred_true_label_files = get_pred_label_files(task_path_dict=task_dict,
                                                    pred_true_labels_filename="pred_true_labels.csv",
                                                    cw_flag=cw_flag)
        else:
            task_dict = get_task_path_dict(result_collection, model_type=model, tasks=mt_tasks)
            pred_true_label_files = get_pred_label_files_MT(task_path_dict=task_dict,
                                                            pred_true_labels_filename="pred_true_labels",
                                                            cw_flag=cw_flag)

        
            
            
        for task in tasks:
            print(task, pred_true_label_files.keys(), task not in pred_true_label_files.keys())
            # (If False, task not in files and will be skipped.")
            if task not in pred_true_label_files.keys():
                # to account for MT/ST differences, and keep the filenames consistent
                print("skip")
                continue
            
            predictions_df = pd.DataFrame(columns=["true", "1", "2", "3", "4", "5", 
                                                   "min3", "min4", "min5"])
            predictions_df_num = pd.DataFrame(columns=["true", "1", "2", "3", "4", "5", 
                                                       "min3", "min4", "min5"])

            
            for fold in range(1, 6):
                df = load_df(pred_true_label_files[task][str(fold)])
                df_num = load_df(pred_true_label_files[task][str(fold)], dtype="int32")
                    
                predictions_df["true"] = list(df["labels_true_alph"])
                predictions_df[str(fold)] = list(df["labels_pred_alph"])
                
                predictions_df_num["true"] = df_num["labels_true"]
                predictions_df_num[str(fold)] = df_num["labels_pred"]
                
            # add model agreement predictions to df
            predictions_df["min3"] = predictions_df.apply(func=prediction_min_n_models_agree, min_count=3, axis=1)
            predictions_df["min4"] = predictions_df.apply(func=prediction_min_n_models_agree, min_count=4, axis=1)
            predictions_df["min5"] = predictions_df.apply(func=prediction_min_n_models_agree, min_count=5, axis=1)


            predictions_df_num["min3"] = predictions_df_num.apply(func=prediction_min_n_models_agree, min_count=3, axis=1)
            predictions_df_num["min4"] = predictions_df_num.apply(func=prediction_min_n_models_agree, min_count=4, axis=1)
            predictions_df_num["min5"] = predictions_df_num.apply(func=prediction_min_n_models_agree, min_count=5, axis=1)

            # calculate scores
            ensemble_stats = get_ensemble_scores(predictions_df_num)
            
            # create df to save conquantilese summary
            results_one_experiment = pd.DataFrame(columns=["true", "1", "2", "3", "4", "5", 
                                                           "min3", "min4", "min5"])
            results_one_experiment.loc["n_unique_classes"] = predictions_df.nunique()

    

            assert len(predictions_df["true"].unique()) == len(predictions_df_num["true"].unique())
            # n classes can include the disagreement label for min3/4/5

            # for each model, task, cw setting: save all values in a separate file
            for stat, value_list in ensemble_stats.items():
                results_one_experiment.loc[stat] = value_list
            
            if cw_flag is True:
                cw_suffix = "_CW_"
            else:
                cw_suffix = ""

            predictions_df.to_csv(os.path.join(output_ensemble, f"predictions{cw_suffix[:-1]}_{model}_{task}.csv"), encoding="utf-8")
            results_one_experiment.to_csv(os.path.join(output_ensemble, f"results{cw_suffix[:-1]}_{model}_{task}.csv"), encoding="utf-8")
            
            # for sorting correctly in df
            if task == "sit2":
                task  = "sit"
                sortsuff = "_2"
            elif task  == "sit3":
                task = "sit"
                sortsuff = "_3"
            else:
                sortsuff = ""

            to_add_acc = [results_one_experiment.loc["accuracy"][1:6].mean(),
                      results_one_experiment.loc["accuracy"][1:6].std(),
                      results_one_experiment.loc["boot_accuracy_mean"][1:6].mean(),

                      list(results_one_experiment.loc["boot_accuracy_mean"][1:6]),
                      list(results_one_experiment.loc["boot_accuracy_quantile_lower"][1:6]),
                      list(results_one_experiment.loc["boot_accuracy_quantile_upper"][1:6]),
                      list(results_one_experiment.loc["boot_accuracy_std"][1:6]),
                      ]
            
            to_add_fmac = [results_one_experiment.loc["fmacro"][1:6].mean(),
                      results_one_experiment.loc["fmacro"][1:6].std(),
                      results_one_experiment.loc["boot_fmacro_mean"][1:6].mean(),

                      list(results_one_experiment.loc["boot_fmacro_mean"][1:6]),
                      list(results_one_experiment.loc["boot_fmacro_quantile_lower"][1:6]),
                      list(results_one_experiment.loc["boot_fmacro_quantile_upper"][1:6]),
                      list(results_one_experiment.loc["boot_fmacro_std"][1:6]),
                      results_one_experiment.loc["n_unique_classes"]["true"],
                      list(results_one_experiment.loc["n_unique_classes"][1:6])]

            summary_df_acc.loc[f"{cw_suffix[1:]}{task_sorter[task]}_{task}_{model_order[model]}_{model}{sortsuff}"] = ensemble_stats["accuracy"] + to_add_acc
            summary_df_fmac.loc[f"{cw_suffix[1:]}{task_sorter[task]}_{task}_{model_order[model]}_{model}{sortsuff}"] = ensemble_stats["fmacro"] + to_add_fmac



summary_df_acc.sort_index(inplace=True)
summary_df_fmac.sort_index(inplace=True)

summary_df_acc.to_csv(os.path.join(output_ensemble, f"SUMMARY_results_accuracy.csv"), encoding="utf-8")
summary_df_fmac.to_csv(os.path.join(output_ensemble, f"SUMMARY_results_fmacro.csv"), encoding="utf-8")

# save rounded for tables
summary_df_acc.round(decimals=2).to_csv(os.path.join(output_ensemble, 
                                                     f"SUMMARY_ROUNDED_results_accuracy.csv"), 
                                                     encoding="utf-8")
summary_df_fmac.round(decimals=2).to_csv(os.path.join(output_ensemble, 
                                                      f"SUMMARY_ROUNDED_results_fmacro.csv"), 
                                                      encoding="utf-8")


sit2 dict_keys(['mor', 'his', 'beh', 'sit']) True
skip
sit3 dict_keys(['mor', 'his', 'beh', 'sit']) True
skip
mor dict_keys(['mor', 'his', 'beh', 'sit']) False
** true **
** 1 **
** 2 **
** 3 **
** 4 **
** 5 **
** min3 **
** min4 **
** min5 **
his dict_keys(['mor', 'his', 'beh', 'sit']) False
** true **
** 1 **
** 2 **
** 3 **
** 4 **
** 5 **
** min3 **
** min4 **
** min5 **
beh dict_keys(['mor', 'his', 'beh', 'sit']) False
** true **
** 1 **
** 2 **
** 3 **
** 4 **
** 5 **
** min3 **
** min4 **
** min5 **
sit dict_keys(['mor', 'his', 'beh', 'sit']) False
** true **
** 1 **
** 2 **
** 3 **
** 4 **
** 5 **
** min3 **
** min4 **
** min5 **
sit2 dict_keys(['mor', 'his', 'beh', 'sit']) True
skip
sit3 dict_keys(['mor', 'his', 'beh', 'sit']) True
skip
mor dict_keys(['mor', 'his', 'beh', 'sit']) False
** true **
** 1 **
** 2 **
** 3 **
** 4 **
** 5 **
** min3 **
** min4 **
** min5 **
his dict_keys(['mor', 'his', 'beh', 'sit']) False
** true **
** 1 **
** 2 **
** 3 **
** 4 **
** 5 **
** min3 *