In [None]:
!python --version

In [1]:
import os, json
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import matplotlib.patches as mpatches

from src.results_loaders import sort_models_order, load_reasoning, load_helm_lite
from src.classification_utils import predictive_method_list
from src.utils import load_with_conditions, save_dataframe
from src.reference_benchmark import SampleSelector, AssessorFromReference

# enable reloading of modules
%load_ext autoreload
%autoreload 2

##  Experiment design

- 2 sets of datasets (reasoning and HELM-Lite) -> 6 splits total; the reasoning one is subsampled to have similar training size as HELM-Lite
- 9 selector methods (random, random_best_of, clustering_embeddings, clustering_LLM_success, clustering_IRT_values, factor_analysis_embeddings,  factor_analysis_IRT_values, factor_analysis_LLM_success_samples, factor_analysis_LLM_success_features)
- predictive frameworks: reference_only, calibrate_general_classifier, baseline_all_train_llms and concatenate_ref_success (the latter with/without similarity measures as well as full/partial cross products with the similarity measures) -> 7 total
- all possible base classifiers (LogReg, XGBoost, ...) -> n classifiers

Total is 441 * n experiments.

I will also truncate the openAI embeddings to the first 1024, as there does not seem to be any improvement beyond that.

Tuning experiments:
- how many reference points: to determine this, take the best selector and predictive method, I vary the number of reference points and see how the performance changes (other notebook)


**Comment on diversity of labels in the reference df**: The reference df shoud have different labels for all test_llms, as otherwise the  reference_only baseline and the recalibrator method would not work. I have put an exception to signal when that is not the case in the source code for that.

**Note**: the IRT values are computed by running an external notebook (`2_run_irt.ipynb`), which must be run before the present one is executed.

In [None]:
n_embeddings_truncate = 1024

In [None]:
def _check_skip(res_df, feature_name, split, selector_name, assessor_name, pred_method_name):
    """pred_method_name is the name of the base classifier, while assessor_name is the name of the method that builds on the base classifier using the reference dataset

    This checks if the experiment has already been done and should be skipped (notice that it does not check for each llm indepenendently as they are all done together"""
    if len(res_df) > 0 and len(res_df[(
                                              res_df["features"] == feature_name) & (res_df["split"] == split) &
                                      (res_df["selector"] == selector_name) & (
                                              res_df["assessor"] == assessor_name) & (
                                              res_df["predictive_method"] == pred_method_name)]) > 0:
        print(f"Skipping {feature_name}, {split},  {selector_name}, {assessor_name}, {pred_method_name}")
        return True
    else:
        print(f"Doing {feature_name}, {split},  {selector_name}, {assessor_name}, {pred_method_name}")
        return False


def _concat_and_save(res_df, prediction_evaluations_validation, prediction_evaluations_test, feature_name, split, selector_name, assessor_name,
                     pred_method_name, filename):
    for prediction_evaluations in [prediction_evaluations_validation, prediction_evaluations_test]:
        # prediction_evaluations is a list of dictionaries:
        # {
        #     "llm": llm,
        #     "BrierScore": BrierScore,
        #     "Calibration": Calibration,
        #     "Refinement": Refinement,
        #     "AUROC": roc_auc,
        #     "Accuracy": accuracy,
        #     "Predictions": y_pred,
        #     "subset": subset
        # }
        # transform into dataframe and add all the other fields
        new_df = pd.DataFrame(prediction_evaluations)
        new_df["features"] = feature_name
        new_df["split"] = split
        new_df["selector"] = selector_name
        new_df["assessor"] = assessor_name
        new_df["predictive_method"] = pred_method_name

        # now concatenate to the previous one:
        res_df = pd.concat([res_df, new_df])

    # save the dataframe
    save_dataframe(filename, res_df)

    return res_df


def evaluate_and_update(res_df, feature_name, split, selector_name, assessor_name_original, assessor_name_results,
                        pred_method_name, assessor, predictive_method,
                        filename, **kwargs):
    if not _check_skip(res_df, feature_name, split, selector_name, assessor_name_results, pred_method_name):

        results_per_llm_dict_val, results_per_llm_dict_test = assessor.predict(assessor_name_original, classifier=predictive_method,
                                                **kwargs)
        prediction_evaluations_validation = assessor.evaluate_predictions(results_per_llm_dict_val, subset="validation")
        prediction_evaluations_test = assessor.evaluate_predictions(results_per_llm_dict_test, subset="test")

        res_df = _concat_and_save(res_df, prediction_evaluations_validation, prediction_evaluations_test, feature_name, split, selector_name,
                                  assessor_name_results, pred_method_name, filename)

    return res_df



In [None]:
def run_selectors(split_name, reference_datasets_dict, reference_datasets_dict_name, selector_methods, n_reference,
                  train_df, train_llms, n_embeddings_truncate, irt_file_prefix):

    if split_name not in reference_datasets_dict:
        reference_datasets_dict[split_name] = {}

    print("split name: ", split_name)

    # truncate the embeddings
    train_df["openai_embeddings_subset"] = train_df["openai_embeddings_large"].apply(
        lambda x: x[:n_embeddings_truncate])

    # now I need to obtain the reference df with the various methods
    # define the selector
    selector = SampleSelector(train_df, "openai_embeddings_subset", train_llms)

    # try all possible selection methods
    for selector_name in selector_methods:
        if selector_name in reference_datasets_dict[split_name]:
            print(f"{selector_name} was already computed")
            selected_df_indeces = reference_datasets_dict[split_name][selector_name]
        else:
            print(f"Trying selector_name {selector_name}")
            if "IRT" in selector_name:
                selected_df = selector.select(selector_name, n_selected=n_reference,
                                              irt_path=f'data_irt/{irt_file_prefix}_irtmodel/')
            else:
                selected_df = selector.select(selector_name, n_selected=n_reference)
            if selected_df is None:
                print(f"Skipping {selector_name} as it did not return any samples")
                continue
            else:
                selected_df_indeces = list(selected_df.index)

        print(len(selected_df_indeces))
        reference_datasets_dict[split_name][selector_name] = selected_df_indeces

        # save the dict at each iteration
        with open(reference_datasets_dict_name, "w") as f:
            json.dump(reference_datasets_dict, f)

    return reference_datasets_dict


In [None]:
def run_assessor(train_df, validation_df, test_df, train_llms, validation_llms, test_llms, reference_datasets_dict, results_df, results_filename,
                 split_name, selector_methods, assessor_methods_list, predictive_method_list,
                 n_embeddings_truncate=1000):
    # truncate the embeddings
    train_df["openai_embeddings_subset"] = train_df["openai_embeddings_large"].apply(
        lambda x: x[:n_embeddings_truncate])
    validation_df["openai_embeddings_subset"] = validation_df["openai_embeddings_large"].apply(
        lambda x: x[:n_embeddings_truncate])
    test_df["openai_embeddings_subset"] = test_df["openai_embeddings_large"].apply(lambda x: x[:n_embeddings_truncate])

    # convert the split_name to a string if it is the bool False
    if isinstance(split_name, bool) and not split_name:
        split_name = "false"

    for selector_name in selector_methods:
        if selector_name not in reference_datasets_dict[split_name]:
            print(f"Skipping {selector_name} as it was not computed")
            continue

        # extract the reference instances
        selected_df_indeces = reference_datasets_dict[split_name][selector_name]
        selected_df = train_df.loc[selected_df_indeces]

        print("selector name: ", selector_name)

        # now define the assessor
        assessor = AssessorFromReference(selected_df, train_df, validation_df, test_df, "openai_embeddings_subset", train_llms,
                                             validation_llms, test_llms)

        for assessor_name_results, assessor_name_original, assessor_kwargs in assessor_methods_list:
            for predictive_method, kwargs, pred_method_name in predictive_method_list:
                results_df = evaluate_and_update(results_df, "openai", split_name,
                                                 selector_name, assessor_name_original, assessor_name_results,
                                                 pred_method_name,
                                                 assessor, predictive_method, filename=results_filename,
                                                 **{**kwargs, **assessor_kwargs})

    return results_df

In [None]:
def plot_best_predictive_method_heatmap(all_datasets_assessors, feature="openai", metric="AUROC", use_sort_order=True):

    if feature not in all_datasets_assessors["features"].unique():
        raise ValueError(f"Feature {feature} not in the dataset")

    # for each set of features, extract the best predictive method
    best_predictive_method_per_feature = all_datasets_assessors.groupby(["llm", "features", "split", "assessor", "selector"]).apply(
    lambda x: x[x.AUROC == x.AUROC.max()]).reset_index(drop=True)
    # remove duplicates
    best_predictive_method_per_feature = best_predictive_method_per_feature.drop_duplicates(subset=["llm", "features", "split", "assessor", "selector"])
    # consider only the chosen feature
    best_predictive_method_per_feature = best_predictive_method_per_feature[best_predictive_method_per_feature["features"] == feature]
    # sort using the order of the models
    best_predictive_method_per_feature = best_predictive_method_per_feature.sort_values(by=["llm"],
                                                                                        key=(lambda x: x.apply(lambda
                                                                                                                  y: sort_models_order.index(
                                                                                            y))) if use_sort_order else None)

    # better labels for x and y axis of heatmap
    best_predictive_method_per_feature["selector"] = best_predictive_method_per_feature["selector"].apply(
        lambda x: x.replace("_", " ").replace("clustering", "Cluster").replace("factor analysis", "FA").replace("samples", "").replace("values", "").capitalize())
    best_predictive_method_per_feature["assessor"] = best_predictive_method_per_feature["assessor"].apply(
        lambda x: x.replace("_", " ").replace("partial ", "").capitalize())

    # create one heatmap for each split and for each LLM
    for split in best_predictive_method_per_feature["split"].unique():
        for llm in best_predictive_method_per_feature["llm"].unique():
            # print(f"Split: {split}, LLM: {llm}")
            #filter the dataframe
            best_predictive_method_per_feature_split_llm = best_predictive_method_per_feature[
                (best_predictive_method_per_feature["split"] == split) & (best_predictive_method_per_feature["llm"] == llm)]
            # create the heatmap
            plt.figure(figsize=(1*len(best_predictive_method_per_feature_split_llm["selector"].unique()), 1*len(best_predictive_method_per_feature_split_llm["assessor"].unique())))
            sns.heatmap(data=best_predictive_method_per_feature_split_llm.pivot("assessor", "selector", metric),
                        annot=True, fmt=".2f", cmap="viridis", vmin=0.5, vmax=1)
            plt.title(f"{metric} for different selectors and assessors\nfor {llm} and {feature} features\n{split}")
            plt.show()

            # rotate x labels
            for ax in plt.gcf().axes:
                plt.sca(ax)
                plt.xticks(rotation=90)


In [None]:
# I removed factor_analysis_LLM_success_features as that fails (singular matrix)
selector_methods = ["random", "random_best_of", "clustering_embeddings", "clustering_LLM_success", "clustering_IRT_values", "factor_analysis_embeddings", "factor_analysis_LLM_success_samples", "factor_analysis_IRT_values"]

In [None]:
assessor_methods_list = [
    ("baseline_reference_only", "reference_only", {}),
    ("calibrate_general_classifier", "calibrate_general_classifier", {}),
    ("baseline_all_train_llms", "calibrate_general_classifier", {"calibration_step": False}),
    ("concatenate_ref_success", "concatenate_ref_success", {}),
    ("concatenate_ref_similarity", "concatenate_ref_success", {"features": ["cosine"]}),
    ("concatenate_ref_similarity_partial_interaction", "concatenate_ref_success", {"features":["cosine"], "interaction_terms":"partial"}),
    # ("concatenate_ref_similarity_full_interaction", "concatenate_ref_success", {"features":["cosine"], "interaction_terms":"full"}),  # requires too much RAM
]

### KindsOfReasoning

Here I will keep all GPT4 versions out of the training set of LLMs.

In [None]:
results_filename = "results/generic_assessors_reasoning.pkl"

In [None]:
overwrite_res = False

results_df = load_with_conditions(results_filename, overwrite_res)

In [None]:
from src.utils import llms_reasoning, train_llms_reasoning, validation_llms_reasoning, test_llms_reasoning
llms, train_llms, validation_llms, test_llms = llms_reasoning, train_llms_reasoning, validation_llms_reasoning, test_llms_reasoning

I will first obtain the various reference dfs with the different selectors, so that I do not need to repeat that many times later on.

In [None]:
n_reference = 100
reference_datasets_dict_name = "results/generic_assessors_dict_reasoning.json"

In [None]:
if not os.path.exists(reference_datasets_dict_name) or overwrite_res:
    reference_datasets_dict = {}
else:
    with open(reference_datasets_dict_name, "r") as f:
        reference_datasets_dict = json.load(f)

The next cell runs the selector steps

In [None]:
for split in [False, "OOD_1", "OOD_2", "OOD_3", "OOD_4"]:
    train_df, validation_df, test_df = load_reasoning(llms, ["openai_embeddings"], ood_split=split, base_path="../results/kindsofreasoning_embeddings")

    reference_datasets_dict = run_selectors(split, reference_datasets_dict, reference_datasets_dict_name, selector_methods, n_reference,
                                            train_df, train_llms, n_embeddings_truncate, irt_file_prefix=f"reasoning_{split}")

Now fit the classifiers on all splits, all reference datasets, all predictive frameworks, and all base classifiers.  

In [None]:
# load the reference dictionary
with open(reference_datasets_dict_name, "r") as f:
    reference_datasets_dict = json.load(f)

In [None]:
for split in [False, "OOD_1", "OOD_2", "OOD_3", "OOD_4"]:
    train_df, validation_df, test_df = load_reasoning(llms, ["openai_embeddings"], ood_split=split, base_path="../results/kindsofreasoning_embeddings")

    print("split name: ", split)

    results_df = run_assessor(train_df,validation_df, test_df, train_llms, validation_llms, test_llms, reference_datasets_dict, results_df, results_filename,
                 split, selector_methods, assessor_methods_list, predictive_method_list, n_embeddings_truncate)

#### Plots


In [None]:
results_filename = "results/generic_assessors_reasoning.pkl"
all_datasets_assessors = load_with_conditions(results_filename)

In [None]:
all_datasets_assessors.shape

In [None]:
all_datasets_assessors.columns

In [None]:
all_datasets_assessors["features"].value_counts()

- Possible metrics are 'BrierScore', 'Calibration', 'Refinement', 'AUROC', 'Accuracy'
- the independent features are instead 'llm', 'features', 'split', 'assessor', 'selector'
- "features" is only "openai", and split has few values.

The following shows the performance of the best classifier for each assessor and selector on validation data and LLMs.

In [None]:
all_datasets_assessors_validation = all_datasets_assessors[all_datasets_assessors["subset"] == "validation"]

plot_best_predictive_method_heatmap(all_datasets_assessors_validation)

### HELM-Lite

In [None]:
results_filename = "results/generic_assessors_helm.pkl"

In [None]:
overwrite_res = False

results_df = load_with_conditions(results_filename, overwrite_res)

In [None]:
from src.utils import llms_helm, train_llms_helm, validation_llms_helm, test_llms_helm
llms, train_llms, validation_llms, test_llms = llms_helm, train_llms_helm, validation_llms_helm, test_llms_helm

In [None]:
n_reference = 100
reference_datasets_dict_name = "results/generic_assessors_dict_helm.json"

In [None]:
if not os.path.exists(reference_datasets_dict_name) or overwrite_res:
    reference_datasets_dict = {}
else:
    with open(reference_datasets_dict_name, "r") as f:
        reference_datasets_dict = json.load(f)

The next instead runs the selector steps

In [None]:
for split in [False, "OOD_1", "OOD_2", "OOD_3"]:
    train_df, validation_df, test_df = load_helm_lite(llms, ["openai_embeddings"], ood_split=split, base_path="../results/helm_lite_v1.0.0_embeddings")

    reference_datasets_dict = run_selectors(split, reference_datasets_dict, reference_datasets_dict_name, selector_methods, n_reference,
                                            train_df, train_llms, n_embeddings_truncate, irt_file_prefix=f"helm_{split}")

Now fit the classifiers on all splits, all reference datasets, all predictive frameworks, and all base classifiers.  

In [None]:
# load the reference dictionary
with open(reference_datasets_dict_name, "r") as f:
    reference_datasets_dict = json.load(f)

In [None]:
for split in [False, "OOD_1", "OOD_2", "OOD_3"]:
    train_df, validation_df, test_df = load_helm_lite(llms, ["openai_embeddings"], ood_split=split, base_path="../results/helm_lite_v1.0.0_embeddings")

    print("split name: ", split)

    results_df = run_assessor(train_df, validation_df, test_df, train_llms, validation_llms, test_llms, reference_datasets_dict, results_df, results_filename,
                 split, selector_methods, assessor_methods_list, predictive_method_list, n_embeddings_truncate)

#### Plots


In [None]:
results_filename = "results/generic_assessors_helm.pkl"
all_datasets_assessors = load_with_conditions(results_filename)

In [None]:
all_datasets_assessors.shape

The following shows the performance of the best classifier for each assessor and selector on validation data and LLMs.

In [None]:
all_datasets_assessors_validation = all_datasets_assessors[all_datasets_assessors["subset"] == "validation"]

plot_best_predictive_method_heatmap(all_datasets_assessors_validation, use_sort_order=False)

## Select best assessor-selector

I will rely on the average win rate of each assessor-selector combination on all others, which has a 1-1 correspondence with the ranking, but it is easier to interpret and to average across scenarios (LLMs, splits, datasets).

For each dataset and split, I will select the best (selector, assessor setup, classifier) by considering the average win on validation data and LLMs. I will also do the same by restricting to the reference_only assessor setup, to determine the best classifier and selector. The I will have a best setup for each dataset and split, and a best setup for each dataset and split with the reference_only assessor setup.



In [None]:
from src.utils import validation_llms_reasoning, validation_llms_helm

In [None]:
results_reference_reasoning = load_with_conditions("results/generic_assessors_reasoning.pkl")
results_reference_helm = load_with_conditions("results/generic_assessors_helm.pkl")

In [None]:
results_reference_reasoning.columns

In [None]:
split_reasoning = [False, "OOD_1", "OOD_2", "OOD_3", "OOD_4"]
split_helm = [False, "OOD_1", "OOD_2", "OOD_3"]

In [None]:
from itertools import product
import numpy as np


def compute_win_rate_single_llm_split_dataset(results, considered_assessors, considered_selectors,
                                              considered_classifiers, llm, split, metric="AUROC", features="openai"):
    """For all combinations of the considered assessor, selector and classifiers, compute whether the assessor-selector-classifier combination is better or worse than any other assessor-selector-classifier combination;
    basically compute a binary matrix where 1 means that the assessor-selector-classifier combination is better than the other one, 0 means it is worse. The number of dimensions of the matrix is the number of combinations obtained with considered assessors, selectors and classifiers."""

    if isinstance(split, bool) and not split:
      split = "false"

    # Filter the DataFrame for the specific LLM, split, and features, and consider only the validation subset
    filtered_results = results[
        (results['llm'] == llm) & (results['split'] == split) & (results['features'] == features) & (
                    results['subset'] == "validation")]


    # Generate all possible combinations of assessors, selectors and classifiers
    combinations = list(product(considered_assessors, considered_selectors, considered_classifiers))

    # Initialize the binary win matrix
    win_matrix = np.zeros((len(combinations), len(combinations)))

    # Iterate through each combination
    for i, (assessor1, selector1, classifier1) in enumerate(combinations):
        assert assessor1 in filtered_results['assessor'].unique(), f"Assessor {assessor1} not in the results"
        assert selector1 in filtered_results['selector'].unique(), f"Selector {selector1} not in the results"
        assert classifier1 in filtered_results[
            'predictive_method'].unique(), f"Classifier {classifier1} not in the results"
        for j in range(i + 1, len(combinations)):
            assessor2, selector2, classifier2 = combinations[j]

            # Filter for the specific combinations
            combo1_results = filtered_results[
                (filtered_results['assessor'] == assessor1) & (filtered_results['selector'] == selector1) & (
                            filtered_results['predictive_method'] == classifier1)]
            combo2_results = filtered_results[
                (filtered_results['assessor'] == assessor2) & (filtered_results['selector'] == selector2) & (
                            filtered_results['predictive_method'] == classifier2)]

            # Compare the metric scores and update the win matrix
            if not combo1_results.empty and not combo2_results.empty:
                if combo1_results[metric].values[0] > combo2_results[metric].values[0]:
                    win_matrix[i, j] = 1
                    win_matrix[j, i] = 0
                else:
                    win_matrix[i, j] = 0
                    win_matrix[j, i] = 1

    # check if the right number of comparisons were made
    assert np.sum(win_matrix) == len(combinations) * (len(combinations) - 1) / 2, "Not all comparisons were made"

    # now compute the average of the win matrix over axis 1, which leads to a 1d array with the average win rate of that combination over all others:
    average_win_rate = np.sum(win_matrix, axis=1) / (len(combinations) - 1)

    # create now a pandas series with the average win rate and the corresponding assessor-selector combination
    combinations_string = [f"{assessor}___{selector}___{classifier}" for assessor, selector, classifier in combinations]
    average_win_rate_series = pd.Series(average_win_rate, index=combinations_string)

    return average_win_rate_series

Test the above

In [None]:
considered_classifiers = [x[2] for x in predictive_method_list]

In [None]:
x = compute_win_rate_single_llm_split_dataset(results_reference_reasoning, ["concatenate_ref_success", "concatenate_ref_similarity"], ["random", "random_best_of"], considered_classifiers, 'gpt-3.5-turbo-0125', False, metric="AUROC", features="openai")

In [None]:
results_reference_reasoning.subset.unique()

In [None]:
llm = "gpt-4-0613"
split =  "OOD_1"
features= "openai"

results = results_reference_reasoning
results_val = results[results['subset'] == "validation"]
results_test = results[results['subset'] == "test"]

In [None]:
results_val.llm.unique()

In [None]:
results_test.llm.unique()

In [None]:
x.sort_values(ascending=False)

In [None]:
# now I need to make a function that averages that over multiple llms, splits and datasets
def compute_win_rate_multiple_llm_split_dataset(results_list_dataset, dataset_names, considered_llm_list, considered_split_list, considered_assessors, considered_selectors, considered_classifiers, metric="AUROC", features="openai"):

    # create a pandas dataframe where to store the various results
    all_win_rates = pd.DataFrame()

    for dataset_name, results, considered_llm, considered_split in zip(dataset_names, results_list_dataset, considered_llm_list, considered_split_list):
        for llm in considered_llm:
            for split in considered_split:
                single_win_rate = compute_win_rate_single_llm_split_dataset(results, considered_assessors, considered_selectors, considered_classifiers, llm, split, metric, features)
                # append as a new column to the dataframe, identified by the dataset name, llm and split
                all_win_rates[f"{dataset_name}_{split}_{llm}"] = single_win_rate

    return all_win_rates


In [None]:
def compute_average_win_rate_from_computed_values(all_win_rates, dataset_names, considered_llm_list, considered_split_list, recursive_average=True):
    dataset_averages_df = pd.DataFrame()

    cols = []
    for dataset_name, considered_llm, considered_split in zip(dataset_names, considered_llm_list, considered_split_list):
        cols_dataset = []
        for llm in considered_llm:
            for split in considered_split:
                cols_dataset.append(f"{dataset_name}_{split}_{llm}")
                cols.append(f"{dataset_name}_{split}_{llm}")

        dataset_averages_df[dataset_name] = all_win_rates[cols_dataset].mean(axis=1)

    if recursive_average:
        return dataset_averages_df.mean(axis=1)
    else:
        return all_win_rates[cols].mean(axis=1)


**Notice the average is done in two steps: first I average over all considered splits and LLMs for a given dataset, and then I average over all datasets.** This is to avoid a dataset with more splits and LLMs to have more weight in the final average.

I will select the best combination of assessor-selector-classifier for each dataset and split by considering the average win rate over all others. In particular, I do this 4 separate instances: 
- the first to select the best of the generic assessor setups (hence considering all possible concatenations of features, selectors and classifiers)
- the second to consider the best reference_all baseline (hence considering only that as assessor and all possible combinations of selectors and classifiers)
- the third for the train_all baseline (hence considering only that and one possible choice of selector, as it does not use the selector information).
Notice that I also trained a `calibrate_general_classifier` which takes the baseline_all and recalibrates it using test data; however recalibrating does not change the AUC, so it will not be considered in the selection.
- Finally, I also consider a baseline where only the random selector is used, but for which the various assessor methods are all used.

In [None]:
considered_selectors = selector_methods
random_selector = ["random"]

In [None]:
considered_assessors_no_baseline = [
    "concatenate_ref_success",
    "concatenate_ref_similarity",
    "concatenate_ref_similarity_partial_interaction",
]
baseline_reference_only = ["baseline_reference_only"]
baseline_all_train_llms = ["baseline_all_train_llms"]

Compute all win rates for the considered methods and for the baselines.

In [None]:
win_rates_no_baseline = compute_win_rate_multiple_llm_split_dataset([results_reference_reasoning, results_reference_helm], ["reasoning", "helm"], [validation_llms_reasoning, validation_llms_helm], [split_reasoning, split_helm], considered_assessors_no_baseline, considered_selectors, considered_classifiers)

In [None]:
win_rates_baseline_reference_only = compute_win_rate_multiple_llm_split_dataset([results_reference_reasoning, results_reference_helm], ["reasoning", "helm"], [validation_llms_reasoning, validation_llms_helm], [split_reasoning, split_helm], baseline_reference_only, considered_selectors, considered_classifiers)

In [None]:
win_rates_baseline_all_train_llms = compute_win_rate_multiple_llm_split_dataset([results_reference_reasoning, results_reference_helm], ["reasoning", "helm"], [validation_llms_reasoning, validation_llms_helm], [split_reasoning, split_helm], baseline_all_train_llms, [considered_selectors[0]], considered_classifiers)  # only consider one selector for sake of efficiency, as they do not impact the AUC 

In [None]:
win_rates_random_selector = compute_win_rate_multiple_llm_split_dataset([results_reference_reasoning, results_reference_helm], ["reasoning", "helm"], [validation_llms_reasoning, validation_llms_helm], [split_reasoning, split_helm], considered_assessors_no_baseline, random_selector, considered_classifiers)

Now, for each dataset and split, I will select the best (selector, assessor setup, classifier) by considering the average win on validation data and LLMs. I will also do the same by restricting to the reference_only assessor setup, to determine the best classifier and selector. The I will have a best setup for each dataset and split, and a best setup for each dataset and split with the reference_only assessor setup.

In [None]:
def get_best_method(average_win_rates, dataset_name, split):
    """
    Function to get the best method based on average win rates.

    Parameters:
    average_win_rates (pd.Series): A pandas series with average win rates.
    dataset_name (str): The name of the dataset.
    split (str): The split of the dataset.

    Returns:
    pd.DataFrame: A dataframe with the best method and its details.
    """
    # Get the index of the maximum average win rate
    best_method = average_win_rates.idxmax()

    # Extract the assessor, selector and classifier from the index
    assessor, selector, classifier = best_method.split("___")

    # Create a DataFrame with the new row
    new_row = pd.DataFrame([{
        "dataset": dataset_name,
        "split": split,
        "assessor": assessor,
        "selector": selector,
        "classifier": classifier,
        "average_win_rate": average_win_rates.max()
    }])

    return new_row

In [None]:
best_method_no_baseline_df = pd.DataFrame(columns=["dataset", "split", "assessor", "selector", "classifier", "average_win_rate"])
best_method_baseline_reference_only_df = pd.DataFrame(columns=["dataset", "split", "assessor", "selector", "classifier", "average_win_rate"])
best_method_baseline_all_train_llms_df = pd.DataFrame(columns=["dataset", "split", "assessor", "selector", "classifier", "average_win_rate"])
best_method_random_selector_df = pd.DataFrame(columns=["dataset", "split", "assessor", "selector", "classifier", "average_win_rate"])

for dataset_name, splits, llms in zip(["reasoning", "helm"], [split_reasoning, split_helm], [validation_llms_reasoning, validation_llms_helm]):
    print(f"Dataset: {dataset_name}")
    for split in splits:
        average_win_rates_no_baseline = compute_average_win_rate_from_computed_values(win_rates_no_baseline, [dataset_name], [llms], [[split],])
        average_win_rates_baseline_reference_only = compute_average_win_rate_from_computed_values(win_rates_baseline_reference_only, [dataset_name], [llms], [[split],])
        average_win_rates_baseline_all_train_llms = compute_average_win_rate_from_computed_values(win_rates_baseline_all_train_llms, [dataset_name], [llms], [[split],])
        average_win_rates_random_selector = compute_average_win_rate_from_computed_values(win_rates_random_selector, [dataset_name], [llms], [[split],])

        # NO BASELINE
        new_row_no_baseline = get_best_method(average_win_rates_no_baseline, dataset_name, split)
        # concatenate
        best_method_no_baseline_df = pd.concat([best_method_no_baseline_df, new_row_no_baseline], ignore_index=True)

        # BASELINE REFERENCE ONLY
        new_row_baseline_reference_only = get_best_method(average_win_rates_baseline_reference_only, dataset_name, split)
        # concatenate the results
        best_method_baseline_reference_only_df = pd.concat([best_method_baseline_reference_only_df, new_row_baseline_reference_only], ignore_index=True)
        
        # BASELINE ALL TRAIN LLMS
        new_row_baseline_all_train_llms = get_best_method(average_win_rates_baseline_all_train_llms, dataset_name, split)
        # concatenate the results
        best_method_baseline_all_train_llms_df = pd.concat([best_method_baseline_all_train_llms_df, new_row_baseline_all_train_llms], ignore_index=True)
        
        # RANDOM SELECTOR
        new_row_random_selector = get_best_method(average_win_rates_random_selector, dataset_name, split)
        # concatenate the results
        best_method_random_selector_df = pd.concat([best_method_random_selector_df, new_row_random_selector], ignore_index=True)

# save these results
save_dataframe("results/generic_assessors_best_method_no_baseline_df.csv", best_method_no_baseline_df)
save_dataframe("results/generic_assessors_best_method_baseline_reference_only_df.csv", best_method_baseline_reference_only_df)
save_dataframe("results/generic_assessors_best_method_baseline_all_train_llms_df.csv", best_method_baseline_all_train_llms_df)
save_dataframe("results/generic_assessors_best_method_random_selector_df.csv", best_method_random_selector_df)

In [None]:
best_method_no_baseline_df


I will then perform the selection on multiple subsets of test llms, datasets and splits and see how robust it is.


### Everything

In [None]:
average_win_rates_all = compute_average_win_rate_from_computed_values(win_rates_no_baseline, ["reasoning", "helm"], [validation_llms_reasoning, validation_llms_helm], [split_reasoning, split_helm])
average_win_rates_all.sort_values(ascending=False)

### KindsOfReasoning and HELM-Lite separately

In [None]:
average_win_rates_reasoning = compute_average_win_rate_from_computed_values(win_rates_no_baseline, ["reasoning"], [validation_llms_reasoning], [split_reasoning])
average_win_rates_reasoning.sort_values(ascending=False)

In [None]:
average_win_rates_helm = compute_average_win_rate_from_computed_values(win_rates_no_baseline, ["helm"], [validation_llms_helm], [split_helm])
average_win_rates_helm.sort_values(ascending=False)

### Random and OOD split separately

Considering the random split in the selection of the best assessor-selector combination and then seeing how the selection performs on the OOD split is probably the most interesting (and representative of real world) setup.


In [None]:
average_win_rates_random_split = compute_average_win_rate_from_computed_values(win_rates_no_baseline, ["reasoning", "helm"], [validation_llms_reasoning, validation_llms_helm], [[split_reasoning[0]], [split_helm[0]]])
average_win_rates_random_split.sort_values(ascending=False)

In [None]:
average_win_rates_ood_splits = compute_average_win_rate_from_computed_values(win_rates_no_baseline, ["reasoning", "helm"], [validation_llms_reasoning, validation_llms_helm], [split_reasoning[1:], split_helm[1:]])
average_win_rates_ood_splits.sort_values(ascending=False)

## Plots - all together

In [None]:
results_reference_reasoning = load_with_conditions("results/generic_assessors_reasoning.pkl")
results_reference_helm = load_with_conditions("results/generic_assessors_helm.pkl")
results_all_reasoning = load_with_conditions("results/specific_assessors_reasoning.pkl")
results_all_helm = load_with_conditions("results/specific_assessors_helm.pkl")
best_method_no_baseline_df = load_with_conditions("results/generic_assessors_best_method_no_baseline_df.csv")
best_method_baseline_reference_only_df = load_with_conditions("results/generic_assessors_best_method_baseline_reference_only_df.csv")
best_method_baseline_all_train_llms_df = load_with_conditions("results/generic_assessors_best_method_baseline_all_train_llms_df.csv")
best_method_random_selector_df = load_with_conditions("results/generic_assessors_best_method_random_selector_df.csv")

In [None]:
best_method_no_baseline_df

Convert the one above to a latex table:

In [None]:
dict_assessor = {
       'concatenate_ref_similarity': 'Similarity',
       'concatenate_ref_similarity_partial_interaction': 'Similarity with interaction',
       'concatenate_ref_success': 'Success',
}
dict_classifier = {'xgboost': "XGBoost", 'logistic_regression_l1_c=0.1': "Logistic Regression L1 C=0.1", 'logistic_regression_l1_c=1': "Logistic Regression L1 C=1"}


In [None]:
latex_df = best_method_no_baseline_df.copy()
# format data 
latex_df.split = latex_df.split.apply(lambda x: x.replace("_", " ").replace("false", "In-distribution"))
latex_df.dataset = latex_df.dataset.apply(lambda x: x.replace("reasoning", "KindsOfReasoning").replace("helm", "HELM-Lite"))
latex_df.assessor = latex_df.assessor.apply(lambda x: dict_assessor[x])
latex_df.classifier = latex_df.classifier.apply(lambda x: dict_classifier[x])
latex_df.selector = latex_df.selector.apply(lambda x: x.replace("_", " ").capitalize().replace("irt", "IRT").replace("llm", "LLM"))
# create hierarchical index
latex_df.set_index(['dataset', 'split'], inplace=True)
latex_df.index.name = ""
latex_df = latex_df.T
latex_df.drop("average_win_rate", inplace=True)
latex_df.style.to_latex(buf="tab/best_combinations.txt", hrules=True)

Do two separate latex tables, one per dataset

In [None]:
for dataset in ["reasoning", "helm"]:
    latex_df = best_method_no_baseline_df[best_method_no_baseline_df["dataset"] == dataset].copy()
    # format data
    latex_df.split = latex_df.split.apply(lambda x: x.replace("_", " ").replace("false", "In-distribution"))
    latex_df.assessor = latex_df.assessor.apply(lambda x: dict_assessor[x])
    latex_df.classifier = latex_df.classifier.apply(lambda x: dict_classifier[x])
    latex_df.selector = latex_df.selector.apply(lambda x: x.replace("_", " ").capitalize().replace("irt", "IRT").replace("llm", "LLM"))
    latex_df.set_index(['split'], inplace=True)
    latex_df.index.name = ""
    latex_df = latex_df.T
    latex_df.drop("average_win_rate", inplace=True)
    latex_df.drop("dataset", inplace=True)
    latex_df.style.to_latex(buf=f"tab/best_combinations_{dataset}.txt", hrules=True)    
    latex_df.T.style.to_latex(buf=f"tab/best_combinations_{dataset}_transposed.txt", hrules=True)    
    


In [None]:
latex_df.T

In [None]:
best_method_random_selector_df

In [None]:
best_method_baseline_reference_only_df

In [None]:
best_method_baseline_all_train_llms_df

The combination selected with the random only selector is, in terms of winning rate over the validation LLMs, always worse than the no-baseline one. However in some on the test LLM the AUC is worse with the latter.

`best_method_baseline_df` contains the results with the best assessor and selector for the reference_only baseline. 

I also need to add 
1) the best case where random instances are selected
2) the results with the train_all baseline -> how to select the best combination there? 

In [None]:
results_all_reasoning.columns

In [None]:
results_all_reasoning.shape

In [None]:
results_reference_reasoning["subset"].unique()

In [None]:
results_reference_reasoning.columns

In [None]:
split_reasoning = [False, "OOD_1", "OOD_2", "OOD_3", "OOD_4"]
split_helm = [False, "OOD_1", "OOD_2", "OOD_3"]
split_reasoning_plot = ["Random", "OOD 1", "OOD 2", "OOD 3", "OOD 4"]
split_helm_plot = ["Random", "OOD 1", "OOD 2", "OOD 3"]

Notice that I now have a best combination for each split and dataset, as I am using the validation data and LLMs to select them.

In [None]:
best_method_no_baseline_df

In [None]:
best_method_random_selector_df

In [None]:
results_reference_reasoning_1 = results_reference_reasoning[results_reference_reasoning["split"] == "OOD_1"] 
results_reference_reasoning_1 = results_reference_reasoning_1[results_reference_reasoning_1["selector"] == "random"]
results_reference_reasoning_1 = results_reference_reasoning_1[results_reference_reasoning_1["assessor"] == "concatenate_ref_similarity_partial_interaction"]
results_reference_reasoning_1 = results_reference_reasoning_1[results_reference_reasoning_1["predictive_method"] == "xgboost"]
results_reference_reasoning_1 = results_reference_reasoning_1[results_reference_reasoning_1["subset"] == "test"]
results_reference_reasoning_1

Create all plots (this is what I reported in the paper):

In [None]:
for orientation in ["h"]:#, "v"]:
    for plot_with_baselines in [True, False]:

        feature = "openai"
        
        # orientation = "v"
        # plot_with_baselines = True
        
        size = 4
        other_dim_depends_on_n_LLM = False
        add_fig_title = True
        
        best_method_no_baseline_df["split"] = best_method_no_baseline_df["split"].apply(lambda x: "false" if x.lower() == "false" else x)
        best_method_baseline_reference_only_df["split"] = best_method_baseline_reference_only_df["split"].apply(lambda x: "false" if x.lower() == "false" else x)
        best_method_baseline_all_train_llms_df["split"] = best_method_baseline_all_train_llms_df["split"].apply(lambda x: "false" if x.lower() == "false" else x)
        best_method_random_selector_df["split"] = best_method_random_selector_df["split"].apply(lambda x: "false" if x.lower() == "false" else x)
        
        list_best_method_df = [best_method_no_baseline_df, best_method_random_selector_df, best_method_baseline_reference_only_df, best_method_baseline_all_train_llms_df]
        list_names = ["Reference", "Random selector", "Reference only", "All train data", ]
        
        if not plot_with_baselines:
            list_best_method_df = list_best_method_df[:1]
            list_names = list_names[:1]
        
        # for both reasoning and helm, put the results with the best assessor and selector and those with the full dataset in a single dataframe.
        
        for results_reference, results_all, splits_list, splits_list_plot, dataset, dataset_name in zip([results_reference_helm, results_reference_reasoning], [results_all_helm, results_all_reasoning], [split_helm, split_reasoning], [split_helm_plot, split_reasoning_plot], ["HELM-Lite", "KindsOfReasoning"], ["helm", "reasoning"]):
        
            # only keep the test results in the reference setup:
            results_reference = results_reference[results_reference["subset"] == "test"]
            
            # rename the "AUROC" column into "AUROC_test"
            results_reference = results_reference.rename(columns={"AUROC": "AUROC_test"})
        
            # filter the _all to keep only the llms in the reference
            results_all = results_all[results_all["llm"].isin(results_reference["llm"].unique())]
        
            # only keep those where features=feature
            results_all = results_all[results_all["features"] == feature]
        
            # select the best predictive method and drop duplicates, for the "all" case
            results_all_best_predictive = results_all.groupby(["llm", "features", "split"]).apply(
                lambda x: x[x.AUROC_val == x.AUROC_val.max()]).reset_index(drop=True).drop_duplicates(subset=["llm", "features", "split"])
        
            # add a column to specify full and reference
            # results_reference["type"] = "reference"
            results_all_best_predictive["type"] = "Specific assessor"
        
            total_df_dataset = results_all_best_predictive
        
            for i, split in enumerate(splits_list):
                if not split: 
                    split="false"
                
                results_reference_split = results_reference[results_reference["split"] == split]
                # print(len(results_reference_split))
                
                for best_method_df, name in zip(list_best_method_df, list_names):
                    # overwrite the "type" in the results dict:
                    results_reference_split["type"] = name
                        
                    best_assessor = best_method_df[(best_method_df["dataset"] == dataset_name) & (best_method_df["split"] == split)]["assessor"].values[0]
                    best_selector = best_method_df[(best_method_df["dataset"] == dataset_name) & (best_method_df["split"] == split)]["selector"].values[0]
                    best_classifier = best_method_df[(best_method_df["dataset"] == dataset_name) & (best_method_df["split"] == split)]["classifier"].values[0]
            
                    # filter for the best assessor, selector and classifier
                    results_reference_split_best = results_reference_split[
                        (results_reference_split["assessor"] == best_assessor) & (results_reference_split["selector"] == best_selector) & (results_reference_split["predictive_method"] == best_classifier)]
                   # print(len(results_reference_split))
            
                    # drop the "assessor", "selector" and "classifier" columns
                    results_reference_split_best = results_reference_split_best.drop(columns=["assessor", "selector", "predictive_method"])
            
                    # concatenate the two dataframes
                    total_df_dataset = pd.concat([total_df_dataset, results_reference_split_best])
        
            # # add the dataset name
            # total_df_dataset["dataset"] = name
        
            n_splits = len(splits_list)
            # create 4 panels one above the other, wide and short
            rescaling_factor = 8 / len(total_df_dataset.llm.unique()) if other_dim_depends_on_n_LLM else 1
            if orientation == "h":
                fig, axes = plt.subplots(1, n_splits, figsize=(size / 4 * 1.8 * n_splits, size / rescaling_factor), sharey=True, sharex=True)
            else:
                fig, axes = plt.subplots(n_splits, 1, figsize=(size / rescaling_factor, size / 4 * 7 / 4 * n_splits), sharex=True, sharey=True)
            
            # make sure the split name is uniform
            total_df_dataset["split"] = total_df_dataset["split"].apply(lambda x: False if x == "false" else x) 
        
            for i, split in enumerate(splits_list):
                df_split = total_df_dataset[total_df_dataset["split"] == split]
                if orientation == "h":
                    sns.barplot(data=df_split, x="AUROC_test", y="llm", hue="type", ax=axes[i], legend=False, orient=orientation)
                else:
                    sns.barplot(data=df_split, x="llm", y="AUROC_test", hue="type", ax=axes[i], legend=False, orient=orientation)
                axes[i].set_title(f"Split: {splits_list_plot[i]}")
                if orientation == "h":
                    axes[i].set_xlim(xmin=0.4)
                    axes[i].set_xlabel("AUC")
                    # set ticks and labels at 0.5 and 0.75
                    axes[i].set_xticks([0.5, 0.75])
                else:
                    axes[i].set_ylim(ymin=0.4)
                    axes[i].set_ylabel("AUC")
                    # set ticks and labels at 0.5 and 0.75
                    axes[i].set_yticks([0.5, 0.75])
            # rotate ticks
            if orientation == "h":
                axes[0].set_yticklabels(axes[0].get_yticklabels(), rotation=0)
                axes[0].set_ylabel("Test LLM")
            else:
                axes[-1].set_xticklabels(axes[-1].get_xticklabels(), rotation=90)
                axes[-1].set_xlabel("Test LLM")
        
            if add_fig_title:
                fig.suptitle(f"{dataset}")
        
            # add legend for reasoning:
            if dataset == "HELM-Lite":
                # Get the current color palette
                current_palette = sns.color_palette()
        
                # Get the unique categories in the 'type' column
                categories = total_df_dataset['type'].unique()
        
                # Create a dictionary that maps each category to a color
                color_dict = dict(zip(categories, current_palette))
        
                legend_elements = [mpatches.Patch(color=color_dict['Specific assessor'], label='Specific assessor'),
                                   mpatches.Patch(color=color_dict['Reference'], label='Generic assessor')]
                if plot_with_baselines:
                    legend_elements += [mpatches.Patch(color=color_dict['Random selector'], label='Random selector'),
                                        mpatches.Patch(color=color_dict['Reference only'], label='Reference only'),
                                        mpatches.Patch(color=color_dict['All train data'], label='All train data')]
                # put the legend below the panels if "h"
                if orientation == "h" and other_dim_depends_on_n_LLM:
                    fig.subplots_adjust(bottom=0.2)
                    fig.legend(loc="upper center", bbox_to_anchor=(0.5, -0.1), ncol=2,
                               handles=legend_elements, title="Method")
                else:
                    fig.subplots_adjust(right=0.9)
                    fig.legend(loc="center left", bbox_to_anchor=(0.91, 0.5), ncol=1, handles=legend_elements, title="Method")
            # tight layout
            # plt.tight_layout()
        
            # savefig
            plt.savefig(f"fig/Fig_2_{dataset}_{orientation}_{'baselines' if plot_with_baselines else 'no_baselines'}.pdf", bbox_inches='tight')
            plt.show()
            