This notebook contain experiments with the generic assessor varying the number of reference instances, to understand the impact of that on the performance of the classifiers.

Note that running this notebook may take long and require a substantial amount of RAM.

In [None]:
!python --version

In [None]:
import os, json
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from src.results_loaders import sort_models_order, load_helm_lite, load_reasoning
from src.utils import load_with_conditions, save_dataframe
from src.classification_utils import predictive_method_list
from src.reference_benchmark import SampleSelector, AssessorFromReference

# enable reloading of modules
%load_ext autoreload
%autoreload 2

##  Experiment design
Take a fixed method and test the various predictive methods by increasing the number of reference points.

In [None]:
n_embeddings_truncate = 1024

In [None]:
def _check_skip(res_df, feature_name, split, selector_name, n_ref, assessor_name, pred_method_name):
    """pred_method_name is the name of the base classifier, while assessor_name is the name of the method that builds on the base classifier using the reference dataset
    
    This checks if the experiment has already been done and should be skipped (notice that it does not check for each llm indepenendently as they are all done together"""
    if len(res_df) > 0 and len(res_df[(
                                              res_df["features"] == feature_name) & (res_df["split"] == split) &
                                      (res_df["selector"] == selector_name) & (
                                              res_df["assessor"] == assessor_name) & (
                                              res_df["predictive_method"] == pred_method_name) & (
                                              res_df["n_ref"] == n_ref)]) > 0:
        print(f"Skipping {feature_name}, {split},  {selector_name}, {assessor_name}, {pred_method_name}, {n_ref}")
        return True
    else:
        print(f"Doing {feature_name}, {split},  {selector_name}, {assessor_name}, {pred_method_name}, {n_ref}")
        return False


def _concat_and_save(res_df, prediction_evaluations_validation, prediction_evaluations_test, feature_name, split, selector_name, n_ref, assessor_name,
                     pred_method_name, filename):


    for prediction_evaluations in [prediction_evaluations_validation, prediction_evaluations_test]:
        # prediction_evaluations is a list of dictionaries: 
        # {
        #     "llm": llm,
        #     "BrierScore": BrierScore,
        #     "Calibration": Calibration,
        #     "Refinement": Refinement,
        #     "AUROC": roc_auc,
        #     "Accuracy": accuracy,
        #     "Predictions": y_pred,
        #     "subset": subset
        # }
        # transform into dataframe and add all the other fields
        new_df = pd.DataFrame(prediction_evaluations)
        new_df["features"] = feature_name
        new_df["split"] = split
        new_df["selector"] = selector_name
        new_df["assessor"] = assessor_name
        new_df["n_ref"] = n_ref
        new_df["predictive_method"] = pred_method_name
    
        # now concatenate to the previous one: 
        res_df = pd.concat([res_df, new_df])    # prediction_evaluations is a list of dictionaries: 
    
    # save the dataframe
    save_dataframe(filename, res_df)

    return res_df


def evaluate_and_update(res_df, feature_name, split, selector_name, n_ref, assessor_name_original, assessor_name_results,
                        pred_method_name,
                        assessor, predictive_method,
                        filename, **kwargs):
    if not _check_skip(res_df, feature_name, split, selector_name, n_ref, assessor_name_results, pred_method_name):
        
        results_per_llm_dict_val, results_per_llm_dict_test = assessor.predict(assessor_name_original, classifier=predictive_method,
                                                **kwargs)
        prediction_evaluations_validation = assessor.evaluate_predictions(results_per_llm_dict_val, subset="validation")
        prediction_evaluations_test = assessor.evaluate_predictions(results_per_llm_dict_test, subset="test")

        res_df = _concat_and_save(res_df, prediction_evaluations_validation, prediction_evaluations_test, feature_name, split, selector_name, n_ref,
                                  assessor_name_results, pred_method_name, filename)

    return res_df



In [None]:
def run_selectors(split_name, reference_datasets_dict, reference_datasets_dict_name, selector_methods, n_reference_list,
                  train_df, train_llms, n_embeddings_truncate, irt_file_prefix):
    # notice test_df is not really used here.
    
    if split_name not in reference_datasets_dict:
        reference_datasets_dict[split_name] = {}

    print("split name: ", split_name)

    # truncate the embeddings
    train_df["openai_embeddings_subset"] = train_df["openai_embeddings_large"].apply(
        lambda x: x[:n_embeddings_truncate])

    # now I need to obtain the reference df with the various methods
    # define the selector
    selector = SampleSelector(train_df, "openai_embeddings_subset", train_llms)

    # try all possible selection methods
    for selector_name in selector_methods:
        if selector_name not in reference_datasets_dict[split_name]:
            reference_datasets_dict[split_name][selector_name] = {}
        for n_reference in n_reference_list:
            if n_reference in reference_datasets_dict[split_name][selector_name]:
                print(f"{selector_name} with n_reference {n_reference} already computed")
                selected_df_indeces = reference_datasets_dict[split_name][selector_name][str(n_reference)]
            else:
                print(f"Trying {selector_name} with n_reference {n_reference}")
                if "IRT" in selector_name:
                    selected_df = selector.select(selector_name, n_selected=n_reference,
                                                  irt_path=f'data_irt/{irt_file_prefix}_irtmodel/')
                else:
                    selected_df = selector.select(selector_name, n_selected=n_reference)
                if selected_df is None:
                    print(f"Skipping {selector_name} as it did not return any samples")
                    continue
                else:
                    selected_df_indeces = list(selected_df.index)
    
                reference_datasets_dict[split_name][selector_name][str(n_reference)] = selected_df_indeces
        
            print(len(selected_df_indeces))

            # save the dict at each iteration
            with open(reference_datasets_dict_name, "w") as f:
                json.dump(reference_datasets_dict, f)
    
    return reference_datasets_dict
    

In [None]:
def run_assessor(train_df, validation_df, test_df, train_llms, validation_llms, test_llms, reference_datasets_dict, results_df, results_filename,
                 split_name, selector_methods, n_reference_list, assessor_methods_list, predictive_method_list,
                 n_embeddings_truncate=1000):
    # truncate the embeddings
    train_df["openai_embeddings_subset"] = train_df["openai_embeddings_large"].apply(
        lambda x: x[:n_embeddings_truncate])
    validation_df["openai_embeddings_subset"] = validation_df["openai_embeddings_large"].apply(
        lambda x: x[:n_embeddings_truncate])
    test_df["openai_embeddings_subset"] = test_df["openai_embeddings_large"].apply(lambda x: x[:n_embeddings_truncate])
    
    # convert the split_name to a string if it is the bool False
    if isinstance(split_name, bool) and not split_name:
        split_name = "false"

    for selector_name in selector_methods:
        if selector_name not in reference_datasets_dict[split_name]:
            print(f"Skipping {selector_name} as it was not computed")
            continue

        for n_ref in n_reference_list:
            if str(n_ref) not in reference_datasets_dict[split_name][selector_name]:
                print(f"Skipping {selector_name} with n_ref {n_ref} as it was not computed")
                continue
            
            # now I need to obtain the reference df with the various methods
            selected_df_indeces = reference_datasets_dict[split_name][selector_name][str(n_ref)]
            selected_df = train_df.loc[selected_df_indeces]
    
            print("selector name: ", selector_name)
    
            # now define the assessor
            assessor = AssessorFromReference(selected_df, train_df, validation_df, test_df, "openai_embeddings_subset", train_llms, validation_llms,
                                                 test_llms)
    
            for assessor_name_results, assessor_name_original, assessor_kwargs in assessor_methods_list:
                for predictive_method, kwargs, pred_method_name in predictive_method_list:
                    results_df = evaluate_and_update(results_df, "openai", split_name,
                                                     selector_name, n_ref, assessor_name_original, assessor_name_results,
                                                     pred_method_name,
                                                     assessor, predictive_method, filename=results_filename,
                                                     **{**kwargs, **assessor_kwargs})

    return results_df

In [None]:
def plot_best_predictive_method_n_ref_line(all_datasets_assessors, feature="openai", selector="clustering_embeddings",
                                           metric="AUROC", use_sort_order=True, hide_legend=True, scale_axis=False,
                                           col_order=None, considered_predictive_method="xgboost", subset="test", font_scale=1, legend_text_size=None, height=5, aspect=1, legend_loc=(1.03, 0.6), panel_space=None):
    if feature not in all_datasets_assessors["features"].unique():
        raise ValueError(f"Feature {feature} not in the dataset")

    # only consider the test llms:
    all_datasets_assessors = all_datasets_assessors[all_datasets_assessors["subset"] == subset]
    # only consider the specified predictive method
    best_predictive_method_per_feature = all_datasets_assessors[all_datasets_assessors["predictive_method"] == considered_predictive_method]
    
    # for each set of features, extract the best predictive method -> the way this is done does not make much sense.
    # best_predictive_method_per_feature = all_datasets_assessors.groupby(["llm", "features", "split", "assessor", "selector", "n_ref"]).apply(
    # lambda x: x[x.AUROC == x.AUROC.max()]).reset_index(drop=True)
    # remove duplicates
    # best_predictive_method_per_feature = best_predictive_method_per_feature.drop_duplicates(subset=["llm", "features", "split", "assessor", "selector", "n_ref"])
    
    # consider only the chosen feature
    best_predictive_method_per_feature = best_predictive_method_per_feature[
        best_predictive_method_per_feature["features"] == feature]
    # consider only the chosen selector
    best_predictive_method_per_feature = best_predictive_method_per_feature[
        best_predictive_method_per_feature["selector"] == selector]
    # sort using the order of the models
    best_predictive_method_per_feature = best_predictive_method_per_feature.sort_values(by=["llm"],
                                                                                        key=(lambda x: x.apply(lambda
                                                                                                                   y: sort_models_order.index(
                                                                                            y))) if use_sort_order else None)
    # rename the splits:
    best_predictive_method_per_feature["split"] = best_predictive_method_per_feature["split"].astype(str).replace(
        "False", "In distribution").replace(
        "false", "In distribution").replace("OOD_1", "OOD 1").replace("OOD_2", "OOD 2").replace("OOD_3",
                                                                                                "OOD 3").replace(
        "OOD_4", "OOD 4")

    # concatenate llm and assessor:
    best_predictive_method_per_feature["llm_assessor"] = best_predictive_method_per_feature["llm"] + ", " + \
                                                         best_predictive_method_per_feature["assessor"] #+ " - " + \
                                                         # best_predictive_method_per_feature["predictive_method"]
                                    

    # Define your markers
    # markers = ['o', 'v', '^', '<', '>', 's', 'p', '*', 'h', 'H', 'D', 'd', 'P', 'X']

    # Get unique llm values
    # unique_llms = best_predictive_method_per_feature['llm'].unique()

    # Create a new figure
    # fig, axes = plt.subplots(ncols=len(best_predictive_method_per_feature["split"].unique()), nrows=1, figsize=(5*len(best_predictive_method_per_feature["split"].unique()), 5))

    # this does not work
    # # Loop over unique llms
    # for i, llm in enumerate(unique_llms):
    #     # Filter dataframe for the current llm
    #     df_llm = best_predictive_method_per_feature[best_predictive_method_per_feature['llm'] == llm]
    #     sns.pointplot(data=df_llm, x='n_ref', y=metric, hue='assessor', col="split", legend=False, marker=markers[i % len(markers)], ax=axes)

    sns.set_context("notebook", font_scale=font_scale)

    # Create catplot for the current llm with a specific marker
    catplot = sns.catplot(data=best_predictive_method_per_feature, x='n_ref', y=metric, hue='llm_assessor', col="split",
                kind="point", legend=not hide_legend,
                col_order=col_order, height=height, aspect=aspect)

    catplot.fig.subplots_adjust(wspace=panel_space)

    # title
    # plt.title("AUROC for different embedding sizes and language models\nby combining all datasets")
    # set ylim between 0.5 and 1
    if scale_axis:
        for ax in plt.gcf().axes:
            ax.set_ylim(0.5, 1)
    # make lines thinner
    for ax in plt.gcf().axes:
        for line in ax.lines:
            line.set_linewidth(1)
    # make points smaller
    for ax in plt.gcf().axes:
        for line in ax.lines:
            line.set_markersize(2)
    # rotate x labels
    for ax in plt.gcf().axes:
        plt.sca(ax)
        plt.xticks(rotation=90)

    # Adjust legend font size
    catplot._legend.set_title("Features")
    if legend_text_size is not None:
        plt.setp(catplot._legend.get_texts(), fontsize=f'{legend_text_size}')  # for legend text
        plt.setp(catplot._legend.get_title(), fontsize=f'{legend_text_size+1}')  # for legend title

    # Move legend to an appropriate position
    catplot._legend.set_bbox_to_anchor(legend_loc)
    catplot._legend.set_frame_on(False)  # Optionally, remove the legend frame for better appearance
 

In [None]:
# selector_methods = ["random", "clustering_embeddings", "clustering_LLM_success", "clustering_IRT_values", "factor_analysis_embeddings", "factor_analysis_LLM_success_samples", "factor_analysis_LLM_success_features", "factor_analysis_IRT_values"]
# we will consider here a single selector and predictive method:
selector_methods = ["clustering_embeddings"]

In [None]:
assessor_methods_list = [
    ("baseline_reference_only", "reference_only", {}),
    ("calibrate_general_classifier", "calibrate_general_classifier", {}),
    # ("baseline_all_train_llms", "calibrate_general_classifier", {"calibration_step": False}),  # does not depend on the reference set, so can skip this
    ("concatenate_ref_success", "concatenate_ref_success", {}),
    ("concatenate_ref_similarity", "concatenate_ref_success", {"features": ["cosine"]}),
    ("concatenate_ref_similarity_partial_interaction", "concatenate_ref_success", {"features":["cosine"], "interaction_terms":"partial"}),
    # ("concatenate_ref_similarity_full_interaction", "concatenate_ref_success", {"features":["cosine"], "interaction_terms":"full"}),  this fails due to exchausting RAM
]

In [None]:
n_reference_list = [1, 3, 10, 30, 100, 300, 1000]

### KindsOfReasoning


In [None]:
results_filename = "results/generic_assessors_reasoning_increasing_n_ref.pkl"

In [None]:
overwrite_res = False

n_ref_df = load_with_conditions(results_filename, overwrite_res)

In [None]:
from src.utils import llms_reasoning, train_llms_reasoning, validation_llms_reasoning, test_llms_reasoning
llms, train_llms, validation_llms, test_llms = llms_reasoning, train_llms_reasoning, validation_llms_reasoning, test_llms_reasoning

I will first obtain the various reference dfs with the different selectors, so that I do not need to repeat that many times later on.

In [None]:
reference_datasets_dict_name = "results/generic_assessors_dict_reasoning_increasing_n_ref.json"

In [None]:
if not os.path.exists(reference_datasets_dict_name) or overwrite_res:
    reference_datasets_dict = {}
else:
    with open(reference_datasets_dict_name, "r") as f:
        reference_datasets_dict = json.load(f)

The next cell runs the selector steps

In [None]:
for split in [False, "OOD_1", "OOD_2", "OOD_3", "OOD_4"]:
    train_df, validation_df, test_df = load_reasoning(llms, ["openai_embeddings"], ood_split=split, base_path="../results/kindsofreasoning_embeddings")
    
    reference_datasets_dict = run_selectors(split, reference_datasets_dict, reference_datasets_dict_name, selector_methods, n_reference_list,
                                            train_df,  train_llms, n_embeddings_truncate, irt_file_prefix=f"reasoning_{split}")

Now fit the classifiers on all splits, all reference datasets, all predictive frameworks, and all base classifiers.  

In [None]:
# load the reference dictionary
with open(reference_datasets_dict_name, "r") as f:
    reference_datasets_dict = json.load(f)

In [None]:
for split in [False, "OOD_1", "OOD_2", "OOD_3", "OOD_4"]:
    train_df, validation_df, test_df = load_reasoning(llms, ["openai_embeddings"], ood_split=split, base_path="../results/kindsofreasoning_embeddings")

    n_ref_df = run_assessor(train_df, validation_df, test_df, train_llms, validation_llms, test_llms, reference_datasets_dict, n_ref_df, results_filename,
                 split, selector_methods, n_reference_list, assessor_methods_list, predictive_method_list, n_embeddings_truncate)

#### Plots


In [None]:
results_filename = "results/generic_assessors_reasoning_increasing_n_ref.pkl"
all_datasets_assessors = load_with_conditions(results_filename)

In [None]:
dict_assessor = {
       'concatenate_ref_similarity': 'Similarity',
       'concatenate_ref_similarity_partial_interaction': 'Similarity with interaction',
       'concatenate_ref_success': 'Embeddings',
}

all_datasets_assessors.assessor = all_datasets_assessors.assessor.apply(lambda x: dict_assessor[x] if x in dict_assessor else x)

In [None]:
all_datasets_assessors.shape

In [None]:
all_datasets_assessors.columns

In [None]:
for column_name in ['features', 'split', 'selector', 'assessor', 'n_ref', 'predictive_method', 'llm', 'subset']:
    print(column_name)
    print(all_datasets_assessors[column_name].unique())
    print()

In [None]:
all_datasets_assessors_val = all_datasets_assessors[all_datasets_assessors["subset"] == "validation"]
all_datasets_assessors_test = all_datasets_assessors[all_datasets_assessors["subset"] == "test"]

In [None]:
all_datasets_assessors_val.llm.unique()

Remove the baselines from the plot:

In [None]:
all_datasets_assessor_plot = all_datasets_assessors[~all_datasets_assessors["assessor"].isin(["baseline_reference_only", "calibrate_general_classifier"])]

In [None]:
plot_best_predictive_method_n_ref_line(all_datasets_assessor_plot, use_sort_order=False, col_order=["In distribution", "OOD 1", "OOD 2", "OOD 3", "OOD 4"], hide_legend=False, subset="test",legend_text_size=16, aspect=0.7, font_scale=1.5, panel_space=0.1, legend_loc=(1, 0.5))
plt.savefig("fig/reasoning_results_increasing_n_ref_test.pdf")

In [None]:
plot_best_predictive_method_n_ref_line(all_datasets_assessor_plot, use_sort_order=False, col_order=["In distribution", "OOD 1", "OOD 2", "OOD 3", "OOD 4"], hide_legend=False, subset="validation", legend_text_size=16, aspect=0.7, font_scale=1.5, panel_space=0.1, legend_loc=(1, 0.5))
plt.savefig("fig/reasoning_results_increasing_n_ref_val.pdf")

### HELM-Lite


In [None]:
results_filename = "generic_assessors_helm_increasing_n_ref"

In [None]:
overwrite_res = False

n_ref_df = load_with_conditions(results_filename, overwrite_res)

In [None]:
from src.utils import llms_helm, train_llms_helm, validation_llms_helm, test_llms_helm
llms, train_llms, validation_llms, test_llms = llms_helm, train_llms_helm, validation_llms_helm, test_llms_helm

In [None]:
reference_datasets_dict_name = "results/generic_assessors_dict_helm_increasing_n_ref.json"

In [None]:
if not os.path.exists(reference_datasets_dict_name) or overwrite_res:
    reference_datasets_dict = {}
else:
    with open(reference_datasets_dict_name, "r") as f:
        reference_datasets_dict = json.load(f)

The next instead runs the selector steps

In [None]:
for split in [False, "OOD_1", "OOD_2", "OOD_3"]:
    train_df, validation_df, test_df = load_helm_lite(llms, ["openai_embeddings"], ood_split=split, base_path="../results/helm_lite_v1.0.0_embeddings")

    reference_datasets_dict = run_selectors(split, reference_datasets_dict, reference_datasets_dict_name, selector_methods, n_reference_list,
                                            train_df, train_llms, n_embeddings_truncate, irt_file_prefix=f"helm_{split}")

Now fit the classifiers on all splits, all reference datasets, all predictive frameworks, and all base classifiers.  

In [None]:
# load the reference dictionary
with open(reference_datasets_dict_name, "r") as f:
    reference_datasets_dict = json.load(f)

In [None]:
for split in [False, "OOD_1", "OOD_2", "OOD_3"]:
    train_df, validation_df, test_df = load_helm_lite(llms, ["openai_embeddings"], ood_split=split, base_path="../results/helm_lite_v1.0.0_embeddings")

    n_ref_df = run_assessor(train_df, validation_df, test_df, train_llms, validation_llms, test_llms, reference_datasets_dict, n_ref_df, results_filename,
                 split, selector_methods, n_reference_list, assessor_methods_list, predictive_method_list, n_embeddings_truncate)

#### Plots


In [None]:
results_filename = "results/generic_assessors_helm_increasing_n_ref.pkl"
all_datasets_assessors = load_with_conditions(results_filename)

In [None]:
dict_assessor = {
       'concatenate_ref_similarity': 'Similarity',
       'concatenate_ref_similarity_partial_interaction': 'Similarity with interaction',
       'concatenate_ref_success': 'Embeddings',
}

all_datasets_assessors.assessor = all_datasets_assessors.assessor.apply(lambda x: dict_assessor[x] if x in dict_assessor else x)

In [None]:
all_datasets_assessors.shape

In [None]:
all_datasets_assessors.columns

In [None]:
all_datasets_assessor_plot = all_datasets_assessors[~all_datasets_assessors["assessor"].isin(["baseline_reference_only", "calibrate_general_classifier"])]

In [None]:
plot_best_predictive_method_n_ref_line(all_datasets_assessor_plot, use_sort_order=False, col_order=["In distribution", "OOD 1", "OOD 2", "OOD 3"], hide_legend=False, subset="test", legend_text_size=16, aspect=0.7, font_scale=1.5, panel_space=0.1, legend_loc=(1, 0.5) )
plt.savefig("fig/helm_results_increasing_n_ref_test.pdf")

In [None]:
plot_best_predictive_method_n_ref_line(all_datasets_assessor_plot, use_sort_order=False, col_order=["In distribution", "OOD 1", "OOD 2", "OOD 3"], hide_legend=False, subset="validation", legend_text_size=16, aspect=0.7, font_scale=1.5, panel_space=0.1, legend_loc=(1, 0.5))
plt.savefig("fig/helm_results_increasing_n_ref_val.pdf")