Code for training assessors specific to each LLM and evaluating their performance.

Note that running this notebook may take long and require a substantial amount of RAM.

In [None]:
!python --version

In [None]:
from src.classification_utils import _evaluate_predictive_method_from_arrays

import os
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from src.results_loaders import sort_models_order, load_reasoning, load_helm_lite, ngram_vectorize_new, select_features
from src.classification_utils import evaluate_predictive_method, predictive_method_list
from src.utils import load_with_conditions, save_dataframe

# enable reloading of modules
%load_ext autoreload
%autoreload 2

In [15]:
# create results folder if it does not exist
if not os.path.exists("results"):
    os.makedirs("results")
# create fig folder if it does not exist
if not os.path.exists("fig"):
    os.makedirs("fig")


## Evaluate performance of a classifier for each LLM independently



In [None]:
def _check_skip(res_df, pred_method_name, feature_name, split, llm):
    if len(res_df)> 0 and len(res_df[(res_df["predictive_method"] == pred_method_name) & (res_df["features"] == feature_name) & (res_df["split"] == split) & (res_df["llm"] == llm)]) > 0:
        print(f"Skipping {split}, {feature_name}, {pred_method_name} for {llm} because it is already in the dataframe")
        return True
    else:
        print(f"Doing {split}, {feature_name}, {pred_method_name} for {llm}")
        return False

def _concat_and_save(res_df, pred_method_name, feature_name, split, llm,
                     BrierScore_val, Calibration_val, Refinement_val, roc_auc_val,
                     BrierScore_test, Calibration_test, Refinement_test, roc_auc_test,
                     trained_method, filename):
    res_df = pd.concat([res_df, pd.DataFrame(
    {"predictive_method": pred_method_name, "features": feature_name, "split": split,
     "llm": llm, "BrierScore_val": BrierScore_val, "Calibration_val": Calibration_val, "Refinement_val": Refinement_val, "AUROC_val": roc_auc_val,
     "BrierScore_test": BrierScore_test, "Calibration_test": Calibration_test, "Refinement_test": Refinement_test, "AUROC_test": roc_auc_test, "trained_classifier": trained_method}, index=[0])])

    save_dataframe(filename, res_df)
    return res_df

def evaluate_and_update(res_df, train_df, validation_df, test_df, features, predictive_method,
                        pred_method_name, feature_name, split, llm,
                        filename, **kwargs):
    if not _check_skip(res_df, pred_method_name, feature_name, split, llm):
        BrierScore_test, Calibration_test, Refinement_test, roc_auc_test, trained_method = evaluate_predictive_method(train_df, test_df,
                                                                                                  features,
                                                                                                  f"Success_{llm}",
                                                                                                  predictive_method=predictive_method,
                                                                                                  return_trained_method=True,
                                                                                                  **kwargs)
        BrierScore_val, Calibration_val, Refinement_val, roc_auc_val = evaluate_predictive_method(train_df, validation_df,
                                                                                                  features,
                                                                                                  f"Success_{llm}",
                                                                                                  predictive_method=predictive_method,
                                                                                                  trained_method=trained_method,
                                                                                                  **kwargs)
        res_df = _concat_and_save(res_df, pred_method_name, feature_name, split, llm,
                                  BrierScore_val, Calibration_val, Refinement_val, roc_auc_val,
                                  BrierScore_test, Calibration_test, Refinement_test, roc_auc_test,
                                  trained_method, filename)
    return res_df


def evaluate_and_update_arrays(res_df,  X_train, train_labels, X_val, val_labels,
                               X_test, test_labels, predictive_method,
                               pred_method_name, feature_name, split, llm,
                               filename,
                               **kwargs):
    if not _check_skip(res_df, pred_method_name, feature_name, split, llm):
        BrierScore_test, Calibration_test, Refinement_test, roc_auc_test, trained_method = _evaluate_predictive_method_from_arrays(X_train,
                                                                                                               train_labels,
                                                                                                               X_test,
                                                                                                               test_labels,
                                                                                                               predictive_method=predictive_method,
                                                                                                               return_trained_method=True,
                                                                                                               **kwargs)
        BrierScore_val, Calibration_val, Refinement_val, roc_auc_val = _evaluate_predictive_method_from_arrays(X_train,
                                                                                                               train_labels,
                                                                                                               X_val,
                                                                                                               val_labels,
                                                                                                               predictive_method=predictive_method,
                                                                                                               trained_method=trained_method,
                                                                                                               **kwargs)
        res_df = _concat_and_save(res_df, pred_method_name, feature_name, split, llm,
                                  BrierScore_val, Calibration_val, Refinement_val, roc_auc_val,
                                  BrierScore_test, Calibration_test, Refinement_test, roc_auc_test,
                                  trained_method, filename)
    return res_df

In [None]:
def plot_best_predictive_method_per_feature(all_datasets_assessors, sort=False, set_axis_bounds=True, metric="AUROC_test", yaxixlabel=None, col_order=None, font_scale=1, legend_text_size=None, height=5, aspect=1, legend_loc=(1.03, 0.6), panel_space=None):
    # Extract best method based on validation split
    best_predictive_method_per_feature = all_datasets_assessors.groupby(["llm", "features", "split"]).apply(lambda x: x[x.AUROC_val == x.AUROC_val.max()]).reset_index(drop=True)
    
    # Sort using the order of the models
    if sort:
        best_predictive_method_per_feature = best_predictive_method_per_feature.sort_values(by=["llm"], key=lambda x: x.apply(lambda y: sort_models_order.index(y)))
    else:
        best_predictive_method_per_feature = best_predictive_method_per_feature.sort_values(by=["llm"])
    
    best_predictive_method_per_feature["split"] = best_predictive_method_per_feature["split"].astype(str).replace("False", "In distribution").replace("OOD_1", "OOD 1").replace("OOD_2", "OOD 2").replace("OOD_3", "OOD 3").replace("OOD_4", "OOD 4")

    # Plot using seaborn
    # Adjust font scale
    sns.set_context("notebook", font_scale=font_scale)
    
    catplot = sns.catplot(
        data=best_predictive_method_per_feature,
        x='llm', y=metric, hue='features', col="split",
        kind='bar', hue_order=["openai", "fasttext", "ngrams_1", "word2vec"],
        col_order=col_order, 
        height=height, aspect=aspect
    )

    # Rotate x labels
    for i, ax in enumerate(plt.gcf().axes):
        plt.sca(ax)
        plt.xticks(rotation=90)
        if i == 0 and yaxixlabel is not None:
            ax.set_ylabel(yaxixlabel)
    
    catplot.fig.subplots_adjust(wspace=panel_space)
    
    # Adjust legend font size
    catplot._legend.set_title("Features")
    if legend_text_size is not None:
        plt.setp(catplot._legend.get_texts(), fontsize=f'{legend_text_size}')  # for legend text
        plt.setp(catplot._legend.get_title(), fontsize=f'{legend_text_size+1}')  # for legend title

    # Move legend to an appropriate position
    catplot._legend.set_bbox_to_anchor(legend_loc)
    catplot._legend.set_frame_on(False)  # Optionally, remove the legend frame for better appearance

    # Set axis bounds
    if set_axis_bounds:
        for ax in plt.gcf().axes:
            ax.set_ylim(0.5, 1)

### KindsOfReasoning

In [None]:
filename = "results/specific_assessors_reasoning.pkl"

In [None]:
overwrite_res = False

all_datasets_assessors = load_with_conditions(filename, overwrite_res)

In [None]:
llms = [
    'text-ada-001',
    'text-babbage-001',
    'text-curie-001',
    'text-davinci-001',
    'text-davinci-002',
    'text-davinci-003',
    'gpt-3.5-turbo-0301',
    'gpt-3.5-turbo-0613',
    'gpt-3.5-turbo-1106',
    'gpt-3.5-turbo-0125',
    'gpt-4-0314',
    'gpt-4-0613',
    'gpt-4-1106-preview',
    'gpt-4-0125-preview',
]

Split into different cells for OpenAI embeddings, word2vec, fasttext and n-grams.

OpenAI embeddings

In [None]:
for split in [False, "OOD_1", "OOD_2", "OOD_3", "OOD_4"]:
    train_df, validation_df, test_df = load_reasoning(llms, ["openai_embeddings"], ood_split=split, base_path="../results/kindsofreasoning_embeddings/")

    for llm in llms:
        if len(train_df[f"Success_{llm}"].unique()) < 2:
            print(f"Skipping {llm} because there is only one value in the 'Success' column for the train df")
            continue

        if len(validation_df[f"Success_{llm}"].unique()) < 2:
            print(f"Skipping {llm} because there is only one value in the 'Success' column for the validation df")
            continue

        if len(test_df[f"Success_{llm}"].unique()) < 2:
            print(f"Skipping {llm} because there is only one value in the 'Success' column for the test df")
            continue

        for predictive_method, kwargs, pred_method_name in predictive_method_list:

            all_datasets_assessors = evaluate_and_update(all_datasets_assessors, train_df, validation_df, test_df, ["openai_embeddings_large"], predictive_method, pred_method_name, "openai", split, llm, filename, **kwargs)


The data with embeddings does not have the sampling column which is needed to extract the system prompt; do not remove it anymore and re-compute the embeddings!

Word2vec + fasttext

In [None]:
for split in [False, "OOD_1", "OOD_2", "OOD_3", "OOD_4"]:
    train_df, validation_df, test_df = load_reasoning(llms, ["word2vec", "fasttext"], ood_split=split, base_path="../results/kindsofreasoning/")

    for llm in llms:
        if len(train_df[f"Success_{llm}"].unique()) < 2:
            print(f"Skipping {llm} because there is only one value in the 'Success' column for the train df")
            continue

        if len(validation_df[f"Success_{llm}"].unique()) < 2:
            print(f"Skipping {llm} because there is only one value in the 'Success' column for the validation df")
            continue

        if len(test_df[f"Success_{llm}"].unique()) < 2:
            print(f"Skipping {llm} because there is only one value in the 'Success' column for the test df")
            continue

        for embedding_type in ["word2vec", "fasttext"]:

            for predictive_method, kwargs, pred_method_name in predictive_method_list:

                all_datasets_assessors = evaluate_and_update(all_datasets_assessors, train_df, validation_df, test_df, [f"{embedding_type}_embeddings"], predictive_method, pred_method_name, embedding_type, split, llm, filename, **kwargs)


ngrams

In [None]:
for split in [False, "OOD_1", "OOD_2", "OOD_3", "OOD_4"]:
    train_df, validation_df, test_df = load_reasoning(llms, [], ood_split=split, base_path="../results/kindsofreasoning/")

    for n_gram_size in [1]:
        # compute the n-grams
        X_train_ngrams, X_val_ngrams, X_test_ngrams, vectorizer = ngram_vectorize_new(train_df["prompt"], validation_df["prompt"], test_df["prompt"], ngram_range=(1, n_gram_size))

        for llm in llms:
            if len(train_df[f"Success_{llm}"].unique()) < 2:
                print(f"Skipping {llm} because there is only one value in the 'Success' column for the train df")
                continue

            if len(validation_df[f"Success_{llm}"].unique()) < 2:
                print(f"Skipping {llm} because there is only one value in the 'Success' column for the validation df")
                continue

            if len(test_df[f"Success_{llm}"].unique()) < 2:
                print(f"Skipping {llm} because there is only one value in the 'Success' column for the test df")
                continue

            # select the features (this depends on which LLM you are considering)
            X_train_ngrams_selected, X_val_ngrams_selected, X_test_ngrams_selected, selector = select_features(X_train_ngrams, train_df[f"Success_{llm}"], X_val_ngrams, X_test_ngrams)

            for predictive_method, kwargs, pred_method_name in predictive_method_list:

                all_datasets_assessors = evaluate_and_update_arrays(all_datasets_assessors, X_train_ngrams_selected, train_df[f"Success_{llm}"], X_val_ngrams_selected, validation_df[f"Success_{llm}"], X_test_ngrams_selected, test_df[f"Success_{llm}"], predictive_method, pred_method_name, f"ngrams_{n_gram_size}", split, llm, filename, **kwargs)

In [None]:
all_datasets_assessors

In [None]:
all_datasets_assessors.split.unique()

#### Plots


In [None]:
filename = "results/specific_assessors_reasoning.pkl"
all_datasets_assessors = load_with_conditions(filename)

In [None]:
plot_best_predictive_method_per_feature(all_datasets_assessors, sort=True, col_order=["In distribution", "OOD 1", "OOD 2", "OOD 3", "OOD 4"], metric="AUROC_val", legend_text_size=24, aspect=0.9, font_scale=2, panel_space=0.1, legend_loc=(0.99, 0.6))
plt.savefig("fig/reasoning_results_full_dataset_val.pdf", bbox_inches="tight")

In [None]:
plot_best_predictive_method_per_feature(all_datasets_assessors, sort=True, col_order=["In distribution", "OOD 1", "OOD 2", "OOD 3", "OOD 4"], metric="AUROC_test", legend_text_size=24, aspect=0.9, font_scale=2, panel_space=0.1, legend_loc=(0.99, 0.6))
plt.savefig("fig/reasoning_results_full_dataset_test.pdf", bbox_inches="tight")

Compute the number of times each feature "wins" over the others, for each split separately.

In [None]:
# find the best over feature and predictive method, for each llm and split
best_over_feat_predictive_method = all_datasets_assessors.groupby(["llm", "split"]).apply(lambda x: x[x.AUROC_val == x.AUROC_val.max()]).reset_index(drop=True)
# then for each split separately, count the number of times each feature "wins" over the others (ie if it is present in the best_over_feat_predictive_method)
for split in best_over_feat_predictive_method.split.unique():
    # for each feature, count the number of times it is the best
    counts_split = best_over_feat_predictive_method[(best_over_feat_predictive_method.split == split)].features.value_counts()
    # normalize by the number of llms
    counts_split = counts_split / sum(counts_split)
    print(f"Split {split}:")
    print(counts_split)

Here, OpenAI embeddings win more frequently.

### HELM-Lite

In [None]:
filename = "results/specific_assessors_helm.pkl"

In [None]:
overwrite_res = False

all_datasets_assessors = load_with_conditions(filename, overwrite_res)

In [None]:
llms_dict = {'01-ai/yi-34b': '01-ai/yi-34b',
             '01-ai/yi-6b': '01-ai/yi-6b',
             'AlephAlpha/luminous-base': 'AlephAlpha/luminous-base',
             'AlephAlpha/luminous-extended': 'AlephAlpha/luminous-extended',
             'AlephAlpha/luminous-supreme': 'AlephAlpha/luminous-supreme',
             'ai21/j2-grande': 'ai21/j2-grande',
             'ai21/j2-jumbo': 'ai21/j2-jumbo',
             'anthropic/claude-2.0': 'anthropic/claude-2.0',
             'anthropic/claude-2.1': 'anthropic/claude-2.1',
             'anthropic/claude-instant-1.2': 'anthropic/claude-instant-1.2',
             'anthropic/claude-v1.3': 'anthropic/claude-v1.3',
             'cohere/command': 'cohere/command',
             'cohere/command-light': 'cohere/command-light',
             'google/text-bison@001': 'google/text-bison@001',
             'google/text-unicorn@001': 'google/text-unicorn@001',
             'meta/llama-2-13b': 'meta/llama-2-13b',
             'meta/llama-2-70b': 'meta/llama-2-70b',
             'meta/llama-2-7b': 'meta/llama-2-7b',
             'meta/llama-65b': 'meta/llama-65b',
             'mistralai/mistral-7b-v0.1': 'mistralai/mistral-7b-v0.1',
             'mistralai/mixtral-8x7b-32kseqlen': 'mistralai/mixtral-8x7b-32kseqlen',
             'gpt-3.5-turbo-0613': 'openai/gpt-3.5-turbo-0613',
             'gpt-4-0613': 'openai/gpt-4-0613',
             'gpt-4-1106-preview': 'openai/gpt-4-1106-preview',
             'text-davinci-002': 'openai/text-davinci-002',
             'text-davinci-003': 'openai/text-davinci-003',
             'tiiuae/falcon-40b': 'tiiuae/falcon-40b',
             'tiiuae/falcon-7b': 'tiiuae/falcon-7b',
             'writer/palmyra-x-v2': 'writer/palmyra-x-v2',
             'writer/palmyra-x-v3': 'writer/palmyra-x-v3'}
llms = list(llms_dict.keys())

Split into different cells for OpenAI embeddings, word2vec, fasttext and n-grams (to avoid memory issues).

OpenAI embeddings

In [None]:
for split in [False, "OOD_1", "OOD_2", "OOD_3"]:
    train_df, validation_df, test_df = load_helm_lite(llms, ["openai_embeddings"], ood_split=split, base_path="../results/helm_lite_v1.0.0_embeddings/")

    for llm in llms:
        if len(train_df[f"Success_{llm}"].unique()) < 2:
            print(f"Skipping {llm} because there is only one value in the 'Success' column for the train df")
            continue

        if len(validation_df[f"Success_{llm}"].unique()) < 2:
            print(f"Skipping {llm} because there is only one value in the 'Success' column for the validation df")
            continue

        if len(test_df[f"Success_{llm}"].unique()) < 2:
            print(f"Skipping {llm} because there is only one value in the 'Success' column for the test df")
            continue

        for predictive_method, kwargs, pred_method_name in predictive_method_list:

            all_datasets_assessors = evaluate_and_update(all_datasets_assessors, train_df, validation_df, test_df, ["openai_embeddings_large"], predictive_method, pred_method_name, "openai", split, llm, filename, **kwargs)

Word2vec + fasttext

In [None]:
for split in [False, "OOD_1", "OOD_2", "OOD_3"]:
    train_df, validation_df, test_df = load_helm_lite(llms, ["word2vec", "fasttext"], ood_split=split, base_path="../results/helm_lite_v1.0.0/")

    print(f'train_df samples: {train_df.shape[0]}')
    print(f'test_df samples: {test_df.shape[0]}')

    for llm in llms:
        if len(train_df[f"Success_{llm}"].unique()) < 2:
            print(f"Skipping {llm} because there is only one value in the 'Success' column for the train df")
            continue

        if len(validation_df[f"Success_{llm}"].unique()) < 2:
            print(f"Skipping {llm} because there is only one value in the 'Success' column for the validation df")
            continue

        if len(test_df[f"Success_{llm}"].unique()) < 2:
            print(f"Skipping {llm} because there is only one value in the 'Success' column for the test df")
            continue

        for embedding_type in ["word2vec", "fasttext"]:

            for predictive_method, kwargs, pred_method_name in predictive_method_list:

                all_datasets_assessors = evaluate_and_update(all_datasets_assessors, train_df, validation_df,  test_df, [f"{embedding_type}_embeddings"], predictive_method, pred_method_name, embedding_type, split, llm, filename, **kwargs)


ngrams

In [None]:
for split in [False, "OOD_1", "OOD_2", "OOD_3"]:
    train_df, validation_df, test_df = load_helm_lite(llms, [], ood_split=split, base_path="../results/helm_lite_v1.0.0/")

    for n_gram_size in [1]:
        # compute the n-grams
        X_train_ngrams, X_val_ngrams, X_test_ngrams, vectorizer = ngram_vectorize_new(train_df["prompt"], validation_df["prompt"], test_df["prompt"], ngram_range=(1, n_gram_size))

        for llm in llms:
            if len(train_df[f"Success_{llm}"].unique()) < 2:
                print(f"Skipping {llm} because there is only one value in the 'Success' column for the train df")
                continue

            if len(validation_df[f"Success_{llm}"].unique()) < 2:
                print(f"Skipping {llm} because there is only one value in the 'Success' column for the validation df")
                continue

            if len(test_df[f"Success_{llm}"].unique()) < 2:
                print(f"Skipping {llm} because there is only one value in the 'Success' column for the test df")
                continue

            # select the features (this depends on which LLM you are considering)
            X_train_ngrams_selected, X_val_ngrams_selected, X_test_ngrams_selected, selector = select_features(X_train_ngrams, train_df[f"Success_{llm}"], X_val_ngrams, X_test_ngrams)

            for predictive_method, kwargs, pred_method_name in predictive_method_list:

                all_datasets_assessors = evaluate_and_update_arrays(all_datasets_assessors, X_train_ngrams_selected, train_df[f"Success_{llm}"], X_val_ngrams_selected, validation_df[f"Success_{llm}"], X_test_ngrams_selected, test_df[f"Success_{llm}"], predictive_method, pred_method_name, f"ngrams_{n_gram_size}", split, llm, filename, **kwargs)

#### Plots


In [None]:
filename = "results/specific_assessors_helm.pkl"

In [None]:
all_datasets_assessors = load_with_conditions(filename)

In [None]:
len(all_datasets_assessors.llm.unique())

In [None]:
plot_best_predictive_method_per_feature(all_datasets_assessors, sort=False, col_order=["In distribution", "OOD 1", "OOD 2", "OOD 3"], metric="AUROC_val", legend_text_size=24, aspect=1.2, font_scale=1.3, panel_space=0.1, legend_loc=(1.01, 0.5))
plt.savefig("fig/helm_results_full_dataset_val.pdf", bbox_inches="tight")

In [None]:
plot_best_predictive_method_per_feature(all_datasets_assessors, sort=False, col_order=["In distribution", "OOD 1", "OOD 2", "OOD 3"], metric="AUROC_test", legend_text_size=24, aspect=1.2, font_scale=1.3, panel_space=0.1, legend_loc=(1.01, 0.5))
plt.savefig("fig/helm_results_full_dataset_test.pdf", bbox_inches="tight")

Compute the number of times each feature "wins" over the others, for each split separately.

In [None]:
# find the best over feature and predictive method, for each llm and split
best_over_feat_predictive_method = all_datasets_assessors.groupby(["llm", "split"]).apply(lambda x: x[x.AUROC_val == x.AUROC_val.max()]).reset_index(drop=True)
# then for each split separately, count the number of times each feature "wins" over the others (ie if it is present in the best_over_feat_predictive_method)
for split in best_over_feat_predictive_method.split.unique():
    # for each feature, count the number of times it is the best
    counts_split = best_over_feat_predictive_method[(best_over_feat_predictive_method.split == split)].features.value_counts()
    # normalize by the number of llms
    counts_split = counts_split / sum(counts_split)
    print(f"Split {split}:")
    print(counts_split)

Here instead I get a much more mixed picture, with fasttext dominating ood_2, while openai embeddings are still the top or very close in the other 3 cases.

### KindsOfReasoning subsampled to have same samples size as HELM-Lite

This to check whether the better performance on reasoning is due to the larger sample size. Reduce to:

train_df samples: 3000

test_df samples: 1285

In principle I should not reduce the test set as well. Maybe re-run without doing so.

In [None]:
n_train = 3000
n_test = 1285

In [None]:
filename = "results/specific_assessors_reasoning_subsampled.pkl"

In [None]:
overwrite_res = False

all_datasets_assessors_subsampled = load_with_conditions(filename, overwrite_res)

In [None]:
llms = [
    'text-ada-001',
    'text-babbage-001',
    'text-curie-001',
    'text-davinci-001',
    'text-davinci-002',
    'text-davinci-003',
    'gpt-3.5-turbo-0301',
    'gpt-3.5-turbo-0613',
    'gpt-3.5-turbo-1106',
    'gpt-3.5-turbo-0125',
    'gpt-4-0314',
    'gpt-4-0613',
    'gpt-4-1106-preview',
    'gpt-4-0125-preview',
]

Split into different cells for OpenAI embeddings, word2vec, fasttext and n-grams.

OpenAI embeddings

In [None]:
for split in [False, "OOD_1", "OOD_2", "OOD_3", "OOD_4"]:
    train_df, validation_df, test_df = load_reasoning(llms, ["openai_embeddings"], ood_split=split, subsampled_n_train=3000, 
                                                      base_path="../results/kindsofreasoning_embeddings")

    for llm in llms:
        if len(train_df[f"Success_{llm}"].unique()) < 2:
            print(f"Skipping {llm} because there is only one value in the 'Success' column for the train df")
            continue

        if len(validation_df[f"Success_{llm}"].unique()) < 2:
            print(f"Skipping {llm} because there is only one value in the 'Success' column for the validation df")
            continue

        if len(test_df[f"Success_{llm}"].unique()) < 2:
            print(f"Skipping {llm} because there is only one value in the 'Success' column for the test df")
            continue

        for predictive_method, kwargs, pred_method_name in predictive_method_list:

            all_datasets_assessors_subsampled = evaluate_and_update(all_datasets_assessors_subsampled, train_df, validation_df, test_df, ["openai_embeddings_large"], predictive_method, pred_method_name, "openai", split, llm, filename, **kwargs)


The data with embeddings does not have the sampling column which is needed to extract the system prompt; do not remove it anymore and re-compute the embeddings!

Word2vec + fasttext

In [None]:
for split in [False, "OOD_1", "OOD_2", "OOD_3", "OOD_4"]:
    train_df, validation_df, test_df = load_reasoning(llms, ["word2vec", "fasttext"], ood_split=split, subsampled_n_train=3000, 
                                                      base_path="../results/kindsofreasoning/")

    for llm in llms:
        if len(train_df[f"Success_{llm}"].unique()) < 2:
            print(f"Skipping {llm} because there is only one value in the 'Success' column for the train df")
            continue

        if len(validation_df[f"Success_{llm}"].unique()) < 2:
            print(f"Skipping {llm} because there is only one value in the 'Success' column for the validation df")
            continue

        if len(test_df[f"Success_{llm}"].unique()) < 2:
            print(f"Skipping {llm} because there is only one value in the 'Success' column for the test df")
            continue

        for embedding_type in ["word2vec", "fasttext"]:

            for predictive_method, kwargs, pred_method_name in predictive_method_list:

                all_datasets_assessors_subsampled = evaluate_and_update(all_datasets_assessors_subsampled, train_df, validation_df, test_df, [f"{embedding_type}_embeddings"], predictive_method, pred_method_name, embedding_type, split, llm, filename, **kwargs)


ngrams

In [None]:
for split in [False, "OOD_1", "OOD_2", "OOD_3", "OOD_4"]:
    train_df, validation_df, test_df = load_reasoning(llms, [], ood_split=split, subsampled_n_train=3000, 
                                                      base_path="../results/kindsofreasoning/")

    for n_gram_size in [1]:
        # compute the n-grams
        X_train_ngrams, X_val_ngrams, X_test_ngrams, vectorizer = ngram_vectorize_new(train_df["prompt"], validation_df["prompt"], test_df["prompt"], ngram_range=(1, n_gram_size))

        for llm in llms:
            if len(train_df[f"Success_{llm}"].unique()) < 2:
                print(f"Skipping {llm} because there is only one value in the 'Success' column for the train df")
                continue

            if len(validation_df[f"Success_{llm}"].unique()) < 2:
                print(f"Skipping {llm} because there is only one value in the 'Success' column for the validation df")
                continue

            if len(test_df[f"Success_{llm}"].unique()) < 2:
                print(f"Skipping {llm} because there is only one value in the 'Success' column for the test df")
                continue

            # select the features (this depends on which LLM you are considering)
            X_train_ngrams_selected, X_val_ngrams_selected, X_test_ngrams_selected, selector = select_features(X_train_ngrams, train_df[f"Success_{llm}"], X_val_ngrams, X_test_ngrams)

            for predictive_method, kwargs, pred_method_name in predictive_method_list:

                all_datasets_assessors_subsampled = evaluate_and_update_arrays(all_datasets_assessors_subsampled, X_train_ngrams_selected, train_df[f"Success_{llm}"], X_val_ngrams_selected, validation_df[f"Success_{llm}"], X_test_ngrams_selected, test_df[f"Success_{llm}"], predictive_method, pred_method_name, f"ngrams_{n_gram_size}", split, llm, filename, **kwargs)

In [None]:
len(all_datasets_assessors_subsampled)

In [None]:
all_datasets_assessors_subsampled.split.unique()

#### Plots

This is for the subsampled setup.

In [None]:
filename = "results/specific_assessors_reasoning_subsampled.pkl"
all_datasets_assessors_subsampled = load_with_conditions(filename)

In [None]:
all_datasets_assessors_subsampled.llm.unique()

In [None]:
plot_best_predictive_method_per_feature(all_datasets_assessors_subsampled, sort=True, col_order=["In distribution", "OOD 1", "OOD 2", "OOD 3", "OOD 4"], metric="AUROC_val", legend_text_size=24, aspect=0.9, font_scale=2, panel_space=0.1, legend_loc=(0.99, 0.6))
plt.savefig("fig/reasoning_results_subsampled_dataset_val.pdf", bbox_inches="tight")

In [None]:
plot_best_predictive_method_per_feature(all_datasets_assessors_subsampled, sort=True, col_order=["In distribution", "OOD 1", "OOD 2", "OOD 3", "OOD 4"], metric="AUROC_test", legend_text_size=24, aspect=0.9, font_scale=2, panel_space=0.1, legend_loc=(0.99, 0.6))
plt.savefig("fig/reasoning_results_subsampled_dataset_test.pdf", bbox_inches="tight")

Compute the number of times each feature "wins" over the others, for each split separately.

In [None]:
# find the best over feature and predictive method, for each llm and split
best_over_feat_predictive_method = all_datasets_assessors_subsampled.groupby(["llm", "split"]).apply(lambda x: x[x.AUROC_val == x.AUROC_val.max()]).reset_index(drop=True)
# then for each split separately, count the number of times each feature "wins" over the others (ie if it is present in the best_over_feat_predictive_method)
for split in best_over_feat_predictive_method.split.unique():
    # for each feature, count the number of times it is the best
    counts_split = best_over_feat_predictive_method[(best_over_feat_predictive_method.split == split)].features.value_counts()
    # normalize by the number of llms
    counts_split = counts_split / sum(counts_split)
    print(f"Split {split}:")
    print(counts_split)

Results do not seem to change much. The main difference is that the OpenAI embedding win less frequently than before.
#### Now compute the pairwise difference between the full data and the subsampled setup.

In [None]:
filename = "results/specific_assessors_reasoning_subsampled.pkl"
all_datasets_assessors_subsampled = load_with_conditions(filename)

In [None]:
filename = "results/specific_assessors_reasoning.pkl"
all_datasets_assessors = load_with_conditions(filename)

In [None]:
# select the best predictive method for each feature, llm and split
all_datasets_assessors_subsampled = all_datasets_assessors_subsampled.groupby(["llm", "features", "split"]).apply(lambda x: x[x.AUROC_val == x.AUROC_val.max()]).reset_index(drop=True)
all_datasets_assessors = all_datasets_assessors.groupby(["llm", "features", "split"]).apply(lambda x: x[x.AUROC_val == x.AUROC_val.max()]).reset_index(drop=True)

In [None]:
# merge the subsampled on the full one using ["llm", "features", "split", "predictive_method"]
all_datasets_assessors = pd.merge(all_datasets_assessors, all_datasets_assessors_subsampled, on=["llm", "features", "split"], suffixes=("", "_subsampled"))
# compute difference in AUROC
all_datasets_assessors["AUROC_test_diff"] = all_datasets_assessors["AUROC_test"] - all_datasets_assessors["AUROC_test_subsampled"]
all_datasets_assessors["AUROC_val_diff"] = all_datasets_assessors["AUROC_val"] - all_datasets_assessors["AUROC_val_subsampled"]

In [None]:
all_datasets_assessors.shape

In [None]:
all_datasets_assessors.columns

In [None]:
plot_best_predictive_method_per_feature(all_datasets_assessors, sort=True, set_axis_bounds=False, metric="AUROC_test_diff", yaxixlabel="AUROC - AUROC_subsampled")
plt.savefig("fig/reasoning_results_diff_full_subsampled_test.pdf", bbox_inches="tight")

In [None]:
plot_best_predictive_method_per_feature(all_datasets_assessors, sort=True, set_axis_bounds=False, metric="AUROC_val_diff", yaxixlabel="AUROC - AUROC_subsampled")
plt.savefig("fig/reasoning_results_diff_full_subsampled_val.pdf", bbox_inches="tight")

Adding training samples is not beneficial for the random splits, but it is (in general, except for ngrams) for the ood split.

The fact that ngrams_1 works much better with a larger number of samples for some LLMs in the OOD split is not that surprising: it is likely due to the fact that some words in the OOD set that were not seen for the subsampled train dataset were instead seen for the full dataset. Other large variations happen with word2vec, likely for the same reason.

## How many OpenaAI embeddings are needed?

I will do this with the OpenAI ones, as I am not sure it makes much sense for the other embeddings (the new OpenAI embeddings use a special technique that makes a subset of them applicable).

Notice that, for the KindsOfReasoning dataset, this has been run with the full dataset (i.e., no subsampling was applied).

Define the embedding sizes to try:


In [None]:
embedding_sizes = [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 3072]

Run the code

In [None]:
def _check_skip(res_df, pred_method_name, feature_name, split, llm, embedding_size):
    if len(res_df)> 0 and len(res_df[(res_df["predictive_method"] == pred_method_name) & (res_df["features"] == feature_name) & (res_df["llm"] == llm) & (res_df["split"] == split) & (res_df["embedding_size"] == embedding_size)]) > 0:
        print(f"Skipping {split}, {feature_name}, {pred_method_name}, {embedding_size} for {llm} because it is already in the dataframe")
        return True
    else:
        print(f"Doing {split}, {feature_name}, {pred_method_name}, {embedding_size} for {llm}")
        return False

def _concat_and_save(res_df, pred_method_name, feature_name, split, llm, embedding_size,
                     BrierScore_val, Calibration_val, Refinement_val, roc_auc_val,
                     BrierScore_test, Calibration_test, Refinement_test, roc_auc_test,
                     trained_method, filename):
    res_df = pd.concat([res_df, pd.DataFrame(
    {"predictive_method": pred_method_name, "features": feature_name, "split": split,
     "llm": llm, "embedding_size": embedding_size,
    "BrierScore_val": BrierScore_val, "Calibration_val": Calibration_val, "Refinement_val": Refinement_val, "AUROC_val": roc_auc_val,
     "BrierScore_test": BrierScore_test, "Calibration_test": Calibration_test, "Refinement_test": Refinement_test, "AUROC_test": roc_auc_test, "trained_classifier": trained_method}, index=[0])])

    save_dataframe(filename, res_df)
    return res_df

def evaluate_and_update(res_df, train_df, validation_df, test_df, features, predictive_method,
                        pred_method_name, feature_name, split, llm, embedding_size,
                        filename,
                        **kwargs):
    if not _check_skip(res_df, pred_method_name, feature_name, split, llm, embedding_size):
        BrierScore_test, Calibration_test, Refinement_test, roc_auc_test, trained_method = evaluate_predictive_method(train_df, test_df,
                                                                                                  features,
                                                                                                  f"Success_{llm}",
                                                                                                  predictive_method=predictive_method,
                                                                                                  return_trained_method=True,
                                                                                                  **kwargs)
        BrierScore_val, Calibration_val, Refinement_val, roc_auc_val = evaluate_predictive_method(train_df, validation_df,
                                                                                                  features,
                                                                                                  f"Success_{llm}",
                                                                                                  predictive_method=predictive_method,
                                                                                                  trained_method=trained_method,
                                                                                                  **kwargs)
        res_df = _concat_and_save(res_df, pred_method_name, feature_name, split, llm, embedding_size,
                                  BrierScore_val, Calibration_val, Refinement_val, roc_auc_val,
                                  BrierScore_test, Calibration_test, Refinement_test, roc_auc_test,
                                  trained_method, filename)
    return res_df

In [None]:
def plot_best_predictive_method_per_feature(increasing_n_embeddings_assessors, sort=False, col_order=None, metric="AUROC_test", font_scale=1, legend_text_size=None, height=5, aspect=1, legend_loc=(1.03, 0.6), panel_space=None):
    # for each set of features, extract the best predictive method
    best_predictive_method_per_feature = increasing_n_embeddings_assessors.groupby(["llm", "features", "embedding_size", "split"]).apply(lambda x: x[x.AUROC_val == x.AUROC_val.max()]).reset_index(drop=True)
    # sort using the order of the models
    if sort:
        best_predictive_method_per_feature = best_predictive_method_per_feature.sort_values(by=["llm"], key=lambda x: x.apply(lambda y: sort_models_order.index(y)))
    else:
        best_predictive_method_per_feature = best_predictive_method_per_feature.sort_values(by=["llm"])
    best_predictive_method_per_feature["split"] = best_predictive_method_per_feature["split"].astype(str).replace("False", "In distribution").replace("OOD_1", "OOD 1").replace("OOD_2", "OOD 2").replace("OOD_3", "OOD 3").replace("OOD_4", "OOD 4")

    # plot using seaborn
    # notice I want to stratify results across language model
    # and then put llms on the x
    # and then put the predictive method on the hue
    # Adjust font scale
    sns.set_context("notebook", font_scale=font_scale)

    catplot = sns.catplot(data=best_predictive_method_per_feature, x='embedding_size', y=metric, hue='llm', col="split", kind='point', col_order=col_order, height=height, aspect=aspect)

    catplot.fig.subplots_adjust(wspace=panel_space)

    # title
    # plt.title("AUROC for different embedding sizes and language models\nby combining all datasets")
    # set ylim between 0.5 and 1
    for ax in plt.gcf().axes:
        ax.set_ylim(0.5, 1)
    # make lines thinner
    for ax in plt.gcf().axes:
        for line in ax.lines:
            line.set_linewidth(1)
    # make points smaller
    for ax in plt.gcf().axes:
        for line in ax.lines:
            line.set_markersize(2)
    # rotate x labels
    for ax in plt.gcf().axes:
        plt.sca(ax)
        plt.xticks(rotation=90)
        
    # Adjust legend font size
    catplot._legend.set_title("Features")
    if legend_text_size is not None:
        plt.setp(catplot._legend.get_texts(), fontsize=f'{legend_text_size}')  # for legend text
        plt.setp(catplot._legend.get_title(), fontsize=f'{legend_text_size+1}')  # for legend title

    # Move legend to an appropriate position
    catplot._legend.set_bbox_to_anchor(legend_loc)
    catplot._legend.set_frame_on(False)  # Optionally, remove the legend frame for better appearance
 
       

### KindsOfReasoning

In [None]:
llms = [
    'text-ada-001',
    'text-babbage-001',
    'text-curie-001',
    'text-davinci-001',
    'text-davinci-002',
    'text-davinci-003',
    'gpt-3.5-turbo-0301',
    'gpt-3.5-turbo-0613',
    'gpt-3.5-turbo-1106',
    'gpt-3.5-turbo-0125',
    'gpt-4-0314',
    'gpt-4-0613',
    'gpt-4-1106-preview',
    'gpt-4-0125-preview',
]

In [None]:
filename = "results/specific_assessor_reasoning_increasing_n_emb.pkl"

In [None]:
overwrite_res = False

increasing_n_embeddings_assessors = load_with_conditions(filename, overwrite_res)

In [None]:
for split in [False, "OOD_1", "OOD_2", "OOD_3", "OOD_4"]:
    train_df, validation_df, test_df = load_reasoning(llms, ["openai_embeddings"], ood_split=split, base_path="../results/kindsofreasoning_embeddings")

    for llm in llms:
        if len(train_df[f"Success_{llm}"].unique()) < 2:
            print(f"Skipping {llm} because there is only one value in the 'Success' column for the train df")
            continue

        if len(validation_df[f"Success_{llm}"].unique()) < 2:
            print(f"Skipping {llm} because there is only one value in the 'Success' column for the validation df")
            continue

        if len(test_df[f"Success_{llm}"].unique()) < 2:
            print(f"Skipping {llm} because there is only one value in the 'Success' column for the test df")
            continue

        for embedding_size in embedding_sizes:
            train_df["openai_embeddings_subset"] = train_df["openai_embeddings_large"].apply(lambda x: x[:embedding_size])
            validation_df["openai_embeddings_subset"] = validation_df["openai_embeddings_large"].apply(lambda x: x[:embedding_size])
            test_df["openai_embeddings_subset"] = test_df["openai_embeddings_large"].apply(lambda x: x[:embedding_size])

            for predictive_method, kwargs, pred_method_name in predictive_method_list:

                increasing_n_embeddings_assessors = evaluate_and_update(increasing_n_embeddings_assessors, train_df, validation_df, test_df, ["openai_embeddings_subset"], predictive_method, pred_method_name, "openai", split, llm, embedding_size, filename, **kwargs)

#### Plots

In [None]:
filename = "results/specific_assessor_reasoning_increasing_n_emb.pkl"

In [None]:
increasing_n_embeddings_assessors = load_with_conditions(filename)

In [None]:
plot_best_predictive_method_per_feature(increasing_n_embeddings_assessors, sort=True, col_order=["In distribution", "OOD 1", "OOD 2", "OOD 3", "OOD 4"], legend_text_size=16, aspect=0.7, font_scale=1.5, panel_space=0.1, legend_loc=(1, 0.5))
plt.savefig("fig/reasoning_results_increasing_n_emb_test.pdf")

In [None]:
plot_best_predictive_method_per_feature(increasing_n_embeddings_assessors, sort=True, col_order=["In distribution", "OOD 1", "OOD 2", "OOD 3", "OOD 4"], metric="AUROC_val", legend_text_size=16, aspect=0.7, font_scale=1.5, panel_space=0.1, legend_loc=(1, 0.5))
plt.savefig("fig/reasoning_results_increasing_n_emb_val.pdf")

### HELM-Lite

In [None]:
llms_dict = {'01-ai/yi-34b': '01-ai/yi-34b',
             '01-ai/yi-6b': '01-ai/yi-6b',
             'AlephAlpha/luminous-base': 'AlephAlpha/luminous-base',
             'AlephAlpha/luminous-extended': 'AlephAlpha/luminous-extended',
             'AlephAlpha/luminous-supreme': 'AlephAlpha/luminous-supreme',
             'ai21/j2-grande': 'ai21/j2-grande',
             'ai21/j2-jumbo': 'ai21/j2-jumbo',
             'anthropic/claude-2.0': 'anthropic/claude-2.0',
             'anthropic/claude-2.1': 'anthropic/claude-2.1',
             'anthropic/claude-instant-1.2': 'anthropic/claude-instant-1.2',
             'anthropic/claude-v1.3': 'anthropic/claude-v1.3',
             'cohere/command': 'cohere/command',
             'cohere/command-light': 'cohere/command-light',
             'google/text-bison@001': 'google/text-bison@001',
             'google/text-unicorn@001': 'google/text-unicorn@001',
             'meta/llama-2-13b': 'meta/llama-2-13b',
             'meta/llama-2-70b': 'meta/llama-2-70b',
             'meta/llama-2-7b': 'meta/llama-2-7b',
             'meta/llama-65b': 'meta/llama-65b',
             'mistralai/mistral-7b-v0.1': 'mistralai/mistral-7b-v0.1',
             'mistralai/mixtral-8x7b-32kseqlen': 'mistralai/mixtral-8x7b-32kseqlen',
             'gpt-3.5-turbo-0613': 'openai/gpt-3.5-turbo-0613',
             'gpt-4-0613': 'openai/gpt-4-0613',
             'gpt-4-1106-preview': 'openai/gpt-4-1106-preview',
             'text-davinci-002': 'openai/text-davinci-002',
             'text-davinci-003': 'openai/text-davinci-003',
             'tiiuae/falcon-40b': 'tiiuae/falcon-40b',
             'tiiuae/falcon-7b': 'tiiuae/falcon-7b',
             'writer/palmyra-x-v2': 'writer/palmyra-x-v2',
             'writer/palmyra-x-v3': 'writer/palmyra-x-v3'}
llms = list(llms_dict.keys())

In [None]:
filename = "results/specific_assessors_helm_increasing_n_emb.pkl"

In [None]:
overwrite_res = False

increasing_n_embeddings_assessors = load_with_conditions(filename, overwrite_res)

In [None]:
for split in [False, "OOD_1", "OOD_2", "OOD_3"]:
    train_df, validation_df, test_df = load_helm_lite(llms, ["openai_embeddings"], ood_split=split, base_path="../results/helm_lite_v1.0.0_embeddings/")

    for llm in llms:
        if len(train_df[f"Success_{llm}"].unique()) < 2:
            print(f"Skipping {llm} because there is only one value in the 'Success' column for the train df")
            continue

        if len(validation_df[f"Success_{llm}"].unique()) < 2:
            print(f"Skipping {llm} because there is only one value in the 'Success' column for the validation df")
            continue

        if len(test_df[f"Success_{llm}"].unique()) < 2:
            print(f"Skipping {llm} because there is only one value in the 'Success' column for the test df")
            continue

        for embedding_size in embedding_sizes:
            train_df["openai_embeddings_subset"] = train_df["openai_embeddings_large"].apply(lambda x: x[:embedding_size])
            validation_df["openai_embeddings_subset"] = validation_df["openai_embeddings_large"].apply(lambda x: x[:embedding_size])
            test_df["openai_embeddings_subset"] = test_df["openai_embeddings_large"].apply(lambda x: x[:embedding_size])

            for predictive_method, kwargs, pred_method_name in predictive_method_list:

                increasing_n_embeddings_assessors = evaluate_and_update(increasing_n_embeddings_assessors, train_df, validation_df, test_df, ["openai_embeddings_subset"], predictive_method, pred_method_name, "openai", split, llm, embedding_size, filename, **kwargs)

#### Plots


In [None]:
filename = "results/specifc_assessors_helm_increasing_n_emb.pkl"

In [None]:
increasing_n_embeddings_assessors = load_with_conditions(filename)

In [None]:
plot_best_predictive_method_per_feature(increasing_n_embeddings_assessors, sort=False, col_order=["In distribution", "OOD 1", "OOD 2", "OOD 3"], legend_text_size=16, aspect=0.7, font_scale=1.5, panel_space=0.1, legend_loc=(1, 0.5))
plt.savefig("fig/helm_results_increasing_n_emb_test.pdf")

In [None]:
plot_best_predictive_method_per_feature(increasing_n_embeddings_assessors, sort=False, col_order=["In distribution", "OOD 1", "OOD 2", "OOD 3"], legend_text_size=16, aspect=0.7, font_scale=1.5, panel_space=0.1, legend_loc=(1, 0.5))
plt.savefig("fig/helm_results_increasing_n_emb_val.pdf")