In [None]:
!python --version

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
from sklearn.preprocessing import LabelEncoder
from tqdm.notebook import tqdm

from src.results_loaders import CLadder, ProntoQA, HelmResultsLoader, EvalsResultsLoader
from src.utils import load_with_conditions, save_dataframe, Cohen_correction, initialize_instance

# enable reloading of modules
%load_ext autoreload
%autoreload 2

## How does the ability of predicting ground truth affect LLM performance

First list the various things for HELM, evals and the datasets with specialised loaders.

In [None]:
helm_list = [('legalbench', 'abercrombie'), ('legalbench', 'corporate_lobbying'), ('legalbench', 'function_of_decision_section'), ('legalbench', 'proa'), ('legalbench', 'international_citizenship_questions')]

In [None]:
evals_list = ['fantasy_reasoning',
 'neubaroco',
 'moral_permissibility',
 'causal_judgment',
 'metaphor_boolean',
 'commonsense_qa_2',
 'space_nli',
 'anli',
 'wanli',
 'babi_task_16',
 'formal_fallacies_syllogisms_negation']

In [None]:
total_list = [(EvalsResultsLoader, eval) for eval in evals_list] + [(HelmResultsLoader, scenario, subscenario) for scenario, subscenario in helm_list] + [(CLadder,), (ProntoQA,)]

### Stratify the performance of LLMs according to if the ngrams approach predicted the ground truth correctly or not

Loop over all datasets, then for any of them extract the prediction of performance on the ngrams, and stratify the performance of the various LLMs on the test dataset

I do this only for the datasets where the ground truth is a "choice".

In [None]:
# now need to implement computation of random guesses using the observed proportions of real labels and the predictions
def compute_random_guesses(labels, predictions):
    assert len(labels) == len(predictions)
    # Assuming labels and predictions are NumPy arrays
    labels_series = pd.Series(labels)
    predictions_series = pd.Series(predictions)

    # Use Cohen's approach: count the number of times each label appears in the predictions and the labels
    label_counts = labels_series.value_counts()
    prediction_counts = predictions_series.value_counts()    # union of the two sets of labels
    total_set_labels = set(label_counts.index) | set(prediction_counts.index)
    # compute the random guess
    random_guess = 0
    for label in total_set_labels:
        if label not in label_counts or label not in prediction_counts:
            random_guess += 0
        else:
            random_guess += label_counts[label] * prediction_counts[label]
    random_guess /= len(labels) ** 2

    return random_guess

In [None]:
filename = "results/ground_truth_prediction_performance.pkl"
primal_performance_df = load_with_conditions(filename)

In [None]:
primal_performance_df.shape

In [None]:
best_predictive_method_per_feature_primal = primal_performance_df.groupby(["dataset", "features"]).apply(lambda x: x[x.Accuracy_val == x.Accuracy_val.max()]).reset_index(drop=True)
# if there are more than one entry with the same accuracy for each ["dataset", "features"] combination, then pick the first one; also keep the dataset and features columns
best_predictive_method_per_feature_primal = best_predictive_method_per_feature_primal.groupby(["dataset", "features"]).first().reset_index()

Compute Cohen correction for the predictive performance of the simple methods

In [None]:
best_predictive_method_per_feature_primal.shape

In [None]:
best_predictive_method_per_feature_primal.columns

In [None]:
# Count how many different values per row in the "instance_level_predictions_test" column
best_predictive_method_per_feature_primal["instance_level_predictions_test"].apply(lambda x: 0 if len(x.shape)==0 else len(set(x))).value_counts()

The following computes the random guesses, that are used for the Cohen correction.

In [None]:
# add two empty columns to the dataframe: cohen_correction_n_choices and cohen_correction_proportions
best_predictive_method_per_feature_primal["cohen_correction_n_choices"] = np.nan
best_predictive_method_per_feature_primal["cohen_correction_proportions"] = np.nan
# loop over all datasets
for dataset in tqdm(total_list):
    instance, dataset_name, ideal_col_name, group = initialize_instance(dataset)

    # Split the dataset
    train_df, val_df, test_df = instance.train_val_test_split(discard_na_rows=False, rng=np.random.RandomState(42), train_size=0.6, val_size=0.2)

    primal_train_labels = train_df[ideal_col_name]
    primal_test_labels = test_df[ideal_col_name]

    # Encode the labels
    label_encoder = LabelEncoder()
    primal_train_labels_encoded = label_encoder.fit_transform(primal_train_labels)
    primal_test_labels_encoded = label_encoder.transform(test_df[ideal_col_name])

    # extract the indices of the dataframe with that dataset:
    indices = best_predictive_method_per_feature_primal[best_predictive_method_per_feature_primal["dataset"] == dataset_name].index
    # loop over those rows:
    for i in indices:
        if best_predictive_method_per_feature_primal.loc[i, "predictive_method"] in ["most_likely_answer", "random_guess"]:
            continue
        # extract the predictions from the best_predictive_method_per_feature_primal dataframe
        instance_level_predictions = best_predictive_method_per_feature_primal.loc[i, "instance_level_predictions_test"]
        # extract the labels
        # compute the random guess
        random_guess_n_choices = 1 / len(primal_test_labels.unique())
        random_guess_proportion = compute_random_guesses(primal_test_labels_encoded, instance_level_predictions)
        # now compute the Cohen's correction with the two random guess values
        cohen_correction_n_choices = Cohen_correction(best_predictive_method_per_feature_primal.loc[i, "Accuracy_test"], random_guess_n_choices)
        cohen_correction_proportion = Cohen_correction(best_predictive_method_per_feature_primal.loc[i, "Accuracy_test"], random_guess_proportion)
        # update the dataframe
        best_predictive_method_per_feature_primal.loc[i, "cohen_correction_n_choices"] = cohen_correction_n_choices
        best_predictive_method_per_feature_primal.loc[i, "cohen_correction_proportions"] = cohen_correction_proportion


In [None]:
best_predictive_method_per_feature_primal["cohen_correction_n_choices"]

In [None]:
features_list =  ['1-grams_presence',
 '1-grams_presence_gpt2',
 '1-grams_simple_frequency',
 '1-grams_simple_frequency_gpt2',
 '1-grams_tfidf',
 '1-grams_tfidf_gpt2',
 '2-grams_presence',
 '2-grams_presence_gpt2',
 '2-grams_simple_frequency',
 '2-grams_simple_frequency_gpt2',
 '2-grams_tfidf',
 '2-grams_tfidf_gpt2',
 'readability_diversity_metrics']

In [None]:
stratified_success_file = "results/stratified_successes.csv"

The following computes the stratified performance (in terms of Cohen correction) stratified for the cases where the simple methods correctly or incorrectly predicted success.

In [None]:
stratified_successes = []
for features in features_list:
    # Loop over all datasets
    for dataset in tqdm(total_list):
        instance, dataset_name, ideal_col_name, group = initialize_instance(dataset)

        # Split the dataset
        train_df, val_df, test_df = instance.train_val_test_split(discard_na_rows=False, rng=np.random.RandomState(42), train_size=0.6, val_size=0.2)

        primal_train_labels = train_df[ideal_col_name]
        primal_test_labels = test_df[ideal_col_name]

        # Encode the labels
        label_encoder = LabelEncoder()
        primal_train_labels_encoded = label_encoder.fit_transform(primal_train_labels)
        primal_test_labels_encoded = label_encoder.transform(test_df[ideal_col_name])

        # extract the predictions from the best_predictive_method_per_feature_primal dataframe
        best_predictive_method_feature_primal = best_predictive_method_per_feature_primal[best_predictive_method_per_feature_primal["dataset"] == dataset_name]
        best_predictive_method_feature_primal = best_predictive_method_feature_primal[best_predictive_method_feature_primal["features"] == features]
        instance_level_predictions = best_predictive_method_feature_primal["instance_level_predictions_test"].values[0]
        instance_level_correct = instance_level_predictions == primal_test_labels_encoded

        random_guess_n_choices = 1 / len(primal_test_labels.unique())

        # Stratify the performance of the various LLMs on the test dataset
        for llm in instance.llms:
            llm_success = test_df[f"Success_{llm}"]
            llm_average_success_on_successfully_predicted = llm_success[instance_level_correct].mean()
            llm_average_success_on_unsuccessfully_predicted = llm_success[~instance_level_correct].mean()

            # random_guess_proportion_successfully_predicted = compute_random_guesses(primal_test_labels[instance_level_correct], instance_level_predictions[instance_level_correct])
            # random_guess_proportion_unsuccessfully_predicted = compute_random_guesses(primal_test_labels[~instance_level_correct], instance_level_predictions[~instance_level_correct])
            # now compute the Cohen's correction with the two random guess values, for the two subsets
            cohen_correction_n_choices_on_successfully_predicted = Cohen_correction(llm_average_success_on_successfully_predicted, random_guess_n_choices)
            cohen_correction_n_choices_on_unsuccessfully_predicted = Cohen_correction(llm_average_success_on_unsuccessfully_predicted, random_guess_n_choices)

            stratified_successes.append({"llm": llm, "dataset": dataset_name, "average_success": llm_average_success_on_successfully_predicted, "cohen_correction_n_choices": cohen_correction_n_choices_on_successfully_predicted, "on": "successfully_predicted", "features": features})
            stratified_successes.append({"llm": llm, "dataset": dataset_name, "average_success": llm_average_success_on_unsuccessfully_predicted, "cohen_correction_n_choices": cohen_correction_n_choices_on_unsuccessfully_predicted, "on": "unsuccessfully_predicted", "features": features})

        # save
        stratified_successes_df = pd.DataFrame(stratified_successes)
        save_dataframe(stratified_success_file, stratified_successes_df)


Now do the same but without XGBoost (as that is a complex algorithm that LLMs may be unable to fully capture):

In [None]:
primal_performance_df_no_xgboost = primal_performance_df[primal_performance_df["predictive_method"] != "XGBoost"]

best_predictive_method_per_feature_primal_no_xgboost = primal_performance_df_no_xgboost.groupby(["dataset", "features"]).apply(lambda x: x[x.Accuracy_val == x.Accuracy_val.max()]).reset_index(drop=True)
# if there are more than one entry with the same accuracy for each ["dataset", "features"] combination, then pick the first one; also keep the dataset and features columns
best_predictive_method_per_feature_primal_no_xgboost = best_predictive_method_per_feature_primal_no_xgboost.groupby(["dataset", "features"]).first().reset_index()

In [None]:
stratified_success_file = "results/stratified_successes_no_xgboost.csv"

In [None]:
stratified_successes_no_xgboost = []
for features in features_list:
    # Loop over all datasets
    for dataset in tqdm(total_list):
        instance, dataset_name, ideal_col_name, group = initialize_instance(dataset)

        # Split the dataset
        train_df, val_df, test_df = instance.train_val_test_split(discard_na_rows=False, rng=np.random.RandomState(42), train_size=0.6, val_size=0.2)

        primal_train_labels = train_df[ideal_col_name]
        primal_test_labels = test_df[ideal_col_name]

        # Encode the labels
        label_encoder = LabelEncoder()
        primal_train_labels_encoded = label_encoder.fit_transform(primal_train_labels)
        primal_test_labels_encoded = label_encoder.transform(test_df[ideal_col_name])

        # extract the predictions from the best_predictive_method_per_feature_primal dataframe
        best_predictive_method_feature_primal = best_predictive_method_per_feature_primal_no_xgboost[best_predictive_method_per_feature_primal_no_xgboost["dataset"] == dataset_name]
        best_predictive_method_feature_primal = best_predictive_method_feature_primal[best_predictive_method_feature_primal["features"] == features]
        instance_level_predictions = best_predictive_method_feature_primal["instance_level_predictions_test"].values[0]
        instance_level_correct = instance_level_predictions == primal_test_labels_encoded

        random_guess_n_choices = 1 / len(primal_test_labels.unique())

        # Stratify the performance of the various LLMs on the test dataset
        for llm in instance.llms:
            llm_success = test_df[f"Success_{llm}"]
            llm_average_success_on_successfully_predicted = llm_success[instance_level_correct].mean()
            llm_average_success_on_unsuccessfully_predicted = llm_success[~instance_level_correct].mean()

            # random_guess_proportion_successfully_predicted = compute_random_guesses(primal_test_labels[instance_level_correct], instance_level_predictions[instance_level_correct])
            # random_guess_proportion_unsuccessfully_predicted = compute_random_guesses(primal_test_labels[~instance_level_correct], instance_level_predictions[~instance_level_correct])
            # now compute the Cohen's correction with the two random guess values, for the two subsets
            cohen_correction_n_choices_on_successfully_predicted = Cohen_correction(llm_average_success_on_successfully_predicted, random_guess_n_choices)
            cohen_correction_n_choices_on_unsuccessfully_predicted = Cohen_correction(llm_average_success_on_unsuccessfully_predicted, random_guess_n_choices)

            stratified_successes_no_xgboost.append({"llm": llm, "dataset": dataset_name, "average_success": llm_average_success_on_successfully_predicted, "cohen_correction_n_choices": cohen_correction_n_choices_on_successfully_predicted, "on": "successfully_predicted", "features": features})
            stratified_successes_no_xgboost.append({"llm": llm, "dataset": dataset_name, "average_success": llm_average_success_on_unsuccessfully_predicted, "cohen_correction_n_choices": cohen_correction_n_choices_on_unsuccessfully_predicted, "on": "unsuccessfully_predicted", "features": features})

        # save
        stratified_successes_no_xgboost_df = pd.DataFrame(stratified_successes_no_xgboost)
        save_dataframe(stratified_success_file, stratified_successes_no_xgboost_df)


## Plots showing the Cohen corrected stratified performance, for each LLM and dataset pair 

The accuracy of the simple classifier has two available correction (proportions and n_choices), while the accuracy of LLMs has the n_choices one only. Make the plots with that one for now.

Load the file:

In [None]:
stratified_successes_df = load_with_conditions(stratified_success_file)

In [None]:
stratified_successes_df.columns

In [None]:
pivoted_table = stratified_successes_df.pivot_table(index=["dataset", "llm", "features"], columns="on", values="cohen_correction_n_choices")
pivoted_table = pivoted_table.reset_index()
pivoted_table["cohen_correction_difference"] = pivoted_table["successfully_predicted"] - pivoted_table["unsuccessfully_predicted"]
pivoted_table

The following scatter plot shows, for each dataset and LLM, the performance of the LLM on the two splits according to the predictions of the simple model; one panel per simple feature.

In [None]:
for features in features_list:
    pivoted_table_features = pivoted_table[pivoted_table["features"] == features]
    # plot a scatterplot of the successfully vs unsuccessfully predicted, where colors=dataset and markers=llm
    plt.figure(figsize=(10, 10))
    sns.scatterplot(data=pivoted_table_features, x="unsuccessfully_predicted", y="successfully_predicted", hue="dataset", style="llm")
    # set title
    plt.title(f"Features: {features}")
    plt.xlabel("Cohen corrected performance of LLM on successfully predicted with simple method")
    plt.ylabel("Cohen corrected performance of LLM on unsuccessfully predicted with simple method")
    # hide legend
    plt.legend().remove()
    plt.show()

In [None]:
for features in features_list:
    pivoted_table_features = pivoted_table[pivoted_table["features"] == features]
    print(f"Features: {features}")
    print(pivoted_table_features["cohen_correction_difference"].describe())
    print()

Plot the differences against the Cohen-corrected accuracy of the methods based on simple features

In [None]:
for features in features_list:

    best_predictive_method_per_feature_primal_feature = best_predictive_method_per_feature_primal[best_predictive_method_per_feature_primal["features"] == features]
    best_predictive_method_per_feature_primal_feature = best_predictive_method_per_feature_primal_feature[["dataset", "cohen_correction_n_choices"]]
    final_table = pd.merge(pivoted_table, best_predictive_method_per_feature_primal_feature, on="dataset", how="left")

    final_table = final_table[final_table["features"] == features]

    # Perform regression analysis using statsmodels
    X = sm.add_constant(final_table["cohen_correction_n_choices"])  # Adds a constant term to the predictor
    model = sm.OLS(final_table["cohen_correction_difference"], X).fit()

    # Extract p-values
    p_values = model.pvalues
    print(f"P-values for features {features}:\n", p_values)

    # scatterplot of the accuracy of the ngrams vs the difference
    plt.figure(figsize=(10, 10))
    # sns.boxplot(data=final_table, x="difference", y="Accuracy_test", hue="llm")
    #sns.scatterplot(data=final_table, x="Accuracy_test", y="difference", hue="llm")
    sns.regplot(data=final_table, x="cohen_correction_n_choices", y="cohen_correction_difference")
    plt.xlabel("Simple method Cohen corrected accuracy")
    plt.ylabel("Difference in Cohen-corrected accuracy of LLMs")
    # set title
    plt.title(f"Features: {features}")
    # switch off legend
    plt.legend().remove()
    plt.show()

## Plots aggregating the performance of the LLMs on the various datasets
Keep only the features that have the highest validation accuracy for each dataset, and then see if there is an effect on the difference in LLM performance. 

In [None]:
def scatterplot_with_color(df, x_column, y_column, color_column, cmap='plasma', alpha=0.7):
    """
    Generates a scatterplot between two values in a DataFrame and colors the dots according to a third value
    using a unidimensional colormap.
    
    Parameters:
    df (DataFrame): The DataFrame containing the data.
    x_column (str): The column name to plot on the x-axis.
    y_column (str): The column name to plot on the y-axis.
    color_column (str): The column name whose values will determine the color of the dots.
    cmap (str): The colormap to use for coloring the points. Defaults to 'Blues' for a unidimensional colormap.
    """
    plt.figure(figsize=(8, 6))
    scatter = plt.scatter(df[x_column], df[y_column], c=df[color_column], cmap=cmap, s=50, alpha=alpha)
    plt.colorbar(scatter, label=color_column)
    plt.xlabel(x_column)
    plt.ylabel(y_column)
    plt.title(f'Scatterplot of {y_column} vs {x_column}\ncolored by {color_column}')
    # plt.show()
    
    return scatter


In [None]:
from scipy import stats
from statsmodels.stats.multitest import multipletests

def corrected_paired_t_test(df, groups_col, alternative="greater"):
    # Assuming `stratified_successes_df` is the DataFrame containing your data

    # Initialize lists to store the results
    groups = []
    p_values = []
    test_statistics = []
    
    # Loop over each unique LLM and perform paired t-test
    for group in df[groups_col].unique():
        # Subset the dataframe by LLM
        group_data = df[df[groups_col] == group]
    
        # Perform paired t-test between the two columns for each LLM
        t_stat, p_val = stats.ttest_rel(
            group_data["Normalised LLM accuracy on successfully predicted split (Cohen correction)"],
            group_data["Normalised LLM accuracy on unsuccessfully predicted split (Cohen correction)"], 
            alternative=alternative  # our alternative hypothesis is that the former is larger than the latter
        )
        
        # Store the results
        groups.append(group)
        test_statistics.append(t_stat)
        p_values.append(p_val)
    
    # Create a DataFrame to store results
    results_df = pd.DataFrame({
        groups_col: groups,
        't_stat': test_statistics,
        'p_value': p_values
    })
    # drop na
    results_df = results_df.dropna()
    
    # Apply Benjamini-Hochberg correction for multiple comparisons
    rejected, pvals_corrected, _, _ = multipletests(results_df['p_value'], alpha=0.05, method='fdr_bh')
    
    # Add the corrected p-values and rejection decision to the results DataFrame
    results_df['p_value_corrected'] = pvals_corrected
    results_df['significant'] = rejected
    
    return results_df

In [None]:
models_dict = {
    'davinci': 'openai/davinci',
    'gpt-3.5-turbo-0613': 'openai/gpt-3.5-turbo-0613',
    'gpt-4-1106-preview': 'openai/gpt-4-1106-preview',
    'text-davinci-001': 'openai/text-davinci-001',
    'text-davinci-002': 'openai/text-davinci-002',
    'text-davinci-003': 'openai/text-davinci-003',
    'ada': 'openai/ada',
    'babbage': 'openai/babbage',
    'curie': 'openai/curie',
    'gpt-3.5-turbo-0125': 'openai/gpt-3.5-turbo-0125',
    'gpt-3.5-turbo-0301': 'openai/gpt-3.5-turbo-0301',
    'gpt-3.5-turbo-1106': 'openai/gpt-3.5-turbo-1106',
    'gpt-4-0125-preview': 'openai/gpt-4-0125-preview',
    'gpt-4-0314': 'openai/gpt-4-0314',
    'gpt-4-0613': 'openai/gpt-4-0613',
    'text-ada-001': 'openai/text-ada-001',
    'text-babbage-001': 'openai/text-babbage-001',
    'text-curie-001': 'openai/text-curie-001',
    'llama007': 'meta/llama-1-7b'
}


In [None]:
filename = "results/ground_truth_prediction_performance.pkl"
primal_performance_df = load_with_conditions(filename)

In [None]:
primal_performance_df.shape

Compute the overall LLM accuracy:

In [None]:
overall_LLM_accuracy = []

for dataset in tqdm(total_list):
    instance, dataset_name, ideal_col_name, group = initialize_instance(dataset)

    train_df, val_df, test_df = instance.train_val_test_split(discard_na_rows=False, rng=np.random.RandomState(42), train_size=0.6, val_size=0.2)

    primal_labels = instance.results_df[ideal_col_name]
    
    random_guess_n_choices = 1 / len(primal_labels.unique())

    # Stratify the performance of the various LLMs on the test dataset
    for llm in instance.llms:
        llm_success = test_df[f"Success_{llm}"].mean()
        
        llm_success_Cohen_correction = Cohen_correction(llm_success, random_guess_n_choices)
        
        overall_LLM_accuracy.append({"LLM": llm, "Dataset": dataset_name, "Overall LLM accuracy": llm_success, "Normalised overall LLM accuracy (Cohen correction)": llm_success_Cohen_correction})
        
overall_LLM_accuracy_df = pd.DataFrame(overall_LLM_accuracy)    


In [None]:
overall_LLM_accuracy_df.columns

### Accuracy of the LLMs on the various datasets 

Make a violinplot showing the performance of the various LLMs on the various datasets

In [None]:
# make a violinplot
plt.figure(figsize=(20*2/3, 6))

# use the extremes of this colormap as colors:
cmap = sns.diverging_palette(220, 20, as_cmap=True)
# extract the two extreme values
cmap_min = cmap(-50)#np.inf)
cmap_max = cmap(250)#np.inf)

# sort by Dataset name
overall_LLM_accuracy_df = overall_LLM_accuracy_df.sort_values(by="Dataset")

# for the ticks, compute the number of LLMs tested with each dataset
llms_per_dataset = overall_LLM_accuracy_df.groupby("Dataset")["LLM"].nunique()
# add the number of LLMs tested with each dataset
ticks = []
for i, dataset in enumerate(overall_LLM_accuracy_df["Dataset"].unique()):
    ticks.append(f"{dataset} ({llms_per_dataset[dataset]})")
    
sns.violinplot(x='Dataset', y="Normalised overall LLM accuracy (Cohen correction)", data=overall_LLM_accuracy_df,
                inner='quartile', linewidth=1.5, color=cmap_max)
# sns.stripplot(x='Dataset', y="Normalised overall LLM accuracy (Cohen correction)", data=stratified_successes_df, color=cmap_max)

# rotate
plt.xticks(ticks=range(len(ticks)), labels=ticks, rotation=45, ha='right')   
# plt.xticks(rotation=45, ha='right')   

plt.xlabel("Dataset", fontsize=14, fontweight='bold')
plt.ylabel("Accuracy (Cohen correction)", fontsize=14, fontweight='bold')
plt.title('Performance of LLMs on different datasets', fontsize=16, fontweight='bold')

# add horizontal line at 0
plt.axhline(y=0, color='k', lw=1)

# save with tight layout
plt.tight_layout()
plt.savefig("fig/LLM_accuracy_violinplot.png")
plt.savefig("fig/LLM_accuracy_violinplot.pdf")
plt.show()

### Table of LLMs and dataset

In [None]:
def map_llm_names(elem):
    if elem in models_dict:
        return models_dict[elem]
    else:
        return elem

# Step 1: Add a constant value column to indicate presence
overall_LLM_accuracy_df['presence'] = 1

# Step 2: Create the pivot table
pivot_df = overall_LLM_accuracy_df.pivot(index='LLM', columns='Dataset', values='presence')

# Step 3: Fill missing values with 0
pivot_df = pivot_df.fillna(0)

# convert the llm names
pivot_df = pivot_df.reset_index()

pivot_df["LLM"] = pivot_df["LLM"].apply(map_llm_names)

# sort by llms
pivot_df = pivot_df.sort_values(by="LLM")

# Convert the DataFrame to a LaTeX table with ticks and crosses
def df_to_latex_with_ticks_and_crosses(df):
    latex_str = df.to_latex(escape=True, index=False)
    latex_str = latex_str.replace('1.0', '\\textcolor{green}{\\ding{51}}')  # Green tick
    latex_str = latex_str.replace('0.0', '\\textcolor{red}{\\ding{55}}')    # Red cross
    return latex_str

# Generate the LaTeX table string
latex_table = df_to_latex_with_ticks_and_crosses(pivot_df)

# Add necessary LaTeX packages
latex_table = (
    "\\documentclass{article}\n"
    "\\usepackage{pifont}\n"
    "\\usepackage{xcolor}\n"
    "\\begin{document}\n"
    + latex_table +
    "\\end{document}"
)

# Save the LaTeX table to a file
with open("tab/llm_dataset_table.tex", "w") as f:
    f.write(latex_table)


### Aggregated plots, stratified across correctly and uncorrectly predicted splits, with XGBoost

For each dataset, keep the best predictive method and features

In [None]:
primal_performance_df_no_baselines = primal_performance_df[~primal_performance_df["predictive_method"].isin(["most_likely_answer", "random_guess"])]

best_predictive_method_primal = primal_performance_df_no_baselines.groupby(["dataset"]).apply(lambda x: x[x.Accuracy_val == x.Accuracy_val.max()]).reset_index(drop=True)
# if there are more than one entry with the same accuracy for each ["dataset", "features"] combination, then pick the first one; also keep the dataset and features columns
best_predictive_method_primal = best_predictive_method_primal.groupby(["dataset"]).first().reset_index()

Compute Cohen correction for the predictive performance of the simple methods

In [None]:
# add two empty columns to the dataframe: cohen_correction_n_choices and cohen_correction_proportions
best_predictive_method_primal["cohen_correction_n_choices"] = np.nan
# best_predictive_method_primal["cohen_correction_proportions"] = np.nan
# loop over all datasets
for dataset in tqdm(total_list):
    instance, dataset_name, ideal_col_name, group = initialize_instance(dataset)

    # Split the dataset
    train_df, val_df, test_df = instance.train_val_test_split(discard_na_rows=False, rng=np.random.RandomState(42), train_size=0.6, val_size=0.2)

    primal_train_labels = train_df[ideal_col_name]
    primal_test_labels = test_df[ideal_col_name]

    # Encode the labels
    label_encoder = LabelEncoder()
    primal_train_labels_encoded = label_encoder.fit_transform(primal_train_labels)
    primal_test_labels_encoded = label_encoder.transform(test_df[ideal_col_name])

    # extract the indices of the dataframe with that dataset:
    indices = best_predictive_method_primal[best_predictive_method_primal["dataset"] == dataset_name].index
    # loop over those rows:
    for i in indices:
        if best_predictive_method_primal.loc[i, "predictive_method"] in ["most_likely_answer", "random_guess"]:
            continue
        # extract the predictions from the best_predictive_method_primal dataframe
        instance_level_predictions = best_predictive_method_primal.loc[i, "instance_level_predictions_test"]
        # extract the labels
        # compute the random guess
        random_guess_n_choices = 1 / len(primal_test_labels.unique())
        # random_guess_proportion = compute_random_guesses(primal_test_labels_encoded, instance_level_predictions)
        # now compute the Cohen's correction with the two random guess values
        cohen_correction_n_choices = Cohen_correction(best_predictive_method_primal.loc[i, "Accuracy_test"], random_guess_n_choices)
        # cohen_correction_proportion = Cohen_correction(best_predictive_method_per_feature_primal.loc[i, "Accuracy_test"], random_guess_proportion)
        # update the dataframe
        best_predictive_method_primal.loc[i, "cohen_correction_n_choices"] = cohen_correction_n_choices
        # best_predictive_method_primal.loc[i, "cohen_correction_proportions"] = cohen_correction_proportion


In [None]:
best_predictive_method_primal.columns

Table with the best features and predictive method

In [None]:
# make a latex table
print(best_predictive_method_primal[['dataset', 'predictive_method', 'features']].to_latex(index=False))

In [None]:
# make ascii table
print(best_predictive_method_primal[['dataset', 'predictive_method', 'features']].to_string(index=False))

Load the stratified successes file 

In [None]:
stratified_success_file = "results/stratified_successes.csv"
# load 
stratified_successes_df = load_with_conditions(stratified_success_file)

In [None]:
stratified_successes_df.shape

In [None]:
stratified_successes_df.columns

In [None]:
print(stratified_successes_df.isna().sum())

In [None]:
# drop the rows without na values
stratified_successes_df_na = stratified_successes_df[stratified_successes_df.isna().any(axis=1)]

In [None]:
stratified_successes_df_na.dataset.unique()

That dataset has some na values because there was perfect predictability with n-grams -> no way to compute the performance of LLMs on the unsuccessfully predicted instances.

I need to discard this from the analysis then

In [None]:
stratified_successes_df = stratified_successes_df[stratified_successes_df["dataset"] != "legalbench_corporate_lobbying"]

Compute the difference of LLM performance in the cases successfully and unsuccessfully predicted by the simple method

In [None]:
stratified_successes_df = stratified_successes_df.pivot_table(index=["dataset", "llm", "features"], columns="on", values="cohen_correction_n_choices")
stratified_successes_df = stratified_successes_df.reset_index()
stratified_successes_df["cohen_correction_difference"] = stratified_successes_df["successfully_predicted"] - stratified_successes_df["unsuccessfully_predicted"]
stratified_successes_df.head()

In [None]:
stratified_successes_df.columns

In [None]:
# stratified_successes_df.drop(["successfully_predicted", "unsuccessfully_predicted"], axis=1, inplace=True)
stratified_successes_df = stratified_successes_df.rename(columns={"cohen_correction_difference": "Difference in normalised LLM accuracy (Cohen correction)", "successfully_predicted": "Normalised LLM accuracy on successfully predicted split (Cohen correction)", "unsuccessfully_predicted": "Normalised LLM accuracy on unsuccessfully predicted split (Cohen correction)"})

In [None]:
stratified_successes_df.columns

In [None]:
stratified_successes_df.shape

For each dataset, I should only keep the rows where the features are the best one, ie those in the best_predictive_method_primal dataframe 
(notice that this the results in stratified_successes_df were already computed with the best predictive method)

The following cell merges the two dataframes and thus selects the best features for the stratified successes df.

In [None]:
stratified_successes_df = stratified_successes_df.merge(best_predictive_method_primal, on=["dataset", "features"])

In [None]:
stratified_successes_df.shape

In [None]:
stratified_successes_df.columns

In [None]:
stratified_successes_df = stratified_successes_df.drop(columns=["Accuracy_val", "Accuracy_test", "instance_level_predictions_test", "group", "trained_classifier", "instance_level_predictions_val"])

In [None]:
stratified_successes_df.columns

In [None]:
rename_dict = {
    'llm': 'LLM',
    'dataset': 'Dataset',
    'average_success': 'average_success',
    'on': 'on',
    'features': 'Features',
    'cohen_correction_n_choices': 'Normalised n-grams accuracy (Cohen correction)',
    'successfully_predicted': 'Successfully predicted',
}

In [None]:
stratified_successes_df = stratified_successes_df.rename(columns=rename_dict)

In [None]:
stratified_successes_df.head()

Merge with the overall LLM accuracy dataframe:

In [None]:
stratified_successes_df = stratified_successes_df.merge(overall_LLM_accuracy_df, on=["LLM", "Dataset"])

In [None]:
stratified_successes_df.head()

In [None]:
stratified_successes_df.columns

Plot of the difference between the LLM performance on the successfully and unsuccessfully predicted instances, against the Cohen-corrected accuracy of the n-grams

In [None]:
cmap = sns.diverging_palette(220, 20, as_cmap=True)

scatterplot_with_color(stratified_successes_df, "Normalised n-grams accuracy (Cohen correction)", "Difference in normalised LLM accuracy (Cohen correction)", "Normalised overall LLM accuracy (Cohen correction)", cmap=cmap)

In [None]:
# make a perceptually uniform colormap centered on 0.2
cmap = sns.diverging_palette(220, 20, as_cmap=True)

scatterplot_with_color(stratified_successes_df, "Normalised n-grams accuracy (Cohen correction)", "Normalised overall LLM accuracy (Cohen correction)", "Difference in normalised LLM accuracy (Cohen correction)")

In [None]:
# cmap = sns.diverging_palette(220, 20, as_cmap=True)

scatterplot_with_color(stratified_successes_df, "Normalised overall LLM accuracy (Cohen correction)", "Difference in normalised LLM accuracy (Cohen correction)", "Normalised n-grams accuracy (Cohen correction)")

#### LLM boxplots

Make a boxplot for each LLM, side by side for the successfully and unsuccessfully predicted by n-gram splits.

I need to exclude the datasets where the n-grams have poor predictive power.

The cohen_correction_n_choices column contains the corrected accuracy on the test split of the dataset

In [None]:
best_predictive_method_primal.shape

In [None]:
threshold = 0.2
# extract the datasets where the cohen_correction_n_choices is <threshold
datasets_to_keep = best_predictive_method_primal[best_predictive_method_primal["cohen_correction_n_choices"] > threshold]["dataset"]
datasets_to_exclude = best_predictive_method_primal[best_predictive_method_primal["cohen_correction_n_choices"] <= threshold]["dataset"]
len(datasets_to_keep)

In [None]:
list(datasets_to_exclude)

In [None]:
stratified_successes_df_datasets_to_keep = stratified_successes_df[stratified_successes_df["Dataset"].isin(datasets_to_keep)]
stratified_successes_df_boxplot = stratified_successes_df_datasets_to_keep[["LLM", "Normalised LLM accuracy on successfully predicted split (Cohen correction)", "Normalised LLM accuracy on unsuccessfully predicted split (Cohen correction)"]]

In [None]:
stratified_successes_df_boxplot.head()
# meld
stratified_successes_df_boxplot = stratified_successes_df_boxplot.melt(id_vars=["LLM"], var_name="Split", value_name="Normalised LLM accuracy (Cohen correction)")


In [None]:
stratified_successes_df_boxplot.head()

In [None]:
stratified_successes_df_boxplot.shape

In [None]:
rename_dict = {'Normalised LLM accuracy on successfully predicted split (Cohen correction)': 'Successfully predicted with n-grams',
       'Normalised LLM accuracy on unsuccessfully predicted split (Cohen correction)': 'Unsuccessfully predicted with n-grams'}

In [None]:
stratified_successes_df_boxplot["Split"] = stratified_successes_df_boxplot["Split"].replace(rename_dict)

In [None]:
stratified_successes_df_boxplot["LLM"] = stratified_successes_df_boxplot["LLM"].replace(models_dict)

In [None]:
# Sort the LLMs based on the overall accuracy
sorted_llms_by_name = sorted(stratified_successes_df_boxplot['LLM'].unique())

# Create a boxplot of the sorted LLMs
plt.figure(figsize=(20, 8))
sns.boxplot(x='LLM', y="Normalised LLM accuracy (Cohen correction)", data=stratified_successes_df_boxplot, 
            order=sorted_llms_by_name, hue="Split")
plt.xticks(rotation=90)
plt.title('Boxplot of LLMs Sorted by Model Name')
plt.xlabel('LLM')
plt.ylabel('Normalised Overall LLM Accuracy (Cohen correction)')
plt.show()

This is quite interesting, looks like some of the families may rely on shortcuts substantially. 


Do paired t-tests at the family level. 

In [None]:
stratified_successes_df_t_test = stratified_successes_df_datasets_to_keep[["LLM", "Normalised LLM accuracy on successfully predicted split (Cohen correction)", "Normalised LLM accuracy on unsuccessfully predicted split (Cohen correction)", "Dataset"]]

In [None]:
# rename LLM using model_dict
stratified_successes_df_t_test["LLM"] = stratified_successes_df_t_test["LLM"].replace(models_dict)

In [None]:
stratified_successes_df_t_test["LLM_family"] = stratified_successes_df_t_test["LLM"].apply(lambda x: x.split("/")[0])
stratified_successes_df_t_test["LLM_family"].unique()

In [None]:
corrected_paired_t_test(stratified_successes_df_t_test, "LLM_family", alternative="greater")

### Aggregated plots, stratified across correctly and uncorrectly predicted splits, without XGBoost
Keep only the features that have the highest validation accuracy for each dataset, and then see if there is an effect on the difference in LLM performance. 

For each dataset, keep the best predictive method and features

In [None]:
primal_performance_df_no_baselines = primal_performance_df[~primal_performance_df["predictive_method"].isin(["most_likely_answer", "random_guess", "xgboost"])]

best_predictive_method_primal = primal_performance_df_no_baselines.groupby(["dataset"]).apply(lambda x: x[x.Accuracy_val == x.Accuracy_val.max()]).reset_index(drop=True)
# if there are more than one entry with the same accuracy for each ["dataset", "features"] combination, then pick the first one; also keep the dataset and features columns
best_predictive_method_primal = best_predictive_method_primal.groupby(["dataset"]).first().reset_index()

Compute Cohen correction for the predictive performance of the simple methods

In [None]:
# add two empty columns to the dataframe: cohen_correction_n_choices and cohen_correction_proportions
best_predictive_method_primal["cohen_correction_n_choices"] = np.nan
# best_predictive_method_primal["cohen_correction_proportions"] = np.nan
# loop over all datasets
for dataset in tqdm(total_list):
    instance, dataset_name, ideal_col_name, group = initialize_instance(dataset)

    # Split the dataset
    train_df, val_df, test_df = instance.train_val_test_split(discard_na_rows=False, rng=np.random.RandomState(42), train_size=0.6, val_size=0.2)

    primal_train_labels = train_df[ideal_col_name]
    primal_test_labels = test_df[ideal_col_name]

    # Encode the labels
    label_encoder = LabelEncoder()
    primal_train_labels_encoded = label_encoder.fit_transform(primal_train_labels)
    primal_test_labels_encoded = label_encoder.transform(test_df[ideal_col_name])

    # extract the indices of the dataframe with that dataset:
    indices = best_predictive_method_primal[best_predictive_method_primal["dataset"] == dataset_name].index
    # loop over those rows:
    for i in indices:
        if best_predictive_method_primal.loc[i, "predictive_method"] in ["most_likely_answer", "random_guess"]:
            continue
        # extract the predictions from the best_predictive_method_primal dataframe
        instance_level_predictions = best_predictive_method_primal.loc[i, "instance_level_predictions_test"]
        # extract the labels
        # compute the random guess
        random_guess_n_choices = 1 / len(primal_test_labels.unique())
        # random_guess_proportion = compute_random_guesses(primal_test_labels_encoded, instance_level_predictions)
        # now compute the Cohen's correction with the two random guess values
        cohen_correction_n_choices = Cohen_correction(best_predictive_method_primal.loc[i, "Accuracy_test"], random_guess_n_choices)
        # cohen_correction_proportion = Cohen_correction(best_predictive_method_per_feature_primal.loc[i, "Accuracy_test"], random_guess_proportion)
        # update the dataframe
        best_predictive_method_primal.loc[i, "cohen_correction_n_choices"] = cohen_correction_n_choices
        # best_predictive_method_primal.loc[i, "cohen_correction_proportions"] = cohen_correction_proportion


In [None]:
best_predictive_method_primal.columns

Table with the best features and predictive method

In [None]:
features_labels = {
    'Cohen_1-grams_presence': "1-grams Presence",
    'Cohen_1-grams_presence_gpt2': "1-grams Presence (GPT-2)",
    'Cohen_1-grams_simple_frequency': "1-grams Frequency",
    'Cohen_1-grams_simple_frequency_gpt2': "1-grams Frequency (GPT-2)",
    'Cohen_1-grams_tfidf': "1-grams TF-IDF",
    'Cohen_1-grams_tfidf_gpt2': "1-grams TF-IDF (GPT-2)",
    'Cohen_2-grams_presence': "2-grams Presence",
    'Cohen_2-grams_presence_gpt2': "2-grams Presence (GPT-2)",
    'Cohen_2-grams_simple_frequency': "2-grams Frequency",
    'Cohen_2-grams_simple_frequency_gpt2': "2-grams Frequency (GPT-2)",
    'Cohen_2-grams_tfidf': "2-grams TF-IDF",
    'Cohen_2-grams_tfidf_gpt2': "2-grams TF-IDF (GPT-2)",
    'Cohen_readability_diversity_metrics': "Readability & Diversity Metrics"
}
predictive_method_labels = {
    'logistic_regression_l1_c=1': "Log Reg (L1, C=1)",
    'logistic_regression_l1_c=0.1': "Log Reg (L1, C=0.1)",
    'logistic_regression_l2': "Log Reg (L2, C=1)",
}

In [None]:
best_predictive_method_primal_for_table = best_predictive_method_primal.copy()
best_predictive_method_primal_for_table["features"] = best_predictive_method_primal_for_table["features"].replace(features_labels)
best_predictive_method_primal_for_table["predictive_method"] = best_predictive_method_primal_for_table["predictive_method"].replace(predictive_method_labels)

# make a latex table
print(best_predictive_method_primal_for_table[['dataset', 'predictive_method', 'features']].to_latex(index=False))

In [None]:
# make ascii table
print(best_predictive_method_primal[['dataset', 'predictive_method', 'features']].to_string(index=False))

Load the stratified successes file 

In [None]:
stratified_success_file = "results/stratified_successes_no_xgboost.csv"
# load 
stratified_successes_df = load_with_conditions(stratified_success_file)

In [None]:
stratified_successes_df.shape

In [None]:
print(stratified_successes_df.isna().sum())

In [None]:
# drop the rows without na values
stratified_successes_df_na = stratified_successes_df[stratified_successes_df.isna().any(axis=1)]

In [None]:
stratified_successes_df_na.dataset.unique()

That dataset has some na values because there was perfect predictability with n-grams -> no way to compute the performance of LLMs on the unsuccessfully predicted instances.

I need to discard this from the analysis then

In [None]:
stratified_successes_df = stratified_successes_df[stratified_successes_df["dataset"] != "legalbench_corporate_lobbying"]

In [None]:
stratified_successes_df.columns

Compute the difference of LLM performance in the cases successfully and unsuccessfully predicted by the simple method

In [None]:
stratified_successes_df = stratified_successes_df.pivot_table(index=["dataset", "llm", "features"], columns="on", values="cohen_correction_n_choices")
stratified_successes_df = stratified_successes_df.reset_index()
stratified_successes_df["cohen_correction_difference"] = stratified_successes_df["successfully_predicted"] - stratified_successes_df["unsuccessfully_predicted"]
stratified_successes_df.head()

In [None]:
stratified_successes_df.columns

In [None]:
# stratified_successes_df.drop(["successfully_predicted", "unsuccessfully_predicted"], axis=1, inplace=True)
stratified_successes_df = stratified_successes_df.rename(columns={"cohen_correction_difference": "Difference in normalised LLM accuracy (Cohen correction)", "successfully_predicted": "Normalised LLM accuracy on successfully predicted split (Cohen correction)", "unsuccessfully_predicted": "Normalised LLM accuracy on unsuccessfully predicted split (Cohen correction)"})

In [None]:
stratified_successes_df.columns

In [None]:
stratified_successes_df.shape

For each dataset, I should only keep the rows where the features are the best one, ie those in the best_predictive_method_primal dataframe 
(notice that this the results in stratified_successes_df were already computed with the best predictive method)

The following cell merges the two dataframes and thus selects the best features for the stratified successes df.

In [None]:
stratified_successes_df = stratified_successes_df.merge(best_predictive_method_primal, on=["dataset", "features"])

In [None]:
stratified_successes_df.shape

In [None]:
stratified_successes_df.columns

In [None]:
stratified_successes_df = stratified_successes_df.drop(columns=["Accuracy_val", "Accuracy_test", "instance_level_predictions_test", "group", "trained_classifier", "instance_level_predictions_val"])

In [None]:
stratified_successes_df.columns

In [None]:
rename_dict = {
    'llm': 'LLM',
    'dataset': 'Dataset',
    'on': 'on',
    'features': 'Features',
    'cohen_correction_n_choices': 'Normalised n-grams accuracy (Cohen correction)',
    'successfully_predicted': 'Successfully predicted',
}

In [None]:
stratified_successes_df = stratified_successes_df.rename(columns=rename_dict)

In [None]:
stratified_successes_df.head()

Merge with the overall LLM accuracy dataframe:

In [None]:
stratified_successes_df = stratified_successes_df.merge(overall_LLM_accuracy_df, on=["LLM", "Dataset"])

In [None]:
stratified_successes_df.head()

In [None]:
stratified_successes_df.columns

In [None]:
len(stratified_successes_df["Normalised overall LLM accuracy (Cohen correction)"].unique())

In [None]:
#cmap = sns.diverging_palette(220, 20, as_cmap=True)

scatterplot_with_color(stratified_successes_df, "Normalised n-grams accuracy (Cohen correction)", "Difference in normalised LLM accuracy (Cohen correction)", "Normalised overall LLM accuracy (Cohen correction)")#, cmap=cmap)

In [None]:
scatterplot_with_color(stratified_successes_df, "Normalised n-grams accuracy (Cohen correction)", "Normalised overall LLM accuracy (Cohen correction)", "Difference in normalised LLM accuracy (Cohen correction)")

In [None]:
cmap = sns.diverging_palette(220, 20, as_cmap=True, center='dark', s=100, l=60, sep=1)
# cmap="winter"

alpha=0.55

x_column = "Normalised overall LLM accuracy (Cohen correction)"
y_column = "Difference in normalised LLM accuracy (Cohen correction)"
color_column = "Normalised n-grams accuracy (Cohen correction)"

plt.figure(figsize=(8, 6))
scatter = plt.scatter(stratified_successes_df[x_column], stratified_successes_df[y_column], c=stratified_successes_df[color_column], cmap=cmap, s=50, alpha=alpha, marker="o")

# remove the grid
# plt.grid(False)
# put horizontal and vertical lines at 0
plt.axhline(0, color='black', lw=1)
plt.axvline(0, color='black', lw=1)

plt.title(f'Difference in LLM accuracy between instances successfully and unsuccessfully predicted by n-grams\nvs overall LLM accuracy, colored by n-gram accuracy')
plt.ylabel("Difference in LLM accuracy between splits, Cohen's kappa")
plt.xlabel("Overall LLM accuracy, Cohen's kappa")
plt.colorbar(scatter, label="N-grams accuracy, Cohen's kappa")

# save with tight layout
plt.tight_layout()
plt.savefig("fig/scatterplot.pdf")
plt.savefig("fig/scatterplot.png")

#### LLM boxplots

Make a boxplot for each LLM, side by side for the successfully and unsuccessfully predicted by n-gram splits.

I need to exclude the datasets where the n-grams have poor predictive power.

The cohen_correction_n_choices column contains the corrected accuracy on the test split of the dataset

In [None]:
threshold = 0.2
# extract the datasets where the cohen_correction_n_choices is <threshold
datasets_to_keep = best_predictive_method_primal[best_predictive_method_primal["cohen_correction_n_choices"] > threshold]["dataset"]
datasets_to_exclude = best_predictive_method_primal[best_predictive_method_primal["cohen_correction_n_choices"] <= threshold]["dataset"]
len(datasets_to_keep)

In [None]:
list(datasets_to_exclude)

In [None]:
stratified_successes_df_datasets_to_keep = stratified_successes_df[stratified_successes_df["Dataset"].isin(datasets_to_keep)]
stratified_successes_df_boxplot = stratified_successes_df_datasets_to_keep[["LLM", "Normalised LLM accuracy on successfully predicted split (Cohen correction)", "Normalised LLM accuracy on unsuccessfully predicted split (Cohen correction)"]]

In [None]:
stratified_successes_df_boxplot.head()
# meld
stratified_successes_df_boxplot = stratified_successes_df_boxplot.melt(id_vars=["LLM"], var_name="Split", value_name="Normalised LLM accuracy (Cohen correction)")

In [None]:
stratified_successes_df_boxplot.head()

In [None]:
stratified_successes_df_boxplot.shape

In [None]:
rename_dict = {'Normalised LLM accuracy on successfully predicted split (Cohen correction)': 'Successfully predicted with n-grams',
       'Normalised LLM accuracy on unsuccessfully predicted split (Cohen correction)': 'Unsuccessfully predicted with n-grams'}

In [None]:
stratified_successes_df_boxplot["Split"] = stratified_successes_df_boxplot["Split"].replace(rename_dict)

In [None]:
len(stratified_successes_df_boxplot["LLM"].unique())

In [None]:
stratified_successes_df_boxplot["LLM"] = stratified_successes_df_boxplot["LLM"].replace(models_dict)

In [None]:
alpha = 0.9
# Sort the LLMs based on the overall accuracy
sorted_llms_by_name = sorted(stratified_successes_df_boxplot['LLM'].unique())

# Create a boxplot of the sorted LLMs
plt.figure(figsize=(20*2/3, 8))

# use the extremes of this colormap as colors:
cmap = sns.diverging_palette(220, 20, as_cmap=True)
# extract the two extreme values
cmap_min = cmap(-50)#np.inf)
cmap_max = cmap(250)#np.inf)
# convert to lists
cmap_min = list(cmap_min)
cmap_max = list(cmap_max)
cmap_min[-1] = alpha
cmap_max[-1] = alpha

sns.boxplot(x='LLM', y="Normalised LLM accuracy (Cohen correction)", data=stratified_successes_df_boxplot, 
            order=sorted_llms_by_name, hue="Split", palette={'Successfully predicted with n-grams': cmap_min, 'Unsuccessfully predicted with n-grams': cmap_max})
plt.xticks(rotation=90, ha='right', fontsize=10)
plt.yticks(rotation=0, fontsize=10)
    
title = 'Performance of each LLM on the successfully and unsuccessfully predicted instances by n-grams of different datasets'
plt.title(title, fontsize=16, fontweight='bold')    
plt.xlabel('LLM', fontsize=12, fontweight='bold')
plt.ylabel("Accuracy (Cohen's kappa)", fontsize=12, fontweight='bold')
plt.tight_layout()
# save
plt.savefig("fig/llm_boxplot.png")
plt.savefig("fig/llm_boxplot.pdf")
plt.show()

This is quite interesting, looks like some of the families may rely on shortcuts substantially. This plot groups by LLM, while the scatterplots I did before do not. On the converse, this one does not display the performance of the n-gram classifier and the overall accuracy of the LLM (though that can be somehow induced by the two boxplots).



#### t-tests


In [None]:
stratified_successes_df_t_test = stratified_successes_df_datasets_to_keep[["LLM", "Normalised LLM accuracy on successfully predicted split (Cohen correction)", "Normalised LLM accuracy on unsuccessfully predicted split (Cohen correction)", "Dataset"]]

In [None]:
# rename LLM using model_dict
stratified_successes_df_t_test["LLM"] = stratified_successes_df_t_test["LLM"].replace(models_dict)

In [None]:
llm_model_families = {
    'openai/ada': 'OpenAI/GPT-3-base',
    'openai/babbage': 'OpenAI/GPT-3-base',
    'openai/curie': 'OpenAI/GPT-3-base',
    'openai/davinci': 'OpenAI/GPT-3-base',
    'openai/text-ada-001': 'OpenAI/GPT-3-instruct',
    'openai/text-babbage-001': 'OpenAI/GPT-3-instruct',
    'openai/text-curie-001': 'OpenAI/GPT-3-instruct',
    'openai/text-davinci-001': 'OpenAI/GPT-3-instruct',
    'openai/text-davinci-002': 'OpenAI/GPT-3.5',
    'openai/text-davinci-003': 'OpenAI/GPT-3.5',
    'openai/gpt-3.5-turbo-0125': 'OpenAI/GPT-3.5',
    'openai/gpt-3.5-turbo-0301': 'OpenAI/GPT-3.5',
    'openai/gpt-3.5-turbo-0613': 'OpenAI/GPT-3.5',
    'openai/gpt-3.5-turbo-1106': 'OpenAI/GPT-3.5',
    'openai/gpt-4-0125-preview': 'OpenAI/GPT-4',
    'openai/gpt-4-0314': 'OpenAI/GPT-4',
    'openai/gpt-4-0613': 'OpenAI/GPT-4',
    'openai/gpt-4-1106-preview': 'OpenAI/GPT-4',
}

def model_family(llm):
    if llm.startswith("openai"):
        family = llm_model_families[llm]
    else:
        family = llm.split("/")[0]
    return family

In [None]:
stratified_successes_df_t_test["LLM_family"] = stratified_successes_df_t_test["LLM"].apply(model_family)
stratified_successes_df_t_test["LLM_family"].unique()

In [None]:
res_df_family = corrected_paired_t_test(stratified_successes_df_t_test, "LLM_family", alternative="greater")
res_df_family

Count the number of LLMs per each model family:


In [None]:
llm_family_counts = stratified_successes_df_t_test.groupby("LLM_family")["LLM"].nunique()

In [None]:
llm_family_counts = llm_family_counts.rename("Number of LLMs")

In [None]:
# merge that with the res_df
res_df_family = res_df_family.merge(llm_family_counts, left_on="LLM_family", right_index=True)
res_df_family[["LLM_family", 'p_value', 'p_value_corrected', 'Number of LLMs']]

In [None]:
# make now a latex table considering the LLM_famliy, p_value and p_value_corrected columns
print(res_df_family[['LLM_family', 'p_value', 'p_value_corrected', 'Number of LLMs']].to_latex(index=False))