In [1]:
import pandas as pd
from datasets import load_dataset
from sklearn.metrics import accuracy_score, precision_score, recall_score,f1_score, fbeta_score, matthews_corrcoef

  from .autonotebook import tqdm as notebook_tqdm


In [21]:
dataset = load_dataset('mediabiasgroup/BABE')['train'].to_pandas()
pool = load_dataset('mediabiasgroup/BABE-icl-pool')['train'].to_pandas()

df_0_shot = pd.read_csv('./zero-shot/all_runs_with_errors.csv').merge(dataset['text'],on='text')
df_0_shot_sys = pd.read_csv(
    './zero-shot-system_prompt/all_runs_with_errors.csv').merge(dataset['text'],on='text')
df_0_shot_cot = pd.read_csv('./zero-shot-cot/all_runs_with_errors.csv').merge(dataset['text'],on='text')
df_2_shot = pd.read_csv('./2-shot/all_runs_with_errors.csv').merge(dataset['text'],on='text')
df_4_shot = pd.read_csv('./4-shot/all_runs_with_errors.csv').merge(dataset['text'],on='text')
df_8_shot = pd.read_csv('./8-shot/all_runs_with_errors.csv').merge(dataset['text'],on='text')
df_2_shot_cot = pd.read_csv('./2-shot-CoT/all_runs_with_errors.csv').merge(dataset['text'],on='text')
df_4_shot_cot = pd.read_csv('./4-shot-CoT/all_runs_with_errors.csv').merge(dataset['text'],on='text')
df_8_shot_cot = pd.read_csv('./8-shot-CoT/all_runs_with_errors.csv').merge(dataset['text'],on='text')

# Sanity check

In [22]:

print(dataset.shape)  # 100 samples bigger than other dfs
print(df_0_shot.shape)
print(df_0_shot_sys.shape)
print(df_0_shot_cot.shape)
print(df_2_shot.shape)
print(df_4_shot.shape)
print(df_8_shot.shape)
print(df_2_shot_cot.shape)
print(df_4_shot_cot.shape)
print(df_8_shot_cot.shape)

dfs = [(df_0_shot, "0 Shot"),
       (df_0_shot_sys, "0 Shot + System Prompt"),
       (df_0_shot_cot, "0 Shot CoT"),
       (df_2_shot, "2 Shot"),
       (df_4_shot, "4 Shot"),
       (df_8_shot, "8 Shot"),
       (df_2_shot_cot, "2 Shot CoT"),
       (df_4_shot_cot, "4 Shot CoT"),
       (df_8_shot_cot, "8 Shot CoT")]

(3121, 9)
(3021, 22)
(3021, 22)
(3021, 21)
(3021, 22)
(3021, 22)
(3021, 19)
(3021, 19)
(3021, 19)
(3021, 19)


# Compute accuracy, precision, recall, f1 and unusable examples

In [23]:
def compute_measures(df, label_column_name, ground_truth_column, model_name,
                     run_type, verbose=False
                     ):
    """
    Compute accuracy, precision, recall, f1 and unusable examples per run.

    For accuracy, precision, recall and f1 we use sklearn default parameters.
    For unusable examples we use the following definition:
    unusable examples = where label != '?'

    Args:
        df (pd.DataFrame): dataframe with all the runs
        label_column_name (str): name of the column with the labels
        ground_truth_column (str): name of the column with the ground truth
        run_type (str): type of run (e.g. 0-shot, 2-shot, etc.)
        verbose (bool, optional): whether to print the results. Defaults to False.

    Returns:
        pd.DataFrame: dataframe row with the results
    """

    # Exclude the unusable examples (label == '?')
    df_usable = df[df[label_column_name] != '?']
    # get the length of excluded examples
    unusable_examples = df[df[label_column_name] == '?'].shape[0]

    # sanity check for the number of excluded examples
    if verbose:
        print(
            f'Number of unusable examples: {unusable_examples}, in {label_column_name}, {run_type}')
        print(df_usable[label_column_name].value_counts())

    assert unusable_examples + df_usable.shape[0] == df.shape[0]

    # Extract the labels and the ground truth
    labels = df_usable[label_column_name].astype(int)
    ground_truth = df_usable[ground_truth_column].astype(int)

    # Compute the metrics
    accuracy = accuracy_score(ground_truth, labels)
    precision = precision_score(ground_truth, labels)
    recall = recall_score(ground_truth, labels)
    fbeta = fbeta_score(ground_truth, labels,beta=0.5)
    f1 = f1_score(ground_truth,labels)
    mcc = matthews_corrcoef(ground_truth,labels)
    # Create a dataframe row with the results
    return pd.DataFrame({"model_name": model_name,
                         'run_type': run_type,
                         'fbeta': fbeta,
                         'f1' : f1,
                         'mcc': mcc,
                         'accuracy': accuracy,
                         'precision': precision,
                         'recall': recall,
                         'unusable_examples': unusable_examples}, index=[0])



In [25]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, \
    f1_score

# Define the model names using the provided list of tuples
name_model_tuples = [
    ('falcon_7b_label', 'Falcon-7B-Instruct '),
    ('flan_t5_base_label', 'FLAN-T5-Base'),
    ('flan_t5_large_label', 'FLAN-T5-Large '),
    ('flan_t5_xl_label', 'FLAN-T5-XL '),
    ('flan_ul2_label', 'Flan-UL2'),
    ('gpt_3_5_label', 'GPT-3.5 Turbo'),
    ('gpt_4_label', 'GPT-4 Turbo'),
    ('llama_7b_label', 'LLama 2 7B Chat'),
    ('llama_13b_label', 'LLama 2 13B Chat'),
    ('mistral_7b_label', 'Mistral-7B-v0.1 Instruct'),
    ('mixtral_8x7b_label', 'Mixtral-8x7B Instruct'),
    ('openchat_label', 'OpenChat 3.5'),
    ('zephyr_label', 'Zephyr 7B β')
]
# Initialize an empty DataFrame to store the results
results_df = pd.DataFrame()

# Iterate through each DataFrame and each label_column_name
for df, type in dfs:

    for label_column_name, model_name in name_model_tuples:
        # Call the compute_measures method for each model

        if label_column_name in df.columns:
            df_row = compute_measures(df=df,
                                      label_column_name=label_column_name,
                                      ground_truth_column="label",
                                      model_name=model_name,
                                      run_type=type,
                                      verbose=True)

            # Append the results to the overall results DataFrame
            results_df = pd.concat([results_df, df_row], ignore_index=True)

# Sort the results by model name and run type
results_df = results_df.sort_values(by=['model_name', 'run_type'])
results_df.to_csv("benchmark_results.csv", index=False)

Number of unusable examples: 0, in falcon_7b_label, 0 Shot
falcon_7b_label
1    1677
0    1344
Name: count, dtype: int64
Number of unusable examples: 0, in flan_t5_base_label, 0 Shot
flan_t5_base_label
1    2540
0     481
Name: count, dtype: int64
Number of unusable examples: 0, in flan_t5_large_label, 0 Shot
flan_t5_large_label
1    1792
0    1229
Name: count, dtype: int64
Number of unusable examples: 0, in flan_t5_xl_label, 0 Shot
flan_t5_xl_label
1    1852
0    1169
Name: count, dtype: int64
Number of unusable examples: 0, in flan_ul2_label, 0 Shot
flan_ul2_label
1    1558
0    1463
Name: count, dtype: int64
Number of unusable examples: 0, in gpt_3_5_label, 0 Shot
gpt_3_5_label
1    2055
0     966
Name: count, dtype: int64
Number of unusable examples: 0, in gpt_4_label, 0 Shot
gpt_4_label
0    1663
1    1358
Name: count, dtype: int64
Number of unusable examples: 0, in llama_7b_label, 0 Shot
llama_7b_label
1    1888
0    1133
Name: count, dtype: int64
Number of unusable examples: 0, 

In [26]:
d = pd.read_csv('benchmark_results.csv')

In [30]:
d[['f1','fbeta','mcc','precision','recall']].corr()

Unnamed: 0,f1,fbeta,mcc,precision,recall
f1,1.0,0.712814,0.719579,0.340637,0.664552
fbeta,0.712814,1.0,0.978281,0.89991,-0.033293
mcc,0.719579,0.978281,1.0,0.869912,-0.009105
precision,0.340637,0.89991,0.869912,1.0,-0.438057
recall,0.664552,-0.033293,-0.009105,-0.438057,1.0


# Compute Krippendorff's alpha

In [24]:
from krippendorff import alpha
import numpy as np
import itertools

runs = ['falcon_7b_label',
        'flan_t5_base_label',
        'flan_t5_large_label',
        'flan_t5_xl_label',
        'flan_ul2_label',
        'gpt_3_5_label',
        'gpt_4_label',
        'llama_7b_label',
        'llama_13b_label',
        'mistral_7b_label',
        'mixtral_8x7b_label',
        'openchat_label',
        'zephyr_label']


def compute_krippendorff_alpha(dataframe, columns, missing_data='?',
                               verbose=False
                               ):
    """
    Compute Krippendorff's alpha for inter-rater reliability.

    Parameters:
    - dataframe: pd.DataFrame, the DataFrame containing the data.
    - columns: list, the list of column names to calculate alpha for.

    Returns:
    - alpha_value: float, Krippendorff's alpha value.
    """
    # Extract the relevant columns from the DataFrame
    data_subset = dataframe[columns]
    data_subset = data_subset.replace(missing_data, np.nan)

    # Ensure that the data is in a format suitable for krippendorff
    data_list = np.array([data_subset[col].tolist() for col in columns])

    if verbose:
        print(data_list.shape)

    # Calculate Krippendorff's alpha
    alpha_value = alpha(reliability_data=data_list)

    return alpha_value


def compute_krippendorff_alpha_for_k_runs(df, runs, k=None):
    # Initialize variables to store the best combination and alpha
    if k is None:
        k = len(runs)

    best_combination = None
    best_alpha = 0  # Assuming alpha ranges from 0 to 1

    # Iterate through all possible combinations
    for combination in itertools.combinations(runs, k):

        alpha_value = compute_krippendorff_alpha(df, list(combination))

        # Print alpha for the current combination
        print(f"Combination: {combination}, Alpha: {alpha_value}")

        # Update best combination and alpha if a higher alpha is found
        if alpha_value > best_alpha:
            best_alpha = alpha_value
            best_combination = combination

    # Print the best combination and alpha
    print(f"\nBest Combination: {best_combination}, Best Alpha: {best_alpha}")
    return best_alpha, best_combination
