In [7]:
import pandas as pd
from datasets import load_dataset
from krippendorff import alpha
import numpy as np
import itertools
from tqdm import tqdm

In [2]:
df_0_shot = pd.read_csv('./zero-shot/all_runs_with_errors.csv')
df_0_shot_sys = pd.read_csv(
    './zero-shot-system_prompt/all_runs_with_errors.csv')
df_0_shot_cot = pd.read_csv('./zero-shot-cot/all_runs_with_errors.csv')
df_2_shot = pd.read_csv('./2-shot/all_runs_with_errors.csv')
df_4_shot = pd.read_csv('./4-shot/all_runs_with_errors.csv')
df_8_shot = pd.read_csv('./8-shot/all_runs_with_errors.csv')
df_2_shot_cot = pd.read_csv('./2-shot-CoT/all_runs_with_errors.csv')
df_4_shot_cot = pd.read_csv('./4-shot-CoT/all_runs_with_errors.csv')
df_8_shot_cot = pd.read_csv('./8-shot-CoT/all_runs_with_errors.csv')

In [3]:
# Sanity Check
dataset = load_dataset('mediabiasgroup/BABE-v4')
df_babe = pd.DataFrame(dataset['train'])
print(df_babe.shape)  # 100 samples bigger than other dfs
print(df_0_shot.shape)
print(df_0_shot_sys.shape)
print(df_0_shot_cot.shape)
print(df_2_shot.shape)
print(df_4_shot.shape)
print(df_8_shot.shape)
print(df_2_shot_cot.shape)
print(df_4_shot_cot.shape)
print(df_8_shot_cot.shape)

dfs = [(df_0_shot, "0 Shot"),
       (df_0_shot_sys, "0 Shot + System Prompt"),
       (df_0_shot_cot, "0 Shot CoT"),
       (df_2_shot, "2 Shot"),
       (df_4_shot, "4 Shot"),
       (df_8_shot, "8 Shot"),
       (df_2_shot_cot, "2 Shot CoT"),
       (df_4_shot_cot, "4 Shot CoT"),
       (df_8_shot_cot, "8 Shot CoT")]

(4121, 8)
(4021, 21)
(4021, 21)
(50, 21)
(4021, 21)
(4021, 21)
(4021, 18)
(4021, 18)
(4021, 18)
(4021, 18)


In [4]:

# split the following list into 2
name_by_label = dict([
    ('falcon_7b_label', 'Falcon-7B-Instruct '),
    ('flan_t5_base_label', 'FLAN-T5-Base'),
    ('flan_t5_large_label', 'FLAN-T5-Large '),
    ('flan_t5_xl_label', 'FLAN-T5-XL '),
    ('flan_ul2_label', 'Flan-UL2'),
    ('gpt_3_5_label', 'GPT-3.5 Turbo'),
    ('gpt_4_label', 'GPT-4 Turbo'),
    ('llama_7b_label', 'LLama 2 7B Chat'),
    ('llama_13b_label', 'LLama 2 13B Chat'),
    ('mistral_7b_label', 'Mistral-7B-v0.1 Instruct'),
    ('mixtral_8x7b_label', 'Mixtral-8x7B Instruct'),
    ('openchat_label', 'OpenChat 3.5'),
    ('zephyr_label', 'Zephyr 7B β')
])

runs = list(name_by_label.keys())


def compute_krippendorff_alpha(dataframe, columns, missing_data='?',
                               verbose=False, runs=runs
                               ):
    """
    Compute Krippendorff's alpha for inter-rater reliability.

    Parameters:
    - dataframe: pd.DataFrame, the DataFrame containing the data.
    - columns: list, the list of column names to calculate alpha for.

    Returns:
    - alpha_value: float, Krippendorff's alpha value.
    """

    # Exclude columns that are not in the dataframe and print a warning
    columns = [col for col in columns if col in dataframe.columns]
    if len(columns) < len(runs) and verbose:
        print(
            f"Warning: {len(runs) - len(columns)} runs are not in the dataframe")

    # Extract the relevant columns from the DataFrame
    data_subset = dataframe[columns]
    data_subset = data_subset.replace(missing_data, np.nan)

    # cast all columns to int except np.nan
    for col in columns:
        data_subset[col] = pd.to_numeric(data_subset[col])

    # Ensure that the data is in a format suitable for krippendorff
    data_list = np.array([data_subset[col].tolist() for col in columns])

    if verbose:
        print(data_list.shape)

    # Calculate Krippendorff's alpha
    alpha_value = alpha(reliability_data=data_list)

    return alpha_value


def compute_krippendorff_alpha_for_k_runs(df, runs, k=None, verbose=False, columns=runs):
    # Initialize variables to store the best combination and alpha

    # exclude runs that are not in the dataframe
    runs = [run for run in tqdm(runs) if run in df.columns]

    if k is None:
        k = len(runs)

    best_combination = None
    best_alpha = 0  # Assuming alpha ranges from 0 to 1

    # Iterate through all possible combinations
    for combination in itertools.combinations(runs, k):

        alpha_value = compute_krippendorff_alpha(df, list(combination), verbose=verbose, runs=columns)

        if verbose:
            # Print alpha for the current combination
            print(f"Combination: {combination}, Alpha: {alpha_value}")

        # Update best combination and alpha if a higher alpha is found
        if alpha_value > best_alpha:
            best_alpha = alpha_value
            best_combination = combination

    # Print the best combination and alpha
    print(
        f"Best Alpha: {best_alpha}, Best Combination: {best_combination}")

    return best_alpha, best_combination


In [5]:
# Compute Krippendorff's alpha for all runs per dataframe and store it to csv
result_data = []
for df, name in dfs:
    print(name)
    score = compute_krippendorff_alpha(df, runs, verbose=False)
    result_data.append({'run_type': name, 'Alpha': score, "Alpha (rounded)": round(score, 2)})

result_df = pd.DataFrame(result_data)
result_df.to_csv('krippendorff_alpha_per_type.csv', index=False)

0 Shot
0 Shot + System Prompt
0 Shot CoT
2 Shot
4 Shot
8 Shot
2 Shot CoT
4 Shot CoT
8 Shot CoT


In [8]:
# Compute best Krippendorff's alpha for all combinations of 2 per dataframe and store it to csv
result_data = []

for df, name in dfs:
    print(name)
    score, combination = compute_krippendorff_alpha_for_k_runs(df, runs, k=2,
                                                               verbose=False)

    names = [name_by_label[run] for run in combination]
    result_data.append(
        {'run_type': name, 'Alpha': score, "Alpha (rounded)": round(score, 2), 'model_1': names[0],
         'model_2': names[1]})

result_df = pd.DataFrame(result_data)
result_df.to_csv('krippendorff_alpha_per_type_2.csv', index=False)
result_df

0 Shot


100%|██████████| 13/13 [00:00<00:00, 148978.01it/s]


Best Alpha: 0.603429667337622, Best Combination: ('gpt_3_5_label', 'zephyr_label')
0 Shot + System Prompt


100%|██████████| 13/13 [00:00<00:00, 258416.83it/s]


Best Alpha: 0.701053892374637, Best Combination: ('gpt_3_5_label', 'gpt_4_label')
0 Shot CoT


100%|██████████| 13/13 [00:00<00:00, 225313.85it/s]


Best Alpha: 0.7261663286004056, Best Combination: ('gpt_4_label', 'llama_13b_label')
2 Shot


100%|██████████| 13/13 [00:00<00:00, 276781.48it/s]


Best Alpha: 0.6449231092223087, Best Combination: ('mixtral_8x7b_label', 'zephyr_label')
4 Shot


100%|██████████| 13/13 [00:00<00:00, 114310.17it/s]


Best Alpha: 0.7118286104946041, Best Combination: ('openchat_label', 'zephyr_label')
8 Shot


100%|██████████| 13/13 [00:00<00:00, 165230.16it/s]


Best Alpha: 0.7661947884539435, Best Combination: ('openchat_label', 'zephyr_label')
2 Shot CoT


100%|██████████| 13/13 [00:00<00:00, 186095.40it/s]


Best Alpha: 0.719000423123165, Best Combination: ('gpt_4_label', 'mixtral_8x7b_label')
4 Shot CoT


100%|██████████| 13/13 [00:00<00:00, 101161.32it/s]


Best Alpha: 0.7546042687825425, Best Combination: ('gpt_4_label', 'mixtral_8x7b_label')
8 Shot CoT


100%|██████████| 13/13 [00:00<00:00, 206537.70it/s]


Best Alpha: 0.753608415965835, Best Combination: ('gpt_4_label', 'mixtral_8x7b_label')


Unnamed: 0,run_type,Alpha,Alpha (rounded),model_1,model_2
0,0 Shot,0.60343,0.6,GPT-3.5 Turbo,Zephyr 7B β
1,0 Shot + System Prompt,0.701054,0.7,GPT-3.5 Turbo,GPT-4 Turbo
2,0 Shot CoT,0.726166,0.73,GPT-4 Turbo,LLama 2 13B Chat
3,2 Shot,0.644923,0.64,Mixtral-8x7B Instruct,Zephyr 7B β
4,4 Shot,0.711829,0.71,OpenChat 3.5,Zephyr 7B β
5,8 Shot,0.766195,0.77,OpenChat 3.5,Zephyr 7B β
6,2 Shot CoT,0.719,0.72,GPT-4 Turbo,Mixtral-8x7B Instruct
7,4 Shot CoT,0.754604,0.75,GPT-4 Turbo,Mixtral-8x7B Instruct
8,8 Shot CoT,0.753608,0.75,GPT-4 Turbo,Mixtral-8x7B Instruct


In [9]:
# Compute best Krippendorff's alpha for all combinations of 3 per dataframe and store it to csv
result_data = []
for df, name in dfs:
    print(name)
    score, combination = compute_krippendorff_alpha_for_k_runs(df, runs, k=3,
                                                               verbose=False)

    names = [name_by_label[run] for run in combination]
    result_data.append(
        {'run_type': name, 'Alpha': score, "Alpha (rounded)": round(score, 2), 'model_1': names[0], 'model_2': names[1],
         'model_3': names[2]})

result_df = pd.DataFrame(result_data)
result_df.to_csv('krippendorff_alpha_per_type_3.csv', index=False)
result_df

0 Shot


100%|██████████| 13/13 [00:00<00:00, 100973.99it/s]


Best Alpha: 0.5692184310334636, Best Combination: ('flan_ul2_label', 'gpt_4_label', 'zephyr_label')
0 Shot + System Prompt


100%|██████████| 13/13 [00:00<00:00, 212163.24it/s]


Best Alpha: 0.6269059614723216, Best Combination: ('flan_ul2_label', 'gpt_3_5_label', 'gpt_4_label')
0 Shot CoT


100%|██████████| 13/13 [00:00<00:00, 188020.52it/s]


Best Alpha: 0.4929070929070929, Best Combination: ('gpt_3_5_label', 'openchat_label', 'zephyr_label')
2 Shot


100%|██████████| 13/13 [00:00<00:00, 185462.42it/s]


Best Alpha: 0.6265886133378524, Best Combination: ('gpt_3_5_label', 'openchat_label', 'zephyr_label')
4 Shot


100%|██████████| 13/13 [00:00<00:00, 232025.33it/s]


Best Alpha: 0.67621374672692, Best Combination: ('gpt_3_5_label', 'openchat_label', 'zephyr_label')
8 Shot


100%|██████████| 13/13 [00:00<00:00, 212163.24it/s]


Best Alpha: 0.6978531172885809, Best Combination: ('gpt_3_5_label', 'openchat_label', 'zephyr_label')
2 Shot CoT


100%|██████████| 13/13 [00:00<00:00, 135974.94it/s]


Best Alpha: 0.6473883973435703, Best Combination: ('gpt_4_label', 'llama_13b_label', 'mixtral_8x7b_label')
4 Shot CoT


100%|██████████| 13/13 [00:00<00:00, 138390.74it/s]


Best Alpha: 0.681981283778573, Best Combination: ('gpt_4_label', 'llama_13b_label', 'mixtral_8x7b_label')
8 Shot CoT


100%|██████████| 13/13 [00:00<00:00, 184209.30it/s]


Best Alpha: 0.686709095619648, Best Combination: ('llama_13b_label', 'openchat_label', 'zephyr_label')


Unnamed: 0,run_type,Alpha,Alpha (rounded),model_1,model_2,model_3
0,0 Shot,0.569218,0.57,Flan-UL2,GPT-4 Turbo,Zephyr 7B β
1,0 Shot + System Prompt,0.626906,0.63,Flan-UL2,GPT-3.5 Turbo,GPT-4 Turbo
2,0 Shot CoT,0.492907,0.49,GPT-3.5 Turbo,OpenChat 3.5,Zephyr 7B β
3,2 Shot,0.626589,0.63,GPT-3.5 Turbo,OpenChat 3.5,Zephyr 7B β
4,4 Shot,0.676214,0.68,GPT-3.5 Turbo,OpenChat 3.5,Zephyr 7B β
5,8 Shot,0.697853,0.7,GPT-3.5 Turbo,OpenChat 3.5,Zephyr 7B β
6,2 Shot CoT,0.647388,0.65,GPT-4 Turbo,LLama 2 13B Chat,Mixtral-8x7B Instruct
7,4 Shot CoT,0.681981,0.68,GPT-4 Turbo,LLama 2 13B Chat,Mixtral-8x7B Instruct
8,8 Shot CoT,0.686709,0.69,LLama 2 13B Chat,OpenChat 3.5,Zephyr 7B β


In [10]:
# Compute best Krippendorff's alpha for all combinations of 4 per dataframe and store it to csv
result_data = []
for df, name in dfs:
    print(name)
    score, combination = compute_krippendorff_alpha_for_k_runs(df, runs, k=4,
                                                               verbose=False)

    names = [name_by_label[run] for run in combination]

    result_data.append(
        {'run_type': name, 'Alpha': score, "Alpha (rounded)": round(score, 2), 'model_1': names[0], 'model_2': names[1],
         'model_3': names[2], 'model_4': names[3]})

result_df = pd.DataFrame(result_data)
result_df.to_csv('krippendorff_alpha_per_type_4.csv', index=False)
result_df

0 Shot


100%|██████████| 13/13 [00:00<00:00, 148168.35it/s]


Best Alpha: 0.5396141441157991, Best Combination: ('flan_ul2_label', 'gpt_3_5_label', 'gpt_4_label', 'zephyr_label')
0 Shot + System Prompt


100%|██████████| 13/13 [00:00<00:00, 192671.21it/s]


Best Alpha: 0.5896219023590168, Best Combination: ('flan_ul2_label', 'gpt_3_5_label', 'gpt_4_label', 'openchat_label')
0 Shot CoT


100%|██████████| 13/13 [00:00<00:00, 260889.72it/s]


Best Alpha: 0.45735626194198364, Best Combination: ('gpt_3_5_label', 'gpt_4_label', 'openchat_label', 'zephyr_label')
2 Shot


100%|██████████| 13/13 [00:00<00:00, 101727.52it/s]


Best Alpha: 0.6163975276546141, Best Combination: ('gpt_3_5_label', 'gpt_4_label', 'openchat_label', 'zephyr_label')
4 Shot


100%|██████████| 13/13 [00:00<00:00, 218103.81it/s]


Best Alpha: 0.6648947092005815, Best Combination: ('gpt_3_5_label', 'gpt_4_label', 'openchat_label', 'zephyr_label')
8 Shot


100%|██████████| 13/13 [00:00<00:00, 132666.55it/s]


Best Alpha: 0.6898387886144812, Best Combination: ('gpt_3_5_label', 'gpt_4_label', 'openchat_label', 'zephyr_label')
2 Shot CoT


100%|██████████| 13/13 [00:00<00:00, 156234.82it/s]


Best Alpha: 0.6126426441299856, Best Combination: ('gpt_3_5_label', 'gpt_4_label', 'llama_13b_label', 'mixtral_8x7b_label')
4 Shot CoT


100%|██████████| 13/13 [00:00<00:00, 188020.52it/s]


Best Alpha: 0.6571949340426286, Best Combination: ('gpt_4_label', 'llama_7b_label', 'llama_13b_label', 'zephyr_label')
8 Shot CoT


100%|██████████| 13/13 [00:00<00:00, 147766.81it/s]


Best Alpha: 0.6756858837566528, Best Combination: ('gpt_3_5_label', 'llama_13b_label', 'openchat_label', 'zephyr_label')


Unnamed: 0,run_type,Alpha,Alpha (rounded),model_1,model_2,model_3,model_4
0,0 Shot,0.539614,0.54,Flan-UL2,GPT-3.5 Turbo,GPT-4 Turbo,Zephyr 7B β
1,0 Shot + System Prompt,0.589622,0.59,Flan-UL2,GPT-3.5 Turbo,GPT-4 Turbo,OpenChat 3.5
2,0 Shot CoT,0.457356,0.46,GPT-3.5 Turbo,GPT-4 Turbo,OpenChat 3.5,Zephyr 7B β
3,2 Shot,0.616398,0.62,GPT-3.5 Turbo,GPT-4 Turbo,OpenChat 3.5,Zephyr 7B β
4,4 Shot,0.664895,0.66,GPT-3.5 Turbo,GPT-4 Turbo,OpenChat 3.5,Zephyr 7B β
5,8 Shot,0.689839,0.69,GPT-3.5 Turbo,GPT-4 Turbo,OpenChat 3.5,Zephyr 7B β
6,2 Shot CoT,0.612643,0.61,GPT-3.5 Turbo,GPT-4 Turbo,LLama 2 13B Chat,Mixtral-8x7B Instruct
7,4 Shot CoT,0.657195,0.66,GPT-4 Turbo,LLama 2 7B Chat,LLama 2 13B Chat,Zephyr 7B β
8,8 Shot CoT,0.675686,0.68,GPT-3.5 Turbo,LLama 2 13B Chat,OpenChat 3.5,Zephyr 7B β


# Compute Krippendorff's alpha per model

In [11]:
def create_df(model_name, dfs):
    res = pd.DataFrame()

    # add the 'text' colum from first df as row
    res['text'] = dfs[0][0]['text']

    for df, name in dfs:

        if model_name in df.columns:
            #merge on 'text' and rename to 'name' not only suffix
            res = res.merge(df[['text', model_name]], on='text', how='left')
            res = res.rename(columns={model_name: name})

    return res

In [12]:
# Compute Krippendorff's alpha per model
columns = ["0 Shot", "0 Shot + System Prompt", "0 Shot CoT", "2 Shot", "4 Shot", "8 Shot", "2 Shot CoT", "4 Shot CoT",
           "8 Shot CoT"]

result_data = []
for babel_label in runs:
    print(babel_label)
    df = create_df(babel_label, dfs)
    score = compute_krippendorff_alpha(df, columns, verbose=True, runs=columns)
    result_data.append({'model': name_by_label[babel_label], 'Alpha': score, "Alpha (rounded)": round(score, 2)})

result_df = pd.DataFrame(result_data)
result_df.to_csv('krippendorff_alpha_per_model.csv', index=False)
result_df

falcon_7b_label
(9, 4021)
flan_t5_base_label
(5, 4021)
flan_t5_large_label
(5, 4021)
flan_t5_xl_label
(5, 4021)
flan_ul2_label
(9, 4021)
gpt_3_5_label
(9, 4021)
gpt_4_label
(9, 4021)
llama_7b_label
(9, 4021)
llama_13b_label
(9, 4021)
mistral_7b_label
(9, 4021)
mixtral_8x7b_label
(9, 4021)
openchat_label
(9, 4021)
zephyr_label
(9, 4021)


Unnamed: 0,model,Alpha,Alpha (rounded)
0,Falcon-7B-Instruct,0.126202,0.13
1,FLAN-T5-Base,0.093991,0.09
2,FLAN-T5-Large,0.194449,0.19
3,FLAN-T5-XL,0.469199,0.47
4,Flan-UL2,0.578945,0.58
5,GPT-3.5 Turbo,0.663122,0.66
6,GPT-4 Turbo,0.780733,0.78
7,LLama 2 7B Chat,0.338822,0.34
8,LLama 2 13B Chat,0.384463,0.38
9,Mistral-7B-v0.1 Instruct,0.346708,0.35


In [13]:
# top 2 runs per model 
result_data = []
for babel_label in runs:
    print(babel_label)
    df = create_df(babel_label, dfs)
    score, combination = compute_krippendorff_alpha_for_k_runs(df, columns, k=2,
                                                               verbose=False, columns=columns)
    result_data.append(
        {'model': name_by_label[babel_label], 'Alpha': score, "Alpha (rounded)": round(score, 2), 'run_1': combination[0],
         'run_2': combination[1]})
    
result_df = pd.DataFrame(result_data)
result_df.to_csv('krippendorff_alpha_per_model_2.csv', index=False)
result_df

falcon_7b_label


100%|██████████| 9/9 [00:00<00:00, 63980.91it/s]


Best Alpha: 0.461073456677047, Best Combination: ('4 Shot', '8 Shot')
flan_t5_base_label


100%|██████████| 9/9 [00:00<00:00, 76106.32it/s]


Best Alpha: 0.31312594666783344, Best Combination: ('0 Shot', '0 Shot + System Prompt')
flan_t5_large_label


100%|██████████| 9/9 [00:00<00:00, 104857.60it/s]


Best Alpha: 0.4315787985466062, Best Combination: ('2 Shot', '4 Shot')
flan_t5_xl_label


100%|██████████| 9/9 [00:00<00:00, 73156.47it/s]


Best Alpha: 0.7828947368421053, Best Combination: ('0 Shot CoT', '4 Shot')
flan_ul2_label


100%|██████████| 9/9 [00:00<00:00, 79976.14it/s]


Best Alpha: 0.7465937179719058, Best Combination: ('2 Shot', '4 Shot')
gpt_3_5_label


100%|██████████| 9/9 [00:00<00:00, 73728.00it/s]


Best Alpha: 0.8128520225294419, Best Combination: ('4 Shot', '8 Shot')
gpt_4_label


100%|██████████| 9/9 [00:00<00:00, 80659.69it/s]


Best Alpha: 0.8861392161757966, Best Combination: ('2 Shot CoT', '4 Shot CoT')
llama_7b_label


100%|██████████| 9/9 [00:00<00:00, 89451.98it/s]


Best Alpha: 0.6463631664543787, Best Combination: ('4 Shot CoT', '8 Shot CoT')
llama_13b_label


100%|██████████| 9/9 [00:00<00:00, 85211.59it/s]


Best Alpha: 0.738510917544824, Best Combination: ('4 Shot CoT', '8 Shot CoT')
mistral_7b_label


100%|██████████| 9/9 [00:00<00:00, 90960.81it/s]


Best Alpha: 0.607256523139082, Best Combination: ('0 Shot', '0 Shot + System Prompt')
mixtral_8x7b_label


100%|██████████| 9/9 [00:00<00:00, 62601.55it/s]


Best Alpha: 0.8496536361364617, Best Combination: ('4 Shot CoT', '8 Shot CoT')
openchat_label


100%|██████████| 9/9 [00:00<00:00, 95808.97it/s]

Best Alpha: 0.7786367986080127, Best Combination: ('4 Shot', '8 Shot')
zephyr_label



100%|██████████| 9/9 [00:00<00:00, 101475.10it/s]


Best Alpha: 0.7901164170832768, Best Combination: ('4 Shot', '8 Shot')


Unnamed: 0,model,Alpha,Alpha (rounded),run_1,run_2
0,Falcon-7B-Instruct,0.461073,0.46,4 Shot,8 Shot
1,FLAN-T5-Base,0.313126,0.31,0 Shot,0 Shot + System Prompt
2,FLAN-T5-Large,0.431579,0.43,2 Shot,4 Shot
3,FLAN-T5-XL,0.782895,0.78,0 Shot CoT,4 Shot
4,Flan-UL2,0.746594,0.75,2 Shot,4 Shot
5,GPT-3.5 Turbo,0.812852,0.81,4 Shot,8 Shot
6,GPT-4 Turbo,0.886139,0.89,2 Shot CoT,4 Shot CoT
7,LLama 2 7B Chat,0.646363,0.65,4 Shot CoT,8 Shot CoT
8,LLama 2 13B Chat,0.738511,0.74,4 Shot CoT,8 Shot CoT
9,Mistral-7B-v0.1 Instruct,0.607257,0.61,0 Shot,0 Shot + System Prompt


In [14]:
# top 3 runs per model 
result_data = []
for babel_label in runs:
    print(babel_label)
    df = create_df(babel_label, dfs)
    score, combination = compute_krippendorff_alpha_for_k_runs(df, columns, k=3,
                                                               verbose=False, columns=columns)
    result_data.append(
        {'model': name_by_label[babel_label], 'Alpha': score, "Alpha (rounded)": round(score, 2), 'run_1': combination[0],
         'run_2': combination[1], 'run_3': combination[2]})

result_df = pd.DataFrame(result_data)
result_df.to_csv('krippendorff_alpha_per_model_3.csv', index=False)
result_df

falcon_7b_label


100%|██████████| 9/9 [00:00<00:00, 68759.08it/s]


Best Alpha: 0.44971081957963044, Best Combination: ('0 Shot CoT', '4 Shot', '8 Shot')
flan_t5_base_label


100%|██████████| 9/9 [00:00<00:00, 66342.24it/s]


Best Alpha: 0.3118649920351201, Best Combination: ('0 Shot', '0 Shot + System Prompt', '0 Shot CoT')
flan_t5_large_label


100%|██████████| 9/9 [00:00<00:00, 107240.73it/s]


Best Alpha: 0.4172444274449998, Best Combination: ('0 Shot CoT', '2 Shot', '4 Shot')
flan_t5_xl_label


100%|██████████| 9/9 [00:00<00:00, 91180.52it/s]


Best Alpha: 0.7175203707283418, Best Combination: ('0 Shot CoT', '2 Shot', '4 Shot')
flan_ul2_label


100%|██████████| 9/9 [00:00<00:00, 94371.84it/s]


Best Alpha: 0.7427422229160174, Best Combination: ('0 Shot CoT', '4 Shot', '8 Shot')
gpt_3_5_label


100%|██████████| 9/9 [00:00<00:00, 84072.91it/s]


Best Alpha: 0.8090386131245364, Best Combination: ('0 Shot CoT', '4 Shot', '8 Shot')
gpt_4_label


100%|██████████| 9/9 [00:00<00:00, 94136.50it/s]


Best Alpha: 0.8841264733198549, Best Combination: ('0 Shot CoT', '2 Shot CoT', '4 Shot CoT')
llama_7b_label


100%|██████████| 9/9 [00:00<00:00, 75497.47it/s]


Best Alpha: 0.643822991760752, Best Combination: ('0 Shot CoT', '4 Shot CoT', '8 Shot CoT')
llama_13b_label


100%|██████████| 9/9 [00:00<00:00, 115793.67it/s]


Best Alpha: 0.7355260190773996, Best Combination: ('0 Shot CoT', '4 Shot CoT', '8 Shot CoT')
mistral_7b_label


100%|██████████| 9/9 [00:00<00:00, 92295.20it/s]


Best Alpha: 0.6009037304452467, Best Combination: ('0 Shot', '0 Shot + System Prompt', '0 Shot CoT')
mixtral_8x7b_label


100%|██████████| 9/9 [00:00<00:00, 48457.94it/s]


Best Alpha: 0.8454467598218173, Best Combination: ('0 Shot CoT', '4 Shot CoT', '8 Shot CoT')
openchat_label


100%|██████████| 9/9 [00:00<00:00, 110376.42it/s]


Best Alpha: 0.7762071993362333, Best Combination: ('0 Shot CoT', '4 Shot', '8 Shot')
zephyr_label


100%|██████████| 9/9 [00:00<00:00, 94846.07it/s]


Best Alpha: 0.7862925286331757, Best Combination: ('0 Shot CoT', '4 Shot', '8 Shot')


Unnamed: 0,model,Alpha,Alpha (rounded),run_1,run_2,run_3
0,Falcon-7B-Instruct,0.449711,0.45,0 Shot CoT,4 Shot,8 Shot
1,FLAN-T5-Base,0.311865,0.31,0 Shot,0 Shot + System Prompt,0 Shot CoT
2,FLAN-T5-Large,0.417244,0.42,0 Shot CoT,2 Shot,4 Shot
3,FLAN-T5-XL,0.71752,0.72,0 Shot CoT,2 Shot,4 Shot
4,Flan-UL2,0.742742,0.74,0 Shot CoT,4 Shot,8 Shot
5,GPT-3.5 Turbo,0.809039,0.81,0 Shot CoT,4 Shot,8 Shot
6,GPT-4 Turbo,0.884126,0.88,0 Shot CoT,2 Shot CoT,4 Shot CoT
7,LLama 2 7B Chat,0.643823,0.64,0 Shot CoT,4 Shot CoT,8 Shot CoT
8,LLama 2 13B Chat,0.735526,0.74,0 Shot CoT,4 Shot CoT,8 Shot CoT
9,Mistral-7B-v0.1 Instruct,0.600904,0.6,0 Shot,0 Shot + System Prompt,0 Shot CoT


In [15]:
# top 4 runs per model
result_data = []
for babel_label in runs:
    print(babel_label)
    df = create_df(babel_label, dfs)
    score, combination = compute_krippendorff_alpha_for_k_runs(df, columns, k=4,
                                                               verbose=False, columns=columns)
    result_data.append(
        {'model': name_by_label[babel_label], 'Alpha': score, "Alpha (rounded)": round(score, 2), 'run_1': combination[0],
         'run_2': combination[1], 'run_3': combination[2], 'run_4': combination[3]})
    
result_df = pd.DataFrame(result_data)
result_df.to_csv('krippendorff_alpha_per_model_4.csv', index=False)
result_df

falcon_7b_label


100%|██████████| 9/9 [00:00<00:00, 62601.55it/s]


Best Alpha: 0.39076571815145245, Best Combination: ('0 Shot CoT', '2 Shot', '4 Shot', '8 Shot')
flan_t5_base_label


100%|██████████| 9/9 [00:00<00:00, 70956.27it/s]


Best Alpha: 0.14358877887689447, Best Combination: ('0 Shot', '0 Shot + System Prompt', '0 Shot CoT', '2 Shot')
flan_t5_large_label


100%|██████████| 9/9 [00:00<00:00, 16740.02it/s]


Best Alpha: 0.2035555541576436, Best Combination: ('0 Shot', '0 Shot + System Prompt', '0 Shot CoT', '2 Shot')
flan_t5_xl_label


100%|██████████| 9/9 [00:00<00:00, 69905.07it/s]


Best Alpha: 0.6365377793837467, Best Combination: ('0 Shot + System Prompt', '0 Shot CoT', '2 Shot', '4 Shot')
flan_ul2_label


100%|██████████| 9/9 [00:00<00:00, 59540.59it/s]


Best Alpha: 0.736434721396401, Best Combination: ('0 Shot CoT', '2 Shot', '4 Shot', '8 Shot')
gpt_3_5_label


100%|██████████| 9/9 [00:00<00:00, 79807.05it/s]


Best Alpha: 0.7627887677473331, Best Combination: ('0 Shot CoT', '2 Shot', '4 Shot', '8 Shot')
gpt_4_label


100%|██████████| 9/9 [00:00<00:00, 69905.07it/s]


Best Alpha: 0.8665777085391551, Best Combination: ('0 Shot CoT', '2 Shot CoT', '4 Shot CoT', '8 Shot CoT')
llama_7b_label


100%|██████████| 9/9 [00:00<00:00, 108473.38it/s]


Best Alpha: 0.6294201655960702, Best Combination: ('0 Shot CoT', '2 Shot CoT', '4 Shot CoT', '8 Shot CoT')
llama_13b_label


100%|██████████| 9/9 [00:00<00:00, 95325.09it/s]


Best Alpha: 0.7104564784845082, Best Combination: ('0 Shot CoT', '2 Shot CoT', '4 Shot CoT', '8 Shot CoT')
mistral_7b_label


100%|██████████| 9/9 [00:00<00:00, 105443.40it/s]


Best Alpha: 0.5314739338983899, Best Combination: ('0 Shot CoT', '2 Shot CoT', '4 Shot CoT', '8 Shot CoT')
mixtral_8x7b_label


100%|██████████| 9/9 [00:00<00:00, 100663.30it/s]


Best Alpha: 0.8154698954253053, Best Combination: ('0 Shot CoT', '2 Shot CoT', '4 Shot CoT', '8 Shot CoT')
openchat_label


100%|██████████| 9/9 [00:00<00:00, 112347.43it/s]


Best Alpha: 0.7385638119107168, Best Combination: ('0 Shot CoT', '2 Shot', '4 Shot', '8 Shot')
zephyr_label


100%|██████████| 9/9 [00:00<00:00, 92295.20it/s]


Best Alpha: 0.7566765821902389, Best Combination: ('0 Shot CoT', '2 Shot', '4 Shot', '8 Shot')


Unnamed: 0,model,Alpha,Alpha (rounded),run_1,run_2,run_3,run_4
0,Falcon-7B-Instruct,0.390766,0.39,0 Shot CoT,2 Shot,4 Shot,8 Shot
1,FLAN-T5-Base,0.143589,0.14,0 Shot,0 Shot + System Prompt,0 Shot CoT,2 Shot
2,FLAN-T5-Large,0.203556,0.2,0 Shot,0 Shot + System Prompt,0 Shot CoT,2 Shot
3,FLAN-T5-XL,0.636538,0.64,0 Shot + System Prompt,0 Shot CoT,2 Shot,4 Shot
4,Flan-UL2,0.736435,0.74,0 Shot CoT,2 Shot,4 Shot,8 Shot
5,GPT-3.5 Turbo,0.762789,0.76,0 Shot CoT,2 Shot,4 Shot,8 Shot
6,GPT-4 Turbo,0.866578,0.87,0 Shot CoT,2 Shot CoT,4 Shot CoT,8 Shot CoT
7,LLama 2 7B Chat,0.62942,0.63,0 Shot CoT,2 Shot CoT,4 Shot CoT,8 Shot CoT
8,LLama 2 13B Chat,0.710456,0.71,0 Shot CoT,2 Shot CoT,4 Shot CoT,8 Shot CoT
9,Mistral-7B-v0.1 Instruct,0.531474,0.53,0 Shot CoT,2 Shot CoT,4 Shot CoT,8 Shot CoT
