In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [None]:
# Load Dataset for the plots:
all_data = pd.read_csv('../model_results/analysed_data/correlations_per_choice.csv')

In [None]:
question_types = [1, 2, 3, 4, 5]
choices = ['correct_answer', 'distractor1', 'distractor2']
model_names = ['Llama3-8b', 'Llama3-70b', 'Yi-34b', 'Mistral-7b']

n_question_types = len(question_types)
n_choices = len(choices)

## Define Plot-Making Function

In [None]:
def create_plot(data, significances, plot_title, y_axis_title, output_filename, lower_y_limit):
    # Set up the figure and subplots
    fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(
        10, 10), sharey=True, sharex=True)

    # Set the positions and width for the bars
    positions = np.arange(n_question_types)
    width = 0.25  # the width of the bars

    # Define colors for each bar
    colors = ['#1c7c54', '#ba274a', '#b8b42d']

    # Plot data
    for idx, ax in enumerate(axes.flatten()):
        for bar_idx in range(n_choices):
            ax.bar(positions + bar_idx * width, data[idx, bar_idx], width,
                   label=f'Question type {bar_idx}', color=colors[bar_idx])

        # Set model name as title
        ax.text(0.5, 0.93, model_names[idx], fontsize=20,
                horizontalalignment='center',
                transform=ax.transAxes)

        # Set x-axis labels
        ax.set_xticks(positions + width)
        ax.set_xticklabels(["Type 1", "Type 2", "Type 3",
                           "Type 4", "Type 5"], fontsize=17)

    for idx, ax in enumerate(axes.flatten()):
        for bar_idx in range(n_choices):
            for pos_idx, pos in enumerate(positions):
                # If the significance is 1, add asterisks
                if significances[idx, bar_idx, pos_idx] == 1:
                    # Adjust the height to be above the bar
                    height = data[idx, bar_idx, pos_idx] + 0.005
                    ax.text(pos + bar_idx * width, height, '*', ha='center',
                            va='bottom', fontsize=16, color='black')

    # set the y limit of all plots to 1
    for ax in axes.flatten():
        ax.set_ylim(lower_y_limit, 1)
    # Add a legend
    fig.legend(loc='upper center', labels=[
        'Correct Answer', 'Distractor 1', 'Distractor 2'], fontsize=17, bbox_to_anchor=(0.52, -0.03), ncol=3)

    # x axis label
    fig.text(0.54, 0, 'Question Type', ha='center', fontsize=23)

    # Add a main y-axis label
    fig.text(0.02, 0.5, y_axis_title,
             va='center', rotation='vertical', fontsize=23)

    # main title: logit
    fig.suptitle(plot_title,
                 fontsize=25, x=0.54, y=0.95)
    # Adjust the layout to prevent overlapping
    plt.tight_layout(rect=[0.05, 0.05, 1, 0.96])
    plt.savefig('plots/' + output_filename + '.png', dpi=200, bbox_inches='tight')
    # Show plot
    plt.show()

## 1st Token Probability & All Questions

In [None]:
# Create dataframe of correlations
correctness = '-1' # -1 for both, False for incorrect, True for correct
correlation_of_interest = 'spearman_logit'
correlation_of_interest_p_value = 'spearman_p_logit'

data = np.zeros((4, n_choices, n_question_types))
significances = np.zeros((4, n_choices, n_question_types))
for model_name in model_names:
    for choice in choices:
        for question_type in question_types:
            subset = all_data[(all_data['model_name'] == model_name) & (
                all_data['choice'] == choice) & (all_data['question_subset'] == question_type) & (all_data['model_correctness'] == correctness)]
            data[model_names.index(model_name), choices.index(choice), question_types.index(
                question_type)] = subset[correlation_of_interest]  # The subset should only contain 1 row.
            significances[model_names.index(model_name), choices.index(choice), question_types.index(
                question_type)] = subset[correlation_of_interest_p_value] < 0.05
            
            
create_plot(data, significances, '1st Token Probability (All MCQs)', 'Spearman Correlation', 'spear_1st_token_all', lower_y_limit=-0.25)

## 1st Token Probability & Only Correct Questions

In [None]:
# Create dataframe of correlations
correctness = 'True' # -1 for both, 0 for incorrect, 1 for correct
correlation_of_interest = 'spearman_logit'
correlation_of_interest_p_value = 'spearman_p_logit'


data = np.zeros((4, n_choices, n_question_types))
significances = np.zeros((4, n_choices, n_question_types))
for model_name in model_names:
    for choice in choices:
        for question_type in question_types:
            subset = all_data[(all_data['model_name'] == model_name) & (
                all_data['choice'] == choice) & (all_data['question_subset'] == question_type) & (all_data['model_correctness'] == correctness)]
            data[model_names.index(model_name), choices.index(choice), question_types.index(
                question_type)] = subset[correlation_of_interest]  # The subset should only contain 1 row.
            significances[model_names.index(model_name), choices.index(choice), question_types.index(
                question_type)] = subset[correlation_of_interest_p_value] < 0.05
            
            
create_plot(data, significances, '1st Token Probability (Correct MCQs)', 'Spearman Correlation', 'spear_1st_token_correct', lower_y_limit=-0.25)

## Order Sensitivity & All Questions

In [None]:
# Create dataframe of correlations
correctness = '-1' # -1 for both, 0 for incorrect, 1 for correct
correlation_of_interest = 'spearman_order_prob'
correlation_of_interest_p_value = 'spearman_p_order_prob'

data = np.zeros((4, n_choices, n_question_types))
significances = np.zeros((4, n_choices, n_question_types))
for model_name in model_names:
    for choice in choices:
        for question_type in question_types:
            subset = all_data[(all_data['model_name'] == model_name) & (
                all_data['choice'] == choice) & (all_data['question_subset'] == question_type) & (all_data['model_correctness'] == correctness)]
            data[model_names.index(model_name), choices.index(choice), question_types.index(
                question_type)] = subset[correlation_of_interest]  # The subset should only contain 1 row.
            significances[model_names.index(model_name), choices.index(choice), question_types.index(
                question_type)] = subset[correlation_of_interest_p_value] < 0.05
            
            
create_plot(data, significances, 'Choice Order Sensitivity (All MCQs)', 'Spearman Correlation', 'spear_order_all', lower_y_limit=-0.25)

## Order Sensitivity & Only Correct Questions

In [None]:
# Create dataframe of correlations
correctness = 'True' # -1 for both, 0 for incorrect, 1 for correct

correlation_of_interest = 'spearman_order_prob'
correlation_of_interest_p_value = 'spearman_p_order_prob'

data = np.zeros((4, n_choices, n_question_types))
significances = np.zeros((4, n_choices, n_question_types))
for model_name in model_names:
    for choice in choices:
        for question_type in question_types:
            subset = all_data[(all_data['model_name'] == model_name) & (
                all_data['choice'] == choice) & (all_data['question_subset'] == question_type) & (all_data['model_correctness'] == correctness)]
            data[model_names.index(model_name), choices.index(choice), question_types.index(
                question_type)] = subset[correlation_of_interest]  # The subset should only contain 1 row.
            significances[model_names.index(model_name), choices.index(choice), question_types.index(
                question_type)] = subset[correlation_of_interest_p_value] < 0.05
            
            
create_plot(data, significances, 'Choice Order Sensitivity (Correct MCQs)', 'Spearman Correlation', 'spear_order_correct', lower_y_limit=-0.4)