In [2]:
import json
import matplotlib.pyplot as plt
import numpy as np


In [6]:
models = ['ada', 'babbage', 'curie', 'GPT-3.5-turbo']
models_to_short_names = {'ada': 'ada', 'babbage': 'babbage', 'curie': 'curie', 'text-davinci-003': 'davinci', "davinci": "davinci", 'GPT-3.5-turbo': '3.5-turbo'}
data = {}

# Load data from JSON files
for model in models:
    with open(f'results/CC_results_{model}.json', 'r') as infile:
        data[model] = json.load(infile)

# Calculate full question accuracy for each prompt type and model
dataset_size = 1000
direct_answer_accuracy_EM = [data[model]['full_question_direct_answer_correct_EM'] / dataset_size for model in models]
chain_of_thought_accuracy_EM = [data[model]['full_question_chain_of_thought_correct_EM'] / dataset_size for model in models]
self_ask_accuracy_EM = [data[model]['full_question_self_ask_correct_EM'] / dataset_size for model in models]
subquestions_accuracy_EM = [data[model]['both_subquestions_correct_EM'] / dataset_size for model in models]

direct_answer_accuracy_CEM = [data[model]['full_question_direct_answer_correct_CEM'] / dataset_size for model in models]
chain_of_thought_accuracy_CEM = [data[model]['full_question_chain_of_thought_correct_CEM'] / dataset_size for model in models]
self_ask_accuracy_CEM = [data[model]['full_question_self_ask_correct_CEM'] / dataset_size for model in models]
subquestions_accuracy_CEM = [data[model]['both_subquestions_correct_CEM'] / dataset_size for model in models]

# Bar plot settings
bar_width = 0.35
x = np.arange(len(models))

# Function to create a bar plot for each prompt methodology
def create_plot(title, full_question_accuracy, subquestions_accuracy, file_name):
    fig, ax = plt.subplots()

    ax.bar(x - bar_width / 2, full_question_accuracy, width=bar_width, label=f'Compositional Question Correct')
    ax.bar(x + bar_width / 2, subquestions_accuracy, width=bar_width, label='Both Subquestions Correct')

    for i, model in enumerate(models):
        comp_gap = (subquestions_accuracy[i] - full_question_accuracy[i]) / subquestions_accuracy[i] * 100
        y_start = full_question_accuracy[i]
        y_end = subquestions_accuracy[i]
        x_pos = x[i] - bar_width / 2
        ax.vlines(x_pos, y_start, y_end, linestyle='dotted', color='black')
        print(comp_gap)
        ax.annotate(f'{comp_gap:.1f}%', xy=(x_pos, (y_start + y_end) / 2), fontsize=12, color='black', ha='right')

    ax.set_title(title)
    ax.set_xlabel('Model')
    ax.set_ylabel('Accuracy')
    ax.set_xticks(x)
    ax.set_xticklabels([models_to_short_names[model] for model in models])
    
    # Move the legend below the plot
    ax.legend(loc='upper center', bbox_to_anchor=(0.5, -0.15), ncol=2)

    plt.tight_layout()
    plt.savefig(file_name)
    plt.show()

# Create and save plots for each prompt methodology
create_plot('Compositionality Gap with Direct Answer Prompt', direct_answer_accuracy_EM, subquestions_accuracy_EM, 'direct_answer_EM_plot.png')
create_plot('Chain-of-thought', chain_of_thought_accuracy_EM, subquestions_accuracy_EM, 'chain_of_thought_EM_plot.png')
create_plot('Self-ask', self_ask_accuracy_EM, subquestions_accuracy_EM, 'self_ask_EM_plot_.png')

create_plot('Compositionality Gap with Direct Answer Prompt', direct_answer_accuracy_EM, subquestions_accuracy_EM, 'direct_answer_CEM_plot.png')
create_plot('Chain-of-thought', chain_of_thought_accuracy_EM, subquestions_accuracy_EM, 'chain_of_thought_CEM_plot.png')
create_plot('Self-ask', self_ask_accuracy_EM, subquestions_accuracy_EM, 'self_ask_CEM_plot_.png')


FileNotFoundError: [Errno 2] No such file or directory: 'results/CC_results_GPT-3.5-turbo.json'