In [1]:
import pandas as pd
import sys
import os
import inspect

# access parent directory from notebooks directory
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0, parentdir)
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

### Evaluation Mistral

In [54]:
output_evaluation_folder_path = 'output_evaluation/'
output_evaluation_mistral_folder_name = output_evaluation_folder_path + 'Mistral/'

prompting_strategies = ['NoContext', 'Context']

for prompting in prompting_strategies:
    output_df = pd.DataFrame()
    for subdir, dirs, files in os.walk(output_evaluation_mistral_folder_name+prompting):
        file_id = 0
        for file in files:
            filepath = subdir + os.sep + file
            if filepath.endswith("mistral-medium.csv"):
                temp_output = pd.read_csv(filepath)
                accuracy = format(temp_output[['explanation_accuracy']].iloc[0])
                temp_output['score_accuracy'] = accuracy.split('\\')[0].split(': ')[1]
                try:
                    content_preservation = format(temp_output[['explanation_content_preservation']].iloc[0])
                    temp_output['score_content_preservation'] = content_preservation.split('\\')[0].split(': ')[1]
                except Exception as e:
                    print(file_id)
                    print(filepath)
                    print("score not found", e)
                    temp_output['score_content_preservation'] = ""
                    
                fluency = format(temp_output[['explanation_fluency']].iloc[0])
                temp_output['score_fluency'] = fluency.split('\\')[0].split(': ')[1]
                temp_output['prompting'] = prompting 
                
                # concat temp dataframe
                output_df = pd.concat([output_df, temp_output], ignore_index=True)
                file_id = file_id + 1

    # save to csv
    output_df.to_csv(output_evaluation_folder_path + 'Evaluation_' + prompting + '_mistral-medium.csv', index=False)



682
output_evaluation/Mistral/NoContext/evaluation_NoContext_1153_mistral-medium.csv
score not found list index out of range


### Evaluation using GPT

In [55]:
output_evaluation_folder_path = 'output_evaluation/'
output_evaluation_mistral_folder_name = output_evaluation_folder_path + 'GPT/'

prompting_strategies = ['NoContext', 'Context']

for prompting in prompting_strategies:
    output_df = pd.DataFrame()
    for subdir, dirs, files in os.walk(output_evaluation_mistral_folder_name+prompting):
        file_id = 0
        for file in files:
            filepath = subdir + os.sep + file
            if filepath.endswith("gpt-4.csv"):
                temp_output = pd.read_csv(filepath)
                accuracy = format(temp_output[['explanation_accuracy']].iloc[0])
                temp_output['score_accuracy'] = accuracy.split('\\')[0].split(': ')[1]
                try:
                    content_preservation = format(temp_output[['explanation_content_preservation']].iloc[0])
                    temp_output['score_content_preservation'] = content_preservation.split('\\')[0].split(': ')[1]
                except Exception as e:
                    print(file_id)
                    print(filepath)
                    print("score not found", e)
                    temp_output['score_content_preservation'] = ""

                fluency = format(temp_output[['explanation_fluency']].iloc[0])
                temp_output['score_fluency'] = fluency.split('\\')[0].split(': ')[1]
                temp_output['prompting'] = prompting

                # concat temp dataframe
                output_df = pd.concat([output_df, temp_output], ignore_index=True)
                file_id = file_id + 1

    # save to csv
    output_df.to_csv(output_evaluation_folder_path + 'Evaluation_' + prompting + '_gpt-4.csv', index=False)



### Summarize scores

In [74]:
# manually checked the files for postprocessing. In a few occasions the score was not correctly extracted from the explanation. In these cases the scores were adjusted manually

mistral_noContext = pd.read_csv(output_evaluation_folder_path + 'Evaluation_NoContext_mistral-medium_corrected.csv')
mistral_context = pd.read_csv(output_evaluation_folder_path + 'Evaluation_Context_gpt-4_corrected.csv')
gpt_noContext = pd.read_csv(output_evaluation_folder_path + 'Evaluation_NoContext_gpt-4_corrected.csv')
gpt_context = pd.read_csv(output_evaluation_folder_path + 'Evaluation_Context_gpt-4_corrected.csv')

In [85]:
summ_mistral_noContext = mistral_noContext.groupby(['model', 'promptID',  'shots']).agg({'score_accuracy': 'mean', 'score_content_preservation': 'mean', 'score_fluency': 'mean'})
summ_mistral_noContext.to_csv(output_evaluation_folder_path + 'Evaluation_NoContext_mistral-medium_summary.csv', index=True)

In [86]:
summ_mistral_context = mistral_context.groupby(['model', 'promptID', 'shots']).agg({'score_accuracy': 'mean', 'score_content_preservation': 'mean', 'score_fluency': 'mean'})
summ_mistral_context.to_csv(output_evaluation_folder_path + 'Evaluation_Context_mistral-medium_summary.csv', index=True)


In [87]:
summ_gpt_noContext = mistral_noContext.groupby(['model', 'promptID', 'shots']).agg({'score_accuracy': 'mean', 'score_content_preservation': 'mean', 'score_fluency': 'mean'})
summ_gpt_noContext.to_csv(output_evaluation_folder_path + 'Evaluation_NoContext_gpt-4_summary.csv', index=True)

In [88]:
summ_gpt_context = mistral_context.groupby(['model', 'promptID', 'shots']).agg({'score_accuracy': 'mean', 'score_content_preservation': 'mean', 'score_fluency': 'mean'})
summ_gpt_context.to_csv(output_evaluation_folder_path + 'Evaluation_Context_gpt-4_summary.csv', index=True)