In [78]:
import pandas as pd
import configparser
import sys
import os
import inspect
import evaluation as eval

# access parent directory from notebooks directory
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0, parentdir)
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [10]:
config = configparser.ConfigParser()
# Read the configuration file
config.read('config.ini')
api_key_openai = config.get('credentials', 'api_key_openai')
api_key_mistral = config.get('credentials', 'api_key_mistral')
surfdrive_url_transcript_sentences = config.get('credentials', 'surfdrive_url_transcript_sentences')

output_evaluation_folder_path = 'output_evaluation/'

### Input

In [21]:
# read output of LLM
fileWithoutContext = 'Prompts_NoContext_complete_output.csv'
fileWithContext = 'Prompts_Context_complete_output.csv'

outputWithoutContext = pd.read_csv(output_evaluation_folder_path+file)
outputWithContext = pd.read_csv(output_evaluation_folder_path+file)

# read rewritten sentences by participants
transcriptSentences = pd.read_csv(surfdrive_url_transcript_sentences).reset_index()[['user', 'original', 'your_text']]

# merge two files
data_for_evaluation_withoutContext = pd.merge(outputWithoutContext, transcriptSentences, on=['user', 'original'])
data_for_evaluation_withContext = pd.merge(outputWithContext, transcriptSentences, on=['user', 'original'])


### Evaluation Prompts


In [83]:
prompts_dict = {
    # Generic prompt
    'prompt_llm': 'You are an expert in text style transfer. Here is paragraph S1: {} ',
                 
    # Accuracy
    'prompt_accuracy_s2': 'and paragraph S2: {} ',
    'prompt_accuracy_inference': 'How different is the conversational style of paragraph S2 compared to S1 on a continuous scale from 1 (completely identical styles) to 10 (completely different styles)? Result = (Only provide the score)',
    
    # Content preservation
    'prompt_content_preservation_s2': 'and paragraph S2: {} ',
    'prompt_content_preservation_inference': 'How much does S1 preserve the content of S2 on a continuous scale from 1 (completely different topic) to 10 (identical topic)? Result = (Only provide the score)',
    
    # Fluency
    'prompt_fluency_inference': 'on a scale from 1 to 10 where 1 (lowest coherent) and 10 (highest coherent)? Result = (Only provide the score)'
}



### Evaluation using Mistral

In [ ]:
# without context
output_withoutContext = eval.get_evaluation_mistral(prompts_dict, data_for_evaluation_withoutContext, 'withoutContext')

In [ ]:
# with context
output_withContext = eval.get_evaluation_mistral(prompts_dict, data_for_evaluation_withoutContext, 'withContext')

### Evaluation using GPT

In [ ]:
# without context
output_withoutContext = eval.get_evaluation_gpt(prompts_dict, data_for_evaluation_withoutContext, 'withoutContext')

In [ ]:
# with context
output_withContext = eval.get_evaluation_gpt(prompts_dict, data_for_evaluation_withoutContext, 'withContext')

### some testing (ignore)

In [ ]:
output_df = data_for_evaluation_withContext
scores = []
for index, row in output_df.iterrows():

    prompt_accuracy = eval.get_accuracy_prompt(prompts_dict, row)
    prompt_content_preservation = eval.get_content_preservation_prompt(prompts_dict, row)
    prompt_fluency = eval.get_fluency_prompt(prompts_dict, row)

    score = 8
    scores.append(score)

    if row[0] == 0:
        print(prompt_accuracy)

output_df['score'] = scores
