In [90]:
import pandas as pd
import sys
import os
import inspect
import s5_postprocess_evaluation as pe
import configparser
import s6_display_results as dr
import s4_evaluation as evl
# access parent directory from notebooks directory
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0, parentdir)
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


#### For the reflection example, we are taking the following usecase: mistral medium (as tst and evaluator), non-parallel data prompt, 5 shots and no-context

In [91]:
df = dr.get_input_data()
###  'cutting' the input dataframe, evaluator is mistral medium, non-parallel data only (promptID 2), 
df = df[(df.evaluator == 'mistral-medium') & (df.isParallel == False) & (df.shots == 5) & (df.prompting == 'NoContext') & (df.model == 'mistral-medium')]
df.head(3)

Unnamed: 0,fileID,user,promptID,model,shots,runID,original,rewritten_sentence,your_text,explanation_accuracy,explanation_content_preservation,explanation_fluency,score_accuracy,score_content_preservation,score_fluency,prompting,evaluator,isParallel
4,231,U1,2,mistral-medium,5,911714,I'm all about fast food. I'm having some fries...,"Hey, I'm all about that fast food, y'all! I'm ...","I just love fast food! I’ll have some fries, s...",Score: 4\n\nExplanation: Both paragraphs S1 an...,Score: 10\nExplanation: Both S1 and S2 convey ...,Score: 9\n\nExplanation: The text is highly co...,4.0,10.0,9.0,NoContext,mistral-medium,False
24,231,U1,2,mistral-medium,5,911714,Just having this vegan hotdog from the school ...,"Ah, just grabbing this vegan hotdog from the s...",Im having this vegan hotdog at school today. I...,Score: 3\n\nExplanation: Both paragraphs conve...,Score: 10\n\nExplanation: Both paragraphs disc...,Score: 9\n\nExplanation: The text is coherent ...,3.0,10.0,9.0,NoContext,mistral-medium,False
118,202,U6,2,mistral-medium,5,209327,"I went to the ""Groene Burger"" fast food restau...","So, I just popped by the 'Groene Burger' fast ...",I went to the “Groene Burger” fast food joint....,Score: 8\n\nExplanation: Paragraph S1 has a mo...,Score: 10\n\nExplanation: Both S1 and S2 conve...,Score: 10\nExplanation: The text is highly coh...,8.0,10.0,10.0,NoContext,mistral-medium,False


In [92]:
df.shape

(96, 18)

### Perform the evaluation step (one more time) - only on accuracy 

In [67]:
prompts_dict_new_accuracy = {
    # Generic prompt
    'prompt_llm': 'You are an expert in text style transfer. Here is text T1, supposedly writen in a style of person X: {} ',
                 
    # Accuracy
    'prompt_s2': 'and here is another text T2, containing are a set of unrelated sentences (separated with ;), that are actually written by person X: {} ',
    'prompt_inference': 'How different is the conversational style in T1 and T2 on a continuous scale from 1 (completely identical styles) to 10 (completely different styles)? For scoring focus on semantics and syntax. Difference in discussed topics or contexts is irrelevant for the score. Result = . Format result as "score" and "explanation".',
}

In [72]:
evl_df = pd.DataFrame()
for index, row in df.iterrows():
    evl_df = pd.concat([evl_df, evl.get_updated_evaluation_mistral(prompts_dict_new_accuracy, row, 'NoContext_' + format(index))], ignore_index=True)


In [231]:
evl_df.to_csv('f8_llm_evaluation_data/Mistral/0_loop_updated_accuracy_evaluation_summary.csv')

In [94]:
eval_step = evl_df[['rewritten_sentence','score_accuracy','explanation_accuracy']]
eval_step

Unnamed: 0,rewritten_sentence,score_accuracy,explanation_accuracy
0,"Hey, I'm all about that fast food, y'all! I'm ...",8,Score: 8\n\nExplanation: The conversational st...
1,"Ah, just grabbing this vegan hotdog from the s...",6,Score: 6\n\nExplanation: The conversational st...
2,"So, I just popped by the 'Groene Burger' fast ...",6,Score: 6\n\nExplanation: Although both texts a...
3,"So, you know that vegan fried chicken from KFC...",6,Score: 6\n\nExplanation:\n\nAlthough both text...
4,"So you know what, I just went to this 'Groene ...",7,Score: 7\n\nExplanation: Although both texts a...
...,...,...,...
91,"Hey, just had my first vegan cake at 'groene b...",7,Score: 7\n\nExplanation: The conversational st...
92,"Sure thing, getting my daily dose of vitamins ...",6,Score: 6\n\nExplanation: The syntax and semant...
93,"So, you know what, I'm totally into fast food....",6,Score: 6\n\nExplanation:\n\nAlthough the texts...
94,"You know what, I'm just vibing with this vegan...",7,Score: 7\n\nExplanation: The conversational st...


In [109]:
# for i, value in eval_step['explanation_accuracy'].items():
#     # Find the index of "Explanation:"
#     index = value.index("Explanation:") + len("Explanation:")
#     # Extract the text after "Explanation:"
#     value = value[index:]
#     # print(i,value,'\n')
#     eval_step['explanation_accuracy'].iloc[i] = value

In [96]:
df['new_score_accuracy'] = eval_step['score_accuracy']
df['new_explanation_accuracy'] = eval_step['explanation_accuracy']

In [97]:
df['score_accuracy'].astype(float).mean(),df['new_score_accuracy'].astype(float).mean()

(4.895833333333333, 7.0)

### Perform the feedback step

In [130]:
prompts_dict_feedback = {
    # Generic prompt
    'prompt_llm': 'You are an AI model that improves a text style transfer based on provided feedback. Here is text T1, supposedly writen in a style of person X: {} ',
                 
    # Accuracy
    'prompt_s2': 'and here is another text T2, containing are a set of unrelated sentences (separated with ;), that are actually written by person X: {} ',
    'prompt_inference': 'Use the following feedback on text style difference between T1 and T2: {}, to rewrite T1 so that it is more similar in style to T2. Keep the style conversational and informal. Result = . Format result as "sentence" and "explanation".',
}

In [158]:
feedback_df = pd.DataFrame()
for index, row in df.iterrows():
    feedback_df = pd.concat([feedback_df, evl.get_updated_evaluation_mistral(prompts_dict_feedback, row, 'NoContext_' + format(index),'feedback_generation')], ignore_index=True)


In [230]:
feedback_df.to_csv('f8_llm_evaluation_data/Mistral/0_loop_updated_feedback_summary.csv')

In [200]:
def remove_quotes(s):
    if s.startswith(' "'):
        s = s[2:]
    if s.endswith('"'):
        s = s[:-1]
    return s

In [215]:
for i,r in feedback_df.iterrows():
    s = remove_quotes(r['explanation_feedback'].split(':')[1].split('\n')[0])
    if s:
        feedback_df['tst_feedback'].iloc[i] = s 
    else:
        # some sentences are messed up, as Sentence: is followed with new line, so they require different handling 
        feedback_df['tst_feedback'].iloc[i] = r['explanation_feedback'].split(':')[1].strip().split('\n')[0]

### Evaluate the feedback - regenerate accuracy

In [221]:
prompts_dict_new_accuracy = {
    # Generic prompt
    'prompt_llm': 'You are an expert in text style transfer. Here is text T1, supposedly writen in a style of person X: {} ',
                 
    # Accuracy
    'prompt_s2': 'and here is another text T2, containing are a set of unrelated sentences (separated with ;), that are actually written by person X: {} ',
    'prompt_inference': 'How different is the conversational style in T1 and T2 on a continuous scale from 1 (completely identical styles) to 10 (completely different styles)? For scoring focus on semantics and syntax. Difference in discussed topics or contexts is irrelevant for the score. Result = . Format result as "score" and "explanation".',
}

In [224]:
feedback_tst_evl_df = pd.DataFrame()
for index, row in feedback_df.iterrows():
    feedback_tst_evl_df = pd.concat([feedback_tst_evl_df, evl.get_updated_evaluation_mistral(prompts_dict_new_accuracy, row, 'NoContext_' + format(index),'feedback_evaluation')], ignore_index=True)


In [229]:
feedback_tst_evl_df.to_csv('f8_llm_evaluation_data/Mistral/0_loop_accuracy_feedback_summary.csv')

In [232]:
feedback_tst_evl_df.columns

Index(['index', 'fileID', 'user', 'promptID', 'model', 'shots', 'runID',
       'original', 'rewritten_sentence', 'your_text', 'explanation_accuracy',
       'explanation_content_preservation', 'explanation_fluency',
       'score_accuracy', 'score_content_preservation', 'score_fluency',
       'prompting', 'evaluator', 'isParallel', 'new_score_accuracy',
       'new_explanation_accuracy', 'tst_feedback', 'explanation_feedback',
       'accuracy_feedback', 'explanation_feedback_evaluation'],
      dtype='object')

In [237]:
''.join(feedback_tst_evl_df['explanation_feedback_evaluation'][0:2])

'Score: 7\n\nExplanation: While both texts are written in an informal style, there are notable differences in semantics and syntax that contribute to a score of 7 on a 10-point scale.\n\nIn T1, the text is descriptive and focuses on the speaker\'s current actions and plans, using present continuous tense ("I\'m munching", "sipping", "looking forward") and concrete nouns ("fries", "soy burger", "coke", "football game"). The sentence structure is simple and straightforward.\n\nIn contrast, T2 consists of several unrelated sentences that are more interactive and dialogue-based, with the speaker responding to different prompts or addressing different people ("tell me about your family", "I\'ll do the export", "im curious", "no skiing for me"). The sentences are shorter and more fragmented, with the use of ellipses and exclamation marks to convey the speaker\'s tone. There is also more variation in sentence structure, with questions, statements, and commands.\n\nOverall, while both texts re

In [228]:
df['score_accuracy'].astype(float).mean(),df['new_score_accuracy'].astype(float).mean(),feedback_tst_evl_df['accuracy_feedback'].astype(float).mean()

(4.895833333333333, 7.0, 5.6875)

# Second loop

In [None]:
feedback_df = pd.DataFrame()
for index, row in feedback_tst_evl_df.iterrows():
    feedback_df = pd.concat([feedback_df, evl.get_updated_evaluation_mistral(prompts_dict_feedback, row, 'NoContext_' + format(index),'feedback_generation')], ignore_index=True)
