In [6]:
import pandas as pd
import sys
import os
import inspect
import s5_postprocess_evaluation as pe
import s6_display_results as dr
import s4_evaluation as evl
import warnings
from tqdm import tqdm

# Ignore all warnings
warnings.filterwarnings("ignore")
# access parent directory from notebooks directory
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0, parentdir)
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


#### For the reflection example, we are taking the following usecase: mistral medium (as tst and evaluator), non-parallel data prompt, 5 shots and no-context

In [8]:
df = dr.get_input_data()
###  'cutting' the input dataframe, evaluator is mistral medium, non-parallel data only (promptID 2), 
df = df[(df.evaluator == 'mistral-medium') & (df.isParallel == False) & (df.shots == 5) & (df.prompting == 'NoContext') & (df.model == 'mistral-medium')]
df = evl.extract_explanation(df,'explanation_accuracy')
df = df.reset_index(drop = True)
df = df[0:2]
df

Unnamed: 0,fileID,user,promptID,model,shots,runID,neutral_sentence,tst_sentence_0,user_sentence,explanation_accuracy,explanation_content_preservation,explanation_fluency,score_accuracy,score_content_preservation,score_fluency,prompting,evaluator,isParallel
0,231,U1,2,mistral-medium,5,911714,I'm all about fast food. I'm having some fries...,"Hey, I'm all about that fast food, y'all! I'm ...","I just love fast food! I’ll have some fries, s...","Both paragraphs S1 and S2 have an informal, c...",Score: 10\nExplanation: Both S1 and S2 convey ...,Score: 9\n\nExplanation: The text is highly co...,4.0,10.0,9.0,NoContext,mistral-medium,False
1,231,U1,2,mistral-medium,5,911714,Just having this vegan hotdog from the school ...,"Ah, just grabbing this vegan hotdog from the s...",Im having this vegan hotdog at school today. I...,Both paragraphs convey a similar informal and...,Score: 10\n\nExplanation: Both paragraphs disc...,Score: 9\n\nExplanation: The text is coherent ...,3.0,10.0,9.0,NoContext,mistral-medium,False


In [47]:
accuracy_prompt_dict = {
    # Generic prompt
    'prompt_p1': '''You are an expert in text style transfer. Here is text T1, supposedly writen in a style of person X: {} ''',
                 
    # Accuracy
    'prompt_p2': '''and here is another text T2, containing are a set of unrelated sentences (separated with a semicolon ;), that are actually written by person X: {} ''',
    'prompt_p3': '''How different is the conversational style in T1 and T2 on a continuous scale from 1 (completely identical styles)
    to 10 (completely different styles)? For scoring focus on semantics and syntax.
    Difference in discussed topics or contexts is irrelevant for the score. Result = . Format result as "score" and "explanation".''',
}

refine_prompt_dict = {
    # System prompt
    'prompt_p1': '''You are an AI model that improves a text style transfer based on provided feedback. 
    Here is text T1, supposedly writen in a style of person X: {} ''',
                 
    # Accuracy
    'prompt_p2': '''and here is another text T2, containing are a set of unrelated sentences (separated with ;),
    that are actually written by person X: {} ''',

    # Inference
    'prompt_p3': '''Use the following feedback on text style difference between T1 and T2: {},
    to rewrite T1 so that it is more similar in style to T2. Keep the style conversational and informal. Keep the original number of sentences. Do not add new sentences based on learned feedback. Do not add context based on the sentences written from person X.
    Result = . Format result as "sentence" and "explanation".''',
}

In [26]:

loop_id = 0
loops = [0,1,2]
loops_length = len(loops)
random_shots = True

In [48]:
for loop_id in tqdm(loops):    
    evl_df = pd.DataFrame()
    for index, row in df.iterrows():
        evl_df = pd.concat([evl_df, evl.get_accuracy_score(accuracy_prompt_dict, row, 'NoContext_' + format(index), loop_id, random_shots)], ignore_index=True)
    
    evl_df = evl.extract_explanation(evl_df,'explanation_accuracy_' + str(loop_id))
    evl_df.to_csv('f8_llm_evaluation_data/Mistral/accuracy_update_summary_loop_' + str(loop_id) + '.csv')
    
    df['score_accuracy_' + str(loop_id)] = evl_df['score_accuracy_' + str(loop_id)]
    df['explanation_accuracy_' + str(loop_id)] = evl_df['explanation_accuracy_' + str(loop_id)]

    if loop_id != loops_length - 1:
        feedback_df = pd.DataFrame()
        for index, row in df.iterrows():
            feedback_df = pd.concat([feedback_df, evl.get_refinement_feedback(refine_prompt_dict, row, 'NoContext_' + format(index),loop_id)], ignore_index=True)
        
        feedback_df = evl.extract_feedback(feedback_df,loop_id)
        feedback_df.to_csv('f8_llm_evaluation_data/Mistral/' +  "refine_summary_loop_" + str(loop_id) + '.csv')

        df['tst_sentence_' + str(loop_id + 1)] = feedback_df['tst_sentence_' + str(loop_id + 1)]
        df['explanation_tst_feedback_' + str(loop_id)] = feedback_df['tst_sentence_' + str(loop_id)]

  0%|          | 0/3 [00:00<?, ?it/s]

Refinement Feedback Prompt Query:  You are an AI model that improves a text style transfer based on provided feedback. 
    Here is text T1, supposedly writen in a style of person X: {Hey, I'm all about that fast food, y'all! I'm chowing down on some fries, a soy burger, and a coke over at the burger joint near school. And come the weekend, I'm all about the football game!} and here is another text T2, containing are a set of unrelated sentences (separated with ;),
    that are actually written by person X: {ah I think its fine! this is probably related to their privacy statement; tell me about your family, cos we are running out of time; i ll do the export, so yeah we are done!; im curious, I ll start with some analysis later. thanks Kim!!; no skiing for me, I never learned it. I think im a bit scared to do that} Use the following feedback on text style difference between T1 and T2: {
The conversational styles in T1 and T2 are quite different in terms of semantics and syntax. In T1, t

 33%|███▎      | 1/3 [00:35<01:10, 35.25s/it]

Refinement Feedback Prompt Response:  Sentence:
Ah, grabbing this vegan hotdog from the school canteen, it's got a nice crunch to it and a bit of a kick! Yeah, just relaxing in the afternoon with the playstation, I think I might challenge my friend to a game later.

Explanation:
In order to make T1 more similar in style to T2, I added a conversational element to the second sentence by including a potential plan to challenge a friend to a game later. This adds a social aspect that is present in T2, while still keeping the focus on the speaker's personal experience. I also used contractions and ellipsis to make the syntax more similar to T2. The semantics are still focused on the speaker's personal experience, but the addition of a potential social plan makes the text more conversational and varied in syntax like T2.  

Refinement Feedback Prompt Query:  You are an AI model that improves a text style transfer based on provided feedback. 
    Here is text T1, supposedly writen in a style 

 67%|██████▋   | 2/3 [01:15<00:38, 38.29s/it]

Refinement Feedback Prompt Response:  Sentence: "Ah, think this vegan hotdog's alright from the school canteen, it's got a nice crunch to it! So, what about chillin' with the playstation this afternoon, I might challenge my friend to a game later."

Explanation: I've rephrased the first sentence to make it more similar to the conversational style of T2 by turning it into a statement addressed to the listener and incorporating future tense ("I might challenge"). I also added a question ("So, what about...") to mimic the question-based sentences found in T2. The second sentence was adjusted to employ future tense and conditional constructions, as seen in T2. However, I did not add any new idiomatic expressions since the instructions asked to avoid adding new content based on learned feedback.  



100%|██████████| 3/3 [01:36<00:00, 32.05s/it]


In [42]:
df.columns

Index(['fileID', 'user', 'promptID', 'model', 'shots', 'runID',
       'neutral_sentence', 'tst_sentence_0', 'user_sentence',
       'explanation_accuracy', 'explanation_content_preservation',
       'explanation_fluency', 'score_accuracy', 'score_content_preservation',
       'score_fluency', 'prompting', 'evaluator', 'isParallel',
       'score_accuracy_0', 'explanation_accuracy_0', 'tst_sentence_1',
       'explanation_tst_feedback_0', 'score_accuracy_1',
       'explanation_accuracy_1', 'tst_sentence_2',
       'explanation_tst_feedback_1', 'score_accuracy_2',
       'explanation_accuracy_2'],
      dtype='object')

In [43]:
df['score_accuracy'].astype(float).mean(),df['score_accuracy_0'].astype(float).mean(),df['score_accuracy_1'].astype(float).mean(),df['score_accuracy_2'].astype(float).mean()

(3.5, 7.5, 6.5, 6.5)

In [44]:
df

Unnamed: 0,fileID,user,promptID,model,shots,runID,neutral_sentence,tst_sentence_0,user_sentence,explanation_accuracy,...,score_accuracy_0,explanation_accuracy_0,tst_sentence_1,explanation_tst_feedback_0,score_accuracy_1,explanation_accuracy_1,tst_sentence_2,explanation_tst_feedback_1,score_accuracy_2,explanation_accuracy_2
0,231,U1,2,mistral-medium,5,911714,I'm all about fast food. I'm having some fries...,"Hey, I'm all about that fast food, y'all! I'm ...","I just love fast food! I’ll have some fries, s...","Both paragraphs S1 and S2 have an informal, c...",...,8,The conversational styles in T1 and T2 are si...,"Hey, it seems fine to me. I'm having some fri...","Hey, I'm all about that fast food, y'all! I'm ...",6,The conversational styles in T1 and T2 are mo...,"Ah, seems good to me! Probably related to thei...","Hey, it seems fine to me. I'm having some fri...",7,The conversational styles of T1 and T2 are qu...
1,231,U1,2,mistral-medium,5,911714,Just having this vegan hotdog from the school ...,"Ah, just grabbing this vegan hotdog from the s...",Im having this vegan hotdog at school today. I...,Both paragraphs convey a similar informal and...,...,7,The conversational styles in T1 and T2 are si...,"Ah, I think this vegan hotdog from the school...","Ah, just grabbing this vegan hotdog from the s...",7,The conversational styles in T1 and T2 are qu...,"Ah, I reckon the vegan hotdog from the school ...","Ah, I think this vegan hotdog from the school...",6,\n\nAlthough both T1 and T2 are written in an ...


In [45]:
import s6_display_results as dr
col_display = ['neutral_sentence', 'user_sentence','tst_sentence_0','tst_sentence_1','tst_sentence_2']

In [46]:
dr.display_interactive_dataframe(df,col_display)

interactive(children=(Dropdown(description='User:', options=('All', 'U1'), value='All'), Dropdown(description=…