In [3]:
import pandas as pd
import configparser
import sys
import os
import inspect
import s3_tst_evaluation as s3
import random

# access parent directory from notebooks directory
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0, parentdir)
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

In [4]:
config = configparser.ConfigParser()
# Read the configuration file
config.read('config.ini')
api_key_openai = config.get('credentials', 'api_key_openai')
api_key_mistral = config.get('credentials', 'api_key_mistral')
surfdrive_url_transcript_sentences = config.get('credentials', 'surfdrive_url_transcript_sentences')

output_evaluation_folder_path = '03_tst_evaluation/'

### Input

In [142]:
## TST accuracy

prompt = """
You are a linguistics expert. Your goal is to assess which message most closely matches the conversational style of person X.

Here is an example sentence from person X: {}

Messages for comparison:
Here is message A: {}
Here is message B: {}

Determine which message, A or B, is more likely to have been written by person X based on their conversational style. Consider the use of function words (e.g., pronouns like "I", "we", "you"), tone (e.g., formality, analytical vs. narrative style), and other stylistic features (e.g., emotional tone, complexity, frequently used phrases, fillers, punctuation).

Return results format in a json object as: {"answer"= [A or B], "explanation"=""}
"""


prompt_system = """
You are a linguistics expert. Your goal is to assess how similar the conversational style of example sentences from person X and the target message are. For conversational style, please consider 1. how person X uses function words (e.g., pronouns such as "I", "we", "you"), 2. which tone person X prefers (e.g., formal/informal vs. analytical vs. narrative), 3. often used stylistic words (e.g., phrases, fillers words), 4. emoji or emoticon-use in the provided example sentences and 5. other stylistic characteristics you find.
Determine on a discrete scale from 1 to 5 the style similarity between the example sentences and the target message, where [1 = completely different styles, 2 = different styles, 3 = not identical nor different styles, 4 = identical styles, 5 = completely identical styles]?
Return results in a json object as: {"score"="", "explanation"=""}
"""
prompt_user = """
Here are the example sentences from person X: {}

Here is the target message: {}
"""



#### Create random sets to compare messages 

In [72]:
random.seed(43)
# List of sets
sets = ['BU01', 'BU02', 'BU03', 'BU04', 'BU05', 'BU06', 'BU07', 'BU08']

# Dictionary to hold matches for each set
matches = {set_name: random.choices([s for s in sets if s != set_name], k=10) for set_name in sets}

# Creating a list of match pairs
match_pairs = []

# Generate match pairs for each set
for set_name in sets:
    for match in matches[set_name]:
        match_pairs.append([set_name, match])

# Convert the list of match pairs to a DataFrame with two columns: Message A and Message B
df_evaluation_accuracy = pd.DataFrame(match_pairs, columns=['user_message_A', 'user_message_B'])

# Add 'Message ID' column that runs from 2 to 11 repeatedly
df_evaluation_accuracy['message_id'] = [i % 10 + 2 for i in range(len(df_evaluation_accuracy))]

# Display the DataFrame
print(df_evaluation_accuracy)

df_evaluation_accuracy.to_csv("03_tst_evaluation/random_user_matching.csv", index=False)

   user_message_A user_message_B  message_id
0            BU01           BU02           2
1            BU01           BU06           3
2            BU01           BU03           4
3            BU01           BU05           5
4            BU01           BU06           6
..            ...            ...         ...
75           BU08           BU07           7
76           BU08           BU07           8
77           BU08           BU01           9
78           BU08           BU06          10
79           BU08           BU07          11

[80 rows x 3 columns]


### Evaluation Accuracy - Message comparison head-to-head

### GPT all shots

In [140]:
example_messages_file = pd.read_csv('01_processed_input_data/all_shots_data_gpt.csv')

In [110]:
# GPT all shots
tst_output = pd.read_csv('02_tst_output/gpt-4o/parallel_all_shots_processed.csv')
df = s3.get_df_eval_acc(df_evaluation_accuracy, tst_output, example_messages_file)
gpt_all = s3.tst_eval(prompt, df, 'gpt-4o', 'all')


Processing LLM TST evaluation...: 100%|██████████| 80/80 [03:54<00:00,  2.93s/it]


In [113]:
gpt_all_parsed = s3.parse_tst_data(gpt_all)
df_temp = gpt_all_parsed[['model', 'shots', 'message_id', 'user_mA', 'user_mB', 'message_A', 'message_B', 'score', 'explanation' ]]
df_temp.to_csv('03_tst_evaluation/tst_eval_gpt-4o_all_processed.csv')

wronglyParsed:  0


In [136]:
summary = s3.eval_summary(df_temp)
print(summary)
print(summary['score_num'].mean())

  user_mA  score_num                 model shots
0    BU01          3  mistral-large-latest   all
1    BU02          6  mistral-large-latest   all
2    BU03          0  mistral-large-latest   all
3    BU04          2  mistral-large-latest   all
4    BU05          2  mistral-large-latest   all
5    BU06          5  mistral-large-latest   all
6    BU07          4  mistral-large-latest   all
7    BU08          3  mistral-large-latest   all
3.125


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [125]:
# GPT all shots
tst_output = pd.read_csv('02_tst_output/gpt-4o/parallel_all_shots_processed.csv')
df = s3.get_df_eval_acc(df_evaluation_accuracy, tst_output, example_messages_file)
gpt_all = s3.tst_eval(prompt, df, 'gpt-4o', 'all')

Processing LLM TST evaluation...: 100%|██████████| 80/80 [03:47<00:00,  2.84s/it]


In [128]:
gpt_all_parsed = s3.parse_tst_data(gpt_all)
df_temp = gpt_all_parsed[['model', 'shots', 'message_id', 'user_mA', 'user_mB', 'message_A', 'message_B', 'score', 'explanation' ]]
df_temp.to_csv('03_tst_evaluation/tst_eval_gpt-4o_all_processed_02.csv')
df_temp.loc[:, 'correct_answer'] = df_temp['score'].str.count('A')

summary = df_temp.groupby('user_mA').agg({
    'correct_answer': 'sum',        # Sum the counts of 'hello'
    'model': 'first',   # Keep the first location (same value for all rows in each group)
    'shots': 'first'      # Keep the first status (same value for all rows in each group)
}).reset_index()
print(summary)

summary['correct_answer'].mean()

wronglyParsed:  0
  user_mA  correct_answer              model shots
0    BU01               4  gpt-4o-2024-05-13   all
1    BU02              10  gpt-4o-2024-05-13   all
2    BU03               3  gpt-4o-2024-05-13   all
3    BU04               2  gpt-4o-2024-05-13   all
4    BU05               3  gpt-4o-2024-05-13   all
5    BU06               5  gpt-4o-2024-05-13   all
6    BU07               1  gpt-4o-2024-05-13   all
7    BU08               4  gpt-4o-2024-05-13   all


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_temp.loc[:, 'correct_answer'] = df_temp['score'].str.count('A')


4.0

### GPT 20 shots

In [135]:
# GPT 20 shots
tst_output = pd.read_csv('02_tst_output/gpt-4o/parallel_20_shots_processed.csv')
df = s3.get_df_eval_acc(df_evaluation_accuracy, tst_output, example_messages_file)
gpt_all = s3.tst_eval(prompt, df, 'gpt-4o', '20')


Processing LLM TST evaluation...: 100%|██████████| 80/80 [04:01<00:00,  3.02s/it]


Unnamed: 0,model,shots,message_id,user_mA,user_mB,message_A,message_B,tst_eval,query,prompt_tokens,completion_tokens,object
0,gpt-4o-2024-05-13,20,2,BU01,BU02,Reducing how much meat we eat can help save a ...,so reducing how much meat we eat can help save...,"{\n ""answer"": ""B"",\n ""explanation"": ""Message...",\nTask: You are a linguistics expert. Your goa...,1003,110,chat.completion
1,gpt-4o-2024-05-13,20,3,BU01,BU06,"If everyone ate less animal foods, the CO2 fro...","if everyone ate less animal foods, the CO2 fro...","{\n ""answer"": ""B"",\n ""explanation"": ""Message...",\nTask: You are a linguistics expert. Your goa...,1029,85,chat.completion
2,gpt-4o-2024-05-13,20,4,BU01,BU03,Animal farming is responsible for like 80% of ...,Animal farming is responsible for 80% of the l...,"{\n ""answer"": ""A"",\n ""explanation"": ""Message...",\nTask: You are a linguistics expert. Your goa...,1006,121,chat.completion
3,gpt-4o-2024-05-13,20,5,BU01,BU05,80% of young people think that living sustaina...,80% of young people believe that living sustai...,"{\n ""answer"": ""A"",\n ""explanation"": ""Message...",\nTask: You are a linguistics expert. Your goa...,960,146,chat.completion
4,gpt-4o-2024-05-13,20,6,BU01,BU06,"Cutting down on meat, dairy, and sugary foods ...","so reducing how much meat, dairy, and sugary f...","{\n ""answer"": ""B"",\n ""explanation"": ""Message...",\nTask: You are a linguistics expert. Your goa...,977,129,chat.completion
...,...,...,...,...,...,...,...,...,...,...,...,...
75,gpt-4o-2024-05-13,20,7,BU08,BU07,Fruits and veggies are packed with important n...,Fruits and veggies are packed with important n...,"{\n ""answer"": ""B"",\n ""explanation"": ""Message...",\nTask: You are a linguistics expert. Your goa...,978,171,chat.completion
76,gpt-4o-2024-05-13,20,8,BU08,BU07,Fiber is super important for keeping our stoma...,Fiber is super important for keeping our stoma...,"{\n ""answer"": ""A"",\n ""explanation"": ""Message...",\nTask: You are a linguistics expert. Your goa...,989,129,chat.completion
77,gpt-4o-2024-05-13,20,9,BU08,BU01,Veggies and fruits have these awesome plant su...,Veggies and fruits have these awesome plant su...,"{\n ""answer"": ""B"",\n ""explanation"": ""Message...",\nTask: You are a linguistics expert. Your goa...,1004,129,chat.completion
78,gpt-4o-2024-05-13,20,10,BU08,BU06,More than 70% of farm animals in the UK are ra...,so more than 70% of farm animals in the UK are...,"{\n ""answer"": ""B"",\n ""explanation"": ""Message...",\nTask: You are a linguistics expert. Your goa...,1007,136,chat.completion


In [ ]:
gpt_all_parsed = s3.parse_tst_data(df)
df_temp = gpt_all_parsed[['model', 'shots', 'message_id', 'user_mA', 'user_mB', 'message_A', 'message_B', 'score', 'explanation' ]]

df_temp.to_csv('03_tst_evaluation/tst_eval_gpt-4o_20_processed.csv')
df_temp.loc[:, 'correct_answer'] = df_temp['score'].str.count('A')

summary = df_temp.groupby('user_mA').agg({
    'correct_answer': 'sum',        # Sum the counts of 'hello'
    'model': 'first',   # Keep the first location (same value for all rows in each group)
    'shots': 'first'      # Keep the first status (same value for all rows in each group)
}).reset_index()
print(summary)

summary['correct_answer'].mean()

### Mistral all shots

In [129]:
# Mistral all shots
tst_output = pd.read_csv('02_tst_output/mistral-large-latest/parallel_all_shots_processed.csv')
df = s3.get_df_eval_acc(df_evaluation_accuracy, tst_output, example_messages_file)
mistral_all = s3.tst_eval(prompt, df, 'mistral-large-latest', 'all')

Processing LLM TST evaluation...: 100%|██████████| 80/80 [04:18<00:00,  3.24s/it]


In [131]:
mistral_all

Unnamed: 0,model,shots,message_id,user_mA,user_mB,message_A,message_B,tst_eval,query,prompt_tokens,completion_tokens,object,score,explanation
0,mistral-large-latest,all,2,BU01,BU02,"So, reducing how much meat we eat can save a t...","like, reducing how much meat we eat can actual...","{\n ""answer"": ""B"",\n ""explanation"": ""Message...",\nTask: You are a linguistics expert. Your goa...,1107,101,chat.completion,B,Message B is more likely to have been written ...
1,mistral-large-latest,all,3,BU01,BU06,"If everyone ate less animal foods, the CO2 fro...","If everyone ate less animal foods, the CO2 fro...","{\n ""answer"": ""B"",\n ""explanation"": ""Message...",\nTask: You are a linguistics expert. Your goa...,1149,73,chat.completion,B,"Message B contains the phrase 'hahaha', which ..."
2,mistral-large-latest,all,4,BU01,BU03,Animal farming is like responsible for 80% of ...,"I mean, animal farming is like responsible for...","{\n ""answer"": ""B"",\n ""explanation"": ""Message...",\nTask: You are a linguistics expert. Your goa...,1117,85,chat.completion,B,Message B more closely matches person X's conv...
3,mistral-large-latest,all,5,BU01,BU05,Like 80% of young people think living sustaina...,80% of young people believe that living sustai...,"{\n ""answer"": ""B"",\n ""explanation"": ""Message...",\nTask: You are a linguistics expert. Your goa...,1059,70,chat.completion,B,Message B more closely matches the conversatio...
4,mistral-large-latest,all,6,BU01,BU06,"Reducing meat, dairy, and sugary foods can rea...","Reducing how much meat, dairy, and sugary food...","{\n ""answer"": ""A"",\n ""explanation"": ""Message...",\nTask: You are a linguistics expert. Your goa...,1105,116,chat.completion,A,Message A more closely matches the conversatio...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75,mistral-large-latest,all,7,BU08,BU07,"So, fruits and veggies are packed with all the...",Fruits and veggies are packed with important n...,"{\n ""answer"": ""A"",\n ""explanation"": ""Message...",\nTask: You are a linguistics expert. Your goa...,1083,98,chat.completion,A,Message A more closely matches the conversatio...
76,mistral-large-latest,all,8,BU08,BU07,So fiber is really important for keeping our s...,Fiber is so important for keeping our stomachs...,"{\n ""answer"": ""A"",\n ""explanation"": ""Message...",\nTask: You are a linguistics expert. Your goa...,1075,111,chat.completion,A,Message A more closely matches the conversatio...
77,mistral-large-latest,all,9,BU08,BU01,So veggies and fruits have these super helpful...,Veggies and fruits have some good stuff for ou...,"{\n ""answer"": ""B"",\n ""explanation"": ""Message...",\nTask: You are a linguistics expert. Your goa...,1079,70,chat.completion,B,Message B more closely matches the conversatio...
78,mistral-large-latest,all,10,BU08,BU06,More than 70% of farm animals in the UK are ra...,More than 70% of farm animals in the UK are ra...,"{\n ""answer"": ""B"",\n ""explanation"": ""Message...",\nTask: You are a linguistics expert. Your goa...,1089,120,chat.completion,B,Message B is more likely to have been written ...


In [ ]:
mistral_all_parsed = s3.parse_tst_data(mistral_all)
df_temp = mistral_all_parsed[['model', 'shots', 'message_id', 'user_mA', 'user_mB', 'message_A', 'message_B', 'score', 'explanation' ]]
df_temp.to_csv('03_tst_evaluation/tst_eval_mistral-large-latest_all_processed.csv')


In [133]:
df_temp

Unnamed: 0,model,shots,message_id,user_mA,user_mB,message_A,message_B,score,explanation,correct_answer
0,mistral-large-latest,all,2,BU01,BU02,"So, reducing how much meat we eat can save a t...","like, reducing how much meat we eat can actual...",B,Message B is more likely to have been written ...,0
1,mistral-large-latest,all,3,BU01,BU06,"If everyone ate less animal foods, the CO2 fro...","If everyone ate less animal foods, the CO2 fro...",B,"Message B contains the phrase 'hahaha', which ...",0
2,mistral-large-latest,all,4,BU01,BU03,Animal farming is like responsible for 80% of ...,"I mean, animal farming is like responsible for...",B,Message B more closely matches person X's conv...,0
3,mistral-large-latest,all,5,BU01,BU05,Like 80% of young people think living sustaina...,80% of young people believe that living sustai...,B,Message B more closely matches the conversatio...,0
4,mistral-large-latest,all,6,BU01,BU06,"Reducing meat, dairy, and sugary foods can rea...","Reducing how much meat, dairy, and sugary food...",A,Message A more closely matches the conversatio...,1
...,...,...,...,...,...,...,...,...,...,...
75,mistral-large-latest,all,7,BU08,BU07,"So, fruits and veggies are packed with all the...",Fruits and veggies are packed with important n...,A,Message A more closely matches the conversatio...,1
76,mistral-large-latest,all,8,BU08,BU07,So fiber is really important for keeping our s...,Fiber is so important for keeping our stomachs...,A,Message A more closely matches the conversatio...,1
77,mistral-large-latest,all,9,BU08,BU01,So veggies and fruits have these super helpful...,Veggies and fruits have some good stuff for ou...,B,Message B more closely matches the conversatio...,0
78,mistral-large-latest,all,10,BU08,BU06,More than 70% of farm animals in the UK are ra...,More than 70% of farm animals in the UK are ra...,B,Message B is more likely to have been written ...,0


In [134]:
df_temp.loc[:, 'correct_answer'] = df_temp['score'].str.count('A')

summary = df_temp.groupby('user_mA').agg({
    'correct_answer': 'sum',        # Sum the counts of 'hello'
    'model': 'first',   # Keep the first location (same value for all rows in each group)
    'shots': 'first'      # Keep the first status (same value for all rows in each group)
}).reset_index()
print(summary)

summary['correct_answer'].mean()

  user_mA  correct_answer                 model shots
0    BU01               3  mistral-large-latest   all
1    BU02               6  mistral-large-latest   all
2    BU03               0  mistral-large-latest   all
3    BU04               2  mistral-large-latest   all
4    BU05               2  mistral-large-latest   all
5    BU06               5  mistral-large-latest   all
6    BU07               4  mistral-large-latest   all
7    BU08               3  mistral-large-latest   all


3.125

### TST Accuracy - Message vs shots


In [186]:
prompt_system = """
You are a linguistics expert. Your goal is to assess how similar the conversational style of example sentences from person X and the target message are. For conversational style, please consider 1. how person X uses function words (e.g., pronouns such as "I", "we", "you"), 2. which tone person X prefers (e.g., formal/informal vs. analytical vs. narrative), 3. often used stylistic words (e.g., phrases, fillers words), 4. emoji or emoticon-use in the provided example sentences and 5. other stylistic characteristics you find.
\n#####\n
"""

prompt_user = """
Determine on a discrete scale from 1 to 5 the style similarity between the example sentences and the target message, where [1 = completely different styles, 2 = different styles, 3 = not identical nor different styles, 4 = identical styles, 5 = completely identical styles].
\n#####\n
Here are the example sentences from person X: {}
\n#####\n
Here is the target message: {}
\n#####\n
Return results in a json object as "score"="", "explanation"=""
"""

In [187]:
user_chats = pd.read_csv('01_processed_input_data/all_shots_data_gpt.csv')
examples = user_chats.groupby('username')['original'].apply('\n'.join).reset_index()

In [188]:
# GPT 0.2 all shots
model_tst = 'gpt-4o'
temp_tst = 0.2
tst_output = pd.read_csv('02_tst_output/'+model_tst+'_'+str(temp_tst)+'_parallel_all_shots_processed.csv')
input_data = pd.merge(tst_output, examples, on='username')

In [189]:
model = 'gpt-4o'
eval_df = s3.tst_eval_accuracy(prompt_system, prompt_user, input_data, model, 'all', 0.7, 43)
eval_df.to_csv('03_tst_evaluation/'+model_tst+'_'+str(temp_tst)+'_all_shots_eval_'+model+'.csv')

Processing LLM TST evaluation...: 100%|██████████| 96/96 [03:03<00:00,  1.91s/it]


In [190]:
model = 'mistral-large-latest'
temp = 0.7
eval_df = s3.tst_eval_accuracy(prompt_system, prompt_user, input_data, model, 'all', temp, 43)
eval_df.to_csv('03_tst_evaluation/'+model+'_'+str(temp)+'_all_shots.csv')

Processing LLM TST evaluation...: 100%|██████████| 96/96 [07:18<00:00,  4.57s/it]


In [191]:
# GPT 0.7 all shots
model_tst = 'gpt-4o'
temp_tst = 0.7
tst_output = pd.read_csv('02_tst_output/'+model_tst+'_'+str(temp_tst)+'_parallel_all_shots_processed.csv')
input_data = pd.merge(tst_output, examples, on='username')

In [192]:
model = 'gpt-4o'
eval_df = s3.tst_eval_accuracy(prompt_system, prompt_user, input_data, model, 'all', 0.7, 43)
eval_df.to_csv('03_tst_evaluation/'+model_tst+'_'+str(temp_tst)+'_all_shots_eval_'+model+'.csv')

Processing LLM TST evaluation...: 100%|██████████| 96/96 [03:15<00:00,  2.04s/it]


In [ ]:
model = 'mistral-large-latest'
eval_df = s3.tst_eval_accuracy(prompt_system, prompt_user, input_data, model, 'all', 0.7, 43)
eval_df.to_csv('03_tst_evaluation/'+model_tst+'_'+str(temp_tst)+'_all_shots_eval_'+model+'.csv')