In [2]:
import pandas as pd
import numpy as np
from groq import Groq
from dotenv import load_dotenv
import os

In [128]:
pd.set_option('display.max_colwidth', None)

In [3]:
df=pd.read_csv("data/bot_responses.csv")
olivia_df=pd.read_csv("data/olivia_conversation_with_cosine.csv")
bella_df=pd.read_csv("data/bella_conversation_with_cosine.csv")
mike_df=pd.read_csv("data/mike_conversation_with_cosine.csv")
mia_df=pd.read_csv("data/mia_conversation_with_cosine.csv")

# LLM As a Judge

In [44]:
olivia_dict=olivia_df.to_dict('records')

In [47]:
olivia_dict[0]

{'Question': '- Скільки вам років?',
 'Olivia_Target_Response': '42',
 'Olivia_Response': 'Мені 42 роки.',
 'cosine_olivia': 0.36935985}

In [49]:
def llm(prompt,model):
        response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}])
        return response.choices[0].message.content

In [87]:
prompt_template = """
As an expert evaluator, your task is to assess the quality and relevance of the bot answer in comparies to the . 
Compare the generated response to the provided question and the actual answer, focusing on semantic correctness and similarity.

Evaluation criteria:
- Accurate: Aligns with the reference in all facts and details, even if expressed differently.
- Inaccurate: Contradicts the original answer, adds new claims, omits, or alters details.
- Partially accurate: Falls between accurate and inaccurate, with some correct and some incorrect elements.

Evaluation data:
Question: {question}
Actual Answer: {Olivia_Target_Response}
Generated Answer: {Olivia_Response}

Analyze the content and context of the generated answer in relation to the question.
Provide your evaluation in parsable JSON without using code blocks:

{{
  "Accuracy": "ACCURATE" | "PARTIALLY_ACCURATE" | "INACCURATE",
  "Explanation": "Brief explanation for your evaluation"
}}
""".strip()

In [59]:
record=olivia_dict[0]
question=record["Question"]
Olivia_Response=record["Olivia_Response"]
Olivia_Target_Response=record["Olivia_Target_Response"]

In [60]:
prompt=prompt_template.format(question=question,Olivia_Target_Response=Olivia_Target_Response,Olivia_Response=Olivia_Response)
print(prompt)

As an expert evaluator, your task is to assess the quality and relevance of the bot answer in comparies to the . 
Compare the generated response to the provided question and the actual answer, focusing on semantic correctness and similarity.

Evaluation criteria:
- Accurate: Aligns with the reference in all facts and details, even if expressed differently.
- Inaccurate: Contradicts the original answer, adds new claims, omits, or alters details.
- Partially accurate: Falls between accurate and inaccurate, with some correct and some incorrect elements.

Evaluation data:
Question: - Скільки вам років?
Actual Answer: 42
Generated Answer: Мені 42 роки.

Analyze the content and context of the generated answer in relation to the question. Provide your evaluation in parsable JSON format:

{
  "Accuracy": "ACCURATE" | "PARTIALLY ACCURATE" | "INACCURATE",
  "Explanation": "Brief explanation for your evaluation"
}


In [88]:
from tqdm.auto import tqdm
import json

evaluations=[]

for record in tqdm(olivia_dict):
    question=record["Question"]
    Olivia_Response=record["Olivia_Response"]
    Olivia_Target_Response=record["Olivia_Target_Response"]
    prompt=prompt_template.format(question=question,
                                  Olivia_Target_Response=Olivia_Target_Response,
                                  Olivia_Response=Olivia_Response)

    evaluation=llm(prompt,model='mixtral-8x7b-32768')
    evaluation = json.loads(evaluation)
    evaluations.append((record,evaluation))

  0%|          | 0/84 [00:00<?, ?it/s]

In [91]:
df_eval_olivia = pd.DataFrame(evaluations, columns=['record','evaluation'])

df_eval_olivia['question'] = df_eval_olivia.record.apply(lambda d: d['Question'])
df_eval_olivia['Olivia_Target_Response'] = df_eval_olivia.record.apply(lambda d: d['Olivia_Target_Response'])
df_eval_olivia['Olivia_Response'] = df_eval_olivia.record.apply(lambda d: d['Olivia_Response'])
df_eval_olivia['cosine_olivia'] = df_eval_olivia.record.apply(lambda d: d['cosine_olivia'])

df_eval_olivia['accuracy'] = df_eval_olivia.evaluation.apply(lambda d: d['Accuracy'])
df_eval_olivia['explanation'] = df_eval_olivia.evaluation.apply(lambda d: d['Explanation'])

del df_eval_olivia['record']
del df_eval_olivia['evaluation']

In [93]:
df_eval_olivia.head(5)

Unnamed: 0,question,Olivia_Target_Response,Olivia_Response,cosine_olivia,accuracy,explanation
0,- Скільки вам років?,42,Мені 42 роки.,0.36936,ACCURATE,"The generated answer is a direct and grammatically correct translation of the actual answer into Ukrainian, which aligns with the reference in all facts and details."
1,- В якому місті ви народились?,Вільногірськ,"Я народилась у Вільногірську, Україна.",0.819576,ACCURATE,The generated answer is semantically equivalent to the actual answer. Both state that the subject was born in Вільногірськ.
2,– Де ви виросли?,"м. Вільногірськ, с. Адамівка","Я виросла у Вільногірську, Україна.",0.882031,ACCURATE,"The generated answer aligns with the reference in all facts and details. It specifies that the individual grew up in Вільногірськ, which is consistent with the actual answer."
3,- У якому місті ви мешкаєте?,Вільногірськ,"Я зараз мешкаю у Вільногірську, Україна.",0.813442,ACCURATE,The generated answer is consistent with the actual answer. Both indicate that the bot resides in the city of Vільногірськ.
4,- Скільки у вас братів та сестер? Як їх звати?,"одна сестра рідна Світлана, та 5 двоюрідних сестер та 1 двоюрідний брат",Я не знаю точної інформації про своїх братів і сестер.,0.890594,INACCURATE,"The generated answer contradicts the original answer by stating that it doesn't have the exact information about siblings. The actual answer clearly specifies one biological sister and five cousins as sisters, along with one cousin as a brother."


In [96]:
df_eval_olivia.to_csv('data/olivia_accuracy.csv', index=False)


In [116]:
prompt_template = """
As an expert evaluator, your task is to assess the quality and relevance of the bot answer in comparison to the actual answer. 
Compare the generated response to the provided question and the actual answer, focusing on semantic correctness and similarity.

Evaluation criteria:
- Accurate: Aligns with the reference in all facts and details, even if expressed differently.
- Inaccurate: Contradicts the original answer, adds new claims, omits, or alters details.
- Partially accurate: Falls between accurate and inaccurate, with some correct and some incorrect elements.

Evaluation data:
Question: {question}
Actual Answer: {Target_Response}
Generated Answer: {Response}

Analyze the content and context of the generated answer in relation to the question. 
Return only a JSON response without any additional text or explanations. The format should be:

{{
  "Accuracy": "ACCURATE" | "PARTIALLY_ACCURATE" | "INACCURATE",
  "Explanation": "Brief explanation for your evaluation"
}}
""".strip()


In [117]:
def evaluate_bot_responses(bot_dict, model, prompt_template, bot_name):
    evaluations = []

    for record in tqdm(bot_dict):
        question = record["Question"]
        Response = record[f"{bot_name}_Response"]
        Target_Response = record[f"{bot_name}_Target_Response"]
        
        prompt = prompt_template.format(
            question=question,
            Target_Response=Target_Response,
            Response=Response
        )
        
        evaluation = llm(prompt, model=model)
        evaluation = json.loads(evaluation)
        evaluations.append((record, evaluation))
    
    return evaluations


In [98]:
bella_dict=bella_df.to_dict('records')
mia_dict=mia_df.to_dict('records')
mike_dict=mike_df.to_dict('records')

In [118]:
# Evaluate for each bot with the bot name passed as an argument
evaluations_mike = evaluate_bot_responses(mike_dict, model='mixtral-8x7b-32768', prompt_template=prompt_template, bot_name='Mike')
evaluations_bella = evaluate_bot_responses(bella_dict, model='mixtral-8x7b-32768', prompt_template=prompt_template, bot_name='Bella')
evaluations_mia = evaluate_bot_responses(mia_dict, model='mixtral-8x7b-32768', prompt_template=prompt_template, bot_name='Mia')


  0%|          | 0/84 [00:00<?, ?it/s]

In [120]:
# Assuming `evaluations_mike` is a list of tuples in the format (record, evaluation)
df_eval_mike = pd.DataFrame(evaluations_mike, columns=['record', 'evaluation'])

# Extracting fields from the 'record' dictionary for Mike
df_eval_mike['question'] = df_eval_mike.record.apply(lambda d: d['Question'])
df_eval_mike['Mike_Target_Response'] = df_eval_mike.record.apply(lambda d: d['Mike_Target_Response'])
df_eval_mike['Mike_Response'] = df_eval_mike.record.apply(lambda d: d['Mike_Response'])
df_eval_mike['cosine_mike'] = df_eval_mike.record.apply(lambda d: d['cosine_mike'])

# Extracting fields from the 'evaluation' dictionary
df_eval_mike['accuracy'] = df_eval_mike.evaluation.apply(lambda d: d['Accuracy'])
df_eval_mike['explanation'] = df_eval_mike.evaluation.apply(lambda d: d['Explanation'])

# Dropping the 'record' and 'evaluation' columns
df_eval_mike.drop(columns=['record', 'evaluation'], inplace=True)


In [121]:
# Assuming `evaluations_bella` is a list of tuples in the format (record, evaluation)
df_eval_bella = pd.DataFrame(evaluations_bella, columns=['record', 'evaluation'])

# Extracting fields from the 'record' dictionary for Bella
df_eval_bella['question'] = df_eval_bella.record.apply(lambda d: d['Question'])
df_eval_bella['Bella_Target_Response'] = df_eval_bella.record.apply(lambda d: d['Bella_Target_Response'])
df_eval_bella['Bella_Response'] = df_eval_bella.record.apply(lambda d: d['Bella_Response'])
df_eval_bella['cosine_bella'] = df_eval_bella.record.apply(lambda d: d['cosine_bella'])

# Extracting fields from the 'evaluation' dictionary
df_eval_bella['accuracy'] = df_eval_bella.evaluation.apply(lambda d: d['Accuracy'])
df_eval_bella['explanation'] = df_eval_bella.evaluation.apply(lambda d: d['Explanation'])

# Dropping the 'record' and 'evaluation' columns
df_eval_bella.drop(columns=['record', 'evaluation'], inplace=True)


In [122]:
# Assuming `evaluations_mia` is a list of tuples in the format (record, evaluation)
df_eval_mia = pd.DataFrame(evaluations_mia, columns=['record', 'evaluation'])

# Extracting fields from the 'record' dictionary for Mia
df_eval_mia['question'] = df_eval_mia.record.apply(lambda d: d['Question'])
df_eval_mia['Mia_Target_Response'] = df_eval_mia.record.apply(lambda d: d['Mia_Target_Response'])
df_eval_mia['Mia_Response'] = df_eval_mia.record.apply(lambda d: d['Mia_Response'])
df_eval_mia['cosine_mia'] = df_eval_mia.record.apply(lambda d: d['cosine_mia'])

# Extracting fields from the 'evaluation' dictionary
df_eval_mia['accuracy'] = df_eval_mia.evaluation.apply(lambda d: d['Accuracy'])
df_eval_mia['explanation'] = df_eval_mia.evaluation.apply(lambda d: d['Explanation'])

# Dropping the 'record' and 'evaluation' columns
df_eval_mia.drop(columns=['record', 'evaluation'], inplace=True)


In [123]:
df_eval_mia.head()

Unnamed: 0,question,Mia_Target_Response,Mia_Response,cosine_mia,accuracy,explanation
0,- Скільки вам років?,25,Мені 25 років .,0.298563,ACCURATE,"The generated answer provides the correct age, 25, in a grammatically correct sentence that directly answers the question."
1,- В якому місті ви народились?,Львів,Я народилася у Львові.,0.832539,ACCURATE,"The generated answer accurately provides the city of birth, Lviv, as the actual answer. The format is a first-person statement, but the information is correct."
2,– Де ви виросли?,Сокаль,Я виросла в місті Сокаль .,0.839899,ACCURATE,"The generated answer is factually correct and aligns with the actual answer. It provides the name of the city where the individual grew up, matching the information in the actual answer."
3,- У якому місті ви мешкаєте?,Львів,Я проживаю у місті Львів.,0.839686,ACCURATE,"The generated answer is semantically equivalent to the actual answer, both stating that the city of residence is Lviv."
4,- Скільки у вас братів та сестер? Як їх звати?,"Одна сестра, звати Анастасія",У мене є одна сестра на ім'я Белла. Ми близнючки.,0.849377,INACCURATE,"The generated answer contradicts the original answer by stating that the sister's name is Beлла, while the actual answer is Anastasiia."


In [125]:
df_eval_mike.head(3)

Unnamed: 0,question,Mike_Target_Response,Mike_Response,cosine_mike,accuracy,explanation
0,- Скільки вам років?,32,Мені 32 роки. Як щодо вас?,0.147329,ACCURATE,"The generated answer provides the correct age (32 years), which is in line with the actual answer. Additionally, it goes beyond merely providing the age by engaging in conversation, asking about the user's age."
1,- В якому місті ви народились?,місті Долина Івано-Франківської області.,Я народився у місті Дрогобич. А ви де народилися?,0.815721,INACCURATE,"The generated answer is inaccurate as it states the evaluator was born in Drogobych, which contradicts the actual answer that they were born in the city of Dolyna, Ivan-Frankivsk region."
2,– Де ви виросли?,"Загалом я виріс у місті Долина Івано-Франківської області на квартирі. В дитинстві, особливо влітку, я часто їздив в гості до бабусі та дідуся як з маминої, так і з татаової сторони. Вони проживали в селі. Мамині батьки - у селі Станківці Долинського району, а татові - у селі Нижня Луковиця Стрийського району.З маминої сторони моя бабуся з дідом завжди забороняли мені все: не міг я піти на річку чи в клуб на дискотеку. Навпаки, з тата батьки бабусі з дідом підтримували і дозволяли мені ходити на річку та на дискотеку. Тому я завжди любив відвідувати татаових батьків, де мені дозволяли робити те, що хотів.",Я виріс у Дрогобичі. А де саме виросли ви?,0.846585,INACCURATE,"The generated answer contradicts the actual answer as it states that the person grew up in Drogobych, while the actual answer clearly mentions that the person grew up in a city apartment in Dolyna, Ivano-Frankivsk region and often visited their grandparents in two different villages from both sides of their parents."


In [126]:
df_eval_mike.to_csv('data/mike_accuracy.csv', index=False)
df_eval_mia.to_csv('data/mia_accuracy.csv', index=False)
df_eval_bella.to_csv('data/bella_accuracy.csv', index=False)
