# Evaluation Notebook:
## Author: William Diaz
### Artificial Agents @ JHU
**Evaluation for the Critic Agent Using a modified version of the hotpotQA dataset.**
***We drop all context for the questions, instead opting to use the search and reasoning capabilities of each model we evaluate against.***

In [1]:
%load_ext autoreload
%autoreload 2

In [1]:
# import packages
import importlib
import os
import time
import random
import csv
import json
import requests
from dotenv import load_dotenv
from datasets import load_dataset
from tqdm import tqdm

import sys
import os
current_dir = os.getcwd()
project_root = os.path.abspath(os.path.join(current_dir, "../.."))
sys.path.append(project_root)

from AgentFactory.critic_search.main import main as query_critic_agent

import nest_asyncio
nest_asyncio.apply()


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load environment variables
load_dotenv()
PERPLEXITY_API_KEY = os.getenv("PERPLEXITY_API_KEY")
GPT4O_API_KEY = os.getenv("GPT4O_API_KEY")

PERPLEXITY_BASE_URL = "https://api.perplexity.ai"
GPT4O_BASE_URL = "https://api.openai.com"

In [5]:
# Load the dataset
dataset = load_dataset("hotpot_qa", "fullwiki")

Downloading data: 100%|██████████| 566M/566M [01:10<00:00, 8.01MB/s] 
Downloading data: 100%|██████████| 47.5M/47.5M [00:05<00:00, 8.98MB/s]
Downloading data: 100%|██████████| 46.2M/46.2M [00:05<00:00, 8.22MB/s]
Generating train split: 100%|██████████| 90447/90447 [00:11<00:00, 7892.51 examples/s] 
Generating validation split: 100%|██████████| 7405/7405 [00:00<00:00, 8157.17 examples/s] 
Generating test split: 100%|██████████| 7405/7405 [00:00<00:00, 10007.55 examples/s]


In [None]:
# we will sample from the validation dataset, though this should not matter
validation_data = dataset['validation']
sample_size = 200
random_indices = random.sample(range(len(validation_data)), sample_size)
sampled_data = [validation_data[i] for i in random_indices]
print(sampled_data[0].keys())

print(len(sampled_data))


dict_keys(['id', 'question', 'answer', 'type', 'level', 'supporting_facts', 'context'])
200


In [None]:
with open("hotpot_data.json", "w") as f:
    json.dump(sampled_data, f, indent=4) 

In [3]:
with open("hotpot_data.json", "r") as f:
    sampled_data = json.load(f)
print(sampled_data[0].keys())

print(len(sampled_data))

dict_keys(['id', 'question', 'answer', 'type', 'level', 'supporting_facts', 'context'])
200


In [4]:
# Prepare output CSV
output_file = "hotpotqa_critic_gpt4omini.csv"
fieldnames = [
    "id",
    "question",
    "ground_truth_answer",
    "perplexity_answer",
    "binary_success",
    "context",
    "type",
    "level",
    "supporting_facts"
]

In [5]:
def exponential_backoff_retry(func, max_retries=5, initial_wait=1):
    """Utility for exponential backoff retries."""
    wait = initial_wait
    for i in range(max_retries):
        try:
            return func()
        except Exception as e:
            if i == max_retries - 1:
                raise e
            time.sleep(wait)
            wait *= 2

In [7]:
def query_perplexity(question):
    """Query the perplexity API with a given question."""
    def do_request():
        headers = {
            "Content-Type": "application/json",
            "Accept": "application/json",
            "Authorization": f"Bearer {PERPLEXITY_API_KEY}"
        }
        messages = [
            {
                "role": "system",
                "content": (
                    "You are an artificial intelligence assistant and you need to "
                    "engage in a helpful, detailed, polite conversation with a user. "
                    "The user will ask you a question. "
                    "You will break down the question, search online for relevant information "
                    "and analyze the returned search results, reasoning "
                    "through ambiguity and providing an accurate fact backed answer to the user."
                )
            },
            {
                "role": "user",
                "content": question
            }
        ]

        json_payload = {
            "model": "llama-3.1-sonar-small-128k-online",
            "messages": messages
        }
        response = requests.post(
            f"{PERPLEXITY_BASE_URL}/chat/completions",
            headers=headers,
            json=json_payload,
            timeout=30
        )
        response.raise_for_status()
        resp_json = response.json()
        # Assuming response JSON structure: {'choices': [{'message': {'content': 'answer'}}], ...}
        # If different, adjust accordingly.
        return resp_json["choices"][0]["message"]["content"]

    answer = exponential_backoff_retry(do_request)
    # Sleep to avoid rate limits
    time.sleep(1.3)
    return answer


query = "At which university does the biographer of John Clare teach English Literature?"
answer = query_perplexity(question=query)
print(answer)
# query = "What is going on with the attempted Coup in Korea?" # real time question
# answer = query_perplexity(question=query)
# print(answer)

The biographer of John Clare, Jonathan Bate, teaches English Literature at the University of Oxford. He is also the Provost of Worcester College, Oxford[3]. Additionally, he has held positions at other universities, including King Alfred Professor of English Literature at Liverpool University and Professor of Shakespeare and Renaissance Literature at the University of Warwick[3].


In [6]:
# The evaluator will now compare the answer against the gold:

def query_gpt4o_evaluator(question, ground_truth, model_answer):
    """Query GPT4o for binary evaluation of correctness."""
    def do_request():
        headers = {
            "Content-Type": "application/json",
            "Authorization": f"Bearer {GPT4O_API_KEY}"
        }
        system_prompt = (
            "You are a reasoning assistant. You are given a question, a ground truth answer, "
            "and a model's answer. You must determine if the model's answer fully and "
            "correctly answers the question according to the ground truth. "
            "If it does, respond with 'True'. If not, respond with 'False'. No explanations, elaborations, or additional information please."
        )
        user_prompt = (
            f"Question: {question}\n"
            f"Ground Truth: {ground_truth}\n"
            f"Model Answer: {model_answer}\n\n"
            "Does the model answer match the ground truth fully and correctly?"
        )
        messages = [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt}
        ]

        json_payload = {
            "model": "gpt-4o",  # Replace with actual model name for gpt4o
            "messages": messages
        }
        response = requests.post(
            f"{GPT4O_BASE_URL}/v1/chat/completions",
            headers=headers,
            json=json_payload,
            timeout=30
        )
        response.raise_for_status()
        resp_json = response.json()
        content = resp_json["choices"][0]["message"]["content"].strip()
        # Expecting either "True" or "False"
        if content.lower() == "true":
            return True
        elif content.lower() == 'false':
            return False
        else:
            raise ValueError("Evaluator returned unexpected content: {content}")

    result = exponential_backoff_retry(do_request)
    # A short sleep to avoid hitting rate limit for evaluator as well
    time.sleep(0.13)
    return result

query = "At which university does the biographer of John Clare teach English Literature?"
answer = "The biographer of John Clare, Jonathan Bate, teaches English Literature at the University of Oxford. Specifically, he is Professor of English Literature at the University of Oxford and also serves as the Provost of Worcester College, Oxford[1][2][3]."
gold = "University of Oxford"
query_gpt4o_evaluator(question=query, model_answer=answer, ground_truth=gold)


True

In [9]:
query = "At which university does the biographer of John Clare teach English Literature?"
answer = query_critic_agent(TASK=query)
print(answer)

[32m2024-12-07 05:04:23.479[0m | [34m[1mDEBUG   [0m | [36mAgentFactory.critic_search.tools.tool_registry[0m:[36mregister[0m:[36m43[0m - [34m[1mRegistered tool: search[0m
<module>:3: No type or annotation for parameter 'urls'
<module>:4: No type or annotation for parameter 'elements'
[32m2024-12-07 05:04:23.480[0m | [34m[1mDEBUG   [0m | [36mAgentFactory.critic_search.tools.tool_registry[0m:[36mregister[0m:[36m43[0m - [34m[1mRegistered tool: scrape[0m
[32m2024-12-07 05:04:23.481[0m | [1mINFO    [0m | [36mAgentFactory.critic_search.main[0m:[36mmain[0m:[36m30[0m - [1m
[0m


Database initialized at critic_search/.data/adapter_usage.sqlite


[32m2024-12-07 05:04:29.229[0m | [34m[1mDEBUG   [0m | [36mAgentFactory.critic_search.base_agent[0m:[36mcommon_chat[0m:[36m87[0m - [34m[1mllm_response:
 ChatCompletionMessage(content='To effectively answer the user\'s question regarding the university where the biographer of John Clare teaches English Literature, we can break down the query into several levels and utilize various search techniques. Here’s a structured approach:\n\n### Level 1: Identify the Biographer of John Clare\nWe need to find out who has written a biography of John Clare, as this is the key to our main query.\n\n**Query 1:** \n- **Search Term:** "biographer of John Clare"\n\n### Level 2: Find Teaching Affiliations\nOnce we identify the biographer, we need to find out their current teaching position, specifically which university they are associated with.\n\n**Query 2:**\n- **Search Term:** "[Biographer\'s Name] teaches English Literature"  \n  (Replace [Biographer\'s Name] with the name found in Query 


The biographer of John Clare, Jonathan Bate, teaches English Literature at the University of Oxford.



In [None]:
# Check if there's already a partial file
processed_ids = set()
if os.path.exists(output_file):
    with open(output_file, "r", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        for row in reader:
            processed_ids.add(row["id"])
            
# write out headers
if not os.path.exists(output_file):
    with open(output_file, "w", encoding="utf-8", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        
# Main loop
with open(output_file, "a", encoding="utf-8", newline="") as f:
    writer = csv.DictWriter(f, fieldnames=fieldnames)
    for entry in tqdm(sampled_data[:50]):
        if entry["id"] in processed_ids:
            continue

        question = entry["question"]
        ground_truth_answer = entry["answer"]
        context = entry["context"]  # list of (title, [sentences])
        supporting_facts = entry["supporting_facts"]
        entry_type = entry["type"]
        level = entry["level"]
        entry_id = entry["id"]

        # Query model for answer
        perplexity_answer = query_critic_agent(TASK=question)

        # Evaluate correctness with GPT4o
        binary_success = query_gpt4o_evaluator(question, ground_truth_answer, perplexity_answer)

        # Write result
        writer.writerow({
            "id": entry_id,
            "question": question,
            "ground_truth_answer": ground_truth_answer,
            "perplexity_answer": perplexity_answer,
            "binary_success": int(binary_success),
            "context": json.dumps(context),
            "type": entry_type,
            "level": level,
            "supporting_facts": json.dumps(supporting_facts)
        })

        f.flush()  # Ensure we write after each entry

print("Evaluation complete. Results saved to", output_file)

  0%|          | 0/50 [00:00<?, ?it/s]

[32m2024-12-07 05:11:21.448[0m | [34m[1mDEBUG   [0m | [36mAgentFactory.critic_search.tools.tool_registry[0m:[36mregister[0m:[36m43[0m - [34m[1mRegistered tool: search[0m
<module>:3: No type or annotation for parameter 'urls'
<module>:4: No type or annotation for parameter 'elements'
[32m2024-12-07 05:11:21.450[0m | [34m[1mDEBUG   [0m | [36mAgentFactory.critic_search.tools.tool_registry[0m:[36mregister[0m:[36m43[0m - [34m[1mRegistered tool: scrape[0m
[32m2024-12-07 05:11:21.451[0m | [1mINFO    [0m | [36mAgentFactory.critic_search.main[0m:[36mmain[0m:[36m30[0m - [1m
[0m


Database initialized at critic_search/.data/adapter_usage.sqlite


[32m2024-12-07 05:11:29.817[0m | [34m[1mDEBUG   [0m | [36mAgentFactory.critic_search.base_agent[0m:[36mcommon_chat[0m:[36m87[0m - [34m[1mllm_response:
 ChatCompletionMessage(content='To answer the user\'s question about Alexander Trachtenberg\'s conviction during the Second Red Scare, we can break down the query into several components and make use of the search techniques outlined. The main components to focus on are:\n\n1. Alexander Trachtenberg\n2. Second Red Scare\n3. Conviction\n4. Specific Act (which Act?)\n\nBased on this breakdown, we can design a series of search queries that utilize the various techniques mentioned. Here’s how we can approach it:\n\n### Primary Search Queries\n1. **Exact Search for Conviction and Act:**\n   - Query: `"Alexander Trachtenberg conviction under Act"`\n   \n2. **Focus on the Second Red Scare Context:**\n   - Query: `"Second Red Scare Alexander Trachtenberg conviction"`\n   \n3. **Specific Acts Mentioned:**\n   - Query: `"Alexander Tra

In [39]:
import pandas as pd

# Path to your input CSV with duplicates
input_csv = "perplexity_large.csv"

# Path to your output CSV after cleaning
output_csv = "perplexity_large_clean.csv"

# Read the CSV into a DataFrame
df = pd.read_csv(input_csv)

# Drop duplicates based on the 'id' column, keeping the first occurrence
df_cleaned = df.drop_duplicates(subset='id', keep='first')

# Write the cleaned DataFrame to a new CSV
df_cleaned.to_csv(output_csv, index=False)

print(f"Duplicates removed. Cleaned CSV saved to {output_csv}.")

Duplicates removed. Cleaned CSV saved to perplexity_large_clean.csv.


## COMPLEXTEMPQA attempt

In [63]:
from itertools import islice


data_stream = load_dataset("DataScienceUIBK/ComplexTempQA", split="train", streaming=True)
data_comptempqa = list(islice(data_stream, 100))

In [132]:
from datetime import datetime

def custom_serializer(obj):
    """
    Custom serializer for JSON to handle non-serializable objects.
    """
    if isinstance(obj, datetime):
        return obj.isoformat()  # Convert datetime to ISO 8601 string
    raise TypeError(f"Type {type(obj)} not serializable")

with open("comptempqa_data.json", "w") as f:
    json.dump(data_comptempqa, f, indent=4, default=custom_serializer) 

In [None]:
def custom_deserializer(d):
    """
    Custom deserializer to parse ISO 8601 strings back to datetime.
    """
    for key, value in d.items():
        if isinstance(value, str):
            try:
                d[key] = datetime.fromisoformat(value)  # Convert back to datetime
            except ValueError:
                pass  # Skip if it's not a datetime string
    return d

with open("comptempqa_data.json", "r") as f:
    data_comptempqa = json.load(f, object_hook=custom_deserializer)

In [None]:
print(data_comptempqa[0])
# Prepare output CSV
output_file = "comptempqa__critic_gpt4omini.csv"
fieldnames = [
    "id",
    "question",
    "ground_truth_answer",
    "model_answer",
    "binary_success",
    "rating",
    "timeframe",
    "type",
]


{'id': 1, 'question': 'Did the Winter Olympic Games in 1994 which had 1737 participants had a higher number of participants than the sports season in 2006 in Germany which had 32 participants?', 'answer': ['yes'], 'type': '2a', 'rating': 1, 'timeframe': [datetime.datetime(1994, 2, 1, 0, 0), datetime.datetime(2006, 7, 9, 0, 0)], 'question_entity': ['9663', '37285'], 'answer_entity': ['224013'], 'question_country_entity': ['20', '183'], 'is_unnamed': 1, 'answer_country_entity': None}


In [None]:
# Check if there's already a partial file
processed_ids = set()
if os.path.exists(output_file):
    with open(output_file, "r", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        for row in reader:
            processed_ids.add(row["id"])

print(processed_ids)

# write out headers
if not os.path.exists(output_file):
    with open(output_file, "w", encoding="utf-8", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()

# Main loop
with open(output_file, "a", encoding="utf-8", newline="") as f:
    writer = csv.DictWriter(f, fieldnames=fieldnames)
    for entry in tqdm(data_comptempqa[:50]):
        if entry["id"] in processed_ids:
            continue

        question = entry["question"]
        ground_truth_answer = entry["answer"]
        entry_id = entry["id"]
        rating = entry['rating']
        timeframe = entry['timeframe']

        # Query perplexity for answer
        perplexity_answer = query_critic_agent(question)

        # Evaluate correctness with GPT4o
        binary_success = query_gpt4o_evaluator(question, ground_truth_answer, perplexity_answer)

        # Write result
        writer.writerow({
            "id": entry_id,
            "question": question,
            "ground_truth_answer": ground_truth_answer,
            "model_answer": perplexity_answer,
            "binary_success": int(binary_success),
            "rating": rating,
            "timeframe": timeframe,
        })

        f.flush()  # Ensure we write after each entry

print("Evaluation complete. Results saved to", output_file)

set()


100%|██████████| 100/100 [12:46<00:00,  7.67s/it]

Evaluation complete. Results saved to comptempqa_perplexity_small.csv





## WIKIQA

In [None]:
# Prepare output CSV
output_file = "wikiqa_critic_gpt4omini.csv"
fieldnames = [
    "id",
    "question",
    "model_answer",
    "ground_truth_answer",
    "binary_success",
    "type",
]

# Check if there's already a partial file
processed_ids = set()
if os.path.exists(output_file):
    with open(output_file, "r", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        for row in reader:
            processed_ids.add(row["id"])


# write out headers
if not os.path.exists(output_file):
    with open(output_file, "w", encoding="utf-8", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()

In [59]:
dataset_temp = load_dataset("microsoft/wiki_qa")

Generating test split: 100%|██████████| 6165/6165 [00:00<00:00, 147545.19 examples/s]
Generating validation split: 100%|██████████| 2733/2733 [00:00<00:00, 618219.87 examples/s]
Generating train split: 100%|██████████| 20360/20360 [00:00<00:00, 1150486.75 examples/s]


In [75]:
# we will sample from the validation dataset, though this should not matter
validation_data_temp = dataset_temp['validation']
sample_size_temp = 200
random_indices_temp = random.sample(range(len(validation_data_temp)), sample_size_temp)
sampled_data_temp = [validation_data_temp[i] for i in random_indices_temp]
print(sampled_data_temp[0].keys())
print(len(sampled_data_temp))

dict_keys(['question_id', 'question', 'document_title', 'answer', 'label'])
200


In [127]:
with open("wiki_data.json", "w") as f:
    json.dump(sampled_data_temp, f, indent=4) 

In [135]:
with open("wiki_data.json", "r") as f:
    sampled_data = json.load(f)

In [None]:
# Main loop
with open(output_file, "a", encoding="utf-8", newline="") as f:
    writer = csv.DictWriter(f, fieldnames=fieldnames)
    for entry in tqdm(sampled_data_temp[:50]):
        if entry["question_id"] in processed_ids:
            continue

        question = entry["question"]
        ground_truth_answer = entry["answer"]
        entry_id = entry["question_id"]
        type = entry["label"]

        # Query perplexity for answer
        perplexity_answer = query_critic_agent(question)

        # Evaluate correctness with GPT4o
        binary_success = query_gpt4o_evaluator(question, ground_truth_answer, perplexity_answer)

        # Write result
        writer.writerow({
            "id": entry_id,
            "question": question,
            "model_answer": perplexity_answer,
            "ground_truth_answer": ground_truth_answer,
            "binary_success": int(binary_success),
            "type": type,
        })

        f.flush()  # Ensure we write after each entry

print("Evaluation complete. Results saved to", output_file)

100%|██████████| 200/200 [30:15<00:00,  9.08s/it]

Evaluation complete. Results saved to wikiqa_perplexity_small.csv



