# Evaluation Notebook:
## Author: William Diaz
### Artificial Agents @ JHU
**Evaluation for the Critic Agent Using a modified version of the hotpotQA dataset.**
***We drop all context for the questions, instead opting to use the search and reasoning capabilities of each model we evaluate against.***

In [1]:
# import packages
import os
import time
import random
import csv
import json
import requests
from dotenv import load_dotenv
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Load environment variables
load_dotenv()
PERPLEXITY_API_KEY = os.getenv("PERPLEXITY_API_KEY")
GPT4O_API_KEY = os.getenv("GPT4O_API_KEY")

PERPLEXITY_BASE_URL = "https://api.perplexity.ai"
GPT4O_BASE_URL = "https://api.openai.com/v1/chat/completions"

In [5]:
# Load the dataset
dataset = load_dataset("hotpot_qa", "fullwiki")

Downloading data: 100%|██████████| 566M/566M [01:10<00:00, 8.01MB/s] 
Downloading data: 100%|██████████| 47.5M/47.5M [00:05<00:00, 8.98MB/s]
Downloading data: 100%|██████████| 46.2M/46.2M [00:05<00:00, 8.22MB/s]
Generating train split: 100%|██████████| 90447/90447 [00:11<00:00, 7892.51 examples/s] 
Generating validation split: 100%|██████████| 7405/7405 [00:00<00:00, 8157.17 examples/s] 
Generating test split: 100%|██████████| 7405/7405 [00:00<00:00, 10007.55 examples/s]


In [8]:
# we will sample from the validation dataset, though this should not matter
validation_data = dataset['validation']
sample_size = 200
random_indices = random.sample(range(len(validation_data)), sample_size)
sampled_data = [validation_data[i] for i in random_indices]
sampled_data[0].keys()


dict_keys(['id', 'question', 'answer', 'type', 'level', 'supporting_facts', 'context'])

In [9]:
# Prepare output CSV
output_file = "results.csv"
fieldnames = [
    "id",
    "question",
    "ground_truth_answer",
    "perplexity_answer",
    "binary_success",
    "context",
    "type",
    "level",
    "supporting_facts"
]

In [10]:
# Check if there's already a partial file
processed_ids = set()
if os.path.exists(output_file):
    with open(output_file, "r", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        for row in reader:
            processed_ids.add(row["id"])

In [11]:
def exponential_backoff_retry(func, max_retries=5, initial_wait=1):
    """Utility for exponential backoff retries."""
    wait = initial_wait
    for i in range(max_retries):
        try:
            return func()
        except Exception as e:
            if i == max_retries - 1:
                raise e
            time.sleep(wait)
            wait *= 2

In [None]:
def query_perplexity(question):
    """Query the perplexity API with a given question."""
    def do_request():
        headers = {
            "Content-Type": "application/json",
            "Authorization": f"Bearer {PERPLEXITY_API_KEY}"
        }
        messages = [
            {
                "role": "system",
                "content": (
                    "You are an artificial intelligence assistant and you need to "
                    "engage in a helpful, detailed, polite conversation with a user."
                )
            },
            {
                "role": "user",
                "content": question
            }
        ]

        json_payload = {
            "model": "llama-3.1-sonar-large-128k-online",
            "messages": messages
        }
        response = requests.post(
            f"{PERPLEXITY_BASE_URL}/v1/chat/completions",
            headers=headers,
            json=json_payload,
            timeout=30
        )
        response.raise_for_status()
        resp_json = response.json()
        # Assuming response JSON structure: {'choices': [{'message': {'content': 'answer'}}], ...}
        # If different, adjust accordingly.
        return resp_json["choices"][0]["message"]["content"]

    answer = exponential_backoff_retry(do_request)
    # Sleep to avoid rate limits
    time.sleep(1)
    return answer