In [None]:
from gpt4all import GPT4All
from datasets import load_dataset
import random
from tqdm import tqdm


model = GPT4All(model_name='Meta-Llama-3.1-8B-Instruct-128k-Q4_0.gguf', device="gpu")

dataset = load_dataset('boolq', split='validation')

random_seed = 447
random.seed(random_seed)
samples = random.sample(list(dataset), 500)

correct_predictions = 0

for i, sample in tqdm(enumerate(samples)):
    # extract the question, passage, and gold answer from the sample
    question = sample['question']
    passage = sample['passage']
    gold_answer = sample['answer']  # True or False

    prompt = f"Question: {question}\n\nPassage: {passage}\n\nPlease answer TRUE or FALSE.\nAnswer:"

    response = model.generate(prompt)
    if i == 0:
        print("Here is an example of the model response:")
        print(f"Model Response:\n{response}\n")
        print(f"Gold Answer: {gold_answer}\n")

    response_lines = response.strip().split('\n')
    predicted_answer = None

    for line in response_lines:
        if line.strip().lower().startswith('answer:'):
            answer_text = line[len('Answer:'):].strip().lower()
            if 'true' in answer_text:
                predicted_answer = True
                break
            elif 'false' in answer_text:
                predicted_answer = False
                break

    if predicted_answer is None:
        if 'true' in response.lower():
            predicted_answer = True
        elif 'false' in response.lower():
            predicted_answer = False

    if predicted_answer is not None:
        if predicted_answer == gold_answer:
            correct_predictions += 1
    else:
        pass


accuracy = correct_predictions / len(samples)
print(f"The accuracy of the model on the selected samples is: {accuracy * 100:.2f}%")
