In [None]:
from gpt4all import GPT4All
from datasets import load_dataset
import random
from tqdm import tqdm

# 1. 加载模型
model = GPT4All(model_name='Meta-Llama-3.1-8B-Instruct-128k-Q4_0.gguf', device="gpu")

# 2. 加载数据集
dataset = load_dataset('boolq', split='validation')

# 3. 随机选择样本
random_seed = 447  # 替换为您的随机种子
random.seed(random_seed)
samples = random.sample(list(dataset), 500)

# 4. 进行推理并计算准确率
correct_predictions = 0

for i, sample in tqdm(enumerate(samples)):
    # 提取问题和段落
    question = sample['question']
    passage = sample['passage']
    gold_answer = sample['answer']  # True 或 False

    # 构建提示
    prompt = f"Question: {question}\n\nPassage: {passage}\n\nPlease answer TRUE or FALSE.\nAnswer:"

    # 获取模型的回答
    response = model.generate(prompt)
    if i == 0:
        print("Here is an example of the model response:")
        print(f"Model Response:\n{response}\n")
        print(f"Gold Answer: {gold_answer}\n")

    # 解析回答
    # 提取 'Answer' 后的内容
    response_lines = response.strip().split('\n')
    predicted_answer = None

    for line in response_lines:
        if line.strip().lower().startswith('answer:'):
            # 获取 'Answer:' 后的文本
            answer_text = line[len('Answer:'):].strip().lower()
            if 'true' in answer_text:
                predicted_answer = True
                break
            elif 'false' in answer_text:
                predicted_answer = False
                break

    # 如果未找到 'Answer:' 行，尝试在整个响应中搜索
    if predicted_answer is None:
        if 'true' in response.lower():
            predicted_answer = True
        elif 'false' in response.lower():
            predicted_answer = False

    # 比较并统计正确率
    if predicted_answer is not None:
        if predicted_answer == gold_answer:
            correct_predictions += 1
    else:
        # 无法解析模型的回答，可以选择跳过或计入错误
        # print("无法解析模型的回答，计入错误。")
        pass

# 5. 输出准确率
accuracy = correct_predictions / len(samples)
print(f"模型在选定样本上的准确率为：{accuracy * 100:.2f}%")
