In [7]:
from transformers import pipeline
import json
import evaluate
import numpy as np

In [3]:
qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")


Device set to use cpu


In [4]:
with open("samsung_refrigerator_qa.json", "r") as f:
    data = json.load(f)

In [5]:
generated_answers = []
true_answers = []
for item in data["data"]:
    for para in item["paragraphs"]:
        context = para["context"]
        for qa in para["qas"]:
            question = qa["question"]
            true_answer = qa["answers"][0]["text"] if qa["answers"] else "No answer"

            result = qa_pipeline(question=question, context=context)

            print(f"Question: {question}")
            print(f"Predicted answer: {result['answer']}")
            print(f"True answer: {true_answer}")
            print("-" * 30)
            generated_answers.append(result["answer"])
            true_answers.append(true_answer)

Question: Where should the appliance be installed?
Predicted answer: a firm and level floor
True answer: on a firm and level floor
------------------------------
Question: Where should the appliance not be installed?
Predicted answer: a firm and level floor
True answer: in a damp and dusty place
------------------------------
Question: How do you activate Child Lock?
Predicted answer: hold the Lock button
True answer: hold the Lock button for 5 seconds until the icon appears
------------------------------
Question: How do you lock the control panel?
Predicted answer: press the "Lock" button
True answer: press the "Lock" button for 5 seconds until the padlock icon appears
------------------------------
Question: Why should you lock the control panel?
Predicted answer: This prevents accidental changes to settings
True answer: This prevents accidental changes to settings
------------------------------
Question: How often should the air filter be replaced?
Predicted answer: every 6 months


In [6]:
metric = evaluate.load("squad")

predictions = [{"id": str(i), "prediction_text": pred} for i, pred in enumerate(generated_answers)]
references = [{"id": str(i), "answers": {"text": [ref], "answer_start": [0]}} for i, ref in enumerate(true_answers)]

results = metric.compute(predictions=predictions, references=references)

print("Exact Match:", results["exact_match"])
print("F1 Score:", results["f1"])

Exact Match: 33.87096774193548
F1 Score: 67.25538192568553


In [9]:
bertscore = evaluate.load("bertscore")

results_bert = bertscore.compute(predictions=generated_answers, references=true_answers, lang="en")

avg_precision = np.mean(results_bert['precision'])
avg_recall = np.mean(results_bert['recall'])
avg_f1 = np.mean(results_bert['f1'])

print(f"Average Precision: {avg_precision:.4f}")
print(f"Average Recall: {avg_recall:.4f}")
print(f"Average F1: {avg_f1:.4f}")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Average Precision: 0.9529
Average Recall: 0.9204
Average F1: 0.9359
