In [9]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import torch
import json
import evaluate
import numpy as np

In [3]:
# Завантажуємо токенізатор і модель
tokenizer = AutoTokenizer.from_pretrained("./bert_finetuned")
model = AutoModelForQuestionAnswering.from_pretrained("./bert_finetuned")

In [4]:
with open("samsung_refrigerator_qa.json", "r") as f:
    data = json.load(f)

In [5]:
from transformers import pipeline

qa_pipeline = pipeline("question-answering", model="./bert_finetuned", tokenizer="./bert_finetuned")

generated_answers = []
true_answers = []
for item in data["data"]:
    for para in item["paragraphs"]:
        context = para["context"]
        for qa in para["qas"]:
            question = qa["question"]
            true_answer = qa["answers"][0]["text"] if qa["answers"] else "No answer"

            result = qa_pipeline(question=question, context=context)

            print(f"Question: {question}")
            print(f"Predicted answer: {result['answer']}")
            print(f"True answer: {true_answer}")
            print("-" * 30)
            generated_answers.append(result["answer"])
            true_answers.append(true_answer)


Device set to use cpu


Question: Where should the appliance be installed?
Predicted answer: Install the appliance on a firm and level floor
True answer: on a firm and level floor
------------------------------
Question: Where should the appliance not be installed?
Predicted answer: Do not install the appliance in a damp and dusty place
True answer: in a damp and dusty place
------------------------------
Question: How do you activate Child Lock?
Predicted answer: Lock, hold the Lock button for 5 seconds until the icon appears
True answer: hold the Lock button for 5 seconds until the icon appears
------------------------------
Question: How do you lock the control panel?
Predicted answer: press the "Lock" button for 5 seconds until the padlock icon appears
True answer: press the "Lock" button for 5 seconds until the padlock icon appears
------------------------------
Question: Why should you lock the control panel?
Predicted answer: panel
True answer: This prevents accidental changes to settings
-------------

In [7]:
metric = evaluate.load("squad")

predictions = [{"id": str(i), "prediction_text": pred} for i, pred in enumerate(generated_answers)]
references = [{"id": str(i), "answers": {"text": [ref], "answer_start": [0]}} for i, ref in enumerate(true_answers)]

results = metric.compute(predictions=predictions, references=references)

print("Exact Match:", results["exact_match"])
print("F1 Score:", results["f1"])

Exact Match: 17.741935483870968
F1 Score: 62.64272610721143


In [None]:
bertscore = evaluate.load("bertscore")

results_bert = bertscore.compute(predictions=generated_answers, references=true_answers, lang="en")


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BERTScore BERT: {'precision': [0.9222825765609741, 0.8856914043426514, 0.9583338499069214, 1.0, 0.8743570446968079, 0.8290807604789734, 0.8511108160018921, 0.9547451734542847, 0.8781918287277222, 0.8762181997299194, 0.9328922629356384, 0.9406851530075073, 0.9171909093856812, 0.9365199208259583, 0.9803174734115601, 0.9320204257965088, 0.929271936416626, 0.9491510391235352, 0.9242998361587524, 0.9086482524871826, 0.8869919776916504, 0.9999999403953552, 0.9999999403953552, 0.867555558681488, 0.9092748165130615, 0.8228372931480408, 0.8768212795257568, 0.8969036340713501, 0.9823201894760132, 1.0, 0.8757304549217224, 0.9037310481071472, 0.8526987433433533, 0.7951646447181702, 1.0000001192092896, 0.8950529098510742, 0.9331247806549072, 0.9167859554290771, 0.9603961706161499, 0.9084415435791016, 0.8953339457511902, 1.0, 0.8886969089508057, 0.8800479769706726, 0.9030711054801941, 0.9272675514221191, 0.932884156703949, 0.7947285771369934, 0.9633402228355408, 0.9999999403953552, 0.874604761600494

In [10]:
avg_precision = np.mean(results_bert['precision'])
avg_recall = np.mean(results_bert['recall'])
avg_f1 = np.mean(results_bert['f1'])

print(f"Average Precision: {avg_precision:.4f}")
print(f"Average Recall: {avg_recall:.4f}")
print(f"Average F1: {avg_f1:.4f}")

Average Precision: 0.9167
Average Recall: 0.9200
Average F1: 0.9179
