In [2]:
from transformers import pipeline
import json
import evaluate
import numpy as np
from datasets import Dataset

In [3]:
qa_pipeline = pipeline("question-answering", model="../models/bert_best")


Device set to use cuda:0


In [4]:
with open("../datasets/ChatGPT/extractive/fridge_dataset_v1.0_clean.json", "r") as f:
    data = json.load(f)

In [5]:
# Перетворення під BERT-формат
rows = []
for item in data:
    context = item["context"]
    question = item["question"]
    if item["answers"]:
        answer_start = item["answers"][0]["answer_start"]
        answer = item["answers"][0]["text"]
        rows.append({
            "context": context,
            "question": question,
            "answers": {"text": [answer], "answer_start": [answer_start]},
            "is_impossible": item["is_impossible"]
        })
    else:
        # Якщо відповіді немає, можна пропустити запис або додати порожні значення
        rows.append({
            "context": context,
            "question": question,
            "answers": {"text": [""], "answer_start": [0]},
            "is_impossible": item["is_impossible"]
        })

# Створення Dataset
dataset = Dataset.from_list(rows)

# Перевірка
print(dataset[-1])



In [6]:
# 3. Тренувальний/валідаційний спліт
split_dataset = dataset.train_test_split(test_size=0.15, seed=42)

In [9]:
generated_answers = []
true_answers = []
for example in split_dataset["train"].select(range(5)):
    context = example["context"]
    question = example["question"]
    answer = qa_pipeline(question=question, context=context)
    
    generated_answers.append(answer["answer"])
    true_answers.append(example["answers"]["text"][0])
    print(f"Question: {question}")
    print(f"Generated Answer: {answer['answer']}")
    print(f"True Answer: {example['answers']['text'][0]}")
    print("-" * 50)

You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Question: What is the recommended disposal procedure for business users of this product?
Generated Answer: of
True Answer: Business users should contact their supplier and check the terms and conditions of the purchase contract. This product and its electronic accessories should not be mixed with other commercial wastes for disposal.
--------------------------------------------------
Question: When setting up the appliance, what should be confirmed about the power cord?
Generated Answer: that
True Answer: When positioning the appliance, ensure the supply cord is not trapped or damaged.
--------------------------------------------------
Question: What is the temperature profile of the two-star compartment relative to other compartments in the freezer?
Generated Answer: ,
True Answer: The temperature of two star section (sections) or compartment (compartments) which have two star symbol ( ) is slightly higher than other freezer compartment (compartments).
--------------------------------

In [6]:
metric = evaluate.load("squad")

predictions = [{"id": str(i), "prediction_text": pred} for i, pred in enumerate(generated_answers)]
references = [{"id": str(i), "answers": {"text": [ref], "answer_start": [0]}} for i, ref in enumerate(true_answers)]

results = metric.compute(predictions=predictions, references=references)

print("Exact Match:", results["exact_match"])
print("F1 Score:", results["f1"])

Exact Match: 33.87096774193548
F1 Score: 67.25538192568553


In [9]:
bertscore = evaluate.load("bertscore")

results_bert = bertscore.compute(predictions=generated_answers, references=true_answers, lang="en")

avg_precision = np.mean(results_bert['precision'])
avg_recall = np.mean(results_bert['recall'])
avg_f1 = np.mean(results_bert['f1'])

print(f"Average Precision: {avg_precision:.4f}")
print(f"Average Recall: {avg_recall:.4f}")
print(f"Average F1: {avg_f1:.4f}")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Average Precision: 0.9529
Average Recall: 0.9204
Average F1: 0.9359
