In [7]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
import json
print(torch.__file__)
import evaluate
import numpy as np

d:\Projects\LLM\venv\lib\site-packages\torch\__init__.py


In [2]:

# Check for CUDA
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device set to use {device}")

Device set to use cpu


In [3]:
model_name = "google/flan-t5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

In [4]:
with open("samsung_refrigerator_qa.json", "r") as f:
    data = json.load(f)

In [5]:
generated_answers = []
true_answers = []
for item in data["data"]:
    for para in item["paragraphs"]:
        context = para["context"]
        for qa in para["qas"]:
            question = qa["question"]
            true_answer = qa["answers"][0]["text"] if qa["answers"] else "No answer"

            input_text = f"question: {question} context: {context}"
            inputs = tokenizer(input_text, return_tensors="pt")

            outputs = model.generate(**inputs, max_length=50, num_beams=4, early_stopping=True)
            generated_answer = tokenizer.decode(outputs[0], skip_special_tokens=True)

            print(f"Question: {question}")
            print(f"Generated answer: {generated_answer}")
            print(f"True answer: {true_answer}")
            print("-" * 30)
            generated_answers.append(generated_answer)
            true_answers.append(true_answer)

Question: Where should the appliance be installed?
Generated answer: firm and level floor
True answer: on a firm and level floor
------------------------------
Question: Where should the appliance not be installed?
Generated answer: outdoor area
True answer: in a damp and dusty place
------------------------------
Question: How do you activate Child Lock?
Generated answer: Hold the Lock button for 5 seconds until the icon appears
True answer: hold the Lock button for 5 seconds until the icon appears
------------------------------
Question: How do you lock the control panel?
Generated answer: Press the "Lock" button
True answer: press the "Lock" button for 5 seconds until the padlock icon appears
------------------------------
Question: Why should you lock the control panel?
Generated answer: prevents accidental changes to settings
True answer: This prevents accidental changes to settings
------------------------------
Question: How often should the air filter be replaced?
Generated ans

In [16]:
rouge = evaluate.load("rouge")

results = rouge.compute(predictions=generated_answers, references=true_answers)
print(results)


{'rouge1': np.float64(0.5960457044948835), 'rouge2': np.float64(0.4927206041991605), 'rougeL': np.float64(0.5963614451851809), 'rougeLsum': np.float64(0.5945300366932245)}


In [6]:
bertscore = evaluate.load("bertscore")

results_bert = bertscore.compute(predictions=generated_answers, references=true_answers, lang="en")
print("BARTScore BERT:", results_bert)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BARTScore BERT: {'precision': [0.9256720542907715, 0.8608920574188232, 0.9996941089630127, 0.9639863967895508, 0.9446936845779419, 0.9999999403953552, 0.9997774362564087, 0.9713519215583801, 0.9705342650413513, 0.9685173630714417, 0.9836770296096802, 0.9676039218902588, 0.9695708751678467, 0.9752511978149414, 0.9803174734115601, 0.94191974401474, 0.960221529006958, 0.9491510391235352, 1.000000238418579, 0.9176837801933289, 0.9666861891746521, 0.9999999403953552, 0.9692066311836243, 0.8519521951675415, 0.9092748165130615, 1.000000238418579, 0.9624884128570557, 0.9538353085517883, 0.9461208581924438, 0.8573760986328125, 0.8104298114776611, 0.8997682929039001, 0.8856562376022339, 1.0000001192092896, 0.9788779020309448, 0.924802303314209, 0.9084659814834595, 0.9999999403953552, 0.8666191697120667, 0.9252203702926636, 0.836661696434021, 0.9443121552467346, 0.9259721636772156, 0.8522756099700928, 0.9342849254608154, 0.9550272226333618, 0.9532012343406677, 0.8716961741447449, 0.95916903018951

In [8]:
avg_precision = np.mean(results_bert['precision'])
avg_recall = np.mean(results_bert['recall'])
avg_f1 = np.mean(results_bert['f1'])

print(f"Average Precision: {avg_precision:.4f}")
print(f"Average Recall: {avg_recall:.4f}")
print(f"Average F1: {avg_f1:.4f}")

Average Precision: 0.9327
Average Recall: 0.8886
Average F1: 0.9097
