In [None]:
from datasets import Dataset
from transformers import BartTokenizer, BartForConditionalGeneration, Seq2SeqTrainer, Seq2SeqTrainingArguments
import torch
import json


In [None]:
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))


True
Tesla T4


In [None]:
with open("fridge_dataset_v2.4_clean.json", "r", encoding="utf-8") as f:
    data = json.load(f)

In [None]:
# Перетворення під BART-формат
rows = []
for item in data:
    question = item["instruction"]
    answer = item["response"]
    rows.append({
        "input": f"question: {question}",
        "output": answer
    })

# Створення Dataset
dataset = Dataset.from_list(rows)

# Перевірка
print(dataset[0])

{'input': 'question: How can I find detailed installation and cleaning instructions for my Samsung appliance?', 'output': 'To access detailed installation and cleaning instructions for your Samsung appliance, visit the Samsung website at http://www.samsung.com. On the homepage, navigate to Support > Support home, and enter the model name of your appliance. The model name can be found on a label enclosed with the product or attached to the product itself.'}


In [None]:
# 3. Тренувальний/валідаційний спліт
split_dataset = dataset.train_test_split(test_size=0.15, seed=42)

In [None]:
# 4. Токенізатор і модель
tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
model = BartForConditionalGeneration.from_pretrained("facebook/bart-base")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

In [None]:
def preprocess(example):
    model_inputs = tokenizer(example["input"], max_length=512, truncation=True, padding="max_length")
    labels = tokenizer(example["output"], max_length=64, truncation=True, padding="max_length")
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


In [None]:
tokenized_train = split_dataset["train"].map(preprocess, batched=True)
tokenized_val = split_dataset["test"].map(preprocess, batched=True)

Map:   0%|          | 0/3178 [00:00<?, ? examples/s]

Map:   0%|          | 0/561 [00:00<?, ? examples/s]

In [None]:
# 5. Аргументи навчання
training_args = Seq2SeqTrainingArguments(
    output_dir="./bart_finetuned",
    eval_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=10,
    predict_with_generate=True,
    logging_dir="./logs",
    logging_strategy="steps",
    logging_steps=10,
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
)

In [None]:
# 6. Тренер
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
)

  trainer = Seq2SeqTrainer(


In [None]:
# 7. Навчання
trainer.train()



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mmir_max[0m ([33mmir_max-ivan-franko-national-university-of-lviv[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss
1,1.0446,0.838761
2,0.6821,0.549958
3,0.4251,0.430502
4,0.3321,0.366793
5,0.2962,0.329805
6,0.2023,0.29748
7,0.2081,0.277527
8,0.1791,0.267254
9,0.1587,0.259236
10,0.1489,0.2555


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].


TrainOutput(global_step=3980, training_loss=0.4742630653944447, metrics={'train_runtime': 2530.3589, 'train_samples_per_second': 12.559, 'train_steps_per_second': 1.573, 'total_flos': 9688712321433600.0, 'train_loss': 0.4742630653944447, 'epoch': 10.0})

In [None]:
# --- Крок 7: Збереження моделі ---
trainer.save_model("./bart_finetuned")

In [None]:
!zip -r bart_finetuned.zip bart_finetuned


  adding: bart_finetuned/ (stored 0%)
  adding: bart_finetuned/special_tokens_map.json (deflated 85%)
  adding: bart_finetuned/tokenizer_config.json (deflated 75%)
  adding: bart_finetuned/checkpoint-3582/ (stored 0%)
  adding: bart_finetuned/checkpoint-3582/special_tokens_map.json (deflated 85%)
  adding: bart_finetuned/checkpoint-3582/trainer_state.json (deflated 82%)
  adding: bart_finetuned/checkpoint-3582/tokenizer_config.json (deflated 75%)
  adding: bart_finetuned/checkpoint-3582/optimizer.pt (deflated 9%)
  adding: bart_finetuned/checkpoint-3582/generation_config.json (deflated 47%)
  adding: bart_finetuned/checkpoint-3582/rng_state.pth (deflated 25%)
  adding: bart_finetuned/checkpoint-3582/model.safetensors (deflated 8%)
  adding: bart_finetuned/checkpoint-3582/vocab.json (deflated 68%)
  adding: bart_finetuned/checkpoint-3582/training_args.bin (deflated 52%)
  adding: bart_finetuned/checkpoint-3582/merges.txt (deflated 53%)
  adding: bart_finetuned/checkpoint-3582/scheduler.

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
!cp bart_finetuned.zip /content/drive/MyDrive/

In [None]:
# Завантажуємо токенізатор і модель
tokenizer = BartTokenizer.from_pretrained("./bart_finetuned")
model = BartForConditionalGeneration.from_pretrained("./bart_finetuned")

In [None]:
generated_answers = []
true_answers = []
for item in data["data"]:
    for para in item["paragraphs"]:
        context = para["context"]
        for qa in para["qas"]:
            question = qa["question"]
            true_answer = qa["answers"][0]["text"] if qa["answers"] else "No answer"

            input_text = f"question: {question} context: {context}"
            inputs = tokenizer(input_text, return_tensors="pt")

            outputs = model.generate(**inputs, max_length=50, num_beams=4, early_stopping=True)
            generated_answer = tokenizer.decode(outputs[0], skip_special_tokens=True)

            print(f"Question: {question}")
            print(f"Generated answer: {generated_answer}")
            print(f"True answer: {true_answer}")
            print("-" * 30)
            generated_answers.append(generated_answer)
            true_answers.append(true_answer)

Question: Where should the appliance be installed?
Generated answer: in a damp and dusty place
True answer: on a firm and level floor
------------------------------
Question: Where should the appliance not be installed?
Generated answer: in a damp and dusty place
True answer: in a damp and dusty place
------------------------------
Question: How do you activate Child Lock?
Generated answer: hold the Lock button for 5 seconds until the icon appears
True answer: hold the Lock button for 5 seconds until the icon appears
------------------------------
Question: How do you lock the control panel?
Generated answer: press the "Lock" button for 5 seconds until the padlock icon appears
True answer: press the "Lock" button for 5 seconds until the padlock icon appears
------------------------------
Question: Why should you lock the control panel?
Generated answer: press the "Lock" button for 5 seconds until the padlock icon appears
True answer: This prevents accidental changes to settings
-------

In [None]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3


In [None]:
!pip install bert_score

Collecting bert_score
  Downloading bert_score-0.3.13-py3-none-any.whl.metadata (15 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.0.0->bert_score)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch>=1.0.

In [None]:
import evaluate
import numpy as np

In [None]:
bertscore = evaluate.load("bertscore")

results_bert = bertscore.compute(predictions=generated_answers, references=true_answers, lang="en")
print("BARTScore BART:", results_bert)

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BARTScore BART: {'precision': [0.8829964399337769, 1.000000238418579, 1.0, 1.0, 0.8312719464302063, 0.7919124364852905, 1.0000001192092896, 0.9999999403953552, 0.8724387884140015, 0.9999999403953552, 0.9995234608650208, 0.8158285021781921, 1.0000001192092896, 0.978571891784668, 0.8302618265151978, 1.0, 0.9999999403953552, 0.8249311447143555, 0.7966462969779968, 0.8423172235488892, 0.905639111995697, 0.8008074760437012, 1.0, 0.8601793050765991, 0.9048842787742615, 0.8214879631996155, 0.8448960185050964, 0.8088018894195557, 0.8513212203979492, 1.0, 0.9720464944839478, 0.9584376215934753, 0.855920135974884, 0.7695980072021484, 0.9420162439346313, 0.9991136193275452, 0.9293383955955505, 0.8957834839820862, 0.9424911737442017, 1.0, 0.9999999403953552, 1.0, 0.8648266792297363, 0.9653387069702148, 0.8496404886245728, 0.9095348119735718, 0.985993504524231, 0.8094643950462341, 0.8813915848731995, 0.8649783134460449, 0.8452955484390259, 1.0000001192092896, 0.9413458704948425, 0.965149998664856, 

In [None]:
avg_precision = np.mean(results_bert['precision'])
avg_recall = np.mean(results_bert['recall'])
avg_f1 = np.mean(results_bert['f1'])

print(f"Average Precision: {avg_precision:.4f}")
print(f"Average Recall: {avg_recall:.4f}")
print(f"Average F1: {avg_f1:.4f}")

Average Precision: 0.9246
Average Recall: 0.9283
Average F1: 0.9262
