In [1]:
from transformers import BartForConditionalGeneration, BartTokenizer
from datasets import Dataset, DatasetDict
import json
from dotenv import load_dotenv
import os
import torch



  from .autonotebook import tqdm as notebook_tqdm


In [2]:
load_dotenv()
DATASET = os.getenv('OUTPUT_JSON_CLEANED')

In [3]:
with open(DATASET, "r", encoding="utf-8") as f:
    data = json.load(f)

In [4]:
# Перетворення під BART-формат
rows = []
for item in data:
    question = item["instruction"]
    answer = item["response"]
    rows.append({
        "input": f"question: {question}",
        "output": answer
    })

# Створення Dataset
dataset = Dataset.from_list(rows)

print("example:", dataset[0])

example: {'input': 'question: How can I find detailed installation and cleaning instructions for my Samsung appliance?', 'output': 'To access detailed installation and cleaning instructions for your Samsung appliance, visit the Samsung website at http://www.samsung.com. On the homepage, navigate to Support > Support home, and enter the model name of your appliance. The model name can be found on a label enclosed with the product or attached to the product itself.'}


In [5]:
split_dataset = dataset.train_test_split(test_size=0.15, seed=42)

In [9]:
model_path = "./bart_finetuned"

FT = False
if FT:
    tokenizer = BartTokenizer.from_pretrained(model_path)
    model = BartForConditionalGeneration.from_pretrained(model_path)
else:
    tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
    model = BartForConditionalGeneration.from_pretrained("facebook/bart-base")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

BartForConditionalGeneration(
  (model): BartModel(
    (shared): BartScaledWordEmbedding(50265, 768, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): BartScaledWordEmbedding(50265, 768, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 768)
      (layers): ModuleList(
        (0-5): 6 x BartEncoderLayer(
          (self_attn): BartSdpaAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (final_lay

In [7]:
def generate_response(instruction):
    input_text = f"question: {instruction}"  # якщо саме так навчав, або просто instruction
    inputs = tokenizer(
    input_text,
    return_tensors="pt",
    truncation=True,
    padding="max_length",
    max_length=512
    )

    # Перекидаємо input_ids та attention_mask на GPU
    inputs = {k: v.to(device) for k, v in inputs.items()}

    output_ids = model.generate(**inputs, max_length=64)
    return tokenizer.decode(output_ids[0], skip_special_tokens=True)


In [11]:
# Вивід перших прикладів
for example in split_dataset["test"].select(range(5)):
    instr = example["input"]
    real = example["output"]
    gen = generate_response(instr)

    print("Instruction:", instr)
    print("Expected:", real)
    print("Generated:", gen)
    print("-" * 50)

Instruction: question: What are the advantages of having a correctly-grounded multiple-socket power strip?
Expected: A properly-grounded multi-socket outlet provides better protection against electrical shocks and ensures the safe operation of your appliances. It helps maintain a stable electrical flow, preventing potential damage from power surges or ground faults.
Generated: question: question: What are the advantages of having a correctly-grounded multiple-socket power strip?
--------------------------------------------------
Instruction: question: What are the positive aspects of employing a well-grounded multi-socket outlet?
Expected:  A multi-socket outlet with proper grounding offers enhanced protection against electrical shocks and guarantees the safe functioning of your appliances. It aids in sustaining a consistent electrical flow, safeguarding against possible harm from power surges or ground faults.
Generated: question: question: What are the positive aspects of employing a

In [12]:
print(generate_response("How can I make a salad?"))
print(generate_response("What is the capital of France?"))
print(generate_response("Explain the theory of relativity in simple terms."))
print(generate_response("What are the benefits of regular exercise?"))
print(generate_response("What is the process of photosynthesis?"))

question: How can I make a salad?
question: What is the capital of France?
question: Explain the theory of relativity in simple terms.
question: What are the benefits of regular exercise?
question: What is the process of photosynthesis?


In [13]:
from bert_score import score

test_samples = split_dataset["test"]

references = [ex["output"] for ex in test_samples]
candidates = [generate_response(ex["input"]) for ex in test_samples]

P, R, F1 = score(
    candidates,
    references,
    lang="en",
    model_type="bert-base-uncased",
    device="cuda",
    batch_size=32
)

print(f"Precision: {P.mean().item():.4f}")
print(f"Recall:    {R.mean().item():.4f}")
print(f"F1 Score:  {F1.mean().item():.4f}")


Precision: 0.5490
Recall:    0.4854
F1 Score:  0.5145
