In [2]:
from datasets import Dataset
from transformers import BartTokenizer, BartForConditionalGeneration
from sentence_transformers import SentenceTransformer, util
from trl import AutoModelForCausalLMWithValueHead, PPOTrainer, PPOConfig
from sklearn.model_selection import train_test_split
import random
import torch
from torch.optim import AdamW
import json
import gc

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# === 1. Налаштування ===
device = "cuda" if torch.cuda.is_available() else "cpu"
model_path = "../models/bart_finetuned_ChatGPT"
print("Using device:", device)

Using device: cuda


In [4]:
# === 2. Завантаження моделей ===
print(model_path)
tokenizer = BartTokenizer.from_pretrained(model_path)
model = BartForConditionalGeneration.from_pretrained(model_path).to(device)
reward_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", device=device)
optimizer = AdamW(model.parameters(), lr=5e-6)

../models/bart_finetuned_ChatGPT


In [6]:
# === 3. Завантаження та підготовка датасету ===
with open("./grouped_qas.json", "r", encoding="utf-8") as f:
    data = json.load(f)
dataset = Dataset.from_list(data)
print("Dataset loaded with", len(dataset), "examples.")

Dataset loaded with 957 examples.


In [7]:
# === 4. Генерація кількох відповідей ===
def generate_n_responses(prompt, n=4, max_tokens=64):
    model.eval()
    with torch.no_grad():
        input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
        outputs = model.generate(
            input_ids.repeat(n, 1),
            do_sample=True,
            top_k=50,
            top_p=0.95,
            max_new_tokens=max_tokens,
            num_return_sequences=n
        )
    return [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]

In [8]:
# === 5. Reward модель: середнє cosine similarity ===
def score_with_reward_model(gen_response, reference_responses):
    with torch.no_grad():
        all = [gen_response] + reference_responses
        embeddings = reward_model.encode(all, convert_to_tensor=True)
        sim = util.cos_sim(embeddings[0], embeddings[1:])[0]
    return torch.mean(sim).item()

In [None]:
# === 6. GRPO тренувальний цикл ===
def grpo_train_loop(data, n_epochs=1, n_responses=4):
    model.train()
    for epoch in range(n_epochs):
        print(f"\n--- Epoch {epoch+1}/{n_epochs} ---")
        random.shuffle(data)

        for i, example in enumerate(data):
            prompt = f"question: {example['input']}"
            gt_responses = example["output"]

            # Генерація кількох відповідей для кожного питання
            generated = generate_n_responses(prompt, n=n_responses)
            rewards = [score_with_reward_model(gen, gt_responses) for gen in generated]
            avg_reward = sum(rewards) / len(rewards)
            advantages = [r - avg_reward for r in rewards]

            for gen_text, advantage in zip(generated, advantages):
                if advantage <= 0:
                    continue  # лише позитивні відповіді

                input_ids = tokenizer(prompt, return_tensors="pt", truncation=True, padding=True).input_ids.to(device)
                output_ids = tokenizer(gen_text, return_tensors="pt", truncation=True, padding=True).input_ids.to(device)

                outputs = model(input_ids=input_ids, labels=output_ids)
                loss = outputs.loss
                weighted_loss = loss * (-advantage)
                weighted_loss.backward()

            optimizer.step()
            optimizer.zero_grad()
            torch.cuda.empty_cache()
            gc.collect()

            if i % 10 == 0:
                print(f"[{i}/{len(data)}] Avg reward: {avg_reward:.4f} | Sample gen: {generated[0]}")


In [14]:
# === 7. Запуск GRPO тренування ===
grpo_train_loop(dataset.to_list(), n_epochs=3, n_responses=4)


--- Epoch 1/3 ---


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


[0/957] Avg reward: 0.9247 | Sample gen: When storing large quantities of food like soup or stew in the refrigerator, divide them into smaller portions and use shallow containers. In the same way, large meat cuts or whole birds should be portioned or placed in shallow dishes. This practice ensures moisture retention, prevents odor contamination, and allows the food to cool evenly
[2/957] Avg reward: 0.9324 | Sample gen: I apologize, but I am a refrigerator assistant and cannot help with cooking recipes.
[4/957] Avg reward: 0.8929 | Sample gen: Home users are advised to either get in touch with the store where they bought the product or consult their local government office to learn about environmentally responsible recycling options. This approach helps ensure that electronic items are discarded in a way that meets environmental protection guidelines.
[6/957] Avg reward: 0.9198 | Sample gen: Proper grounding ensures electrical safety. The appliance must be connected to a grounded outle

KeyboardInterrupt: 

In [None]:
# === 7. Збереження моделі і оптимізатора ===
def save_model_and_optimizer(model, optimizer, model_path="./bart_GPT_GRPO", optimizer_path="./optimizer.pth"):
    # Зберігаємо модель
    torch.save(model.state_dict(), model_path)
    
    # Зберігаємо оптимізатор
    torch.save(optimizer.state_dict(), optimizer_path)

    print(f"Model and optimizer saved to {model_path} and {optimizer_path}")

In [None]:
save_model_and_optimizer(model, optimizer)

In [None]:
# Завантажуємо токенізатор і модель
tokenizer = BartTokenizer.from_pretrained("./bart_finetuned")
model = BartForConditionalGeneration.from_pretrained("./bart_finetuned")

In [None]:
generated_answers = []
true_answers = []
for item in data["data"]:
    for para in item["paragraphs"]:
        context = para["context"]
        for qa in para["qas"]:
            question = qa["question"]
            true_answer = qa["answers"][0]["text"] if qa["answers"] else "No answer"

            input_text = f"question: {question} context: {context}"
            inputs = tokenizer(input_text, return_tensors="pt")

            outputs = model.generate(**inputs, max_length=50, num_beams=4, early_stopping=True)
            generated_answer = tokenizer.decode(outputs[0], skip_special_tokens=True)

            print(f"Question: {question}")
            print(f"Generated answer: {generated_answer}")
            print(f"True answer: {true_answer}")
            print("-" * 30)
            generated_answers.append(generated_answer)
            true_answers.append(true_answer)

Question: Where should the appliance be installed?
Generated answer: in a damp and dusty place
True answer: on a firm and level floor
------------------------------
Question: Where should the appliance not be installed?
Generated answer: in a damp and dusty place
True answer: in a damp and dusty place
------------------------------
Question: How do you activate Child Lock?
Generated answer: hold the Lock button for 5 seconds until the icon appears
True answer: hold the Lock button for 5 seconds until the icon appears
------------------------------
Question: How do you lock the control panel?
Generated answer: press the "Lock" button for 5 seconds until the padlock icon appears
True answer: press the "Lock" button for 5 seconds until the padlock icon appears
------------------------------
Question: Why should you lock the control panel?
Generated answer: press the "Lock" button for 5 seconds until the padlock icon appears
True answer: This prevents accidental changes to settings
-------