In [1]:
from datasets import Dataset
from transformers import BertTokenizerFast, BertForQuestionAnswering,Trainer, TrainingArguments
import torch
import json


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
with open("../datasets/ChatGPT/extractive/fridge_dataset_v1.0_clean.json", "r") as f:
    data = json.load(f)

In [3]:
# Перетворення під BERT-формат
rows = []
for item in data:
    context = item["context"]
    question = item["question"]
    if item["answers"]:
        answer_start = item["answers"][0]["answer_start"]
        answer = item["answers"][0]["text"]
        rows.append({
            "context": context,
            "question": question,
            "answers": {"text": [answer], "answer_start": [answer_start]},
            "is_impossible": item["is_impossible"]
        })
    else:
        # Якщо відповіді немає, можна пропустити запис або додати порожні значення
        rows.append({
            "context": context,
            "question": question,
            "answers": {"text": [""], "answer_start": [0]},
            "is_impossible": item["is_impossible"]
        })

# Створення Dataset
dataset = Dataset.from_list(rows)

# Перевірка
print(dataset[-1])



In [4]:
# 3. Тренувальний/валідаційний спліт
split_dataset = dataset.train_test_split(test_size=0.15, seed=42)

In [5]:
# 4. Токенізатор і модель
tokenizer = BertTokenizerFast.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
model = BertForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQue

In [23]:
def preprocess(examples):
    # Tokenize questions and contexts
    inputs = tokenizer(
        examples["question"],
        examples["context"],
        max_length=512,
        truncation="only_second",
        padding="max_length",
        return_tensors=None  # Remove return_tensors="pt" for batched processing
    )
    
    # Get start positions and answer texts
    start_positions = []
    end_positions = []
    
    # Process each example in the batch
    for i in range(len(examples["question"])):
        answer_start = examples["answers"][i]["answer_start"][0]
        answer_text = examples["answers"][i]["text"][0]
        
        # Get offsets for this example
        tokenized_context = tokenizer(
            examples["context"][i],
            return_offsets_mapping=True,
            max_length=512,
            truncation=True,
            padding="max_length",
        )
        
        offsets = tokenized_context["offset_mapping"]
        
        # Find start/end token indices
        start_idx = None
        end_idx = None
        
        for idx, (start, end) in enumerate(offsets):
            if start <= answer_start < end:
                start_idx = idx
                break
                
        if start_idx is not None:
            for idx, (start, end) in enumerate(offsets[start_idx:], start=start_idx):
                if end >= answer_start + len(answer_text):
                    end_idx = idx
                    break
                    
        if start_idx is None or end_idx is None:
            start_idx = 0
            end_idx = 0
            
        start_positions.append(start_idx)
        end_positions.append(end_idx)
    
    # Add start and end positions to inputs
    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    
    return inputs

# Test with a single example
single_example = split_dataset["train"].select([0])
tokenized_example = single_example.map(preprocess, batched=True)

# Print result
print("Приклад токенізації для першого запису:")
print(tokenized_example[0])

Map: 100%|██████████| 1/1 [00:00<00:00, 76.02 examples/s]

Приклад токенізації для першого запису:
{'context': 'Instructions about the WEEE\nCorrect Disposal of This Product (Waste Electrical & Electronic Equipment)\n(Applicable in countries with separate collection systems)\nThis marking on the product, accessories or literature indicates that the product and its electronic accessories (e.g. charger, headset, USB cable) should not be disposed of with other household waste at the end of their working life.\nTo prevent possible harm to the environment or human health from uncontrolled waste disposal, please separate these items from other types of waste and recycle them responsibly to promote the sustainable reuse of material resources.\nHousehold users should contact either the retailer where they purchased this product, or their local government office, for details of where and how they can take these items for environmentally safe recycling.\nBusiness users should contact their supplier and check the terms and conditions of the purchase cont




In [25]:
def check_tokenization(example, start_idx, end_idx):
    # Токенізація контексту для виведення
    tokenized_context = tokenizer(
        example["context"],
        return_offsets_mapping=True,
        max_length=512,
        truncation=True,
        padding="max_length"
    )

    # Отримуємо токени контексту
    tokens = tokenizer.convert_ids_to_tokens(tokenized_context["input_ids"])

    # Виводимо контекст між start і end індексами
    print("Context Tokens:")
    print(tokens[start_idx:end_idx+1])  # Вивести частину токенів відповіді

# Перевірка для першого запису
check_tokenization(single_example[0], 156, 190)


Context Tokens:
['business', 'users', 'should', 'contact', 'their', 'supplier', 'and', 'check', 'the', 'terms', 'and', 'conditions', 'of', 'the', 'purchase', 'contract', '.', 'this', 'product', 'and', 'its', 'electronic', 'accessories', 'should', 'not', 'be', 'mixed', 'with', 'other', 'commercial', 'waste', '##s', 'for', 'disposal', '.']


In [None]:

tokenized_train = split_dataset["train"].map(preprocess, batched=True)
tokenized_val = split_dataset["test"].map(preprocess, batched=True)

In [16]:
# Перевірка перших 1 прикладів
for i in range(1):
    print(tokenized_train[i])


{'context': 'Instructions about the WEEE\nCorrect Disposal of This Product (Waste Electrical & Electronic Equipment)\n(Applicable in countries with separate collection systems)\nThis marking on the product, accessories or literature indicates that the product and its electronic accessories (e.g. charger, headset, USB cable) should not be disposed of with other household waste at the end of their working life.\nTo prevent possible harm to the environment or human health from uncontrolled waste disposal, please separate these items from other types of waste and recycle them responsibly to promote the sustainable reuse of material resources.\nHousehold users should contact either the retailer where they purchased this product, or their local government office, for details of where and how they can take these items for environmentally safe recycling.\nBusiness users should contact their supplier and check the terms and conditions of the purchase contract. This product and its electronic ac

In [9]:
training_args = TrainingArguments(
    output_dir="./bert_finetuned",
    eval_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=7,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_strategy="steps",
    logging_steps=10,
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
)

In [12]:
# 6. Тренер
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    tokenizer=tokenizer,
)



  trainer = Trainer(


In [None]:
# 7. Навчання
trainer.train()

In [49]:
# --- Крок 5: Функція для обчислення метрик ---

metric = evaluate.load("squad")

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    examples = split_dataset["test"]
    features = tokenized_test

    preds = postprocess_qa_predictions(examples, features, logits)

    formatted_preds = [{"id": k, "prediction_text": v} for k, v in preds.items()]
    references = [{"id": ex["id"], "answers": ex["answers"]} for ex in examples]

    return metric.compute(predictions=formatted_preds, references=references)

In [50]:
# --- Крок 6: Параметри тренування ---
training_args = TrainingArguments(
    output_dir="./bert_finetuned",
    eval_strategy="steps",
    eval_steps=1,
    logging_steps=1,
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
)

In [51]:
# --- Крок 7: Ініціалізація тренера ---
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [52]:
# --- Крок 8: Запуск тренування ---
trainer.train()

Step,Training Loss,Validation Loss,Exact Match,F1
1,4.2832,5.391767,40.0,64.636364
2,6.382,4.731651,40.0,64.636364
3,4.4938,4.244206,40.0,64.636364
4,4.7686,3.792037,10.0,57.30303
5,4.9722,3.433322,10.0,58.30303
6,3.1617,3.204514,10.0,58.508159
7,4.2071,3.043259,10.0,55.841492
8,3.1566,2.94651,10.0,55.841492
9,3.2254,2.884795,10.0,55.841492
10,3.4426,2.83982,10.0,57.919414


TrainOutput(global_step=14, training_loss=3.766544818878174, metrics={'train_runtime': 57.9011, 'train_samples_per_second': 1.796, 'train_steps_per_second': 0.242, 'total_flos': 10190941802496.0, 'train_loss': 3.766544818878174, 'epoch': 2.0})

In [53]:
print(tokenized_test.column_names)


['id', 'input_ids', 'attention_mask', 'offset_mapping', 'example_id', 'start_positions', 'end_positions']


In [54]:
# --- Крок 7: Збереження моделі ---
trainer.save_model("./bert_finetuned")