In [30]:
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer
import json
import evaluate
import collections
import numpy as np

In [31]:
with open("samsung_refrigerator_qa.json", "r") as f:
    data = json.load(f)

In [32]:
rows = []
for item in data["data"]:
    for para in item["paragraphs"]:
        context = para["context"]
        for qa in para["qas"]:
            row = {
                "id": str(len(rows)),
                "context": context,
                "question": qa["question"],
                "answers": qa["answers"]  # список з текстом і стартом відповіді
            }
            rows.append(row)

# Створюємо Dataset
dataset = Dataset.from_list(rows)

print(dataset[0])  # Перевірка

{'id': '0', 'context': 'Install the appliance on a firm and level floor. Do not install the appliance in a damp and dusty place. Do not install or store the appliance in any outdoor area, or any area that is subject to weathering conditions such as direct sunlight, wind, rain, or temperatures below freezing.', 'question': 'Where should the appliance be installed?', 'answers': [{'answer_start': 24, 'text': 'on a firm and level floor'}]}


In [33]:
# --- Крок 2: Ініціалізація токенізатора та моделі ---
model_name = "distilbert-base-uncased-distilled-squad"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

In [44]:
def prepare_train_features(examples):
    questions = examples["question"]
    contexts = examples["context"]
    answers = examples["answers"]

    tokenized_examples = tokenizer(
        questions,
        contexts,
        truncation="only_second",
        max_length=384,
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")

    # Додаємо example_id, щоб потім зв'язати з оригіналом
    tokenized_examples["example_id"] = []

    # examples['id'] — список списків, розгорнемо в один список
    flat_ids = examples["id"]
    for i in range(len(tokenized_examples["input_ids"])):
        tokenized_examples["example_id"].append(flat_ids[sample_mapping[i]])

        
    offset_mapping = tokenized_examples["offset_mapping"]


    start_positions = []
    end_positions = []

    # flatten answers аналогічно
    flat_answers = [item for sublist in examples["answers"] for item in sublist]

    for i, offsets in enumerate(offset_mapping):
        example_id = tokenized_examples["example_id"][i]
        # example_id — це id прикладу, знаходимо його індекс у flat_ids
        sample_index = flat_ids.index(example_id)
        answer_list = answers[sample_index]  # це список словників

        if len(answer_list) == 0:
            start_positions.append(tokenizer.model_max_length)
            end_positions.append(tokenizer.model_max_length)
        else:
            # беремо першу відповідь (якщо їх кілька)
            answer = answer_list[0]
            start_char = answer["answer_start"]
            end_char = start_char + len(answer["text"])

            sequence_ids = tokenized_examples.sequence_ids(i)

            token_start_index = 0
            while sequence_ids[token_start_index] != 1:
                token_start_index += 1
            token_end_index = len(offsets) - 1
            while sequence_ids[token_end_index] != 1:
                token_end_index -= 1

            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                start_positions.append(tokenizer.model_max_length)
                end_positions.append(tokenizer.model_max_length)
            else:
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                start_positions.append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                end_positions.append(token_end_index + 1)

    tokenized_examples["start_positions"] = start_positions
    tokenized_examples["end_positions"] = end_positions

    return tokenized_examples


In [45]:
# 1. Спочатку робиш розбиття на train/test для оригінального (сирого) датасету:
split_dataset = dataset.train_test_split(test_size=0.15, seed=42)

tokenized_train = split_dataset["train"].map(
    prepare_train_features,
    batched=True,
    remove_columns=[col for col in split_dataset["train"].column_names if col not in ("id", "example_id")]
)

tokenized_test = split_dataset["test"].map(
    prepare_train_features,
    batched=True,
    remove_columns=[col for col in split_dataset["test"].column_names if col not in ("id", "example_id")]
)



Map:   0%|          | 0/52 [00:00<?, ? examples/s]

Map: 100%|██████████| 52/52 [00:00<00:00, 1243.23 examples/s]
Map: 100%|██████████| 10/10 [00:00<00:00, 728.44 examples/s]


In [46]:
# Припустимо, examples і features — це об'єкти datasets.Dataset
examples = split_dataset["test"]          # Оригінальні приклади (мають id)
features = tokenized_test                  # Токенізовані фічі (мають example_id)

# Побудова словника для швидкого пошуку індексів
example_id_to_index = {str(k): i for i, k in enumerate(examples["id"])}

# Перевірка, чи всі example_id з features є в examples
missing_ids = set()
for feature in features:
    eid = str(feature["example_id"])
    if eid not in example_id_to_index:
        missing_ids.add(eid)

if missing_ids:
    print(f"Відсутні example_id у examples: {missing_ids}")
else:
    print("Всі example_id з features присутні у examples")

Всі example_id з features присутні у examples


In [47]:
print(tokenized_test.column_names)


['id', 'input_ids', 'attention_mask', 'offset_mapping', 'example_id', 'start_positions', 'end_positions']


In [48]:
# --- Крок 4: Постпроцесинг для обчислення текстових відповідей ---

def postprocess_qa_predictions(examples, features, raw_predictions, n_best_size=20, max_answer_length=30):
    all_start_logits, all_end_logits = raw_predictions

    example_id_to_index = {str(k): i for i, k in enumerate(examples["id"])}  # <- сюди додано str()

    features_per_example = collections.defaultdict(list)
    for i, feature in enumerate(features):
        example_id = str(feature["example_id"])  # теж приводимо до рядка
        if example_id not in example_id_to_index:
            print(f"Missing example_id in dict: {example_id}")
        features_per_example[example_id_to_index[example_id]].append(i)

    predictions = collections.OrderedDict()

    for example_index, example in enumerate(examples):
        context = example["context"]
        feature_indices = features_per_example[example_index]

        prelim_predictions = []

        for feature_index in feature_indices:
            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]
            offset_mapping = features[feature_index]["offset_mapping"]

            start_indexes = np.argsort(start_logits)[-1: -n_best_size - 1: -1].tolist()
            end_indexes = np.argsort(end_logits)[-1: -n_best_size - 1: -1].tolist()

            for start_index in start_indexes:
                for end_index in end_indexes:
                    if start_index >= len(offset_mapping) or end_index >= len(offset_mapping):
                        continue
                    if offset_mapping[start_index] is None or offset_mapping[end_index] is None:
                        continue
                    if end_index < start_index or (end_index - start_index + 1) > max_answer_length:
                        continue

                    start_char = offset_mapping[start_index][0]
                    end_char = offset_mapping[end_index][1]
                    text = context[start_char:end_char]

                    prelim_predictions.append({
                        "text": text,
                        "start_logit": start_logits[start_index],
                        "end_logit": end_logits[end_index]
                    })

        if prelim_predictions:
            best_pred = max(prelim_predictions, key=lambda x: x["start_logit"] + x["end_logit"])
            predictions[example["id"]] = best_pred["text"]
        else:
            predictions[example["id"]] = ""

    return predictions

In [49]:
# --- Крок 5: Функція для обчислення метрик ---

metric = evaluate.load("squad")

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    examples = split_dataset["test"]
    features = tokenized_test

    preds = postprocess_qa_predictions(examples, features, logits)

    formatted_preds = [{"id": k, "prediction_text": v} for k, v in preds.items()]
    references = [{"id": ex["id"], "answers": ex["answers"]} for ex in examples]

    return metric.compute(predictions=formatted_preds, references=references)

In [50]:
# --- Крок 6: Параметри тренування ---
training_args = TrainingArguments(
    output_dir="./bert_finetuned",
    eval_strategy="steps",
    eval_steps=1,
    logging_steps=1,
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=2,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
)

In [51]:
# --- Крок 7: Ініціалізація тренера ---
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_test,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [52]:
# --- Крок 8: Запуск тренування ---
trainer.train()

Step,Training Loss,Validation Loss,Exact Match,F1
1,4.2832,5.391767,40.0,64.636364
2,6.382,4.731651,40.0,64.636364
3,4.4938,4.244206,40.0,64.636364
4,4.7686,3.792037,10.0,57.30303
5,4.9722,3.433322,10.0,58.30303
6,3.1617,3.204514,10.0,58.508159
7,4.2071,3.043259,10.0,55.841492
8,3.1566,2.94651,10.0,55.841492
9,3.2254,2.884795,10.0,55.841492
10,3.4426,2.83982,10.0,57.919414


TrainOutput(global_step=14, training_loss=3.766544818878174, metrics={'train_runtime': 57.9011, 'train_samples_per_second': 1.796, 'train_steps_per_second': 0.242, 'total_flos': 10190941802496.0, 'train_loss': 3.766544818878174, 'epoch': 2.0})

In [53]:
print(tokenized_test.column_names)


['id', 'input_ids', 'attention_mask', 'offset_mapping', 'example_id', 'start_positions', 'end_positions']


In [54]:
# --- Крок 7: Збереження моделі ---
trainer.save_model("./bert_finetuned")