In [1]:
from datasets import Dataset
from transformers import BertTokenizerFast, BertForQuestionAnswering,Trainer, TrainingArguments
import torch
import json
import unicodedata


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))


True
NVIDIA GeForce RTX 4060 Laptop GPU


In [3]:
with open("../datasets/ChatGPT/extractive/fridge_dataset_v1.1_clean.json", "r", encoding="utf-8") as f:
    data = json.load(f)

In [4]:
# Перетворення під BERT-формат
rows = []
for item in data:
    context = item["context"]
    question = item["question"]
    if item["answers"]:
        answer_start = item["answers"][0]["answer_start"]
        answer = item["answers"][0]["text"]
        rows.append({
            "context": context,
            "question": question,
            "answers": {"text": [answer], "answer_start": [answer_start]},
            "is_impossible": item["is_impossible"]
        })
    else:
        # Якщо відповіді немає, можна пропустити запис або додати порожні значення
        rows.append({
            "context": context,
            "question": question,
            "answers": {"text": [""], "answer_start": [0]},
            "is_impossible": item["is_impossible"]
        })

# Створення Dataset
dataset = Dataset.from_list(rows)

# Перевірка
print(dataset[-1])



In [5]:
# 3. Тренувальний/валідаційний спліт
split_dataset = dataset.train_test_split(test_size=0.15, seed=42)

In [23]:
tokenizer = BertTokenizerFast.from_pretrained("../models/bert_best_2")
# tokenizer = BertTokenizerFast.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")
model = BertForQuestionAnswering.from_pretrained("../models/bert_best_2")

In [70]:
def preprocess(examples):
    inputs = tokenizer(
        examples["question"],
        examples["context"],
        max_length=512,
        truncation="only_second",
        padding="max_length",
        return_offsets_mapping=True,
        return_tensors=None
    )

    start_positions = []
    end_positions = []

    for i in range(len(examples["question"])):
        answer = examples["answers"][i]
        answer_text = answer["text"][0].strip()
        answer_start = answer["answer_start"][0]
        answer_end = answer_start + len(answer_text)
        context = examples["context"][i]
        
        # Verify exact match
        assert context[answer_start:answer_end] == answer_text, \
            f"Answer mismatch at {answer_start}: '{context[answer_start:answer_end]}' != '{answer_text}'"

        # Get token positions
        offset_mapping = inputs["offset_mapping"][i]
        sequence_ids = inputs.sequence_ids(i)
        
        # Find context span
        context_start = sequence_ids.index(1)
        context_end = len(sequence_ids) - 2
        
        # Initialize positions
        start_idx = end_idx = 0
        
        # Get the exact answer tokens
        answer_tokens = tokenizer.tokenize(answer_text)
        answer_ids = tokenizer.convert_tokens_to_ids(answer_tokens)
        
        # Get context tokens as IDs
        context_ids = inputs["input_ids"][i][context_start:context_end+1]
        
        # Convert to tokens for debugging
        context_tokens = tokenizer.convert_ids_to_tokens(context_ids)
        
        # Search for answer token sequence in context
        for j in range(len(context_ids) - len(answer_ids) + 1):
            if context_ids[j:j+len(answer_ids)] == answer_ids:
                start_idx = context_start + j
                end_idx = start_idx + len(answer_ids) - 1
                break
        
        # If still not found, try character-based matching as fallback
        if start_idx == 0 and end_idx == 0:
            for idx in range(context_start, context_end + 1):
                if offset_mapping[idx][0] <= answer_start < offset_mapping[idx][1]:
                    start_idx = idx
                    break
                    
            for idx in range(start_idx, context_end + 1):
                if offset_mapping[idx][0] < answer_end <= offset_mapping[idx][1]:
                    end_idx = idx
                    break
        
        # Final verification
        if start_idx != 0 or end_idx != 0:
            predicted_answer = tokenizer.decode(inputs["input_ids"][i][start_idx:end_idx+1])
            if predicted_answer.lower() != answer_text.lower():
                print(f"Prediction mismatch: '{predicted_answer}' != '{answer_text}'")
                start_idx = end_idx = 0
        
        start_positions.append(start_idx)
        end_positions.append(end_idx)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [38]:
from transformers import BertTokenizerFast

# Повне перезавантаження токенізатора з новими спеціальними токенами
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
special_tokens = ["appliance doors", "freezer compartment", "storage room", "⚠️", "•", "appliance"]
tokenizer.add_tokens(special_tokens)

model.resize_token_embeddings(len(tokenizer))

# Після ініціалізації токенізатора
test_example = split_dataset["train"].select([3, 6])
tokenized_test = test_example.map(preprocess, batched=True)

for i in range(2):
    print(f"\nResult {i}:")
    print("Original:", test_example[i]["answers"]["text"][0])
    if tokenized_test[i]["start_positions"] > 0:
        answer_tokens = tokenized_test[i]["input_ids"][
            tokenized_test[i]["start_positions"]:tokenized_test[i]["end_positions"]+1
        ]
        print("Tokenized:", tokenizer.decode(answer_tokens))
    else:
        print("Tokenized: [NOT FOUND]")

Map: 100%|██████████| 2/2 [00:00<00:00, 133.89 examples/s]


Result 0:
Original: Do not hang from the appliance doors, storage room, shelf or climb up into it.
Tokenized: do not install the appliance in a damp and dusty place.

Result 1:
Original: Never eat frozen foods immediately after they have been taken out in the freezer compartment.
Tokenized: never start up an appliance showing any signs of damage.





In [71]:
# Initialize tokenizer with special tokens
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
special_tokens = ["appliance doors", "freezer compartment", "storage room"]
tokenizer.add_tokens(special_tokens)

# Test with your problematic examples
test_example = split_dataset["train"].select([3, 6])
tokenized_test = test_example.map(preprocess, batched=True)

for i in range(2):
    print(f"\nExample {i}:")
    print("Original:", test_example[i]["answers"]["text"][0])
    answer_tokens = tokenized_test[i]["input_ids"][
        tokenized_test[i]["start_positions"]:tokenized_test[i]["end_positions"]+1
    ]
    print("Tokenized:", tokenizer.decode(answer_tokens))

Map: 100%|██████████| 2/2 [00:00<00:00, 314.85 examples/s]


Example 0:
Original: Do not hang from the appliance doors, storage room, shelf or climb up into it.
Tokenized: [CLS]

Example 1:
Original: Never eat frozen foods immediately after they have been taken out in the freezer compartment.
Tokenized: [CLS]





In [65]:
# Test with your problematic examples
test_examples = [
    {
        "context": "Installation\n• This appliance should only be transported by two or more people holding the appliance securely.\n• Install the appliance on a firm and level floor.\n• Do not install the appliance in a damp and dusty place. Do not install or store the appliance in any outdoor area, or any area that is subject to weathering conditions such as direct sunlight, wind, rain, or temperatures below freezing.\n• Do not place the appliance in direct sunlight or expose it to the heat from heating appliances such as stoves or heaters.\n• Be careful not to expose the rear of the appliance when installing.\n• Install the appliance in a place where it is easy to unplug the power plug of the appliance.\n• Be careful not to let the appliance door fall during assembly or disassembly.\n• Be careful not to pinch, crush, or damage the power cable during assembly or disassembly of the appliance door.\n• Be careful not to point the power plug up or let the appliance lean against the power plug.\n• Do not connect a plug adapter or other accessories to the power plug.\n• Do not modify or extend the power cable.\n• Ensure that the outlet socket is properly grounded, and that the earth pin on the power cord is not damaged or removed from the power plug. For more details on grounding, inquire at the Samsung Electronics service centre.\n• This appliance is equipped with a power cord having an equipment-grounding conductor and a grounding power plug. The power plug must be plugged into an appropriate outlet socket that is installed and grounded in accordance with all local codes and ordinances.\n• Never start up an appliance showing any signs of damage. If in doubt, consult your dealer.\n• Do not plug the appliance into a multi-socket adapter which does not have a power cable (mounted).\n• The appliance should be connected to a dedicated power line which is separately fused.\n• Do not use a multi-socket outlet which is not properly grounded (portable). In case of using a properly-grounded multi-socket (portable), use the multi-socket outlet with the current capacity of the power cord rating or higher and use the multi-socket outlet only for the appliance.\n• Dispose of all packaging materials (such as plastic bags and styrofoam) away from children. The packaging materials can cause suffocation.\nOperation\n⚠ CAUTION\nTo reduce the risk of minor injury to persons, malfunction, or damage to the product or property when using this product, follow basic precaution, including the following:\n• Be careful of nearby children when you open or close the appliance door, The door may bump the child and cause injury.\n• Avoid the danger of children getting trapped inside the appliance. A child trapped inside the appliance can cause suffocation. Do not touch frozen food or the metal parts in the freezer compartment with wet or damp hands. It may cause frostbite.\n• Do not place glass containers, bottles or cans (especially those containing carbonated drinks) in the freezer compartment, shelves or ice bin that will be exposed to temperatures below freezing.\n• The tempered glass on the front side of the appliance door or the shelves can be damaged by an impact. If it is broken. Do not touch it with hands.\n• Do not hang from the appliance doors, storage room, shelf or climb up into it.\n• Do not store an excessive amount of water bottles or containers for side dishes on the door baskets.\n• Do not open or close the appliance door with excessive force.\n• If the hinge of the appliance door is damaged or operates improperly, stop using the appliance and contact an authorized service centre.\n• Prevent animals from nibbling on the power cable or water hose.\n• Never eat frozen foods immediately after they have been taken out in the freezer compartment.\n• Make sure not to get a hand or foot stuck upon opening or closing the appliance door or door in door.\nMaintenance\n• Do not clean glass shelves or covers with warm water when they are cold. They may shatter if exposed to sudden temperature changes.\n• Do not insert the shelves upside down. The shelves may fall.\n• To remove frost from the appliance, contact the Samsung Electronics service centre.\n• Dispose of the ice inside the ice bin in the freezer compartment during an extended power outage.",
        "question": "About hanging from doors",
        "answers": {
            "text": ["Do not hang from the appliance doors, storage room, shelf or climb up into it."],
            "answer_start": [3198]
        }
    }
]

tokenized_test = Dataset.from_list(test_examples).map(preprocess, batched=True)
print(tokenizer.decode(tokenized_test[0]["input_ids"][
    tokenized_test[0]["start_positions"]:tokenized_test[0]["end_positions"]+1
]))

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

Map: 100%|██████████| 1/1 [00:00<00:00, 67.09 examples/s]

[CLS]





In [47]:
# Run this verification script on your dataset
for i, example in enumerate(dataset):
    context = example["context"]
    answer = example["answers"]["text"][0]
    start = example["answers"]["answer_start"][0]
    end = start + len(answer)
    
    if context[start:end] != answer:
        print(f"\nMismatch in example {i}:")
        print(f"Question: {example['question']}")
        print(f"Expected: '{answer}'")
        print(f"Actual:   '{context[start:end]}'")
        
        # Find the correct position
        correct_pos = context.find(answer)
        if correct_pos != -1:
            print(f"Correct position should be: {correct_pos}")
            # Update the dataset
            dataset[i]["answers"]["answer_start"][0] = correct_pos
        else:
            print("Correct answer not found in context!")
            # Handle this case (either remove example or fix answer text)


Mismatch in example 13:
Question: Can you outline the steps for cleaning the inside and outside of a refrigerator?
Expected: '1. Unplug the power cord. 2. Use a moistened, soft, lint-free cloth or paper towel to clean the refrigerator’s interior and exterior. 3. When done, use a dry cloth or paper towel to dry well. 4. Plug in the power cord.'
Actual:   '1. Unplug the power cord.
2. Use a moistened, soft, lint-free cloth or paper towel to clean the refrigerator’s interior and exterior.
3. When done, use a dry cloth or paper towel to dry well.
4. Plug in the power cord.'
Correct answer not found in context!

Mismatch in example 84:
Question: What is the minimum timeframe during which necessary repair parts for the appliance remain accessible?
Expected: 'The minimum period during which spare parts, necessary for the repair of the appliance, are available - 7 Years thermostats, temperature sensors, printed circuit boards and light sources, door handles, door hinges, trays, baskets (boxes

In [36]:
i = 1  # або 6
inputs = tokenizer(
    test_example[i]["question"],
    test_example[i]["context"],
    return_offsets_mapping=True,
)
for idx, (token, (start, end)) in enumerate(zip(inputs.tokens(), inputs["offset_mapping"])):
    print(f"{idx}: {token} -> {start}-{end}")

Token indices sequence length is longer than the specified maximum sequence length for this model (888 > 512). Running this sequence through the model will result in indexing errors


0: [CLS] -> 0-0
1: should -> 0-6
2: you -> 7-10
3: wait -> 11-15
4: before -> 16-22
5: eating -> 23-29
6: foods -> 30-35
7: that -> 36-40
8: have -> 41-45
9: just -> 46-50
10: been -> 51-55
11: removed -> 56-63
12: from -> 64-68
13: the -> 69-72
14: freeze -> 73-79
15: ##r -> 79-80
16: ? -> 80-81
17: [SEP] -> 0-0
18: installation -> 0-12
19: • -> 13-14
20: this -> 15-19
21: appliance -> 20-29
22: should -> 30-36
23: only -> 37-41
24: be -> 42-44
25: transported -> 45-56
26: by -> 57-59
27: two -> 60-63
28: or -> 64-66
29: more -> 67-71
30: people -> 72-78
31: holding -> 79-86
32: the -> 87-90
33: appliance -> 91-100
34: securely -> 101-109
35: . -> 109-110
36: • -> 111-112
37: install -> 113-120
38: the -> 121-124
39: appliance -> 125-134
40: on -> 135-137
41: a -> 138-139
42: firm -> 140-144
43: and -> 145-148
44: level -> 149-154
45: floor -> 155-160
46: . -> 160-161
47: • -> 162-163
48: do -> 164-166
49: not -> 167-170
50: install -> 171-178
51: the -> 179-182
52: appliance -> 183-1

In [125]:
def check_tokenization(example, start_idx, end_idx):
    # Токенізація контексту для виведення
    tokenized_context = tokenizer(
        example["context"],
        return_offsets_mapping=True,
        max_length=512,
        truncation=True,
        padding="max_length"
    )

    # Отримуємо токени контексту
    tokens = tokenizer.convert_ids_to_tokens(tokenized_context["input_ids"])

    # Виводимо контекст між start і end індексами
    print("Context Tokens:")
    print(tokens[start_idx:end_idx+1])  # Вивести частину токенів відповіді

# Перевірка для першого запису
check_tokenization(single_example[0], 156, 190)
print("--" * 50)
check_tokenization(single_example[9], 129, 142)


Context Tokens:
['business', 'users', 'should', 'contact', 'their', 'supplier', 'and', 'check', 'the', 'terms', 'and', 'conditions', 'of', 'the', 'purchase', 'contract', '.', 'this', 'product', 'and', 'its', 'electronic', 'accessories', 'should', 'not', 'be', 'mixed', 'with', 'other', 'commercial', 'waste', '##s', 'for', 'disposal', '.']
----------------------------------------------------------------------------------------------------
Context Tokens:
['in', 'the', 'uk', '.', 'frequency', 'range', '|', 'transmitter', 'power', '(', 'max', ')', '-', '-']
