## Loading dataset

In [1]:
from datasets import load_dataset

datasets = load_dataset("squad")

Found cached dataset squad (C:/Users/Gyanprakash/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)


  0%|          | 0/2 [00:00<?, ?it/s]

In [2]:
datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})

In [None]:
def tokenize_and_truncate(examples):
    examples["question"] = [q.lstrip() for q in examples["question"]]
    examples["context"] = [c.lstrip() for c in examples["context"]]
    tokenized_examples = tokenizer(
        examples["question"],
        examples["context"],
        truncation="only_second",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )
    return tokenized_examples

def extract_mappings(tokenized_examples):
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    offset_mapping = tokenized_examples.pop("offset_mapping")
    return sample_mapping, offset_mapping

def label_start_end_positions(examples, sample_mapping, offset_mapping):
    start_positions = []
    end_positions = []

    for i, offsets in enumerate(offset_mapping):
        input_ids = examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)
        sequence_ids = examples.sequence_ids(i)
        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]

        if len(answers["answer_start"]) == 0:
            start_positions.append(cls_index)
            end_positions.append(cls_index)
        else:
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])
            token_start_index = 0
            while sequence_ids[token_start_index] != 1:
                token_start_index += 1
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != 1:
                token_end_index -= 1

            if not (
                offsets[token_start_index][0] <= start_char
                and offsets[token_end_index][1] >= end_char
            ):
                start_positions.append(cls_index)
                end_positions.append(cls_index)
            else:
                while (
                    token_start_index < len(offsets)
                    and offsets[token_start_index][0] <= start_char
                ):
                    token_start_index += 1
                start_positions.append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                end_positions.append(token_end_index + 1)

    return start_positions, end_positions

# Tokenization and truncation
tokenized_datasets = datasets.map(
    tokenize_and_truncate,
    batched=True,
    remove_columns=datasets["train"].column_names,
    num_proc=3,
)

# Extract mappings
sample_mappings, offset_mappings = tokenized_datasets.map(
    extract_mappings,
    batched=True,
    num_proc=3,
)

# Label start and end positions
start_positions, end_positions = tokenized_datasets.map(
    lambda examples: label_start_end_positions(examples, sample_mappings, offset_mappings),
    batched=True,
    num_proc=3,
)


In [None]:
def process_example(examples):
    # Step 1: Tokenize and truncate
    tokenized_examples = tokenize_and_truncate(examples)

    # Step 2: Extract mappings
    sample_mappings, offset_mappings = extract_mappings(tokenized_examples)

    # Step 3: Label start and end positions
    start_positions, end_positions = label_start_end_positions(examples, sample_mappings, offset_mappings)

    # Combine tokenized examples with start and end positions
    tokenized_examples["start_positions"] = start_positions
    tokenized_examples["end_positions"] = end_positions

    return tokenized_examples

# Apply the combined processing function using map
tokenized_datasets = datasets.map(
    process_example,
    batched=True,
    remove_columns=datasets["train"].column_names,
    num_proc=3,
)
