## Loading dataset

In [1]:
from datasets import load_dataset

datasets = load_dataset("squad")

Found cached dataset squad (C:/Users/Gyanprakash/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)


  0%|          | 0/2 [00:00<?, ?it/s]

In [2]:
datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})

In [None]:
def tokenize_and_truncate(examples):
    examples["question"] = [q.lstrip() for q in examples["question"]]
    examples["context"] = [c.lstrip() for c in examples["context"]]
    tokenized_examples = tokenizer(
        examples["question"],
        examples["context"],
        truncation="only_second",
        max_length=max_length,
        stride=doc_stride,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )
    return tokenized_examples

def extract_mappings(tokenized_examples):
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    offset_mapping = tokenized_examples.pop("offset_mapping")
    return sample_mapping, offset_mapping

def label_start_end_positions(examples, sample_mapping, offset_mapping):
    start_positions = []
    end_positions = []

    for i, offsets in enumerate(offset_mapping):
        input_ids = examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)
        sequence_ids = examples.sequence_ids(i)
        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]

        if len(answers["answer_start"]) == 0:
            start_positions.append(cls_index)
            end_positions.append(cls_index)
        else:
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])
            token_start_index = 0
            while sequence_ids[token_start_index] != 1:
                token_start_index += 1
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != 1:
                token_end_index -= 1

            if not (
                offsets[token_start_index][0] <= start_char
                and offsets[token_end_index][1] >= end_char
            ):
                start_positions.append(cls_index)
                end_positions.append(cls_index)
            else:
                while (
                    token_start_index < len(offsets)
                    and offsets[token_start_index][0] <= start_char
                ):
                    token_start_index += 1
                start_positions.append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                end_positions.append(token_end_index + 1)

    return start_positions, end_positions

# Tokenization and truncation
tokenized_datasets = datasets.map(
    tokenize_and_truncate,
    batched=True,
    remove_columns=datasets["train"].column_names,
    num_proc=3,
)

# Extract mappings
sample_mappings, offset_mappings = tokenized_datasets.map(
    extract_mappings,
    batched=True,
    num_proc=3,
)

# Label start and end positions
start_positions, end_positions = tokenized_datasets.map(
    lambda examples: label_start_end_positions(examples, sample_mappings, offset_mappings),
    batched=True,
    num_proc=3,
)


In [None]:
def process_example(examples):
    # Step 1: Tokenize and truncate
    tokenized_examples = tokenize_and_truncate(examples)

    # Step 2: Extract mappings
    sample_mappings, offset_mappings = extract_mappings(tokenized_examples)

    # Step 3: Label start and end positions
    start_positions, end_positions = label_start_end_positions(examples, sample_mappings, offset_mappings)

    # Combine tokenized examples with start and end positions
    tokenized_examples["start_positions"] = start_positions
    tokenized_examples["end_positions"] = end_positions

    return tokenized_examples

# Apply the combined processing function using map
tokenized_datasets = datasets.map(
    process_example,
    batched=True,
    remove_columns=datasets["train"].column_names,
    num_proc=3,
)


# EXP....

In [3]:
from transformers import AutoTokenizer

In [4]:
model_checkpoint = "distilbert-base-cased"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [5]:
class tokeniz_and_truncate_class:
    def __init__(self,tokenizer):
        self.tokenizer = tokenizer

    max_length = 384  # The maximum length of a feature (question and context)
    doc_stride = (
        128  # The authorized overlap between two part of the context when splitting
    ) # it is needed.


    def tokenize_and_truncate(self,examples):
        # removing whitespace from this.
        examples["question"] = [q.lstrip() for q in examples["question"]]
        examples["context"] = [c.lstrip() for c in examples["context"]]
        # tokenizing the word's.......
        tokenized_examples = self.tokenizer(
            examples["question"],
            examples["context"],
            truncation="only_second",
            max_length=self.max_length,
            stride=self.doc_stride,
            return_overflowing_tokens=True,
            return_offsets_mapping=True,
            padding="max_length",
        )
        return tokenized_examples

In [6]:

tokenize_and_truncate_cls = tokeniz_and_truncate_class(tokenizer=tokenizer)
tokenized_datasets = datasets.map(
    tokenize_and_truncate_cls.tokenize_and_truncate,
    batched=True,
    remove_columns=datasets["train"].column_names,
    num_proc=3,
)


Loading cached processed dataset at C:\Users\Gyanprakash\.cache\huggingface\datasets\squad\plain_text\1.0.0\d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453\cache-1ec38edb82d40a9d_*_of_00003.arrow
Loading cached processed dataset at C:\Users\Gyanprakash\.cache\huggingface\datasets\squad\plain_text\1.0.0\d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453\cache-279acc2c04df8249_*_of_00003.arrow


In [12]:
from pprint import pprint

In [13]:
pprint(tokenized_datasets['train'][2])

{'attention_mask': [1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
                    1,
           

In [14]:
print(tokenized_datasets['train'][2])

{'input_ids': [101, 1109, 19349, 1104, 1103, 11373, 1762, 1120, 10360, 8022, 1110, 3148, 1106, 1134, 2401, 136, 102, 22182, 1193, 117, 1103, 1278, 1144, 170, 2336, 1959, 119, 1335, 4184, 1103, 4304, 4334, 112, 188, 2284, 10945, 1110, 170, 5404, 5921, 1104, 1103, 6567, 2090, 119, 13301, 1107, 1524, 1104, 1103, 4304, 4334, 1105, 4749, 1122, 117, 1110, 170, 7335, 5921, 1104, 4028, 1114, 1739, 1146, 14089, 5591, 1114, 1103, 7051, 107, 159, 21462, 1566, 24930, 2508, 152, 1306, 3965, 107, 119, 5893, 1106, 1103, 4304, 4334, 1110, 1103, 19349, 1104, 1103, 11373, 4641, 119, 13301, 1481, 1103, 171, 17506, 9538, 1110, 1103, 144, 10595, 2430, 117, 170, 14789, 1282, 1104, 8070, 1105, 9284, 119, 1135, 1110, 170, 16498, 1104, 1103, 176, 10595, 2430, 1120, 10111, 20500, 117, 1699, 1187, 1103, 6567, 2090, 25153, 1193, 1691, 1106, 2216, 17666, 6397, 3786, 1573, 25422, 13149, 1107, 8109, 119, 1335, 1103, 1322, 1104, 1103, 1514, 2797, 113, 1105, 1107, 170, 2904, 1413, 1115, 8200, 1194, 124, 11739, 1105, 1