## Loading dataset

In [1]:
from pprint import pprint

In [2]:
from datasets import load_dataset

datasets = load_dataset("squad")

Found cached dataset squad (C:/Users/Gyanprakash/.cache/huggingface/datasets/squad/plain_text/1.0.0/d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453)


  0%|          | 0/2 [00:00<?, ?it/s]

In [3]:
datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})

## Preprocessing the training data


In [4]:
from transformers import AutoTokenizer

In [5]:
pprint(datasets)

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})


In [6]:
model_checkpoint = "distilbert-base-cased"

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [7]:
pprint(tokenizer)

DistilBertTokenizerFast(name_or_path='distilbert-base-cased', vocab_size=28996, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)


In [8]:
max_length = 384  # The maximum length of a feature (question and context)
doc_stride = (
    128  # The authorized overlap between two part of the context when splitting
) # it is needed.


In [9]:
max_length

384

In [12]:
class tokeniz_and_truncate_class:
    def __init__(self,tokenizer):
        self.tokenizer = tokenizer

    max_length = 384  # The maximum length of a feature (question and context)
    doc_stride = (
        128  # The authorized overlap between two part of the context when splitting
    ) # it is needed.


    def tokenize_and_truncate(self,examples):
        # removing whitespace from this.
        examples["question"] = [q.lstrip() for q in examples["question"]]
        examples["context"] = [c.lstrip() for c in examples["context"]]
        # tokenizing the word's.......
        tokenized_examples = self.tokenizer(
            examples["question"],
            examples["context"],
            truncation="only_second",
            max_length=self.max_length,
            stride=self.doc_stride,
            return_overflowing_tokens=True,
            return_offsets_mapping=True,
            padding="max_length",
        )
        return tokenized_examples

In [13]:

tokenize_and_truncate_cls = tokeniz_and_truncate_class(tokenizer=tokenizer)
tokenized_datasets = datasets.map(
    tokenize_and_truncate_cls.tokenize_and_truncate,
    batched=True,
    remove_columns=datasets["train"].column_names,
    num_proc=3,
)


Loading cached processed dataset at C:\Users\Gyanprakash\.cache\huggingface\datasets\squad\plain_text\1.0.0\d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453\cache-1ec38edb82d40a9d_*_of_00003.arrow
Loading cached processed dataset at C:\Users\Gyanprakash\.cache\huggingface\datasets\squad\plain_text\1.0.0\d6ec3ceb99ca480ce37cdd35555d6cb2511d223b9150cce08a837ef62ffea453\cache-279acc2c04df8249_*_of_00003.arrow


In [16]:
def extract_mappings(tokenized_examples):
    sample_mapping = tokenized_examples.pop("overflow_to_sample_mapping")
    offset_mapping = tokenized_examples.pop("offset_mapping")
    return {"sample_mapping": sample_mapping, "offset_mapping": offset_mapping}



In [17]:
# Extract mappings
sample_mappings, offset_mappings = tokenized_datasets.map(
    extract_mappings,
    batched=True,
    num_proc=3,
)


Map (num_proc=3):   0%|          | 0/88729 [00:00<?, ? examples/s]

Map (num_proc=3):   0%|          | 0/10822 [00:00<?, ? examples/s]

In [20]:
offset_mappings

'validation'

In [21]:
sample_mappings

'train'

In [35]:

def label_start_end_positions(self,examples, sample_mapping, offset_mapping):
    start_positions = []
    end_positions = []

    for i, offsets in enumerate(offset_mapping):
        input_ids = examples["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)
        sequence_ids = examples.sequence_ids(i)
        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]

        if len(answers["answer_start"]) == 0:
            start_positions.append(cls_index)
            end_positions.append(cls_index)
        else:
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])
            token_start_index = 0
            while sequence_ids[token_start_index] != 1:
                token_start_index += 1
            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != 1:
                token_end_index -= 1

            if not (
                offsets[token_start_index][0] <= start_char
                and offsets[token_end_index][1] >= end_char
            ):
                start_positions.append(cls_index)
                end_positions.append(cls_index)
            else:
                while (
                    token_start_index < len(offsets)
                    and offsets[token_start_index][0] <= start_char
                ):
                    token_start_index += 1
                start_positions.append(token_start_index - 1)
                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                end_positions.append(token_end_index + 1)

    return start_positions, end_positions


In [36]:
# Label start and end positions
start_positions, end_positions = tokenized_datasets.map(
    lambda examples: label_start_end_positions(examples, sample_mappings, offset_mappings),
    batched=True,
    num_proc=3,
)


Map (num_proc=3):   0%|          | 0/88729 [00:00<?, ? examples/s]

NameError: name 'label_start_end_positions' is not defined

In [28]:
def process_example(examples):
    # Step 1: Tokenize and truncate
    tokenized_examples = tokenize_and_truncate(examples)

    # Step 2: Extract mappings
    sample_mappings, offset_mappings = extract_mappings(tokenized_examples)

    # Step 3: Label start and end positions
    start_positions, end_positions = label_start_end_positions(examples, sample_mappings, offset_mappings)

    # Combine tokenized examples with start and end positions
    tokenized_examples["start_positions"] = start_positions
    tokenized_examples["end_positions"] = end_positions

    return tokenized_examples

# Apply the combined processing function using map
tokenized_datasets = datasets.map(
    process_example,
    batched=True,
    remove_columns=datasets["train"].column_names,
    num_proc=3,
)


Map (num_proc=3):   0%|          | 0/87599 [00:00<?, ? examples/s]

NameError: name 'tokenize_and_truncate' is not defined