# **Reader: DistilBERT**

## **1. Install and import bibraries**

In [None]:
!pip install -qq datasets==2.16.1 evaluate==0.4.1 transformers
!pip install -q accelerate==0.26.1
!pip install git-lfs

In [2]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [16]:
import os
import numpy as np
from tqdm.auto import tqdm
import collections

import torch

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, TrainingArguments, Trainer, pipeline
import evaluate

In [8]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

## **2. Setup config**

In [6]:
model_name = "distilbert-base-uncased"
max_length = 384
stride = 128

## **3. Setup Dataset**

### **3.1. Download dataset**

In [7]:
dataset_name = "squad_v2"
raw_datasets = load_dataset(dataset_name)

# Load tokenizer and run some examples
tokenizer = AutoTokenizer.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


### **3.2. EDA dataset**

In [8]:
# Print các thông tin Context, Question, vaf Answer trong dataset
print("Context: ", raw_datasets["train"][0]["context"])
print("Question: ", raw_datasets["train"][0]["question"])
print("Answer: ", raw_datasets["train"][0]["answers"])

Context:  Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny's Child. Managed by her father, Mathew Knowles, the group became one of the world's best-selling girl groups of all time. Their hiatus saw the release of Beyoncé's debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "Crazy in Love" and "Baby Boy".
Question:  When did Beyonce start becoming popular?
Answer:  {'text': ['in the late 1990s'], 'answer_start': [269]}


In [9]:
non_answers = raw_datasets["train"].filter(
    lambda x: len(x['answers']['text']) > 0
)

## **4. Tokenize dataset**

### **4.1. Tokenize train set**

In [10]:
def preprocess_training_examples(examples):
    # Strip whitespace from questions
    questions = [q.strip() for q in examples["question"]]

    # Tokenize input text (questions + context) with truncation and padding
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length, # Set max length for tokenized input
        truncation="only_second", # Truncate only the context if too long
        stride=stride, # Overlapping tokens to handle truncation
        return_overflowing_tokens=True, # Allow multiple chunks for long contexts
        return_offsets_mapping=True, # Track character-level token offsets
        padding="max_length" # Pad shorter sequences to max length
    )

    # Extract offset mappings (maps token positions to original text positions)
    offset_mapping = inputs.pop("offset_mapping")

    # Mapping from tokenized samples to original examples
    sample_map = inputs.pop("overflow_to_sample_mapping")

    # Get answer spans from the dataset
    answers = examples["answers"]

    # Initialize lists to store start and end positions of answers
    start_positions, end_positions = [], []

    # Iterate through each tokenized sample
    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i] # Get the index of the original example
        sequence_ids = inputs.sequence_ids(i) # Identify which tokens belong to the context

        # Find the start and end indices of the context within the tokenized sequence
        idx = 0
        while sequence_ids[idx] != 1: # Find first token of the context
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1: # Find last token of the context
            idx += 1
        context_end = idx - 1 # Last token of the context

        answer = answers[sample_idx] # Retrieve the corresponding answer

        # If no answer is provided, set start and end positions to 0
        if len(answer["answer_start"]) == 0:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Get the start and end character positions of the answer
            start_char = answer["answer_start"][0]
            end_char = answer["answer_start"][0] + len(answer["text"][0])

            # Check if answer is outside the tokenized context window
            if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
                start_positions.append(0)
                end_positions.append(0)
            else:
                # Find the start token index corresponding to the answer start character
                idx = context_start
                while idx <= context_end and offset[idx][0] <= start_char:
                    idx += 1
                start_positions.append(idx - 1)

                # Find the end token index corresponding to the answer end character
                idx = context_end
                while idx >= context_start and offset[idx][1] >= end_char:
                    idx -= 1
                end_positions.append(idx + 1)

    # Add the computed start and end positions to the inputs dictionary
    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions

    return inputs

In [11]:
train_dataset = raw_datasets["train"].map(
    preprocess_training_examples,
    batched=True,
    remove_columns=raw_datasets["train"].column_names
)

print(f"Number of raw examples : {len(raw_datasets['train'])}")
print(f"Number of training examples: {len(train_dataset)}")

Number of raw examples : 130319
Number of training examples: 131754


### **4.2. Tokenize Val set**

In [12]:
def preprocess_validation_examples(examples):
    # Strip whitespace from each question
    questions = [q.strip() for q in examples["question"]]

    # Tokenize the input text (questions + context) with truncation and padding
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=max_length, # Set max length for tokenized input
        truncation="only_second",# Overlapping tokens to handle truncation
        stride=stride, # Overlapping tokens to handle truncation
        return_overflowing_tokens=True, # Allow multiple chunks for long contexts
        return_offsets_mapping=True, # Track character-level token offsets
        padding="max_length" # Pad shorter sequences to max length
    )

    # Retrieve mapping from tokenized samples to original examples
    sample_map = inputs.pop("overflow_to_sample_mapping")
    # Initialize a list to store example IDs
    example_ids = []

    # Iterate through tokenized samples
    for i in range(len(inputs["input_ids"])):
        sample_idx = sample_map[i] # Get original example index
        example_ids.append(examples["id"][sample_idx])

        sequence_ids = inputs.sequence_ids(i) # Identify tokens belonging to the context
        offset = inputs["offset_mapping"][i] # Get offset mapping for this tokenized input

        # Modify offset mapping to retain only context offsets and set others to None
        inputs["offset_mapping"][i] = [
            o if sequence_ids[k] == 1 else None for k, o in enumerate(offset)
        ]
    inputs["example_id"] = example_ids

    return inputs

In [13]:
validation_dataset = raw_datasets["validation"].map(
    preprocess_validation_examples,
    batched=True,
    remove_columns=raw_datasets["validation"].column_names
)

print(f"Number of raw examples : {len(raw_datasets['train'])}")
print(f"Number of training examples: {len(train_dataset)}")

Number of raw examples : 130319
Number of training examples: 131754


In [14]:
os.environ["WANDB_DISABLED"] = "true"

## **5. Train model**

In [16]:
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

args = TrainingArguments(
    output_dir="distilbert-finetuned-squadv2",
    evaluation_strategy="no",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=True,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
)

trainer.train()

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  trainer = Trainer(


Step,Training Loss
500,3.1149
1000,2.2941
1500,2.0856
2000,1.9513
2500,1.8301
3000,1.7608
3500,1.6692
4000,1.6135
4500,1.5806
5000,1.5842


TrainOutput(global_step=49410, training_loss=1.0561441637296662, metrics={'train_runtime': 4709.5394, 'train_samples_per_second': 83.928, 'train_steps_per_second': 10.491, 'total_flos': 3.873165421863629e+16, 'train_loss': 1.0561441637296662, 'epoch': 3.0})

In [17]:
trainer.push_to_hub(commit_message="Training complete")

events.out.tfevents.1741159848.b8f16f2db5c4.17788.0:   0%|          | 0.00/26.3k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/nhutan410/distilbert-finetuned-squadv2/commit/8c623662ce90f80452bd1dfc355f22fcebc0ca31', commit_message='Training complete', commit_description='', oid='8c623662ce90f80452bd1dfc355f22fcebc0ca31', pr_url=None, repo_url=RepoUrl('https://huggingface.co/nhutan410/distilbert-finetuned-squadv2', endpoint='https://huggingface.co', repo_type='model', repo_id='nhutan410/distilbert-finetuned-squadv2'), pr_revision=None, pr_num=None)

## **6. Evaluate model**

In [None]:
metric = evaluate.load("squad_v2")

In [3]:
def compute_metrics(start_logits, end_logits, features, examples):
    example_to_features = collections.defaultdict(list)
    for idx, feature in enumerate(features):
        example_to_features[feature["example_id"]].append(idx)

    predicted_answers = []

    # Iterate through original examples to generate predictions
    for example in tqdm(examples):
        example_id = example["id"]
        context = example["context"]
        answers = []  # Store possible answers for this example

        # Iterate over features corresponding to this example
        for feature_index in example_to_features[example_id]:
            # Ensure feature_index is within bounds
            if feature_index >= len(features):
                continue

            # Get start and end logits
            start_logit = start_logits[feature_index]
            end_logit = end_logits[feature_index]

            # Check for "offset_mapping" in the feature
            if "offset_mapping" not in features[feature_index]:
                continue

            offsets = features[feature_index]["offset_mapping"]

            # Get top `n_best` start and end indices based on logit scores
            start_indexes = np.argsort(start_logit)[-1: -n_best-1: -1].tolist()
            end_indexes = np.argsort(end_logit)[-1: -n_best-1: -1].tolist()

            # Generate possible answers based on top start/end positions
            for start_index in start_indexes:
                for end_index in end_indexes:
                    # Skip if offset mapping is None (padding tokens)
                    if offsets[start_index] is None or offsets[end_index] is None:
                        continue
                    # Ensure valid answer span and length
                    if end_index < start_index + 1 or end_index - start_index > max_ans_length:
                        continue

                    # Extract predicted text from context using offset positions
                    text = context[offsets[start_index][0]: offsets[end_index][1]]
                    logit_score = start_logit[start_index] + end_logit[end_index]

                    # Store answer candidate with its score
                    answer = {"text": text, "logit_score": logit_score}
                    answers.append(answer)

        # Choose the best answer or default to no-answer prediction
        if len(answers) > 0:
            # Select answer with highest logit score
            best_answer = max(answers, key=lambda x: x["logit_score"])
            answer_dict = {
                "id": example_id,
                "prediction_text": best_answer["text"],
                "no_answer_probability": 1 - best_answer["logit_score"]
            }
        else:
            # No valid answer found
            answer_dict = {
                "id": example_id,
                "prediction_text": "",
                "no_answer_probability": 1.0
            }
        predicted_answers.append(answer_dict)

    # Prepare ground truth answers for evaluation
    theoretical_answers = [{"id": ex["id"], "answers": ex["answers"]} for ex in examples]
    return metric.compute(predictions=predicted_answers, references=theoretical_answers)

In [None]:
predictions, _, _ = trainer.predict(validation_dataset)

start_logits, end_logits = predictions

results = compute_metrics(
    start_logits,
    end_logits,
    validation_dataset,
    raw_datasets["validation"]
)

results

## **7. Load model from Hub**

In [10]:
# Load model from hub

pipeline_name = "question-answering"
model_name = "nhutan410/distilbert-finetuned-squadv2"

pipe = pipeline(pipeline_name, model=model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/561 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.23k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Device set to use cuda:0


In [11]:
input_question = "What is my name?"
input_context = "My name is An and i live in Tay Ninh"

pipe(question=input_question, context=input_context)

{'score': 0.9484549760818481, 'start': 11, 'end': 13, 'answer': 'An'}

In [14]:
input_question = "Where does An live?"
input_context = "My name is An and i live in Tay Ninh"

pipe(question=input_question, context=input_context)

{'score': 0.96587073802948, 'start': 28, 'end': 36, 'answer': 'Tay Ninh'}