<a href="https://colab.research.google.com/github/Mahnoor036/Fine-tune-BERT-for-Question-Answering/blob/main/QUESTION_ANSWERING_WITH_BERT_ON_SQuAD.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# QUESTION ANSWERING WITH BERT ON SQuAD

from datasets import load_dataset
from transformers import (
    BertTokenizerFast,
    BertForQuestionAnswering,
    Trainer,
    TrainingArguments,
    default_data_collator
)
import torch
import numpy as np
import os

# 1. Load dataset (with error handling)
try:
    dataset = load_dataset("squad")
    print("✅ SQuAD dataset loaded successfully!")
    print(f"Train samples: {len(dataset['train'])}, Validation samples: {len(dataset['validation'])}")
except Exception as e:
    print(f"❌ Error loading dataset: {e}")
    exit()

# 2. Initialize model and tokenizer
try:
    tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
    model = BertForQuestionAnswering.from_pretrained("bert-base-uncased")

    # Print model info
    print("✅ Model and tokenizer loaded successfully!")
    print(f"Model architecture: {model.__class__.__name__}")
    print(f"Tokenizer vocab size: {tokenizer.vocab_size}")

    # Move model to GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    print(f"Using device: {device}")

except Exception as e:
    print(f"❌ Error loading model/tokenizer: {e}")
    exit()

# 3. Tokenization and alignment
def tokenize_and_align(examples):
    try:
        tokenized = tokenizer(
            examples["question"],
            examples["context"],
            truncation="only_second",
            max_length=384,
            stride=128,
            return_overflowing_tokens=True,
            return_offsets_mapping=True,
            padding="max_length"
        )

        sample_map = tokenized.pop("overflow_to_sample_mapping")
        offset_mapping = tokenized.pop("offset_mapping")
        start_positions = []
        end_positions = []

        for i, offsets in enumerate(offset_mapping):
            sample_index = sample_map[i]
            answer = examples["answers"][sample_index]

            # Handle no-answer cases
            if len(answer["answer_start"]) == 0:
                start_positions.append(0)
                end_positions.append(0)
                continue

            start_char = answer["answer_start"][0]
            end_char = start_char + len(answer["text"][0])
            sequence_ids = tokenized.sequence_ids(i)

            # Find context span
            idx = 0
            while sequence_ids[idx] != 1:
                idx += 1
            context_start = idx
            while sequence_ids[idx] == 1:
                idx += 1
            context_end = idx - 1

            # If answer is outside context, label as [CLS]
            if (offsets[context_start][0] > start_char or
                offsets[context_end][1] < end_char):
                start_positions.append(0)
                end_positions.append(0)
            else:
                # Find answer tokens
                idx = context_start
                while idx <= context_end and offsets[idx][0] <= start_char:
                    idx += 1
                start_positions.append(idx - 1)

                idx = context_end
                while idx >= context_start and offsets[idx][1] >= end_char:
                    idx -= 1
                end_positions.append(idx + 1)

        tokenized["start_positions"] = start_positions
        tokenized["end_positions"] = end_positions
        return tokenized

    except Exception as e:
        print(f"❌ Error in tokenization: {e}")
        return None

# 4. Process dataset
try:
    tokenized_dataset = dataset.map(
        tokenize_and_align,
        batched=True,
        remove_columns=dataset["train"].column_names,
        num_proc=4  # Use multiple processes for faster tokenization
    )
    print("✅ Dataset tokenized successfully!")
    print(f"Tokenized train samples: {len(tokenized_dataset['train'])}, Validation samples: {len(tokenized_dataset['validation'])}")
except Exception as e:
    print(f"❌ Error tokenizing dataset: {e}")
    exit()

# 5. Training setup
try:
    training_args = TrainingArguments(
        output_dir="./bert-squad-finetuned",
        eval_strategy="steps",
        eval_steps=500,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=2,
        save_steps=1000,
        save_total_limit=2,
        logging_steps=100,
        learning_rate=3e-5,
        weight_decay=0.01,
        fp16=torch.cuda.is_available(),
        report_to="none",
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        greater_is_better=False
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_dataset["train"],
        eval_dataset=tokenized_dataset["validation"],
        tokenizer=tokenizer,
        data_collator=default_data_collator,
    )

    print("✅ Starting training...")
    train_result = trainer.train()
    print("🎉 Training completed successfully!")

    # Save the model
    trainer.save_model("./bert-squad-finetuned/final_model")
    tokenizer.save_pretrained("./bert-squad-finetuned/final_model")
    print("💾 Model saved successfully!")

    # Print training metrics
    metrics = train_result.metrics
    print(f"Training metrics: {metrics}")

except Exception as e:
    print(f"❌ Training error: {e}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

plain_text/validation-00000-of-00001.par(…):   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

✅ SQuAD dataset loaded successfully!
Train samples: 87599, Validation samples: 10570


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✅ Model and tokenizer loaded successfully!
Model architecture: BertForQuestionAnswering
Tokenizer vocab size: 30522
Using device: cuda


Map (num_proc=4):   0%|          | 0/87599 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/10570 [00:00<?, ? examples/s]

✅ Dataset tokenized successfully!
Tokenized train samples: 88524, Validation samples: 10784
✅ Starting training...


  trainer = Trainer(


Step,Training Loss,Validation Loss
500,1.8364,1.645565
1000,1.5433,1.429482
1500,1.5148,1.308256
2000,1.2686,1.28953
2500,1.3302,1.237966
3000,1.241,1.167221
3500,1.2964,1.168284
4000,1.2252,1.1449
4500,1.3118,1.127252
5000,1.164,1.124609


Step,Training Loss,Validation Loss
500,1.8364,1.645565
1000,1.5433,1.429482
1500,1.5148,1.308256
2000,1.2686,1.28953
2500,1.3302,1.237966
3000,1.241,1.167221
3500,1.2964,1.168284
4000,1.2252,1.1449
4500,1.3118,1.127252
5000,1.164,1.124609


🎉 Training completed successfully!
💾 Model saved successfully!
Training metrics: {'train_runtime': 7616.3143, 'train_samples_per_second': 23.246, 'train_steps_per_second': 2.906, 'total_flos': 3.4696551139946496e+16, 'train_loss': 0.9696348660164148, 'epoch': 2.0}


**Results**

In [None]:
from transformers import pipeline
qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)
qa_pipeline({
    "context": "Paris is the capital of France",
    "question": "What is the capital of France?"
})

Device set to use cuda:0


{'score': 0.9969263076782227, 'start': 0, 'end': 5, 'answer': 'Paris'}

In [None]:
# Enable these in your training args:
training_args = TrainingArguments(
    gradient_accumulation_steps=2,
    gradient_checkpointing=True,
    fp16=True
)

In [None]:
model.save_pretrained("my_bert_squad")
tokenizer.save_pretrained("my_bert_squad")

('my_bert_squad/tokenizer_config.json',
 'my_bert_squad/special_tokens_map.json',
 'my_bert_squad/vocab.txt',
 'my_bert_squad/added_tokens.json',
 'my_bert_squad/tokenizer.json')