In [None]:
!pip install transformers datasets accelerate -U
!pip install evaluate

Collecting datasets
  Downloading datasets-4.4.1-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=21.0.0 (from datasets)
  Downloading pyarrow-22.0.0-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (3.2 kB)
Downloading datasets-4.4.1-py3-none-any.whl (511 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m511.6/511.6 kB[0m [31m21.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-22.0.0-cp312-cp312-manylinux_2_28_x86_64.whl (47.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m47.7/47.7 MB[0m [31m21.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyarrow, datasets
  Attempting uninstall: pyarrow
    Found existing installation: pyarrow 18.1.0
    Uninstalling pyarrow-18.1.0:
      Successfully uninstalled pyarrow-18.1.0
  Attempting uninstall: datasets
    Found existing installation: datasets 4.0.0
    Uninstalling datasets-4.0.0:
      Successfully uninstalled datasets-4.0.0
Successfully installed datasets-4.4.1 pya

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import evaluate
import numpy as np

model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

raw_datasets = load_dataset("squad", split="train[:5000]")
raw_datasets = raw_datasets.train_test_split(test_size=0.1)

print(f"Loaded {len(raw_datasets['train'])} training examples.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

plain_text/validation-00000-of-00001.par(…):   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

Loaded 4500 training examples.


In [None]:
MAX_LENGTH = 384
STRIDE = 128

def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=MAX_LENGTH,
        truncation="only_second",
        stride=STRIDE,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_map = inputs.pop("overflow_to_sample_mapping")
    offset_mapping = inputs.pop("offset_mapping")
    inputs["start_positions"] = []
    inputs["end_positions"] = []

    for i, offsets in enumerate(offset_mapping):
        sample_index = sample_map[i]
        answer = examples["answers"][sample_index]
        context = examples["context"][sample_index]

        if len(answer['text']) == 0:
            inputs["start_positions"].append(0)
            inputs["end_positions"].append(0)
            continue

        start_char = answer["answer_start"][0]
        end_char = start_char + len(answer["text"][0])

        sequence_ids = inputs.sequence_ids(i)

        context_start_token = sequence_ids.index(1)
        context_end_token = len(sequence_ids) - 1

        start_token = context_start_token
        while start_token < context_end_token and offsets[start_token][0] <= start_char:
            start_token += 1
        inputs["start_positions"].append(start_token - 1)

        end_token = context_end_token
        while end_token >= context_start_token and offsets[end_token][1] >= end_char:
            end_token -= 1
        inputs["end_positions"].append(end_token + 1)

    return inputs

tokenized_datasets = raw_datasets.map(
    preprocess_function,
    batched=True,
    remove_columns=raw_datasets["train"].column_names
)

Map:   0%|          | 0/4500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

In [None]:
from transformers import TrainingArguments, Trainer, default_data_collator
import torch

use_fp16 = False

training_args = TrainingArguments(
    output_dir="./qa_bert_results",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_steps=100,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    report_to="none",
    fp16=use_fp16,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"].select(range(500)),
    tokenizer=tokenizer,
    data_collator=default_data_collator,
)

print("\n" + "="*50)
print("  Starting BERT Q&A Fine-Tuning")
print("="*50 + "\n")

trainer.train()

print("\nFine-Tuning Complete! Model is now saved.")
trainer.save_model("./final_bert_qa_model")
tokenizer.save_pretrained("./final_bert_qa_model")

  trainer = Trainer(



  Starting BERT Q&A Fine-Tuning (Attempting to fix NaN/Zero Loss)...



Epoch,Training Loss,Validation Loss
1,0.0,
2,0.0,
3,0.0,



Fine-Tuning Complete! Model is now saved.


('./final_bert_qa_model/tokenizer_config.json',
 './final_bert_qa_model/special_tokens_map.json',
 './final_bert_qa_model/vocab.txt',
 './final_bert_qa_model/added_tokens.json',
 './final_bert_qa_model/tokenizer.json')

In [None]:
from transformers import pipeline, AutoModelForQuestionAnswering, AutoTokenizer

MODEL_PATH = "./final_bert_qa_model"
qa_pipeline = pipeline(
    "question-answering",
    model=AutoModelForQuestionAnswering.from_pretrained(MODEL_PATH),
    tokenizer=AutoTokenizer.from_pretrained(MODEL_PATH)
)

def answer_question_from_pdf(question, context):
    if not context:
        return "Error: Context (PDF text) is empty."

    result = qa_pipeline({
        'question': question,
        'context': context
    })

    print("\n--- Q&A Result ---")
    print(f"Question: {question}")
    print(f"Predicted Answer: {result['answer']}")
    print(f"Confidence Score: {result['score']:.4f}")
    print("------------------")

    return result['answer']

sample_context = """
BERT was invented by Google and uses a Transformer architecture. The model
fine-tuned here was only trained for a short period of 3 epochs to conserve
time, but is now ready for demonstration.
"""
q_test = "How long was the model trained for?"
answer_question_from_pdf(q_test, sample_context)

Device set to use cuda:0



--- Q&A Result ---
Question: How long was the model trained for?
Predicted Answer: 3
Confidence Score: 0.0386
------------------


'3'