In [1]:
# nvidia-smi -q -d POWER
# trainer.train(resume_from_checkpoint="./results/checkpoint-500")

In [2]:
from datasets import load_dataset

# Load SQuAD v2 dataset
ds = load_dataset("rajpurkar/squad_v2")

In [3]:
# Check available splits
print(ds)

# Inspect a sample from the training set
print(ds['train'][0])


DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 130319
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 11873
    })
})
{'id': '56be85543aeaaa14008c9063', 'title': 'Beyoncé', 'context': 'Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ bee-YON-say) (born September 4, 1981) is an American singer, songwriter, record producer and actress. Born and raised in Houston, Texas, she performed in various singing and dancing competitions as a child, and rose to fame in the late 1990s as lead singer of R&B girl-group Destiny\'s Child. Managed by her father, Mathew Knowles, the group became one of the world\'s best-selling girl groups of all time. Their hiatus saw the release of Beyoncé\'s debut album, Dangerously in Love (2003), which established her as a solo artist worldwide, earned five Grammy Awards and featured the Billboard Hot 100 number-one singles "C

In [None]:
from transformers import BertForQuestionAnswering, TrainingArguments, Trainer, DataCollatorWithPadding
from transformers import BertTokenizerFast
from datasets import load_dataset
import torch

# Load the tokenizer and model
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
model = BertForQuestionAnswering.from_pretrained("bert-base-uncased")

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Load and preprocess dataset as before
ds = load_dataset("rajpurkar/squad_v2")

# Preprocessing function (as defined before)
def preprocess_data(examples):
    inputs = tokenizer(
        examples['question'],
        examples['context'],
        truncation="only_second",
        max_length=384,
        stride=128,
        padding="max_length",
        return_overflowing_tokens=True,
        return_offsets_mapping=True
    )
    
    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    
    start_positions, end_positions = [], []
    
    for i, offsets in enumerate(offset_mapping):
        if i >= len(answers) or "answer_start" not in answers[i] or len(answers[i]["answer_start"]) == 0:
            start_positions.append(0)
            end_positions.append(0)
        else:
            start_char = answers[i]["answer_start"][0]
            end_char = start_char + len(answers[i]["text"][0])
            token_start_index = 0
            token_end_index = 0
            for idx, (start, end) in enumerate(offsets):
                if start <= start_char < end:
                    token_start_index = idx
                if start < end_char <= end:
                    token_end_index = idx
                    break
            start_positions.append(token_start_index)
            end_positions.append(token_end_index)
    
    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

# Apply preprocessing
tokenized_ds = ds.map(preprocess_data, batched=True, remove_columns=ds["train"].column_names)

# Define data collator for dynamic padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Set training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["validation"],
    data_collator=data_collator,
)

# Start training
trainer.train()

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/49410 [00:00<?, ?it/s]

In [None]:
from transformers import BertForQuestionAnswering, BertTokenizerFast, pipeline
import os

# Path to the directory containing checkpoints
checkpoints_dir = "./results"

# List all checkpoints and sort them by training step number
checkpoints = [os.path.join(checkpoints_dir, d) for d in os.listdir(checkpoints_dir) if d.startswith("checkpoint")]
checkpoints.sort(key=lambda x: int(x.split('-')[-1]))

# Example validation data
question = "What is the capital of France?"
context = "France is a country in Europe. The capital of France is Paris."

best_checkpoint = None
best_score = float("-inf")

print("Evaluating all checkpoints to find the best one...")
for checkpoint in checkpoints:
    print(f"Evaluating checkpoint: {checkpoint}")
    # Load model for this checkpoint
    model = BertForQuestionAnswering.from_pretrained(checkpoint)
    # Load tokenizer from original base model
    tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

    # Create a pipeline for question answering
    qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)

    # Perform QA inference and evaluate the result
    result = qa_pipeline(question=question, context=context)
    print(f"Checkpoint: {checkpoint}, Answer: {result['answer']}, Score: {result['score']}")

    # Track the best checkpoint based on the score
    if result['score'] > best_score:
        best_score = result['score']
        best_checkpoint = checkpoint

# Output the best checkpoint
print(f"Best checkpoint: {best_checkpoint} with score {best_score}")

# Save the best checkpoint to a permanent location
final_model_path = "./final_model"
print(f"Saving the best checkpoint to: {final_model_path}")
tokenizer.save_pretrained(final_model_path)
model.save_pretrained(final_model_path)

# Reload the saved model for verification
print("Reloading the saved model for verification...")
tokenizer = BertTokenizerFast.from_pretrained(final_model_path)
model = BertForQuestionAnswering.from_pretrained(final_model_path)

# Use the saved model for inference
qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer)
result = qa_pipeline(question=question, context=context)
print(f"Final Answer: {result['answer']}, Score: {result['score']}")
