In [None]:
# nvidia-smi -q -d POWER
# trainer.train(resume_from_checkpoint="./results/checkpoint-500")

In [None]:
from datasets import load_dataset

# Load SQuAD v2 dataset
ds = load_dataset("rajpurkar/squad_v2")

In [None]:
# Check available splits
print(ds)

# Inspect a sample from the training set
print(ds['train'][0])


In [None]:
from datasets import load_dataset
from transformers import BertTokenizerFast

# Load SQuAD v2 dataset
ds = load_dataset("rajpurkar/squad_v2")

# Initialize a fast tokenizer
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

# Preprocessing function for the dataset
def preprocess_data(examples):
    # Tokenize context and question with truncation and padding
    inputs = tokenizer(
        examples['question'],
        examples['context'],
        truncation="only_second",  # Truncate the context if it's too long
        max_length=384,  # Typically used max length for QA
        stride=128,
        padding="max_length",
        return_overflowing_tokens=True,
        return_offsets_mapping=True
    )
    
    # Map answer to the tokenized input
    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    
    start_positions, end_positions = [], []
    
    for i, offsets in enumerate(offset_mapping):
        # Check if the answer data is available for the current example
        if i >= len(answers) or "answer_start" not in answers[i] or len(answers[i]["answer_start"]) == 0:
            # No answer available, set default start and end positions
            start_positions.append(0)
            end_positions.append(0)
        else:
            # There is an answer, retrieve start and end characters
            start_char = answers[i]["answer_start"][0]
            end_char = start_char + len(answers[i]["text"][0])
            
            # Find the start and end of the answer in tokens
            token_start_index = 0
            token_end_index = 0
            for idx, (start, end) in enumerate(offsets):
                if start <= start_char < end:
                    token_start_index = idx
                if start < end_char <= end:
                    token_end_index = idx
                    break
            
            start_positions.append(token_start_index)
            end_positions.append(token_end_index)
    
    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

# Apply to the dataset
tokenized_ds = ds.map(preprocess_data, batched=True, remove_columns=ds["train"].column_names)


In [None]:
from transformers import BertForQuestionAnswering, TrainingArguments, Trainer, DataCollatorWithPadding
from transformers import BertTokenizerFast
from datasets import load_dataset
import torch

# Load the tokenizer and model
tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
model = BertForQuestionAnswering.from_pretrained("bert-base-uncased")

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Load and preprocess dataset as before
ds = load_dataset("rajpurkar/squad_v2")

# Preprocessing function (as defined before)
def preprocess_data(examples):
    inputs = tokenizer(
        examples['question'],
        examples['context'],
        truncation="only_second",
        max_length=384,
        stride=128,
        padding="max_length",
        return_overflowing_tokens=True,
        return_offsets_mapping=True
    )
    
    offset_mapping = inputs.pop("offset_mapping")
    answers = examples["answers"]
    
    start_positions, end_positions = [], []
    
    for i, offsets in enumerate(offset_mapping):
        if i >= len(answers) or "answer_start" not in answers[i] or len(answers[i]["answer_start"]) == 0:
            start_positions.append(0)
            end_positions.append(0)
        else:
            start_char = answers[i]["answer_start"][0]
            end_char = start_char + len(answers[i]["text"][0])
            token_start_index = 0
            token_end_index = 0
            for idx, (start, end) in enumerate(offsets):
                if start <= start_char < end:
                    token_start_index = idx
                if start < end_char <= end:
                    token_end_index = idx
                    break
            start_positions.append(token_start_index)
            end_positions.append(token_end_index)
    
    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

# Apply preprocessing
tokenized_ds = ds.map(preprocess_data, batched=True, remove_columns=ds["train"].column_names)

# Define data collator for dynamic padding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Set training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    eval_dataset=tokenized_ds["validation"],
    data_collator=data_collator,
)

# Start training
trainer.train()
# trainer.train(resume_from_checkpoint="./results/checkpoint-33500")

In [2]:
from transformers import BertForQuestionAnswering, BertTokenizerFast, pipeline
import os

# Path to the directory containing checkpoints
checkpoints_dir = "./results"

# List all checkpoints and sort them by training step number
checkpoints = [os.path.join(checkpoints_dir, d) for d in os.listdir(checkpoints_dir) if d.startswith("checkpoint")]
checkpoints.sort(key=lambda x: int(x.split('-')[-1]))

# Example validation data
question = "What is the capital of France?"
context = "France is a country in Europe. The capital of France is Paris."

best_checkpoint = None
best_score = float("-inf")

# Ensure device selection
device = 0  # GPU 0

print("Evaluating all checkpoints to find the best one...")
for checkpoint in checkpoints:
    print(f"Evaluating checkpoint: {checkpoint}")
    # Load model for this checkpoint
    model = BertForQuestionAnswering.from_pretrained(checkpoint)
    # Load tokenizer from original base model
    tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")

    # Create a pipeline for question answering with GPU
    qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer, device=device)

    # Perform QA inference and evaluate the result
    result = qa_pipeline(question=question, context=context)
    print(f"Checkpoint: {checkpoint}, Answer: {result['answer']}, Score: {result['score']}")

    # Track the best checkpoint based on the score
    if result['score'] > best_score:
        best_score = result['score']
        best_checkpoint = checkpoint

# Output the best checkpoint
print(f"Best checkpoint: {best_checkpoint} with score {best_score}")

# Save the best checkpoint to a permanent location
final_model_path = "./final_model"
print(f"Saving the best checkpoint to: {final_model_path}")
tokenizer.save_pretrained(final_model_path)
model.save_pretrained(final_model_path)

# Reload the saved model for verification
print("Reloading the saved model for verification...")
tokenizer = BertTokenizerFast.from_pretrained(final_model_path)
model = BertForQuestionAnswering.from_pretrained(final_model_path)

# Use the saved model for inference
qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer, device=device)
result = qa_pipeline(question=question, context=context)
print(f"Final Answer: {result['answer']}, Score: {result['score']}")


Evaluating all checkpoints to find the best one...
Evaluating checkpoint: ./results\checkpoint-500
Checkpoint: ./results\checkpoint-500, Answer: Paris, Score: 0.15826915204524994
Evaluating checkpoint: ./results\checkpoint-1000
Checkpoint: ./results\checkpoint-1000, Answer: Paris, Score: 0.2653363049030304
Evaluating checkpoint: ./results\checkpoint-1500
Checkpoint: ./results\checkpoint-1500, Answer: Paris, Score: 0.3415960669517517
Evaluating checkpoint: ./results\checkpoint-2000
Checkpoint: ./results\checkpoint-2000, Answer: Paris, Score: 0.22488351166248322
Evaluating checkpoint: ./results\checkpoint-2500
Checkpoint: ./results\checkpoint-2500, Answer: Paris, Score: 0.4180004596710205
Evaluating checkpoint: ./results\checkpoint-3000
Checkpoint: ./results\checkpoint-3000, Answer: Paris, Score: 0.538913369178772
Evaluating checkpoint: ./results\checkpoint-3500
Checkpoint: ./results\checkpoint-3500, Answer: Paris, Score: 0.3645639419555664
Evaluating checkpoint: ./results\checkpoint-400

In [22]:
from transformers import BertForQuestionAnswering, BertTokenizerFast, pipeline

# Load the fine-tuned model
model_path = "./final_model"
tokenizer = BertTokenizerFast.from_pretrained(model_path)
model = BertForQuestionAnswering.from_pretrained(model_path)

# Create a pipeline for question answering
qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer, device=0)  # Ensure GPU usage (device=0)

# Example usage
question = "what is safe state"
context = "The Banker's algorithm is a resource allocation and deadlock avoidance algorithm. This was developed by Edsger Dijkstra. This tests for safety by simulating the allocation of predetermined maximum possible amounts of all resources, and then makes a safe-state check to test for possible deadlock conditions for all other pending activities, before deciding whether allocation should be allowed to continue."

result = qa_pipeline(question=question, context=context)
print(f"Answer: {result['answer']}, Score: {result['score']}")


Answer: check to test for possible deadlock conditions, Score: 0.17190447449684143
