In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install -q transformers datasets accelerate evaluate

In [2]:
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForQuestionAnswering,
    TrainingArguments,
    Trainer,
    DefaultDataCollator
)
from datasets import load_dataset
import numpy as np
from tqdm.auto import tqdm
import json
import random
from torch.utils.data import DataLoader, Dataset
from torch.optim import AdamW

# Fix random seed for reproducibility
def same_seeds(seed):
	torch.manual_seed(seed)
	if torch.cuda.is_available():
			torch.cuda.manual_seed(seed)
			torch.cuda.manual_seed_all(seed)
	np.random.seed(seed)
	random.seed(seed)
	torch.backends.cudnn.benchmark = False
	torch.backends.cudnn.deterministic = True
same_seeds(2)

In [3]:
# Check GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")


Using device: cuda
GPU: Tesla T4
Memory: 15.83 GB


In [4]:
# Configuration
MODEL_NAME = "bert-base-uncased"  # Can also try distilbert-base-uncased for faster training
MAX_LENGTH = 384
STRIDE = 128
BATCH_SIZE = 8  # Adjust based on GPU memory
LEARNING_RATE = 1e-5
EPOCHS = 1
OUTPUT_DIR = "./qa_model"

In [None]:
print("\n=== Loading Tokenizer and Model ===")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForQuestionAnswering.from_pretrained(MODEL_NAME)
model.to(device)

In [5]:
from transformers import BertTokenizerFast, BertForQuestionAnswering

tokenizer = BertTokenizerFast.from_pretrained("deepset/bert-base-cased-squad2")
model = BertForQuestionAnswering.from_pretrained("deepset/bert-base-cased-squad2")
model.to(device)

Some weights of the model checkpoint at deepset/bert-base-cased-squad2 were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, 

In [6]:
print("\n=== Loading Dataset ===")
# Load SQuAD dataset (using subset for faster training on free GPU)
dataset = load_dataset("squad", split="train")  # Use full dataset: split="train"
eval_dataset = load_dataset("squad", split="validation[500:2500]")  # Use split="validation" for full

print(f"Training samples: {len(dataset)}")
print(f"Validation samples: {len(eval_dataset)}")
print(f"\nExample: {dataset[0]}")


=== Loading Dataset ===


README.md: 0.00B [00:00, ?B/s]

plain_text/train-00000-of-00001.parquet:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

plain_text/validation-00000-of-00001.par(…):   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/87599 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/10570 [00:00<?, ? examples/s]

Training samples: 87599
Validation samples: 2000

Example: {'id': '5733be284776f41900661182', 'title': 'University_of_Notre_Dame', 'context': 'Architecturally, the school has a Catholic character. Atop the Main Building\'s gold dome is a golden statue of the Virgin Mary. Immediately in front of the Main Building and facing it, is a copper statue of Christ with arms upraised with the legend "Venite Ad Me Omnes". Next to the Main Building is the Basilica of the Sacred Heart. Immediately behind the basilica is the Grotto, a Marian place of prayer and reflection. It is a replica of the grotto at Lourdes, France where the Virgin Mary reputedly appeared to Saint Bernadette Soubirous in 1858. At the end of the main drive (and in a direct line that connects through 3 statues and the Gold Dome), is a simple, modern stone statue of Mary.', 'question': 'To whom did the Virgin Mary allegedly appear in 1858 in Lourdes France?', 'answers': {'text': ['Saint Bernadette Soubirous'], 'answer_start': [51

In [7]:
class SquadDataset(torch.utils.data.Dataset):
    """
    Dataset wrapper for SQuAD format
    SQuAD format: {'id', 'title', 'context', 'question', 'answers': {'text': [...], 'answer_start': [...]}}
    """
    
    def __init__(self, squad_data, tokenizer, max_length=384, is_training=True):
        self.data = squad_data
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.is_training = is_training
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        item = self.data[idx]
        
        question = item['question']
        context = item['context']
        
        # Tokenize with offsets to map tokens back to character positions
        encoding = self.tokenizer(
            question,
            context,
            truncation=True,
            max_length=self.max_length,
            stride=128,  # For handling long contexts
            padding="max_length",
            return_overflowing_tokens=False,  # Keep only first chunk for simplicity
            return_offsets_mapping=True,
            return_token_type_ids=True
        )
        
        input_ids = torch.tensor(encoding["input_ids"])
        attention_mask = torch.tensor(encoding["attention_mask"])
        token_type_ids = torch.tensor(encoding["token_type_ids"])
        offset_mapping = encoding["offset_mapping"]
        
        result = {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "token_type_ids": token_type_ids,
        }
        
        # ALWAYS add start and end positions (even for evaluation)
        # This ensures compute_metrics works properly
        answers = item['answers']
        
        # SQuAD can have multiple answers, take the first one
        if len(answers['text']) > 0:
            answer_text = answers['text'][0]
            answer_start_char = answers['answer_start'][0]
            answer_end_char = answer_start_char + len(answer_text)
            
            # Find token positions
            start_position, end_position = self.char_to_token_position(
                offset_mapping,
                answer_start_char,
                answer_end_char,
                encoding.get("sequence_ids", None)
            )
        else:
            # Unanswerable question (SQuAD v2)
            start_position = 0
            end_position = 0
        
        result["start_positions"] = torch.tensor(start_position, dtype=torch.long)
        result["end_positions"] = torch.tensor(end_position, dtype=torch.long)
        
        # Keep original data for text-level evaluation
        result["context"] = context
        result["question"] = question
        result["ground_truth"] = item['answers']['text'][0] if len(item['answers']['text']) > 0 else ""
        result["example_id"] = item['id']
        
        return result
    
    def char_to_token_position(self, offset_mapping, start_char, end_char, sequence_ids=None):
        """
        Convert character positions to token positions
        """
        start_position = 0
        end_position = 0
        
        for idx, (start, end) in enumerate(offset_mapping):
            # Skip special tokens and question tokens
            if start == 0 and end == 0:
                continue
            
            # If we have sequence_ids, only look in context (sequence_id == 1)
            if sequence_ids is not None and sequence_ids[idx] == 0:
                continue  # This is in the question part
            
            # Find start position
            if start <= start_char < end:
                start_position = idx
            
            # Find end position
            if start < end_char <= end:
                end_position = idx
        
        # Ensure valid span
        if end_position < start_position:
            end_position = start_position
        
        return start_position, end_position

In [8]:
print("\n=== Creating Dataset Objects ===")

train_dataset = SquadDataset(
    squad_data=dataset,
    tokenizer=tokenizer,
    max_length=384,  # Standard for BERT-based models
    is_training=True
)

validation_dataset = SquadDataset(
    squad_data=eval_dataset,
    tokenizer=tokenizer,
    max_length=384,
    is_training=False
)

print(f"✓ Train dataset: {len(train_dataset)} samples")
print(f"✓ Validation dataset: {len(validation_dataset)} samples")


=== Creating Dataset Objects ===
✓ Train dataset: 87599 samples
✓ Validation dataset: 2000 samples


In [None]:
from sklearn.model_selection import KFold

shuffled_dataset = train_dataset.shuffle(seed=42)

k = 5
kf = KFold(n_splits=k, shuffle=True, random_state=42)

# Convert train_data to a list if not already
train_data_list = list(dataset)

for fold, (train_idx, val_idx) in enumerate(kf.split(train_data_list)):
    train_fold = [train_data_list[i] for i in train_idx]
    val_fold = [train_data_list[i] for i in val_idx]
    
    print(f"Fold {fold+1}: Train={len(train_fold)}, Val={len(val_fold)}")

In [None]:
# Verify a sample
print("\n=== Verifying Dataset Format ===")
sample = train_dataset[0]
print(f"Sample keys: {list(sample.keys())}")
print(f"✓ input_ids shape: {sample['input_ids'].shape}")
print(f"✓ attention_mask shape: {sample['attention_mask'].shape}")
print(f"✓ token_type_ids shape: {sample['token_type_ids'].shape}")
if 'start_positions' in sample:
    print(f"✓ start_positions: {sample['start_positions'].item()}")
    print(f"✓ end_positions: {sample['end_positions'].item()}")
print(f"✓ ground_truth: {sample['ground_truth'][:50]}...")

In [None]:
# Verify a sample
print("\n=== Verifying Dataset Format ===")
sample = validation_dataset[0]
print(f"Sample keys: {list(sample.keys())}")
print(f"✓ input_ids shape: {sample['input_ids'].shape}")
print(f"✓ attention_mask shape: {sample['attention_mask'].shape}")
print(f"✓ token_type_ids shape: {sample['token_type_ids'].shape}")
if 'start_positions' in sample:
    print(f"✓ start_positions: {sample['start_positions'].item()}")
    print(f"✓ end_positions: {sample['end_positions'].item()}")
print(f"✓ ground_truth: {sample['ground_truth'][:50]}...")

In [9]:
class QADataCollator:
    """
    Custom data collator for QA that handles both training and evaluation
    """
    
    def __call__(self, features):
        # Separate tensor fields from non-tensor fields
        batch = {
            "input_ids": torch.stack([f["input_ids"] for f in features]),
            "attention_mask": torch.stack([f["attention_mask"] for f in features]),
        }
        
        # Add token_type_ids if present
        if "token_type_ids" in features[0]:
            batch["token_type_ids"] = torch.stack([f["token_type_ids"] for f in features])
        
        # Add position labels if present (training mode)
        if "start_positions" in features[0]:
            batch["start_positions"] = torch.stack([f["start_positions"] for f in features])
            batch["end_positions"] = torch.stack([f["end_positions"] for f in features])
        
        # Keep non-tensor fields separate (for evaluation)
        if "context" in features[0]:
            batch["contexts"] = [f["context"] for f in features]
            batch["questions"] = [f["question"] for f in features]
            batch["ground_truths"] = [f["ground_truth"] for f in features]
        
        return batch

In [10]:
def postprocess_qa_predictions(
    examples,
    features,
    raw_predictions,
    n_best_size=20,
    max_answer_length=30,
):
    """
    Convert start/end logits into final text answers.
    """
    all_start_logits, all_end_logits = raw_predictions

    example_id_to_index = {ex["id"]: i for i, ex in enumerate(examples)}
    features_per_example = collections.defaultdict(list)

    for i, feature in enumerate(features):
        features_per_example[example_id_to_index[feature["example_id"]]].append(i)

    predictions = collections.OrderedDict()

    for example_index, example in enumerate(examples):
        feature_indices = features_per_example[example_index]
        context = example["context"]

        best_score = -1e9
        best_answer = ""

        for feature_index in feature_indices:
            start_logits = all_start_logits[feature_index]
            end_logits = all_end_logits[feature_index]
            offsets = features[feature_index]["offset_mapping"]

            start_indexes = np.argsort(start_logits)[-n_best_size:]
            end_indexes = np.argsort(end_logits)[-n_best_size:]

            for start_index in start_indexes:
                for end_index in end_indexes:
                    if start_index >= len(offsets) or end_index >= len(offsets):
                        continue
                    if offsets[start_index] is None or offsets[end_index] is None:
                        continue
                    if end_index < start_index:
                        continue
                    if end_index - start_index + 1 > max_answer_length:
                        continue

                    score = start_logits[start_index] + end_logits[end_index]
                    if score > best_score:
                        start_char = offsets[start_index][0]
                        end_char = offsets[end_index][1]
                        best_answer = context[start_char:end_char]
                        best_score = score

        predictions[example["id"]] = best_answer

    return predictions


In [11]:
!pip install -q evaluate

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [12]:
import evaluate
metric = evaluate.load("squad")

Downloading builder script: 0.00B [00:00, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]

In [13]:
def compute_metrics(eval_pred):
    start_logits, end_logits = eval_pred.predictions
    predictions = []
    references = []

    for i in range(len(validation_dataset)):
        example = validation_dataset[i]
        input_ids = example["input_ids"]
        context = example["context"]
        offsets = tokenizer(
            example["question"],
            context,
            return_offsets_mapping=True,
            truncation=True,
            max_length=384
        )["offset_mapping"]

        start = int(np.argmax(start_logits[i]))
        end = int(np.argmax(end_logits[i]))

        if start <= end and start < len(offsets) and end < len(offsets):
            start_char = offsets[start][0]
            end_char = offsets[end][1]
            pred_text = context[start_char:end_char]
        else:
            pred_text = ""

        predictions.append({
            "id": example["example_id"],
            "prediction_text": pred_text
        })

        references.append({
            "id": example["example_id"],
            "answers": {
                "text": [example["ground_truth"]],
                "answer_start": [0]
            }
        })

    return metric.compute(predictions=predictions, references=references)


In [14]:
from transformers import get_linear_schedule_with_warmup

optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)

num_training_steps = len(train_dataset) / BATCH_SIZE / 8 * EPOCHS
num_warmup_steps = int(0.2 * num_training_steps)

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=num_warmup_steps,
    num_training_steps=num_training_steps
)

In [15]:
# Training arguments
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    #eval_strategy="epoch",
    #save_strategy="epoch",
    learning_rate=LEARNING_RATE,
    eval_strategy="steps",
    save_strategy="steps",
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    weight_decay=0.05,
    fp16=True,
    logging_steps=100,
    gradient_accumulation_steps=4,
    warmup_ratio=0.2,
    #load_best_model_at_end=True,
    #metric_for_best_model="eval_span_exact_match",  # Use text-level EM
    #greater_is_better=True,  # Higher EM is better
    push_to_hub=False,
    report_to="none",
    save_total_limit=2,
    label_names=[]
)

# Create data collator
data_collator = QADataCollator()
#data_collator = DefaultDataCollator()

# Create trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=validation_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, scheduler),
)

  trainer = Trainer(


In [16]:
trainer.can_return_loss = True

In [17]:
print("\n=== Starting Training ===")
trainer.train()


=== Starting Training ===




Step,Training Loss,Validation Loss,Exact Match,F1
100,0.6713,1.210883,64.45,78.383069
200,0.5065,1.210193,65.2,79.302727
300,0.5003,1.185845,65.4,79.291602
400,0.4813,1.201964,66.0,79.690336
500,0.505,1.189681,65.85,79.458609
600,0.492,1.188847,65.35,79.362426
700,0.4954,1.209417,65.4,79.554896
800,0.4996,1.190094,65.65,79.555728
900,0.5009,1.178088,66.0,79.865284
1000,0.4681,1.195574,66.05,79.754233




TrainOutput(global_step=1369, training_loss=0.5007549533885791, metrics={'train_runtime': 4415.5491, 'train_samples_per_second': 19.839, 'train_steps_per_second': 0.31, 'total_flos': 1.7167000944987648e+16, 'train_loss': 0.5007549533885791, 'epoch': 1.0})

In [18]:
print("\n=== Saving Model ===")
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print(f"Model saved to {OUTPUT_DIR}")


=== Saving Model ===
Model saved to ./qa_model


In [19]:
print("\n=== Testing the Model ===")
# Load the fine-tuned model
qa_model = AutoModelForQuestionAnswering.from_pretrained(OUTPUT_DIR)
qa_model.to(device)
qa_model.eval()


=== Testing the Model ===


BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, 

In [20]:
import re

def normalize_text(s):
    """Normalize text for comparison"""
    s = s.lower()
    s = re.sub(r'[^\w\s]', '', s)
    return ' '.join(s.split())

def compute_exact_match(prediction, ground_truth):
    """Calculate exact match"""
    return max([float(normalize_text(prediction).replace(" ", "") == normalize_text(ground_truth[i]).replace(" ", "").strip(".")) for i in range(len(ground_truth))])

def compute_f1(prediction, ground_truth):
    f1_arr = [0] * len(ground_truth)
    for i in range(len(ground_truth)):
        """Calculate F1 score"""
        pred_tokens = normalize_text(prediction).split()
        truth_tokens = normalize_text(ground_truth[i]).split()
    
        if not pred_tokens or not truth_tokens:
            f1_arr[i] = float(pred_tokens == truth_tokens)
            continue
    
        common = set(pred_tokens) & set(truth_tokens)
        if not common:
            f1_arr[i] = 0.0
            continue
    
        precision = len(common) / len(pred_tokens)
        recall = len(common) / len(truth_tokens)
        f1_arr[i] = 2 * (precision * recall) / (precision + recall)
    return max(f1_arr)

def compute_qa_metrics(eval_pred):
    """
    Compute metrics for extractive QA
    Note: This computes span-level accuracy, not text-level EM/F1
    """
    predictions, labels = eval_pred
    start_logits, end_logits = predictions
    start_positions, end_positions = labels
    
    # Get predicted positions
    pred_start = np.argmax(start_logits, axis=-1)
    pred_end = np.argmax(end_logits, axis=-1)
    
    # Calculate exact match at span level
    start_match = (pred_start == start_positions).astype(float)
    end_match = (pred_end == end_positions).astype(float)
    exact_match = (start_match * end_match).mean()
    
    # Calculate partial match (either start or end correct)
    partial_match = np.maximum(start_match, end_match).mean()
    
    return {
        "span_exact_match": float(exact_match),
        "span_partial_match": float(partial_match),
    }

In [21]:
test_data = load_dataset("squad", split="validation[:500]")
test_dataset = SquadDataset(
        squad_data=test_data,
        tokenizer=tokenizer,
        max_length=384,
        is_training=False
    )
print(f"✓ Loaded {len(test_dataset)} test examples")

✓ Loaded 500 test examples


In [22]:
def answer_question(question, context):
    inputs = tokenizer(
        question,
        context,
        max_length=MAX_LENGTH,
        truncation=True,
        return_offsets_mapping=True,
        return_tensors="pt"
    )

    offset_mapping = inputs.pop("offset_mapping")[0]
    inputs = inputs.to(device)

    with torch.no_grad():
        outputs = qa_model(**inputs)
        #outputs = model(**inputs)

    start_idx = torch.argmax(outputs.start_logits).item()
    end_idx = torch.argmax(outputs.end_logits).item()

    start_char = offset_mapping[start_idx][0]
    end_char = offset_mapping[end_idx][1]

    answer = context[start_char:end_char]
    return answer


In [23]:
def answer_question_test(question, text):
    device = model.device
    
    inputs = tokenizer(question, text, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = qa_model(**inputs)
    
    answer_start_index = outputs.start_logits.argmax()
    answer_end_index = outputs.end_logits.argmax()
    predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
    
    answer = tokenizer.decode(predict_answer_tokens, skip_special_tokens=True)
    return answer

In [24]:
import pandas as pd

num_test_samples = min(500, len(test_data))  # Adjust as needed

exact_matches = []
f1_scores = []
rows = []

for i in range(num_test_samples):
    example = test_data[i]
    context = example['context']
    question = example['question']
    ground_truth = example['answers']['text']

    # Generate prediction
    prediction = answer_question_test(question, context).strip()

    # Compute metrics
    em = compute_exact_match(prediction, ground_truth)
    f1 = compute_f1(prediction, ground_truth)

    exact_matches.append(em)
    f1_scores.append(f1)

    rows.append({
        "prediction": prediction,
        "ground_truth": ground_truth,
        "exact_match": em,
        "f1": f1
    })

    # Show some examples
    if i < 5:
        print(f"\nExample {i+1}:")
        print(f"Question: {question}")
        print(f"Ground Truth: {ground_truth}")
        print(f"Prediction: {prediction}")
        print(f"Exact Match: {em} | F1: {f1:.3f}")

# Print overall metrics
print("\n" + "="*60)
print("OVERALL TEST RESULTS")
print("="*60)
print(f"Exact Match Accuracy: {sum(exact_matches)/len(exact_matches)*100:.2f}%")
print(f"Average F1 Score: {sum(f1_scores)/len(f1_scores)*100:.2f}%")
print(f"Samples Evaluated: {num_test_samples}")


Example 1:
Question: Which NFL team represented the AFC at Super Bowl 50?
Ground Truth: ['Denver Broncos', 'Denver Broncos', 'Denver Broncos']
Prediction: Denver Broncos
Exact Match: 1.0 | F1: 1.000

Example 2:
Question: Which NFL team represented the NFC at Super Bowl 50?
Ground Truth: ['Carolina Panthers', 'Carolina Panthers', 'Carolina Panthers']
Prediction: Carolina Panthers
Exact Match: 1.0 | F1: 1.000

Example 3:
Question: Where did Super Bowl 50 take place?
Ground Truth: ['Santa Clara, California', "Levi's Stadium", "Levi's Stadium in the San Francisco Bay Area at Santa Clara, California."]
Prediction: Levi's Stadium
Exact Match: 1.0 | F1: 1.000

Example 4:
Question: Which NFL team won Super Bowl 50?
Ground Truth: ['Denver Broncos', 'Denver Broncos', 'Denver Broncos']
Prediction: Denver Broncos
Exact Match: 1.0 | F1: 1.000

Example 5:
Question: What color was used to emphasize the 50th anniversary of the Super Bowl?
Ground Truth: ['gold', 'gold', 'gold']
Prediction: golden
Exac

In [25]:
df = pd.DataFrame(rows)

output_path = "squad_predictions.xlsx"
df.to_excel(output_path, index=False)

print(f"\nSaved prediction results to {output_path}")


Saved prediction results to squad_predictions.xlsx


In [None]:
def compute_exact_match(prediction, ground_truth):
    """Check if prediction exactly matches ground truth"""
    return prediction == ground_truth

def compute_f1(prediction, ground_truth):
    """Compute F1 score between prediction and ground truth"""
    pred_tokens = prediction.split()
    truth_tokens = ground_truth.split()

    common = set(pred_tokens) & set(truth_tokens)
    num_same = len(common)

    if len(pred_tokens) == 0 or len(truth_tokens) == 0:
        return int(pred_tokens == truth_tokens)

    if num_same == 0:
        return 0

    precision = num_same / len(pred_tokens)
    recall = num_same / len(truth_tokens)
    f1 = (2 * precision * recall) / (precision + recall)

    return f1

In [None]:
def decode_prediction(example, start_logit, end_logit):
    encoding = tokenizer(
        example["question"],
        example["context"],
        return_offsets_mapping=True,
        truncation=True,
        max_length=384
    )

    offsets = encoding["offset_mapping"]

    start_idx = int(start_logit.argmax())
    end_idx = int(end_logit.argmax())

    if start_idx >= len(offsets) or end_idx >= len(offsets) or start_idx > end_idx:
        return ""

    start_char = offsets[start_idx][0]
    end_char = offsets[end_idx][1]

    return example["context"][start_char:end_char]


In [None]:
from torch.utils.data import DataLoader

loader = DataLoader(
    test_dataset,
    batch_size=8,
    shuffle=False
)

predictions = []
references = []

with torch.no_grad():
    for batch in loader:
        input_ids = batch["input_ids"].cuda()
        attention_mask = batch["attention_mask"].cuda()
        token_type_ids = batch["token_type_ids"].cuda()

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids
        )

        for i in range(len(input_ids)):
            example_id = batch["example_id"][i]
            context = batch["context"][i]
            question = batch["question"][i]
            gt = batch["ground_truth"][i]

            example = {
                "context": context,
                "question": question
            }

            pred_text = decode_prediction(
                example,
                outputs.start_logits[i],
                outputs.end_logits[i]
            )

            predictions.append({
                "id": example_id,
                "prediction_text": pred_text
            })

            references.append({
                "id": example_id,
                "answers": {
                    "text": [gt],
                    "answer_start": [0]
                }
            })


In [None]:
results = squad_metric.compute(
    predictions=predictions,
    references=references
)

print("Exact Match:", results["exact_match"])
print("F1:", results["f1"])

In [None]:
for i in range(5):
    print("Q:", references[i]["id"])
    print("GT:", references[i]["answers"]["text"][0])
    print("Pred:", predictions[i]["prediction_text"])
    print("-" * 50)
