In [1]:
import numpy as np
import pandas as pd
import transformers
from datasets import Dataset
import json



In [2]:
# Load training data
train_path = '/kaggle/input/stanford-question-answering-dataset/train-v1.1.json'
with open(train_path, 'r') as f_train:
    train_data = json.load(f_train)

# Load development data
dev_path = '/kaggle/input/stanford-question-answering-dataset/dev-v1.1.json'
with open(dev_path, 'r') as f_dev:
    dev_data = json.load(f_dev)

In [3]:
def process_dataset(input_data):
    context_list = []
    question_list = []
    answer_list = []

    for doc in input_data['data']:
        for para in doc['paragraphs']:
            context_text = para['context']
            for q_a in para['qas']:
                query = q_a['question']
                answer_info = q_a['answers'][0]  # Always selecting the first answer
                extracted_answer = {
                    'text': answer_info['text'],
                    'answer_start': answer_info['answer_start']
                }

                context_list.append(context_text)
                question_list.append(query)
                answer_list.append(extracted_answer)

    dataset_format = {'context': context_list, 'question': question_list, 'answers': answer_list}
    return Dataset.from_dict(dataset_format)


In [4]:
# Prepare the training and development datasets
training_dataset = process_dataset(train_data)
validation_dataset = process_dataset(dev_data)


In [5]:
from transformers import AutoTokenizer

model='distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [6]:
def transform_examples(sample_batch):
    clean_questions = [query.strip() for query in sample_batch['question']]
    
    tokenized_inputs = tokenizer(
        clean_questions,
        sample_batch['context'],
        max_length=384,
        truncation=True,
        padding="max_length",
        return_offsets_mapping=True,  # For fast tokenizers
        return_tensors="pt"
    )
    
    start_pos_list = []
    end_pos_list = []
    
    for idx, ans in enumerate(sample_batch['answers']):
        start_pos_list.append(ans['answer_start'])
        end_pos_list.append(ans['answer_start'] + len(ans['text']))
    
    tokenized_inputs.update({
        "start_positions": start_pos_list,
        "end_positions": end_pos_list,
    })
    
    return tokenized_inputs


In [7]:
# Apply preprocessing to the training and validation datasets
encoded_train_dataset = training_dataset.map(transform_examples, batched=True)
encoded_dev_dataset = validation_dataset.map(transform_examples, batched=True)


Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [8]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer

qa_model = AutoModelForQuestionAnswering.from_pretrained(model_name)


model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
train_args = TrainingArguments(
    output_dir="./outputs",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
)

qa_trainer = Trainer(
    model=qa_model,
    args=train_args,
    train_dataset=encoded_train_dataset,
    eval_dataset=encoded_dev_dataset
)

qa_trainer.train()


[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss
1,5.1942,5.069376
2,4.8438,4.813616


TrainOutput(global_step=10950, training_loss=5.1704162207477165, metrics={'train_runtime': 3848.8393, 'train_samples_per_second': 45.52, 'train_steps_per_second': 2.845, 'total_flos': 1.7167621364554752e+16, 'train_loss': 5.1704162207477165, 'epoch': 2.0})

In [13]:
from transformers import pipeline
qa_system = pipeline("question-answering", model=qa_model, tokenizer=tokenizer, device=0)


In [49]:
context_text = "The NMIMS Uni is quite big "
query = 'What uni is quite big?'

result = qa_system({
    'context': context_text,
    'question': query
})


In [50]:
print("Prediction:", result)

Prediction: {'score': 0.009663555771112442, 'start': 28, 'end': 33, 'answer': 'NMIMS'}


In [51]:
predicted_start_index = result['start']
predicted_end_index = result['end']

actual_answer = "NMIMS"
actual_start_index = context_text.find(actual_answer)
actual_end_index = actual_start_index + len(actual_answer)



In [52]:
def calculate_iou(prediction, reference):
    predicted_tokens = set(range(prediction['start_positions'], prediction['end_positions']))
    reference_tokens = set(range(reference['start_positions'], reference['end_positions']))
    
    intersection_count = len(predicted_tokens & reference_tokens)
    union_count = len(predicted_tokens | reference_tokens)
    
    return intersection_count / union_count if union_count != 0 else 0


In [53]:
prediction = {'start_positions': predicted_start_index, 'end_positions': predicted_end_index}
reference = {'start_positions': actual_start_index, 'end_positions': actual_end_index}

# Calculate token-level IoU
iou_score = calculate_iou(prediction, reference)


In [54]:
print("Prediction:", result['answer'])
print("True Answer:", true_answer)
print("Token-level IoU:", iou_score)

Prediction: NMIMS
True Answer: NMIMS
Token-level IoU: 1.0
