In [None]:
## Speech and Natural Language Processing (SNLP)

## Lab06
## Question Answering and NER (Part 1- QnA)

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import pandas as pd
import pandas as pd

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/stanford-question-answering-dataset/train-v1.1.json
/kaggle/input/stanford-question-answering-dataset/dev-v1.1.json


In [None]:
import transformers
import json
import os
import warnings
from datasets import Dataset
warnings.filterwarnings("ignore")

In [None]:
with open('/kaggle/input/stanford-question-answering-dataset/train-v1.1.json') as train_file:
    train_data = json.load(train_file)

with open('/kaggle/input/stanford-question-answering-dataset/dev-v1.1.json') as dev_file:
    dev_data = json.load(dev_file)

In [None]:
def prepare_dataset(data):
    contexts = []
    questions = []
    answers = []

    for article in data['data']:
        for paragraph in article['paragraphs']:
            context = paragraph['context']
            for qa in paragraph['qas']:
                question = qa['question']
                answer = qa['answers'][0]  # Take the first answer
                answer['text'] = answer['text']
                answer['answer_start'] = answer['answer_start']

                contexts.append(context)
                questions.append(question)
                answers.append(answer)

    return Dataset.from_dict({'context': contexts, 'question': questions, 'answers': answers})

In [None]:
train_dataset = prepare_dataset(train_data)
dev_dataset = prepare_dataset(dev_data)

In [None]:
from transformers import AutoTokenizer

model_name='distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples['question']]
    inputs = tokenizer(
        questions,
        examples['context'],
        max_length=384,
        truncation=True,
        padding="max_length",
        return_offsets_mapping=True,  # Now supported by the fast tokenizer
        return_tensors="pt"
    )

    start_positions = []
    end_positions = []

    for i, answer in enumerate(examples['answers']):
        start_positions.append(answer['answer_start'])
        end_positions.append(answer['answer_start'] + len(answer['text']))

    inputs.update({
        "start_positions": start_positions,
        "end_positions": end_positions,
    })

    return inputs

In [None]:
tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_dev_dataset = dev_dataset.map(preprocess_function, batched=True)

Map:   0%|          | 0/87599 [00:00<?, ? examples/s]

Map:   0%|          | 0/10570 [00:00<?, ? examples/s]

In [None]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer

model = AutoModelForQuestionAnswering.from_pretrained(model_name)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_dev_dataset
)

trainer.train()

In [None]:
from transformers import pipeline
qa_pipeline = pipeline("question-answering", model=model, tokenizer=tokenizer,device=0)

In [None]:
context="I am Pranay Vuppala. I am born and bought up at Mumbai, Maharashtra. I am pursuing B.Tech Data Science at SVKM's NMIMS MPSTME."
question='Where does Amulya Stay ?'

result = qa_pipeline({
    'context': context,
    'question': question
})

In [None]:
print("Prediction:", result)

Prediction: {'score': 0.023877525702118874, 'start': 23, 'end': 41, 'answer': 'He stays in Mumbai'}


In [None]:
predicted_start = result['start']
predicted_end = result['end']


true_answer = "He stays in Mumbai"
true_start = context.find(true_answer)
true_end = true_start + len(true_answer)

In [None]:
def compute_iou(pred, ref):
    pred_tokens = set(range(pred['start_positions'], pred['end_positions']))
    ref_tokens = set(range(ref['start_positions'], ref['end_positions']))
    intersection = len(pred_tokens & ref_tokens)
    union = len(pred_tokens | ref_tokens)
    return intersection / union if union != 0 else 0

In [None]:
pred = {'start_positions': predicted_start, 'end_positions': predicted_end}
ref = {'start_positions': true_start, 'end_positions': true_end}

# Compute token-level IoU
iou_score = compute_iou(pred, ref)

In [None]:
print("Prediction:", result['answer'])
print("True Answer:", true_answer)
print("Token-level IoU:", iou_score)

Prediction: He stays in Mumbai
True Answer: He stays in Mumbai
Token-level IoU: 1.0
