#### Name: Sai Sriharsha Griddaluru
CUID: C15358926

In [43]:
import json
import torch
from transformers import DistilBertForQuestionAnswering, DistilBertTokenizer
import numpy as np
from tqdm import tqdm

In [44]:
# :: function to read the dataset :: 
def read_squad(filepath):
    with open(filepath, 'r') as f:
        data = json.load(f)

    contexts = []
    questions = []
    answers = []
    
    for item in data['data']:
        for paragraph in item['paragraphs']:
            context = paragraph['context']
            for qa in paragraph['qas']:
                question = qa['question']
                answer = qa['answers'][0]  # Assuming one answer per question
                contexts.append(context)
                questions.append(question)
                answers.append(answer)
    
    return contexts, questions, answers

In [45]:
# :: load pre-trained model and tokenizer ::
def load_pretrained_model(model_path, device):
    model = DistilBertForQuestionAnswering.from_pretrained(model_path)
    tokenizer = DistilBertTokenizer.from_pretrained(model_path)
    model.to(device)
    model.eval()
    return model, tokenizer


In [46]:
# :: function to get predictions from the model :: 
# Import necessary logging module
from transformers import logging

# Set the verbosity to only show errors (this will suppress warnings and info messages)
logging.set_verbosity_error()

def get_answer(model, tokenizer, context, question, device):
    # Apply truncation and handle padding correctly, avoiding overflow tokens
    
    inputs = tokenizer(question, context, return_tensors="pt", truncation=True, padding='max_length', max_length=512, return_overflowing_tokens=False)
    
    # Move inputs to the device (GPU or CPU)
    inputs = {key: value.to(device) for key, value in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs)
        start_scores, end_scores = outputs.start_logits, outputs.end_logits
        start_idx = torch.argmax(start_scores)
        end_idx = torch.argmax(end_scores) + 1
        answer = tokenizer.decode(inputs['input_ids'][0][start_idx:end_idx], skip_special_tokens=True)
        
    return answer


In [20]:
# :: training model :: 
from torch.utils.data import DataLoader
from transformers import AdamW
from tqdm import tqdm


device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)
model.train()
optim = AdamW(model.parameters(), lr=2e-6)

# initialize data loader for training data
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

for epoch in range(5):
    model.train()
    loop = tqdm(train_loader, leave=True)
    for batch in loop:
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
        end_positions = batch['end_positions'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask,
                        start_positions=start_positions,
                        end_positions=end_positions)
        loss = outputs[0]
        loss.backward()
        optim.step()
        loop.set_description(f'Epoch {epoch}')
        loop.set_postfix(loss=loss.item())


Epoch 0: 100%|██████████| 2320/2320 [06:57<00:00,  5.55it/s, loss=2.64]
Epoch 1: 100%|██████████| 2320/2320 [06:59<00:00,  5.53it/s, loss=1.97]
Epoch 2: 100%|██████████| 2320/2320 [06:59<00:00,  5.53it/s, loss=2.94]
Epoch 3: 100%|██████████| 2320/2320 [06:59<00:00,  5.53it/s, loss=1.67] 
Epoch 4: 100%|██████████| 2320/2320 [06:59<00:00,  5.53it/s, loss=1.73] 
Epoch 5: 100%|██████████| 2320/2320 [06:59<00:00,  5.53it/s, loss=1.62] 
Epoch 6: 100%|██████████| 2320/2320 [06:59<00:00,  5.53it/s, loss=1.69] 
Epoch 7: 100%|██████████| 2320/2320 [07:00<00:00,  5.52it/s, loss=0.956]
Epoch 8: 100%|██████████| 2320/2320 [07:00<00:00,  5.52it/s, loss=2.02] 
Epoch 9: 100%|██████████| 2320/2320 [07:01<00:00,  5.51it/s, loss=1.2]



In [47]:
# :: function to calculate evaluation metrics ::
def calculate_metrics(predictions, references):
    em_score = np.mean([1 if pred.strip() == ref.strip() else 0 for pred, ref in zip(predictions, references)])
    
    f1_scores = []
    for pred, ref in zip(predictions, references):
        pred_tokens = pred.strip().split()
        ref_tokens = ref.strip().split()
        common_tokens = set(pred_tokens) & set(ref_tokens)
        
        if len(common_tokens) == 0:
            f1_scores.append(0)
        else:
            precision = len(common_tokens) / len(pred_tokens)
            recall = len(common_tokens) / len(ref_tokens)
            f1_scores.append(2 * (precision * recall) / (precision + recall))
    
    f1_score_mean = np.mean(f1_scores)
    return em_score * 100, f1_score_mean * 100

In [48]:
# :: main script ::
# load data
train_contexts, train_questions, train_answers = read_squad('./spoken_data/spoken_train-v1.1.json')
val_contexts, val_questions, val_answers = read_squad('./spoken_data/spoken_test-v1.1.json')


# load pre-trained model and tokenizer
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model_path = './improved_model'  # Path where your trained model is saved
model, tokenizer = load_pretrained_model(model_path, device)

# collect predictions and references
predictions = []
references = [answer['text'] for answer in val_answers]  # Assuming val_answers is a list of answer dicts

# generate answers and print three examples
print("SAMPLE PREDICTIONS:")
for idx, (context, question) in enumerate(tqdm(zip(val_contexts, val_questions), total=len(val_questions))):
    predicted_answer = get_answer(model, tokenizer, context, question, device)
    predictions.append(predicted_answer)

    # Print 3 example outputs
    if idx < 3:
        print(f"CONTEXT: {context}")
        print(f"QUESTION: {question}")
        print(f"PREDICTED ANSWER: {predicted_answer}")
        print(f"ACTUAL ANSWER: {references[idx]}")
        print("-----")




SAMPLE PREDICTIONS:


  0%|          | 15/5351 [00:00<00:37, 143.93it/s]

CONTEXT: super bowl fifty was an american football game to determine the champion of the national football league nfl for the twenty fifteen season. the american football conference a f c c champion denver broncos defeated the national football conference n f c c champion carolina panthers twenty four to ten to earn their third super bowl title. the game was played on february seventh twenty sixteen and levis stadium in the san francisco bay area santa clara california. as this was the fiftieth super bowl the league emphasized the golden anniversary with various goldsteins initiatives as well as temporarily suspending the tradition of naming each super bowl game with roman numerals under which they gain would have been known as super bowl l sell that the logo could prominently featured the arabic numerals fifty.
QUESTION: Which NFL team represented the AFC at Super Bowl 50?
PREDICTED ANSWER: denver broncos
ACTUAL ANSWER: denver broncos
-----
CONTEXT: super bowl fifty was an american fo

100%|██████████| 5351/5351 [00:37<00:00, 143.29it/s]


In [49]:
# calculate and display evaluation metrics
em, f1 = calculate_metrics(predictions, references)
print(f"Exact Match (EM): {em:.2f}%")
print(f"F1 Score: {f1:.2f}%")

Exact Match (EM): 13.10%
F1 Score: 22.71%
