In [2]:
# Import necessary libraries for data handling, model operations, and visualization
import os
import json
import requests
import numpy as np
import torch
from torch import nn
from torch.optim.lr_scheduler import ExponentialLR
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
from collections import Counter
from tqdm import tqdm

# Import specific tools from the transformers library
import transformers
from transformers import BertTokenizerFast, BertForQuestionAnswering, AdamW

In [2]:
# Set the computing device based on GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Print device status for verification
if torch.cuda.is_available():
    print("Running on GPU.")
else:
    print("GPU not available; running on CPU.")

Running on GPU.


In [3]:
# Define the base path for data storage on Palmetto
BASE_DIR = "/home/jrajend/HW3"

# Specify the filenames for the various dataset files
train_file = "spoken_train-v1.1.json"
test_file = "spoken_test-v1.1.json"
test_file_WER44 = "spoken_test-v1.1_WER44.json"
test_file_WER54 = "spoken_test-v1.1_WER54.json"

# Construct full paths
train_file_path = os.path.join(BASE_DIR, train_file)
test_file_path = os.path.join(BASE_DIR, test_file)
test_file_WER44_path = os.path.join(BASE_DIR, test_file_WER44)
test_file_WER54_path = os.path.join(BASE_DIR, test_file_WER54)


In [4]:
import json

with open(train_file_path, 'r') as f:
    train_data = json.load(f)

with open(test_file_path, 'r') as f:
    test_data = json.load(f)

In [5]:
import os

# Load the path for the training dataset file
train_file_path = os.path.join(BASE_DIR, train_file)
print("Training file path:", train_file_path)

# Repeat similar steps for other dataset files if required

Training file path: /home/jrajend/HW3/spoken_train-v1.1.json


In [6]:
# Function to extract contexts, questions, and answers from a JSON file
def load_data_from_json(path):
    # Initialize lists to store contexts, questions, and answers
    data_contexts, data_questions, data_answers = [], [], []

    # Open the JSON file and load its content
    with open(path, 'r') as file:
        file_content = json.load(file)

    # Process each section within the data file
    for entry in file_content.get('data', []):
        paragraphs = entry.get('paragraphs', [])

        # Extract context and question-answer pairs from each paragraph
        for paragraph in paragraphs:
            context_text = paragraph.get('context', "").lower()

            qas = paragraph.get('qas', [])
            for qa_pair in qas:
                question_text = qa_pair.get('question', "").lower()

                # Append each answer related to the question and context
                for answer in qa_pair.get('answers', []):
                    data_contexts.append(context_text)
                    data_questions.append(question_text)
                    data_answers.append(answer)

    # Print the first few entries to verify the output
    print("Sample Context:", data_contexts[:1])
    print("Sample Question:", data_questions[:1])
    print("Sample Answer:", data_answers[:1])

    return data_contexts, data_questions, data_answers

# Example usage with a file path
contexts, questions, answers = load_data_from_json(train_file_path)


Sample Context: ['architecturally the school has a catholic character. atop the main building school dome is the golden statue of the virgin mary. immediately in front of the main building in facing it is a copper statue of christ with arms appraised with the legend and the bad meow names. next to the main building is the basilica of the sacred heart. immediately behind the basilica is the grotto im mary in place of prayer and reflection. it is a replica of the grotto at lourdes france where the virgin mary reputedly appeared to st bernadette still burning eighteen fifty eight. at the end of the main drive and in a direct line that connects through three statues in the gold dome is as simple modern stone statue of mary.']
Sample Question: ['what is in front of the notre dame main building?']
Sample Answer: [{'answer_start': 187, 'text': 'a copper statue of christ'}]


In [7]:
# Define paths for each dataset type
train_data_path = os.path.join(BASE_DIR, train_file)
test_data_path = os.path.join(BASE_DIR, test_file)
test_data_path_WER44 = os.path.join(BASE_DIR, test_file_WER44)
test_data_path_WER54 = os.path.join(BASE_DIR, test_file_WER54)

# Load and display training data sample
train_contexts, train_questions, train_answers = load_data_from_json(train_data_path)
print(f"Sample from Training Data:\nQuestion: {train_questions[0]}\nAnswer: {train_answers[0]}")

# Load and display testing data sample
test_contexts, test_questions, test_answers = load_data_from_json(test_data_path)
print(f"Sample from Testing Data:\nQuestion: {test_questions[0]}\nAnswer: {test_answers[0]}")

# Load and display WER 44 testing data sample
test_contexts_44, test_questions_44, test_answers_44 = load_data_from_json(test_data_path_WER44)
print(f"Sample from Testing Data WER 44:\nQuestion: {test_questions_44[0]}\nAnswer: {test_answers_44[0]}")

# Load and display WER 54 testing data sample
test_contexts_54, test_questions_54, test_answers_54 = load_data_from_json(test_data_path_WER54)
print(f"Sample from Testing Data WER 54:\nQuestion: {test_questions_54[0]}\nAnswer: {test_answers_54[0]}")

Sample Context: ['architecturally the school has a catholic character. atop the main building school dome is the golden statue of the virgin mary. immediately in front of the main building in facing it is a copper statue of christ with arms appraised with the legend and the bad meow names. next to the main building is the basilica of the sacred heart. immediately behind the basilica is the grotto im mary in place of prayer and reflection. it is a replica of the grotto at lourdes france where the virgin mary reputedly appeared to st bernadette still burning eighteen fifty eight. at the end of the main drive and in a direct line that connects through three statues in the gold dome is as simple modern stone statue of mary.']
Sample Question: ['what is in front of the notre dame main building?']
Sample Answer: [{'answer_start': 187, 'text': 'a copper statue of christ'}]
Sample from Training Data:
Question: what is in front of the notre dame main building?
Answer: {'answer_start': 187, 'tex

In [8]:
# Function to add the 'answer_end' index for each answer in the dataset
def set_answer_end_indices(answers, contexts):
    for ans, ctx in zip(answers, contexts):
        ans_text = ans['text'].lower()
        ans_start = ans['answer_start']
        ans_end = ans_start + len(ans_text)

        # Check if the text matches at the expected location
        if ctx[ans_start:ans_end] == ans_text:
            ans['answer_end'] = ans_end
        else:
            # Adjust start and end indices if there is a mismatch
            for adjustment in [1, 2]:
                shifted_start = ans_start - adjustment
                shifted_end = ans_end - adjustment
                if ctx[shifted_start:shifted_end] == ans_text:
                    ans['answer_start'] = shifted_start
                    ans['answer_end'] = shifted_end
                    break  # Stop adjustment once a match is found


In [9]:
# Set answer end indices for each dataset by calling the function
set_answer_end_indices(train_answers, train_contexts)
set_answer_end_indices(test_answers, test_contexts)
set_answer_end_indices(test_answers_44, test_contexts_44)
set_answer_end_indices(test_answers_54, test_contexts_54)

In [10]:
from transformers import AutoTokenizer  # Ensure AutoTokenizer is imported

# Define parameters for tokenization
MAX_CONTEXT_LENGTH = 512
MODEL_NAME = "deepset/bert-base-uncased-squad2"

# Initialize the tokenizer for the specified model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

# Tokenize each dataset with the specified maximum length, truncation, and padding
train_encodings = tokenizer(train_questions, train_contexts, max_length=MAX_CONTEXT_LENGTH, truncation=True, padding=True)
test_encodings = tokenizer(test_questions, test_contexts, max_length=MAX_CONTEXT_LENGTH, truncation=True, padding=True)
test_encodings_44 = tokenizer(test_questions_44, test_contexts_44, max_length=MAX_CONTEXT_LENGTH, truncation=True, padding=True)
test_encodings_54 = tokenizer(test_questions_54, test_contexts_54, max_length=MAX_CONTEXT_LENGTH, truncation=True, padding=True)




In [11]:
# Function to locate start and end positions of answers within tokenized encodings
def locate_answer_positions(encodings, answers, tokenizer):
    start_positions, end_positions = [], []

    # Iterate over each encoding-answer pair
    for idx in range(len(encodings['input_ids'])):
        answer_text = answers[idx]['text']
        
        # Tokenize the answer text independently
        answer_tokens = tokenizer(answer_text, max_length=MAX_CONTEXT_LENGTH, truncation=True, padding=True)

        # Initialize position tracking variables
        answer_start, answer_end = 0, 0
        answer_found = False

        # Search for matching token sequence within the context
        context_tokens = encodings['input_ids'][idx]
        for j in range(len(context_tokens) - len(answer_tokens['input_ids'])):
            if context_tokens[j + 1:j + len(answer_tokens['input_ids']) - 1] == answer_tokens['input_ids'][1:-1]:
                answer_start = j
                answer_end = j + len(answer_tokens['input_ids']) - 1
                answer_found = True
                break

        # Append positions or default values if no match was found
        start_positions.append(answer_start if answer_found else 0)
        end_positions.append(answer_end if answer_found else 0)

    return start_positions, end_positions

# Generate and add start/end positions for each dataset encoding
# Ensure this block of code is run after tokenizing each dataset

# For training data
train_start_positions, train_end_positions = locate_answer_positions(train_encodings, train_answers, tokenizer)
train_encodings.update({'start_positions': train_start_positions, 'end_positions': train_end_positions})

# For test data
test_start_positions, test_end_positions = locate_answer_positions(test_encodings, test_answers, tokenizer)
test_encodings.update({'start_positions': test_start_positions, 'end_positions': test_end_positions})

# For WER 44 test data
test_start_positions_44, test_end_positions_44 = locate_answer_positions(test_encodings_44, test_answers_44, tokenizer)
test_encodings_44.update({'start_positions': test_start_positions_44, 'end_positions': test_end_positions_44})

# For WER 54 test data
test_start_positions_54, test_end_positions_54 = locate_answer_positions(test_encodings_54, test_answers_54, tokenizer)
test_encodings_54.update({'start_positions': test_start_positions_54, 'end_positions': test_end_positions_54})


In [17]:
class QADataset(Dataset):
    def __init__(self, encodings):
        # Convert each encoding to a torch tensor
        self.data = {
            'input_ids': torch.tensor(encodings['input_ids']),
            'token_type_ids': torch.tensor(encodings['token_type_ids']),
            'attention_mask': torch.tensor(encodings['attention_mask']),
            'start_positions': torch.tensor(encodings['start_positions']),
            'end_positions': torch.tensor(encodings['end_positions'])
        }
        # Check for matching lengths in all encoding tensors
        assert len(self.data['input_ids']) == len(self.data['start_positions']) == len(self.data['end_positions']), \
            "Mismatch in data lengths among input_ids, start_positions, and end_positions"

    def __getitem__(self, idx):
        # Check if idx exists to prevent KeyErrors
        if idx >= len(self.data['input_ids']):
            raise IndexError(f"Index {idx} out of range for dataset size {len(self.data['input_ids'])}")
        
        return {key: value[idx] for key, value in self.data.items()}

    def __len__(self):
        return len(self.data['input_ids'])


In [18]:
# Create dataset instances for each set of encodings
train_dataset = QADataset(train_encodings)
test_dataset = QADataset(test_encodings)
test_dataset_44 = QADataset(test_encodings_44)
test_dataset_54 = QADataset(test_encodings_54)


In [19]:
from transformers import AutoModelForQuestionAnswering  # Ensure the model class is imported

# Load the pre-trained question-answering model
qa_model = AutoModelForQuestionAnswering.from_pretrained('deepset/bert-base-uncased-squad2')
print("Loaded QA model architecture:\n", qa_model)


Loaded QA model architecture:
 BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): La

In [20]:
from transformers import AdamW
import numpy as np
from tqdm import tqdm

# Function to train the QA model with specified number of epochs and dataloader
def train_question_answering_model(qa_model, train_loader, epochs=1, learning_rate=2e-5, decay=2e-2):
    qa_model.to(device)
    qa_model.train()

    # Set up optimizer with specified learning rate and weight decay
    optimizer = AdamW(qa_model.parameters(), lr=learning_rate, weight_decay=decay)

    # Lists to track loss and accuracy per epoch
    loss_history, accuracy_history = [], []

    # Iterate over epochs
    for epoch in range(epochs):
        batch_losses, batch_accuracies = [], []
        with tqdm(train_loader, desc=f'Epoch {epoch + 1}') as progress_bar:

            # Process each batch within the loader
            for batch in progress_bar:
                optimizer.zero_grad()

                # Transfer each batch component to the device
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                token_type_ids = batch['token_type_ids'].to(device)
                start_positions = batch['start_positions'].to(device)
                end_positions = batch['end_positions'].to(device)

                # Forward pass through the model
                outputs = qa_model(
                    input_ids,
                    attention_mask=attention_mask,
                    start_positions=start_positions,
                    token_type_ids=token_type_ids,
                    end_positions=end_positions
                )

                # Compute loss and perform backpropagation
                loss = outputs[0]
                loss.backward()
                optimizer.step()

                # Calculate batch accuracy for start and end positions
                start_logits, end_logits = outputs[1], outputs[2]
                start_preds = start_logits.argmax(dim=1)
                end_preds = end_logits.argmax(dim=1)
                batch_accuracy = ((start_preds == start_positions).float().mean() +
                                  (end_preds == end_positions).float().mean()) / 2

                # Record loss and accuracy for this batch
                batch_losses.append(loss.item())
                batch_accuracies.append(batch_accuracy.item())

                # Update progress bar with current loss and accuracy
                progress_bar.set_postfix_str(f'Loss: {loss.item():.4f}, Acc: {batch_accuracy.item():.4f}')

        # Track average loss and accuracy per epoch
        loss_history.append(np.mean(batch_losses))
        accuracy_history.append(np.mean(batch_accuracies))

    return loss_history, accuracy_history


In [22]:
from torch.utils.data import DataLoader

# Set the batch size for DataLoader
BATCH_SIZE = 16

# Initialize DataLoader for training dataset with specified batch size and shuffling enabled
train_data_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

# Add this test to inspect the first batch and ensure DataLoader is working
for batch in train_data_loader:
    print(batch)
    break  # Only print the first batch to inspect its structure

# Call the training function with matching function name
train_loss_history, train_accuracy_history = train_question_answering_model(qa_model, train_data_loader, epochs=1)


{'input_ids': tensor([[ 101, 2013, 2029,  ...,    0,    0,    0],
        [ 101, 2029, 2535,  ...,    0,    0,    0],
        [ 101, 2054, 4111,  ..., 1037, 3618,  102],
        ...,
        [ 101, 2054, 2024,  ...,    0,    0,    0],
        [ 101, 1999, 2054,  ...,    0,    0,    0],
        [ 101, 2073, 1999,  ...,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 1, 1, 1],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'start_positions': tensor([ 17, 127,  40,  44,  64,  16,  48, 134, 101, 106,  35,  58,  78, 263,
         15, 175]), 'end_positions': tensor([ 19, 133,  42,  49,  69,  19,  53, 139, 103

Epoch 1: 100%|██████████| 2320/2320 [05:42<00:00,  6.78it/s, Loss: 1.0228, Acc: 0.7143]


In [23]:
from collections import Counter

# Function to compute the average F1 score between predictions and references
def compute_f1_score(predictions, references):
    f1_scores = []

    # Calculate F1 for each prediction-reference pair
    for prediction, reference in zip(predictions, references):
        # Count common elements between prediction and reference
        overlap = Counter(prediction) & Counter(reference)
        matches = sum(overlap.values())

        # Calculate precision and recall
        if matches == 0:
            precision, recall = 0, 0
        else:
            precision = matches / len(prediction)
            recall = matches / len(reference)

        # Calculate F1 score for current pair
        if precision + recall == 0:
            f1 = 0
        else:
            f1 = (2 * precision * recall) / (precision + recall)

        f1_scores.append(f1)

    # Compute the average F1 score across all pairs
    average_f1 = sum(f1_scores) / len(f1_scores)
    return average_f1

In [30]:
from torch.utils.data import DataLoader

# Define the evaluation function to assess model performance on test datasets
def evaluate_qa_model(qa_model, dataloader, tokenizer):
    qa_model.eval()
    predictions, references = [], []

    with torch.no_grad():
        for batch in dataloader:
            # Move batch data to the specified device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)
            start_positions = batch['start_positions'].to(device)
            end_positions = batch['end_positions'].to(device)

            # Perform forward pass
            outputs = qa_model(input_ids, attention_mask=attention_mask,
                               start_positions=start_positions,
                               token_type_ids=token_type_ids,
                               end_positions=end_positions)

            # Extract start and end logits, and determine predicted positions
            start_logits, end_logits = outputs[1], outputs[2]
            start_pred = start_logits.argmax(dim=1)
            end_pred = end_logits.argmax(dim=1)

            # Decode predicted answers and reference answers for each example
            for i in range(len(start_pred)):
                start = start_pred[i].item()
                end = end_pred[i].item()
                predictions.append(tokenizer.decode(batch['input_ids'][i][start:end+1], skip_special_tokens=True))
                references.append(tokenizer.decode(batch['input_ids'][i][batch['start_positions'][i]:batch['end_positions'][i]+1], skip_special_tokens=True))

    # Calculate and return the average F1 score using the function defined earlier
    avg_f1_score = compute_f1_score(predictions, references)
    return avg_f1_score

# Initialize DataLoaders for each test dataset with a batch size of 16
test_data_loader = DataLoader(test_dataset, batch_size=16)
test_data_loader_44 = DataLoader(test_dataset_44, batch_size=16)
test_data_loader_54 = DataLoader(test_dataset_54, batch_size=16)
# Print training loss history
print("Training Loss History:", train_loss_history)

# Evaluate the model on each test dataset and print F1 scores
f1_score_test = evaluate_qa_model(qa_model, test_data_loader, tokenizer)
print(f"F1 Score on Test Data: {f1_score_test}")

f1_score_test_44 = evaluate_qa_model(qa_model, test_data_loader_44, tokenizer)
print(f"F1 Score on Test Data WER 44: {f1_score_test_44}")

f1_score_test_54 = evaluate_qa_model(qa_model, test_data_loader_54, tokenizer)
print(f"F1 Score on Test Data WER 54: {f1_score_test_54}")

Training Loss History: [1.0592439389305897]
F1 Score on Test Data: 0.8194211220474285
F1 Score on Test Data WER 44: 0.45354643780059883
F1 Score on Test Data WER 54: 0.3210323865450928
