In [2]:
# Import necessary libraries for data handling, model operations, and visualization
import os
import json
import requests
import numpy as np
import torch
from torch import nn
from torch.optim.lr_scheduler import ExponentialLR
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
from collections import Counter
from tqdm import tqdm

# Import specific tools from the transformers library
import transformers
from transformers import BertTokenizerFast, BertForQuestionAnswering, AdamW

In [3]:
# Define the base path for data storage on Palmetto
BASE_DIR = "/home/jrajend/HW3"

# Specify the filenames for the various dataset files
train_file = "spoken_train-v1.1.json"
test_file = "spoken_test-v1.1.json"
test_file_WER44 = "spoken_test-v1.1_WER44.json"
test_file_WER54 = "spoken_test-v1.1_WER54.json"

# Construct full paths
train_file_path = os.path.join(BASE_DIR, train_file)
test_file_path = os.path.join(BASE_DIR, test_file)
test_file_WER44_path = os.path.join(BASE_DIR, test_file_WER44)
test_file_WER54_path = os.path.join(BASE_DIR, test_file_WER54)


In [4]:
# Set the computing device based on GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Print device status for verification
if torch.cuda.is_available():
    print("Running on GPU.")
else:
    print("GPU not available; running on CPU.")

Running on GPU.


In [5]:
import json

with open(train_file_path, 'r') as f:
    train_data = json.load(f)

with open(test_file_path, 'r') as f:
    test_data = json.load(f)


In [6]:
import os

# Load the path for the training dataset file
train_file_path = os.path.join(BASE_DIR, train_file)
print("Training file path:", train_file_path)

# Repeat similar steps for other dataset files if required

Training file path: /home/jrajend/HW3/spoken_train-v1.1.json


In [7]:
# Function to extract contexts, questions, and answers from a JSON file
def load_data_from_json(path):
    # Initialize lists to store contexts, questions, and answers
    data_contexts, data_questions, data_answers = [], [], []

    # Open the JSON file and load its content
    with open(path, 'r') as file:
        file_content = json.load(file)

    # Process each section within the data file
    for entry in file_content.get('data', []):
        paragraphs = entry.get('paragraphs', [])

        # Extract context and question-answer pairs from each paragraph
        for paragraph in paragraphs:
            context_text = paragraph.get('context', "").lower()

            qas = paragraph.get('qas', [])
            for qa_pair in qas:
                question_text = qa_pair.get('question', "").lower()

                # Append each answer related to the question and context
                for answer in qa_pair.get('answers', []):
                    data_contexts.append(context_text)
                    data_questions.append(question_text)
                    data_answers.append(answer)

    # Print the first few entries to verify the output
    print("Sample Context:", data_contexts[:1])
    print("Sample Question:", data_questions[:1])
    print("Sample Answer:", data_answers[:1])

    return data_contexts, data_questions, data_answers

# Example usage with a file path
contexts, questions, answers = load_data_from_json(train_file_path)


Sample Context: ['architecturally the school has a catholic character. atop the main building school dome is the golden statue of the virgin mary. immediately in front of the main building in facing it is a copper statue of christ with arms appraised with the legend and the bad meow names. next to the main building is the basilica of the sacred heart. immediately behind the basilica is the grotto im mary in place of prayer and reflection. it is a replica of the grotto at lourdes france where the virgin mary reputedly appeared to st bernadette still burning eighteen fifty eight. at the end of the main drive and in a direct line that connects through three statues in the gold dome is as simple modern stone statue of mary.']
Sample Question: ['what is in front of the notre dame main building?']
Sample Answer: [{'answer_start': 187, 'text': 'a copper statue of christ'}]


In [8]:
# Define paths for each dataset type
train_data_path = os.path.join(BASE_DIR, train_file)
test_data_path = os.path.join(BASE_DIR, test_file)
test_data_path_WER44 = os.path.join(BASE_DIR, test_file_WER44)
test_data_path_WER54 = os.path.join(BASE_DIR, test_file_WER54)

# Load and display training data sample
train_contexts, train_questions, train_answers = load_data_from_json(train_data_path)
print(f"Sample from Training Data:\nQuestion: {train_questions[0]}\nAnswer: {train_answers[0]}")

# Load and display testing data sample
test_contexts, test_questions, test_answers = load_data_from_json(test_data_path)
print(f"Sample from Testing Data:\nQuestion: {test_questions[0]}\nAnswer: {test_answers[0]}")

# Load and display WER 44 testing data sample
test_contexts_44, test_questions_44, test_answers_44 = load_data_from_json(test_data_path_WER44)
print(f"Sample from Testing Data WER 44:\nQuestion: {test_questions_44[0]}\nAnswer: {test_answers_44[0]}")

# Load and display WER 54 testing data sample
test_contexts_54, test_questions_54, test_answers_54 = load_data_from_json(test_data_path_WER54)
print(f"Sample from Testing Data WER 54:\nQuestion: {test_questions_54[0]}\nAnswer: {test_answers_54[0]}")

Sample Context: ['architecturally the school has a catholic character. atop the main building school dome is the golden statue of the virgin mary. immediately in front of the main building in facing it is a copper statue of christ with arms appraised with the legend and the bad meow names. next to the main building is the basilica of the sacred heart. immediately behind the basilica is the grotto im mary in place of prayer and reflection. it is a replica of the grotto at lourdes france where the virgin mary reputedly appeared to st bernadette still burning eighteen fifty eight. at the end of the main drive and in a direct line that connects through three statues in the gold dome is as simple modern stone statue of mary.']
Sample Question: ['what is in front of the notre dame main building?']
Sample Answer: [{'answer_start': 187, 'text': 'a copper statue of christ'}]
Sample from Training Data:
Question: what is in front of the notre dame main building?
Answer: {'answer_start': 187, 'tex

In [9]:
# Function to add the 'answer_end' index for each answer in the dataset
def set_answer_end_indices(answers, contexts):
    for ans, ctx in zip(answers, contexts):
        ans_text = ans['text'].lower()
        ans_start = ans['answer_start']
        ans_end = ans_start + len(ans_text)

        # Check if the text matches at the expected location
        if ctx[ans_start:ans_end] == ans_text:
            ans['answer_end'] = ans_end
        else:
            # Adjust start and end indices if there is a mismatch
            for adjustment in [1, 2]:
                shifted_start = ans_start - adjustment
                shifted_end = ans_end - adjustment
                if ctx[shifted_start:shifted_end] == ans_text:
                    ans['answer_start'] = shifted_start
                    ans['answer_end'] = shifted_end
                    break  # Stop adjustment once a match is found

In [10]:
# Set answer end indices for each dataset by calling the function
set_answer_end_indices(train_answers, train_contexts)
set_answer_end_indices(test_answers, test_contexts)
set_answer_end_indices(test_answers_44, test_contexts_44)
set_answer_end_indices(test_answers_54, test_contexts_54)

In [15]:
# Set model parameters and initialize tokenizer
MAX_TOKENS = 512
MODEL_NAME = "bert-base-uncased"

# Initialize the tokenizer for the specified BERT model
tokenizer = BertTokenizerFast.from_pretrained(MODEL_NAME)

# Tokenize each dataset
encoded_train = tokenizer(train_questions, train_contexts, max_length=MAX_TOKENS, truncation=True, padding=True)
encoded_test = tokenizer(test_questions, test_contexts, max_length=MAX_TOKENS, truncation=True, padding=True)
encoded_test_44 = tokenizer(test_questions_44, test_contexts_44, max_length=MAX_TOKENS, truncation=True, padding=True)
encoded_test_54 = tokenizer(test_questions_54, test_contexts_54, max_length=MAX_TOKENS, truncation=True, padding=True)


In [37]:
# Function to locate start and end positions of answers within tokenized encodings
def locate_answer_positions(encodings, answers, tokenizer):
    start_positions, end_positions = [], []

    # Iterate over each encoding-answer pair
    for idx in range(len(encodings['input_ids'])):
        answer_text = answers[idx]['text']
        # Tokenize the answer text independently
        answer_tokens = tokenizer(answer_text, max_length=MAX_TOKENS, truncation=True, padding=True)

        # Initialize position tracking variables
        answer_start, answer_end = 0, 0
        answer_found = False

        # Search for matching token sequence within the context
        context_tokens = encodings['input_ids'][idx]
        for j in range(len(context_tokens) - len(answer_tokens['input_ids'])):
            if context_tokens[j + 1:j + len(answer_tokens['input_ids']) - 1] == answer_tokens['input_ids'][1:-1]:
                answer_start = j
                answer_end = j + len(answer_tokens['input_ids']) - 1
                answer_found = True
                break

        # Append positions or default values if no match was found
        start_positions.append(answer_start if answer_found else 0)
        end_positions.append(answer_end if answer_found else 0)

    return start_positions, end_positions

# Generate and add start/end positions for each dataset encoding
# Ensure this block of code is run after tokenizing each dataset

# For training data
train_start_positions, train_end_positions = locate_answer_positions(encoded_train, train_answers, tokenizer)
encoded_train.update({'start_positions': train_start_positions, 'end_positions': train_end_positions})

# For test data
test_start_positions, test_end_positions = locate_answer_positions(encoded_test, test_answers, tokenizer)
encoded_test.update({'start_positions': test_start_positions, 'end_positions': test_end_positions})

# For WER 44 test data
test_start_positions_44, test_end_positions_44 = locate_answer_positions(encoded_test_44, test_answers_44, tokenizer)
encoded_test_44.update({'start_positions': test_start_positions_44, 'end_positions': test_end_positions_44})

# For WER 54 test data
test_start_positions_54, test_end_positions_54 = locate_answer_positions(encoded_test_54, test_answers_54, tokenizer)
encoded_test_54.update({'start_positions': test_start_positions_54, 'end_positions': test_end_positions_54})


In [38]:
class QADataset(Dataset):
    def __init__(self, encodings):
        # Store all encoding tensors in a dictionary for efficient access
        self.data = {
            'input_ids': torch.tensor(encodings['input_ids']),
            'token_type_ids': torch.tensor(encodings['token_type_ids']),
            'attention_mask': torch.tensor(encodings['attention_mask']),
            'start_positions': torch.tensor(encodings['start_positions']),
            'end_positions': torch.tensor(encodings['end_positions'])
        }

    def __getitem__(self, idx):
        # Retrieve each tensor directly from the data dictionary
        return {key: value[idx] for key, value in self.data.items()}

    def __len__(self):
        # Return the total number of samples based on input IDs
        return len(self.data['input_ids'])

In [40]:
# Create dataset instances for each set of encodings
train_data = QADataset(encoded_train)
test_data = QADataset(encoded_test)
test_data_44 = QADataset(encoded_test_44)
test_data_54 = QADataset(encoded_test_54)

In [41]:
from transformers import BertForQuestionAnswering

# Initialize and load the pre-trained BERT model for question answering
model_name = "bert-base-uncased"
qa_model = BertForQuestionAnswering.from_pretrained(model_name)

# Confirm successful loading by printing the model architecture
print("Loaded BERT model for QA:", qa_model)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loaded BERT model for QA: BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNo

In [42]:
from transformers import AdamW
import numpy as np
from tqdm import tqdm

# Function to train the model with specified epochs and dataloader
def train_qa_model(qa_model, dataloader, num_epochs=1, lr=2e-5, weight_decay=2e-2):
    qa_model.to(device)
    qa_model.train()

    # Set up optimizer with learning rate and weight decay
    optimizer = AdamW(qa_model.parameters(), lr=lr, weight_decay=weight_decay)

    # Arrays to store loss and accuracy history
    loss_history, accuracy_history = [], []

    # Loop over epochs
    for epoch in range(num_epochs):
        epoch_losses, epoch_accuracies = [], []
        with tqdm(dataloader, desc=f'Epoch {epoch + 1}') as progress_bar:

            # Process each batch in the dataloader
            for batch in progress_bar:
                optimizer.zero_grad()

                # Move batch data to the specified device
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                token_type_ids = batch['token_type_ids'].to(device)
                start_positions = batch['start_positions'].to(device)
                end_positions = batch['end_positions'].to(device)

                # Forward pass
                outputs = qa_model(
                    input_ids,
                    attention_mask=attention_mask,
                    start_positions=start_positions,
                    token_type_ids=token_type_ids,
                    end_positions=end_positions
                )

                # Backpropagation and optimization
                loss = outputs[0]
                loss.backward()
                optimizer.step()

                # Calculate accuracy for start and end positions
                start_logits, end_logits = outputs[1], outputs[2]
                start_predictions = start_logits.argmax(dim=1)
                end_predictions = end_logits.argmax(dim=1)
                accuracy = ((start_predictions == start_positions).float().mean() +
                            (end_predictions == end_positions).float().mean()) / 2

                # Append loss and accuracy for this batch
                epoch_losses.append(loss.item())
                epoch_accuracies.append(accuracy.item())

                # Update the progress bar with loss and accuracy
                progress_bar.set_postfix_str(f'Loss: {loss.item():.4f}, Acc: {accuracy.item():.4f}')

        # Record average loss and accuracy for the epoch
        loss_history.append(np.mean(epoch_losses))
        accuracy_history.append(np.mean(epoch_accuracies))

    return loss_history, accuracy_history


In [43]:
from torch.utils.data import DataLoader

# Initialize the DataLoader for training with batch size and shuffling
batch_size = 16
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)

# Train the model and capture the loss and accuracy history
loss_history, accuracy_history = train_qa_model(qa_model, train_loader, num_epochs=1)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Epoch 1: 100%|██████████| 2320/2320 [17:51<00:00,  2.16it/s, Loss: 1.4784, Acc: 0.7143]


In [44]:
from collections import Counter

# Function to compute the average F1 score between predictions and references
def compute_f1_score(predictions, references):
    f1_scores = []

    # Calculate F1 for each prediction-reference pair
    for prediction, reference in zip(predictions, references):
        # Count common elements between prediction and reference
        overlap = Counter(prediction) & Counter(reference)
        matches = sum(overlap.values())

        # Calculate precision and recall
        if matches == 0:
            precision, recall = 0, 0
        else:
            precision = matches / len(prediction)
            recall = matches / len(reference)

        # Calculate F1 score for current pair
        if precision + recall == 0:
            f1 = 0
        else:
            f1 = (2 * precision * recall) / (precision + recall)

        f1_scores.append(f1)

    # Compute the average F1 score across all pairs
    average_f1 = sum(f1_scores) / len(f1_scores)
    return average_f1

In [45]:
from torch.utils.data import DataLoader

# Function to evaluate the model and calculate average F1 score
def assess_model_performance(qa_model, dataloader, tokenizer):
    qa_model.eval()
    all_predictions, all_references = [], []

    with torch.no_grad():
        for batch in dataloader:
            # Move batch data to the specified device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)
            start_positions = batch['start_positions'].to(device)
            end_positions = batch['end_positions'].to(device)

            # Model forward pass
            outputs = qa_model(
                input_ids,
                attention_mask=attention_mask,
                start_positions=start_positions,
                token_type_ids=token_type_ids,
                end_positions=end_positions
            )

            # Obtain predictions
            start_logits, end_logits = outputs[1], outputs[2]
            start_preds = start_logits.argmax(dim=1)
            end_preds = end_logits.argmax(dim=1)

            # Decode predictions and references
            for idx in range(len(start_preds)):
                start_idx = start_preds[idx].item()
                end_idx = end_preds[idx].item()
                prediction = tokenizer.decode(batch['input_ids'][idx][start_idx:end_idx + 1])
                reference = tokenizer.decode(batch['input_ids'][idx][batch['start_positions'][idx]:batch['end_positions'][idx] + 1])

                all_predictions.append(prediction)
                all_references.append(reference)

    # Calculate F1 score over all predictions
    avg_f1_score = compute_f1_score(all_predictions, all_references)
    return avg_f1_score

# DataLoaders for each test dataset
test_loader = DataLoader(test_data, batch_size=16)
test_loader_44 = DataLoader(test_data_44, batch_size=16)
test_loader_54 = DataLoader(test_data_54, batch_size=16)

# Display training loss history if available
if 'loss_history' in locals():
    print("Training Loss History:", loss_history)

# Evaluate model on each dataset and print F1 scores
f1_test = assess_model_performance(qa_model, test_loader, tokenizer)
print(f"F1 Score on Test Data: {f1_test}")

f1_test_44 = assess_model_performance(qa_model, test_loader_44, tokenizer)
print(f"F1 Score on Test Data WER 44: {f1_test_44}")

f1_test_54 = assess_model_performance(qa_model, test_loader_54, tokenizer)
print(f"F1 Score on Test Data WER 54: {f1_test_54}")


Training Loss History: [1.9935726115672754]
F1 Score on Test Data: 0.7513018275331491
F1 Score on Test Data WER 44: 0.42437608514459235
F1 Score on Test Data WER 54: 0.30395857412143634
