## Q1 - Dataset and Pre-Processing


Start by downloading the train, validation, and test splits of a version of the WikiQA corpus with the link above. Load the data into your Notebook and answer the following questions about it. Note that the data’s accompanying


In [1]:
import json
import spacy

# Load the data
def load_dataset(file_path):
    with open(file_path, 'r') as file:
        dataset = json.load(file)
    return dataset

# Load datasets
training_set = load_dataset('/content/coursework_dataset/train.json')
validation_set = load_dataset('/content/coursework_dataset/val.json')
testing_set = load_dataset('/content/coursework_dataset/test.json')

# Initialize SpaCy
nlp = spacy.load("en_core_web_sm")

# Tokenization function
def spacy_tokenize(text):
    doc = nlp(text)
    return [token.lemma_.lower() for token in doc if not token.is_punct and not token.is_space]



In [25]:
import pandas as pd

# Function to count questions and options
def count_items(data):
    num_questions = len(data)
    num_options = sum(len(item["options"]) for item in data)
    return num_questions, num_options

# Initialize lists to store the counts
dataset_names = ["Training", "Validation", "Test"]
q_counts = []
opt_counts = []

# Calculate and store the counts
for dataset_name, dataset in zip(dataset_names, [training_set, validation_set, testing_set]):
    q_count, opt_count = count_items(dataset)
    q_counts.append(q_count)
    opt_counts.append(opt_count)

# Create a DataFrame
data = {
    "Dataset": dataset_names,
    "Questions": q_counts,
    "Options": opt_counts
}
df = pd.DataFrame(data)

# Format the DataFrame
styled_df = df.style.set_caption("Dataset Summary").set_table_styles([{
    'selector': 'caption',
    'props': [('font-size', '16px'), ('font-weight', 'bold')]
}]).applymap(lambda x: 'font-weight: bold' if x == 'Dataset' else '')

# Display the styled DataFrame
styled_df


Unnamed: 0,Dataset,Questions,Options
0,Training,741,2964
1,Validation,103,412
2,Test,202,808


###  (1.2) What is the average number of tokens per question in the training set? [1 mark]


In [26]:
# Function to calculate average tokens in questions

def avg_tokens_questions(data):
    total_tokens = sum(len(spacy_tokenize(item["question"])) for item in data)
    avg_tokens = total_tokens / len(data)
    return avg_tokens

# Calculate and print the average number of tokens per question in the training set
average_tokens_training_questions = avg_tokens_questions(training_set)
print(f"Average number of tokens per question in the training set: {average_tokens_training_questions:.2f}")


Average number of tokens per question in the training set: 6.27


###  (1.3) What is the average number of tokens per choice in the training set?

In [27]:
# Function to calculate average tokens in options

def avg_tokens_options(data):
    total_tokens = sum(len(spacy_tokenize(option)) for item in data for option in item["options"])
    total_options = sum(len(item["options"]) for item in data)
    avg_tokens = total_tokens / total_options
    return avg_tokens

# Calculate and print the average number of tokens per choice in the training set
average_tokens_training_options = avg_tokens_options(training_set)
print(f"Average number of tokens per choice in the training set: {average_tokens_training_options:.2f}")


Average number of tokens per choice in the training set: 22.34


###  (1.4) What is the average number of tokens per correct choice in the training set?

In [28]:
# Function to calculate average tokens in correct options
def avg_tokens_correct_options(data):
    correct_indices = (item["correct_index"] for item in data if "correct_index" in item)
    total_tokens = sum(len(spacy_tokenize(data[i]["options"][index])) for i, index in enumerate(correct_indices))
    avg_tokens = total_tokens / len(data)
    return avg_tokens

# Calculate and print the average number of tokens per correct choice in the training set
average_tokens_correct_option = avg_tokens_correct_options(training_set)
print(f"Average number of tokens per correct choice in the training set: {average_tokens_correct_option:.2f}")


Average number of tokens per correct choice in the training set: 26.03


### 1.5 Perform any additional exploration of the data that you feel would be helpful for this multiple-choice question-answering task. Briefly describe what you found.


In [29]:
from collections import Counter
import numpy as np

# SpaCy for vectorization
nlp = spacy.load('en_core_web_sm')



In [30]:
# 1. Word Frequency Analysis
def word_frequency(data):
    all_tokens = [token for item in data for token in spacy_tokenize(item["question"] + ' ' + ' '.join(item["options"]))]
    return Counter(all_tokens)

word_freq_training = word_frequency(training_set)
print(f"Most common words in the training set: {word_freq_training.most_common(10)}")


Most common words in the training set: [('the', 5274), ('be', 2972), ('of', 2671), ('and', 2097), ('in', 1884), ('a', 1796), ('to', 1113), ('as', 722), ('by', 618), ('or', 500)]


In [31]:
# 2. Correct Option Length vs. Incorrect Options
def option_length_comparison(data):
    correct_lengths = []
    incorrect_lengths = []
    for item in data:
        if "correct_index" in item:
            correct_lengths.append(len(spacy_tokenize(item["options"][item["correct_index"]])))
            incorrect_lengths.extend(len(spacy_tokenize(option)) for idx, option in enumerate(item["options"]) if idx != item["correct_index"])
    return correct_lengths, incorrect_lengths

correct_lengths, incorrect_lengths = option_length_comparison(training_set)
print(f"Average length of correct options: {np.mean(correct_lengths):.2f}")
print(f"Average length of incorrect options: {np.mean(incorrect_lengths):.2f}")


Average length of correct options: 26.03
Average length of incorrect options: 21.11


In [32]:
# 3. Overlap Between Questions and Correct Options
def overlap_question_correct_option(data):
    overlaps = []
    for item in data:
        if "correct_index" in item:
            question_tokens = set(spacy_tokenize(item["question"]))
            correct_option_tokens = set(spacy_tokenize(item["options"][item["correct_index"]]))
            overlap = question_tokens.intersection(correct_option_tokens)
            overlaps.append(len(overlap))
    return overlaps

overlaps_training = overlap_question_correct_option(training_set)
print(f"Average number of overlapping tokens between questions and correct options: {np.mean(overlaps_training):.2f}")


Average number of overlapping tokens between questions and correct options: 2.92


In [33]:
# 4. Semantic Similarity

# Now define the semantic similarity function
def semantic_similarity(data):
    similarities = []
    for item in data:
        if "correct_index" in item:
            question = nlp(item["question"])
            correct_option = nlp(item["options"][item["correct_index"]])
            similarity = question.similarity(correct_option)
            similarities.append(similarity)
    return similarities

# Calculate semantic similarities for the training set
similarities_training = semantic_similarity(training_set)

# Compute the average similarity score
average_similarity = sum(similarities_training) / len(similarities_training)
print(f"Average semantic similarity between questions and correct options in the training set: {average_similarity:.2f}")

  similarity = question.similarity(correct_option)


Average semantic similarity between questions and correct options in the training set: 0.32


##  Q2-Set Similarity Measures


(2.1) Report the performance of each similarity measure (overlap coefficient, Sorensen-Dice & Jaccard) on the training and validation sets by measuring accuracy.

(2.2) For each similarity measure, how many times was the score of the most similar answer tied with another answer? When there was a tied score among the top answers, how did you choose which to select? Why?


In [34]:

def overlap_coefficient(set1, set2):
    return len(set1.intersection(set2)) / min(len(set1), len(set2))

def sorensen_dice_coefficient(set1, set2):
    return 2 * len(set1.intersection(set2)) / (len(set1) + len(set2))

def jaccard_similarity(set1, set2):
    return len(set1.intersection(set2)) / len(set1.union(set2))

def calculate_accuracy(data, similarity_function):
    correct = 0
    ties = 0
    for item in data:
        question_tokens = set(spacy_tokenize(item["question"]))
        scores = [similarity_function(question_tokens, set(spacy_tokenize(option))) for option in item["options"]]
        max_score = max(scores)
        if scores.count(max_score) > 1:  # Check if there's a tie
            ties += 1
            # Implement tie-breaking strategy here. For now, we pick the first option with the highest score.
            chosen_index = scores.index(max_score)
        else:
            chosen_index = scores.index(max_score)
        if chosen_index == item["correct_index"]:
            correct += 1
    accuracy = correct / len(data)
    return accuracy, ties



In [37]:
import pandas as pd

# Accuracy and tie information for training set
training_data = {
    "Similarity Measure": ["Overlap", "Sorensen-Dice", "Jaccard"],
    "Accuracy": [train_accuracy_overlap, train_accuracy_sorensen, train_accuracy_jaccard],
    "Ties": [train_ties_overlap, train_ties_sorensen, train_ties_jaccard]
}

# Accuracy and tie information for validation set
validation_data = {
    "Similarity Measure": ["Overlap", "Sorensen-Dice", "Jaccard"],
    "Accuracy": [val_accuracy_overlap, val_accuracy_sorensen, val_accuracy_jaccard],
    "Ties": [val_ties_overlap, val_ties_sorensen, val_ties_jaccard]
}

# Create DataFrames
df_training = pd.DataFrame(training_data)
df_validation = pd.DataFrame(validation_data)

# Apply styling
df_training_styled = df_training.style.set_caption("Training Set").set_table_styles([{
    'selector': 'caption',
    'props': [
        ('color', 'blue'),
        ('font-size', '16px')
    ]
}]).background_gradient(cmap='viridis', subset=['Accuracy', 'Ties'])

df_validation_styled = df_validation.style.set_caption("Validation Set").set_table_styles([{
    'selector': 'caption',
    'props': [
        ('color', 'blue'),
        ('font-size', '16px')
    ]
}]).background_gradient(cmap='viridis', subset=['Accuracy', 'Ties'])

# Display styled DataFrames
display(df_training_styled)
display(df_validation_styled)


Unnamed: 0,Similarity Measure,Accuracy,Ties
0,Overlap,0.527665,258
1,Sorensen-Dice,0.433198,18
2,Jaccard,0.433198,18


Unnamed: 0,Similarity Measure,Accuracy,Ties
0,Overlap,0.504854,33
1,Sorensen-Dice,0.368932,2
2,Jaccard,0.368932,2


## Q3-Cosine similarity of TF vectors

 Generate term frequency (TF) vectors of each question as well as the four possible answers. You should use the CountVectorizer with default settings (but use the same tokenizer as in Q1 and Q2). For each question, pick the answer with the highest cosine similarity between its TF vector and the question's TF vector.

 (3.1) Report the performance of the training and validation sets by measuring accuracy. Discuss how they compare
 with the set similarity measures from Q2. [6 marks]

In [38]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [39]:
import spacy

# Load SpaCy English tokenizer
nlp = spacy.load("en_core_web_sm")

# Custom tokenizer function
def text_pipeline_spacy_special(text):
    # Tokenize the text using SpaCy
    doc = nlp(text)
    # Extract lemmatized tokens and remove punctuation and whitespace
    tokens = [token.lemma_.lower() for token in doc if not token.is_punct and not token.is_space]
    return tokens




In [40]:

# Code for calculating cosine similarity and best answer
def calculate_cosine_similarity_and_best_answer(data, tf_question_vectors, tf_answer_vectors):
    correct_predictions = 0
    for i, item in enumerate(data):
        question_vector = tf_question_vectors[i:i+1]  # This slices the question vector correctly for cosine_similarity
        answers_start = i * 4
        answers_end = answers_start + 4
        answer_vectors = tf_answer_vectors[answers_start:answers_end]  # This slices the answer vectors correctly
        similarity_scores = cosine_similarity(question_vector, answer_vectors)
        selected_answer_index = similarity_scores.argmax()
        if selected_answer_index == item['correct_index']:
            correct_predictions += 1
    accuracy = correct_predictions / len(data)
    return accuracy


In [41]:
import pandas as pd

# Evaluation with TF-IDF
train_question_vectors_tfidf, train_answer_vectors_tfidf = generate_tfidf_vectors(training_set)
valid_question_vectors_tfidf, valid_answer_vectors_tfidf = generate_tfidf_vectors(validation_set)
train_cosine_similarity_accuracy_tfidf = calculate_cosine_similarity_and_best_answer(training_set, train_question_vectors_tfidf, train_answer_vectors_tfidf)
valid_cosine_similarity_accuracy_tfidf = calculate_cosine_similarity_and_best_answer(validation_set, valid_question_vectors_tfidf, valid_answer_vectors_tfidf)

# Create DataFrame for results
results_data = {
    "Dataset": ["Training", "Validation"],
    "Size": [len(training_set), len(validation_set)],
    "Cosine Similarity Accuracy": [train_cosine_similarity_accuracy_tfidf, valid_cosine_similarity_accuracy_tfidf]
}
df_results = pd.DataFrame(results_data)

# Apply styling
df_results_styled = df_results.style.set_caption("Results for TF-IDF Weighting").set_table_styles([{
    'selector': 'caption',
    'props': [
        ('color', 'blue'),
        ('font-size', '16px')
    ]
}]).background_gradient(cmap='viridis', subset=['Cosine Similarity Accuracy'])

# Display styled DataFrame
display(df_results_styled)




Unnamed: 0,Dataset,Size,Cosine Similarity Accuracy
0,Training,741,0.431849
1,Validation,103,0.407767


In [42]:
import pandas as pd

# Generating TF vectors and calculate cosine similarity and accuracy for training and validation sets
train_question_vectors_tf, train_answer_vectors_tf = generate_tf_vectors(training_set)
valid_question_vectors_tf, valid_answer_vectors_tf = generate_tf_vectors(validation_set)
cosine_similar_accuracy_train = calculate_cosine_similarity_and_best_answer(training_set, train_question_vectors_tf, train_answer_vectors_tf)
cosine_similar_accuracy_val = calculate_cosine_similarity_and_best_answer(validation_set, valid_question_vectors_tf, valid_answer_vectors_tf)

# Create DataFrame for results
results_data_tf = {
    "Dataset": ["Training", "Validation"],
    "Size": [len(training_set), len(validation_set)],
    "Cosine Similarity Accuracy": [cosine_similar_accuracy_train, cosine_similar_accuracy_val]
}
df_results_tf = pd.DataFrame(results_data_tf)

# Apply styling
df_results_tf_styled = df_results_tf.style.set_caption("Results for TF Vector").set_table_styles([{
    'selector': 'caption',
    'props': [
        ('color', 'blue'),
        ('font-size', '16px')
    ]
}]).background_gradient(cmap='viridis', subset=['Cosine Similarity Accuracy'])

# Display styled DataFrame
display(df_results_tf_styled)




Unnamed: 0,Dataset,Size,Cosine Similarity Accuracy
0,Training,741,0.449393
1,Validation,103,0.436893


## Q4-Cosine similarity of vectors from bert-base-uncased


 Use the feature-extraction pipeline with a bert-based-uncased model to create context vectors from the bert-based-uncased model for the text of each question and its four answers separately. You should use the context vector that represents the [CLS] token, which will be the first vector. For each question, pick the answer with the highest cosine similarity between its vector and the question’s vector.

 (4.1) Report the performance of the training and validation sets by measuring accuracy.

 (4.2) What are the limitations of the set similarity and cosine similarity methods used in Q2, Q3 and Q4?

In [43]:
from transformers import BertModel, BertTokenizer
import torch
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd

# Load pre-trained BERT model and tokenizer
model_name = 'bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

# Function to extract context vectors for text
def extract_context_vectors(text):
    inputs = tokenizer.encode_plus(text, return_tensors='pt')
    with torch.no_grad():
        outputs = model(**inputs)
    cls_embeddings = outputs[0][:, 0, :]  # Extract CLS token embeddings
    return cls_embeddings

# Function to calculate cosine similarity and select the most similar answer
def calculate_cosine_similarity_and_select_answer(data):
    correct_predictions = 0
    for item in data:
        question_vector = extract_context_vectors(item['question'])
        max_similarity_score = -1
        selected_answer_index = -1
        for idx, option in enumerate(item['options']):
            answer_vector = extract_context_vectors(option)
            similarity_score = cosine_similarity(question_vector, answer_vector)[0][0]
            if similarity_score > max_similarity_score:
                max_similarity_score = similarity_score
                selected_answer_index = idx
        # Check if the selected answer is correct
        if selected_answer_index == item['correct_index']:
            correct_predictions += 1
    accuracy = correct_predictions / len(data)
    return accuracy

# Calculate cosine similarity and accuracy for training and validation sets
cosine_similarity_accuracy_train = calculate_cosine_similarity_and_select_answer(training_set)
cosine_similarity_accuracy_val = calculate_cosine_similarity_and_select_answer(validation_set)

# Create DataFrame for results
results_data_bert = {
    "Dataset": ["Training", "Validation"],
    "Cosine Similarity Accuracy": [cosine_similarity_accuracy_train, cosine_similarity_accuracy_val]
}
df_results_bert = pd.DataFrame(results_data_bert)

# Apply styling
df_results_bert_styled = df_results_bert.style.set_caption("Results for BERT Model").set_table_styles([{
    'selector': 'caption',
    'props': [
        ('color', 'blue'),
        ('font-size', '16px')
    ]
}]).background_gradient(cmap='viridis', subset=['Cosine Similarity Accuracy'])

# Display styled DataFrame
display(df_results_bert_styled)


Unnamed: 0,Dataset,Cosine Similarity Accuracy
0,Training,0.14305
1,Validation,0.203883


## Q5 - Fine-tuning a transformer model [18 marks]




### (5.1) Report the accuracy, precision, recall and F1 score of the predictions on the question-option pairs representation of the training and validation sets.

In [44]:
from transformers import BertTokenizer, BertForSequenceClassification, AutoModelForSequenceClassification
from torch.utils.data import Dataset, DataLoader
import torch
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pandas as pd

# Load tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Define dataset class
class QuestionOptionDataset(Dataset):
    def __init__(self, dataset):
        self.data = []
        for item in dataset:
            question = item['question']
            options = item['options']
            correct_index = item['correct_index']
            for i, option in enumerate(options):
                input_text = f"{question} [SEP] {option}"
                label = 1 if i == correct_index else 0
                self.data.append((input_text, label))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]

# Create datasets and dataloaders
train_dataset = QuestionOptionDataset(training_set)
valid_dataset = QuestionOptionDataset(validation_set)
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=8)

# Define training function
def train(model, train_loader, optimizer, device):
    model.train()
    total_loss = 0
    for batch in train_loader:
        inputs = tokenizer(batch[0], padding=True, truncation=True, return_tensors="pt").to(device)
        labels = torch.tensor(batch[1]).to(device)
        optimizer.zero_grad()
        outputs = model(**inputs, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
    return total_loss / len(train_loader)

# Define evaluation function
def evaluate(model, valid_loader, device):
    model.eval()
    all_labels = []
    all_preds = []
    with torch.no_grad():
        for batch in valid_loader:
            inputs = tokenizer(batch[0], padding=True, truncation=True, return_tensors="pt").to(device)
            labels = torch.tensor(batch[1]).to(device)
            outputs = model(**inputs)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)
            all_labels.extend(labels.cpu().numpy())
            all_preds.extend(preds.cpu().numpy())
    accuracy = accuracy_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds)
    recall = recall_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)
    return accuracy, precision, recall, f1

# Define hyperparameters
learning_rate = 1e-5
epochs = 4
weight_decay = 0

# Define optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

# Train the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
for epoch in range(epochs):
    train_loss = train(model, train_loader, optimizer, device)
    print(f"Epoch {epoch + 1}, Train Loss: {train_loss:.4f}")

# Evaluate on the validation set
accuracy, precision, recall, f1 = evaluate(model, valid_loader, device)

# Create DataFrame for results
results_data_bert_classifier = {
    "Metric": ["Accuracy", "Precision", "Recall", "F1 Score"],
    "Validation Set": [accuracy, precision, recall, f1]
}
df_results_bert_classifier = pd.DataFrame(results_data_bert_classifier)

# Apply styling
df_results_bert_classifier_styled = df_results_bert_classifier.style.set_caption("Results for BERT Classifier Model").set_table_styles([{
    'selector': 'caption',
    'props': [
        ('color', 'blue'),
        ('font-size', '16px')
    ]
}]).background_gradient(cmap='viridis', subset=['Validation Set'])

# Display styled DataFrame
display(df_results_bert_classifier_styled)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  labels = torch.tensor(batch[1]).to(device)


Epoch 1, Train Loss: 0.5268
Epoch 2, Train Loss: 0.4373
Epoch 3, Train Loss: 0.3250
Epoch 4, Train Loss: 0.2056


  labels = torch.tensor(batch[1]).to(device)


Unnamed: 0,Metric,Validation Set
0,Accuracy,0.791262
1,Precision,0.570248
2,Recall,0.669903
3,F1 Score,0.616071


(5.2) Report the accuracy for this method for selecting the correct answer on the training and validation sets of this model. Note this is different from the value in part (a). To enable this, select the option for each question with the highest output logit value for the positive class of the model.

In [45]:
import pandas as pd

def select_correct_answer(model, loader, device):
    model.eval()
    correct_predictions = 0
    total_questions = 0
    with torch.no_grad():
        for batch in loader:
            inputs = tokenizer(batch[0], padding=True, truncation=True, return_tensors="pt").to(device)
            labels = torch.tensor(batch[1]).to(device)
            outputs = model(**inputs)
            logits = outputs.logits
            preds = torch.argmax(logits, dim=1)
            correct_predictions += torch.sum(preds == labels).item()
            total_questions += labels.size(0)
    accuracy = correct_predictions / total_questions
    return accuracy

# Calculate accuracy for selecting the correct answer on the training set
train_accuracy_selecting_correct_answer = select_correct_answer(model, train_loader, device)

# Calculate accuracy for selecting the correct answer on the validation set
valid_accuracy_selecting_correct_answer = select_correct_answer(model, valid_loader, device)

# Create DataFrame for results
results_data_selecting_correct_answer = {
    "Dataset": ["Training", "Validation"],
    "Accuracy for Selecting Correct Answer": [train_accuracy_selecting_correct_answer, valid_accuracy_selecting_correct_answer]
}
df_results_selecting_correct_answer = pd.DataFrame(results_data_selecting_correct_answer)

# Apply styling
df_results_selecting_correct_answer_styled = df_results_selecting_correct_answer.style.set_caption("Accuracy for Selecting the Correct Answer").set_table_styles([{
    'selector': 'caption',
    'props': [
        ('color', 'blue'),
        ('font-size', '16px')
    ]
}]).background_gradient(cmap='viridis', subset=['Accuracy for Selecting Correct Answer'])

# Display styled DataFrame
display(df_results_selecting_correct_answer_styled)


  labels = torch.tensor(batch[1]).to(device)


Unnamed: 0,Dataset,Accuracy for Selecting Correct Answer
0,Training,0.958165
1,Validation,0.791262


*Thank you*