In [None]:
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
path="/content/drive/MyDrive/coursework_dataset"
os.chdir(path)
os.listdir(path)

['val.json', 'test.json', 'train.json']

In [None]:
!pip install transformers[torch] datasets

Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
Collecting accelerate>=0.21.0 (from transformers[torch])
  Downloading accelerate-0.27.2-py3-none-any.whl (279 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.0/280.0 kB[0m [31m26.5 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: dill, multiprocess, accelerate, datasets
Successfully installed accelerate-0.27.2 datas

## **Dataset and Pre-Processing**

In [None]:
import json

train_data_path = '/content/drive/MyDrive/coursework_dataset/train.json'
val_data_path = '/content/drive/MyDrive/coursework_dataset/val.json'
test_data_path = '/content/drive/MyDrive/coursework_dataset/test.json'

# Function to load the data
def load_data(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        return json.load(file)

# Load the data
train_data = load_data(train_data_path)
val_data = load_data(val_data_path)
test_data = load_data(test_data_path)

In [None]:
import spacy

# Load the small english model.
# Disable the advanced NLP features in the pipeline for efficiency.
nlp = spacy.load('en_core_web_sm', disable=['ner'])
nlp.remove_pipe('tagger')
nlp.remove_pipe('parser')
nlp.remove_pipe('lemmatizer')

('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x7b3940102380>)

In [None]:
def text_pipeline_spacy_special(text):
    tokens = []
    doc = nlp(text)
    for t in doc:
        if not t.is_punct and not t.is_space: # what we removed: "not t.is_stop and"
            tokens.append(t.text.lower()) # what we changed: t.text instead of t.lemma_
    return tokens

In [None]:
train_data[0]

{'question': 'how are glacier caves formed?',
 'options': ['The ice facade is approximately 60 m high',
  'A partly submerged glacier cave on Perito Moreno Glacier .',
  'Ice formations in the Titlis glacier cave',
  'A glacier cave is a cave formed within the ice of a glacier .'],
 'correct_index': 3}

In [None]:
train_data[0].keys()

dict_keys(['question', 'options', 'correct_index'])

In [None]:
from tqdm import tqdm # This provides a nice progress bar
def tokenisation(train_data):
    for post in tqdm(train_data):
        post['question_tokens'] = text_pipeline_spacy_special(post['question'])
        options = []
        for option in post['options']:
            options.append(text_pipeline_spacy_special(option))
        post['options_tokens'] = options

In [None]:
tokenisation(train_data)
tokenisation(val_data)
tokenisation(test_data)

100%|██████████| 741/741 [00:12<00:00, 58.31it/s]
100%|██████████| 103/103 [00:01<00:00, 56.10it/s]
100%|██████████| 202/202 [00:03<00:00, 64.65it/s]


In [None]:
train_data[0]

{'question': 'how are glacier caves formed?',
 'options': ['The ice facade is approximately 60 m high',
  'A partly submerged glacier cave on Perito Moreno Glacier .',
  'Ice formations in the Titlis glacier cave',
  'A glacier cave is a cave formed within the ice of a glacier .'],
 'correct_index': 3,
 'question_tokens': ['how', 'are', 'glacier', 'caves', 'formed'],
 'options_tokens': [['the',
   'ice',
   'facade',
   'is',
   'approximately',
   '60',
   'm',
   'high'],
  ['a',
   'partly',
   'submerged',
   'glacier',
   'cave',
   'on',
   'perito',
   'moreno',
   'glacier'],
  ['ice', 'formations', 'in', 'the', 'titlis', 'glacier', 'cave'],
  ['a',
   'glacier',
   'cave',
   'is',
   'a',
   'cave',
   'formed',
   'within',
   'the',
   'ice',
   'of',
   'a',
   'glacier']]}

In [None]:
num_questions_train = len(train_data)
num_options_train = sum(len(item['options']) for item in train_data)

num_questions_val = len(val_data)
num_options_val = sum(len(item['options']) for item in val_data)

num_questions_test = len(test_data)
num_options_test = sum(len(item['options']) for item in test_data)

print("train data length",(num_questions_train, num_options_train))
print("val data length",(num_questions_val, num_options_val))
print("test data length", (num_questions_test, num_options_test))

train data length (741, 2964)
val data length (103, 412)
test data length (202, 808)


In [None]:
train_questions_tokenized = []
for post in train_data:
    train_questions_tokenized.append(post['question_tokens'])
average_tokens_per_question_train = sum(len(tokens) for tokens in train_questions_tokenized) / len(train_questions_tokenized)
print("average_tokens_per_question_train",average_tokens_per_question_train)

average_tokens_per_question_train 6.272604588394062


In [None]:
train_options_tokenized = []
for post in train_data:
    train_options_tokenized.append(post['options_tokens'])
all_options_tokenized = [token for sublist in train_options_tokenized for token in sublist]

average_tokens_per_option_train = sum(len(tokens) for tokens in all_options_tokenized) / num_options_train
print("average_tokens_per_option_train", average_tokens_per_option_train)

average_tokens_per_option_train 22.338056680161944


In [None]:
train_correct_options_tokenized = []
for post in train_data:
    train_correct_options_tokenized.append(post['options_tokens'][post['correct_index']])
average_tokens_per_correct_options_train = sum(len(tokens) for tokens in train_correct_options_tokenized) / len(train_correct_options_tokenized)

print("average_tokens_per_correct_options_train", average_tokens_per_correct_options_train)

average_tokens_per_correct_options_train 26.032388663967613


we can deal with the whole collection of posts as a single long list of tokens. We'll make one list and combine each post with a special token <START> at the beginning of each.

In [None]:
posts_flattened_question_tokens = []
for post in train_data:
    posts_flattened_question_tokens += ['<START>'] + post['question_tokens']
len(posts_flattened_question_tokens)

5389

And again, we'll create the same flattened list of all the options tokens in each question with the <START> special token.

In [None]:
posts_flattened_options = []
for post in train_data:
    posts_flattened_option = []
    for x in post['options']:
        posts_flattened_option += ['<START>'] + text_pipeline_spacy_special(x)
    posts_flattened_options.append(posts_flattened_option)
print(len(posts_flattened_options))
print(posts_flattened_options[0])

741
['<START>', 'the', 'ice', 'facade', 'is', 'approximately', '60', 'm', 'high', '<START>', 'a', 'partly', 'submerged', 'glacier', 'cave', 'on', 'perito', 'moreno', 'glacier', '<START>', 'ice', 'formations', 'in', 'the', 'titlis', 'glacier', 'cave', '<START>', 'a', 'glacier', 'cave', 'is', 'a', 'cave', 'formed', 'within', 'the', 'ice', 'of', 'a', 'glacier']


In [None]:
from collections import Counter
import numpy as np

# 1. Distribution of the number of tokens in questions
question_lengths = [len(question) for question in train_questions_tokenized]
options_lengths = [len(option) for sublist in train_options_tokenized for option in sublist]

# 2. Frequency of the correct answer being the longest or shortest option
correct_longest = 0
correct_shortest = 0
for item in train_data:
    lengths = [len(text_pipeline_spacy_special(option)) for option in item['options']]
    correct_length = lengths[item['correct_index']]
    if correct_length == max(lengths):
        correct_longest += 1
    if correct_length == min(lengths):
        correct_shortest += 1

# 3. Number of unique words across all questions and options
unique_words = set(word for question in train_questions_tokenized for word in question)
for options in train_options_tokenized:
    for option in options:
        unique_words.update(option)

# 4. Distribution of question types
question_starts = Counter(question[0] for question in train_questions_tokenized)

question_lengths_stats = {
    'Mean': np.mean(question_lengths),
    'Median': np.median(question_lengths),
    'Mode': max(set(question_lengths), key=question_lengths.count)
}

options_lengths_stats = {
    'Mean': np.mean(options_lengths),
    'Median': np.median(options_lengths),
    'Mode': max(set(options_lengths), key=options_lengths.count)
}

all_option_words = [option for sublist in posts_flattened_options for option in sublist]

# Count the occurrences of each word
word_counts = Counter(all_option_words)

# Find the 10 most common words
most_common_words = word_counts.most_common(10)

counts_words_question = Counter(posts_flattened_question_tokens)

most_common_words_question = counts_words_question.most_common(10)

additional_explorations = {
    'Question Lengths Stats': question_lengths_stats,
    'Options Lengths Stats': options_lengths_stats,
    'Correct Longest': correct_longest,
    'Correct Shortest': correct_shortest,
    'Unique Words': len(unique_words),
    'Question Starts': question_starts,
    'most common word in flattened options': most_common_words,
    'most common word in flattened question': most_common_words_question
}

additional_explorations

{'Question Lengths Stats': {'Mean': 6.272604588394062,
  'Median': 6.0,
  'Mode': 5},
 'Options Lengths Stats': {'Mean': 22.338056680161944,
  'Median': 21.0,
  'Mode': 18},
 'Correct Longest': 306,
 'Correct Shortest': 112,
 'Unique Words': 11676,
 'Question Starts': Counter({'how': 116,
          'what': 390,
          'where': 56,
          'who': 99,
          'when': 80}),
 'most common word in flattened options': [('the', 5020),
  ('<START>', 2964),
  ('of', 2577),
  ('and', 2078),
  ('in', 1754),
  ('a', 1705),
  ('is', 1331),
  ('to', 1086),
  ('as', 715),
  ('by', 614)],
 'most common word in flattened question': [('<START>', 741),
  ('what', 392),
  ('is', 304),
  ('the', 254),
  ('in', 130),
  ('how', 116),
  ('who', 101),
  ('of', 94),
  ('a', 91),
  ('when', 81)]}

## **Set Similarity Measures**

In [None]:
def count_overlapping_tokens_with_sets(tokens_X, tokens_Y):
    unique_tokens1 = set(tokens_X)
    unique_tokens2 = set(tokens_Y)

      # Count overlapping tokens
    overlap_count = sum(1 for token in unique_tokens1 if token in unique_tokens2)

    return overlap_count

# Define the set similarity measures: overlap coefficient, Sorensen-Dice & Jaccard
def overlap_coefficient(tokens_X,tokens_Y):
  # your code!
    set_tokens1 = set(tokens_X)
    set_tokens2 = set(tokens_Y)

    intersection_size = count_overlapping_tokens_with_sets(tokens_X,tokens_Y)
    # Calculate the overlap coefficient
    coefficient = intersection_size / min(len(set_tokens1), len(set_tokens2))

    return coefficient

def sorenson_dice(tokens_X,tokens_Y):
    set_tokens1 = set(tokens_X)
    set_tokens2 = set(tokens_Y)

    intersection_size = count_overlapping_tokens_with_sets(tokens_X,tokens_Y)
        # Calculate the overlap coefficient
    sorenson = 2*intersection_size / (len(set_tokens1) + len(set_tokens2))

    return sorenson

def jaccard_similarity(tokens_X,tokens_Y):
    set_tokens1 = set(tokens_X)
    set_tokens2 = set(tokens_Y)

    intersection_size = count_overlapping_tokens_with_sets(tokens_X,tokens_Y)

    jaccard = intersection_size / len(set_tokens1.union(set_tokens2))
    return jaccard

# Function to evaluate the similarity measures
def evaluate_similarity(data):
    results = {
        'overlap': [],
        'sorensen_dice': [],
        'jaccard': []
    }

    for item in data:
        question_set = set(item['question_tokens'])
        max_overlap = max_sorensen_dice = max_jaccard = -1
        chosen_overlap = chosen_sorensen_dice = chosen_jaccard = -1


        for i, option_tokens in enumerate(item['options_tokens']):
            option_set = set(option_tokens)
            overlap = overlap_coefficient(question_set, option_set)
            sorensen_dice = sorenson_dice(question_set, option_set)
            jaccard = jaccard_similarity(question_set, option_set)

            if overlap > max_overlap:
                max_overlap = overlap
                chosen_overlap = i
            if sorensen_dice > max_sorensen_dice:
                max_sorensen_dice = sorensen_dice
                chosen_sorensen_dice = i
            if jaccard > max_jaccard:
                max_jaccard = jaccard
                chosen_jaccard = i

        results['overlap'].append(chosen_overlap == item['correct_index'])
        results['sorensen_dice'].append(chosen_sorensen_dice == item['correct_index'])
        results['jaccard'].append(chosen_jaccard == item['correct_index'])

    return {measure: sum(correct) / len(correct) for measure, correct in results.items()}

# Evaluate on the training set
train_performance = evaluate_similarity(train_data)

# Evaluate on the validation set
val_performance = evaluate_similarity(val_data)

train_performance, val_performance


({'overlap': 0.5236167341430499,
  'sorensen_dice': 0.4291497975708502,
  'jaccard': 0.4291497975708502},
 {'overlap': 0.46601941747572817,
  'sorensen_dice': 0.3592233009708738,
  'jaccard': 0.3592233009708738})

In [None]:
def record_similarity_scores(data):
    # Initializing a dictionary to hold similarity scores for each question
    similarity_scores = {
        'overlap': [],
        'sorensen_dice': [],
        'jaccard': []
    }
    # Initializing counters for the ties
    tie_counters = {
        'overlap': 0,
        'sorensen_dice': 0,
        'jaccard': 0
    }

    # Iterating through each item in the dataset
    for item in data:
        question_set = set(item['question_tokens'])
        scores = {
            'overlap': [],
            'sorensen_dice': [],
            'jaccard': []
        }

        # Calculating similarity scores for each option
        for option_tokens in item['options_tokens']:
            option_set = set(option_tokens)
            scores['overlap'].append(overlap_coefficient(question_set, option_set))
            scores['sorensen_dice'].append(sorenson_dice(question_set, option_set))
            scores['jaccard'].append(jaccard_similarity(question_set, option_set))

        # Recording the calculated scores
        similarity_scores['overlap'].append(scores['overlap'])
        similarity_scores['sorensen_dice'].append(scores['sorensen_dice'])
        similarity_scores['jaccard'].append(scores['jaccard'])

        # Checking for ties in the scores for each measure
        for measure in scores:
            max_score = max(scores[measure])
            # Check if there are ties for the highest score
            if scores[measure].count(max_score) > 1:
                tie_counters[measure] += 1


    return similarity_scores,tie_counters

# Recording similarity scores for both training and validation datasets
train_similarity_scores,train_ties_counts = record_similarity_scores(train_data)
val_similarity_scores, val_ties_counts = record_similarity_scores(val_data)

print(train_similarity_scores['overlap'][:1], train_similarity_scores['sorensen_dice'][:1], train_similarity_scores['jaccard'][:1])
train_ties_counts,val_ties_counts

[[0.0, 0.2, 0.2, 0.4]] [[0.0, 0.15384615384615385, 0.16666666666666666, 0.2857142857142857]] [[0.0, 0.08333333333333333, 0.09090909090909091, 0.16666666666666666]]


({'overlap': 246, 'sorensen_dice': 20, 'jaccard': 20},
 {'overlap': 29, 'sorensen_dice': 4, 'jaccard': 4})

### Use random choice to choose option when the similarity is same

In [None]:
import random

def evaluate_similarity_with_random_choice(data):
    results = {
        'overlap': [],
        'sorensen_dice': [],
        'jaccard': []
    }

    for item in data:
        question_set = set(item['question_tokens'])
        scores_overlap = []
        scores_sorensen_dice = []
        scores_jaccard = []

        for option_tokens in item['options_tokens']:
            option_set = set(option_tokens)
            scores_overlap.append(overlap_coefficient(question_set, option_set))
            scores_sorensen_dice.append(sorenson_dice(question_set, option_set))
            scores_jaccard.append(jaccard_similarity(question_set, option_set))

        # Handling ties: If the max score is found in multiple indices, randomly select one.
        max_overlap = max(scores_overlap)
        indices_overlap = [i for i, score in enumerate(scores_overlap) if score == max_overlap]
        chosen_overlap = random.choice(indices_overlap) if len(indices_overlap) > 1 else indices_overlap[0]

        max_sorensen_dice = max(scores_sorensen_dice)
        indices_sorensen_dice = [i for i, score in enumerate(scores_sorensen_dice) if score == max_sorensen_dice]
        chosen_sorensen_dice = random.choice(indices_sorensen_dice) if len(indices_sorensen_dice) > 1 else indices_sorensen_dice[0]

        max_jaccard = max(scores_jaccard)
        indices_jaccard = [i for i, score in enumerate(scores_jaccard) if score == max_jaccard]
        chosen_jaccard = random.choice(indices_jaccard) if len(indices_jaccard) > 1 else indices_jaccard[0]

        # Append the result of whether the chosen answer is correct
        results['overlap'].append(chosen_overlap == item['correct_index'])
        results['sorensen_dice'].append(chosen_sorensen_dice == item['correct_index'])
        results['jaccard'].append(chosen_jaccard == item['correct_index'])

    # Calculate accuracy for each measure
    return {measure: sum(correct) / len(correct) for measure, correct in results.items()}

# Set a fixed seed for random number generator for reproducibility
random.seed(0)

# Evaluate on the training set
train_performance_with_random = evaluate_similarity_with_random_choice(train_data)

# Evaluate on the validation set
val_performance_with_random = evaluate_similarity_with_random_choice(val_data)

train_performance_with_random, val_performance_with_random


({'overlap': 0.5020242914979757,
  'sorensen_dice': 0.4318488529014845,
  'jaccard': 0.4331983805668016},
 {'overlap': 0.4854368932038835,
  'sorensen_dice': 0.3592233009708738,
  'jaccard': 0.3592233009708738})

## **Cosine similarity of TF vectors**

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings
import numpy as np

# Initialize the CountVectorizer with the default as q1 q2
vectorizer = CountVectorizer(tokenizer=text_pipeline_spacy_special)

def cosine_similarity_tf(data):
    accuracies = []

    for item in data:
        # Combine the question and options into a single list for vectorization
        texts = [item['question']] + item['options']

        # Generate the term frequency vectors for the question and the options
        tf_vectors = vectorizer.fit_transform(texts).toarray()

         # Generate the TF-IDF vectors for the question and the options
#         tf_vectors = tfidf_vectorizer.fit_transform(texts).toarray()

        # Calculate cosine similarity between question and each option
        question_vector = tf_vectors[0].reshape(1, -1)
        option_vectors = tf_vectors[1:]
        cos_similarities = cosine_similarity(question_vector, option_vectors).flatten()

        # Pick the option with the highest cosine similarity
        selected_option_index = np.argmax(cos_similarities)

        # Check if the selected option is the correct one and calculate accuracy
        accuracies.append(selected_option_index == item['correct_index'])

    # Return the accuracy of the model
    return np.mean(accuracies)

# Calculate cosine similarity for the training and validation sets
training_accuracy_cosine = cosine_similarity_tf(train_data)
validation_accuracy_cosine = cosine_similarity_tf(val_data)
warnings.filterwarnings("ignore", message="UserWarning: The parameter 'token_pattern' will not be used since 'tokenizer' is not None'")
training_accuracy_cosine, validation_accuracy_cosine


(0.446693657219973, 0.4563106796116505)

In [None]:
# Initialize the TfidfVectorizer with the default as q1 q2
tfidf_vectorizer = TfidfVectorizer(tokenizer=text_pipeline_spacy_special)

def cosine_similarity_tfidf(data):
    accuracies = []

    for item in data:
        # Combine the question and options into a single list for vectorization
        texts = [item['question']] + item['options']

         # Generate the TF-IDF vectors for the question and the options
        tf_vectors = tfidf_vectorizer.fit_transform(texts).toarray()

        # Calculate cosine similarity between question and each option
        question_vector = tf_vectors[0].reshape(1, -1)
        option_vectors = tf_vectors[1:]
        cos_similarities = cosine_similarity(question_vector, option_vectors).flatten()

        # Pick the option with the highest cosine similarity
        selected_option_index = np.argmax(cos_similarities)

        # Check if the selected option is the correct one and calculate accuracy
        accuracies.append(selected_option_index == item['correct_index'])

    # Return the accuracy of the model
    return np.mean(accuracies)

# Calculate cosine similarity for the training and validation sets
training_accuracy_cosine = cosine_similarity_tfidf(train_data)
validation_accuracy_cosine = cosine_similarity_tfidf(val_data)

training_accuracy_cosine, validation_accuracy_cosine

(0.47638326585695007, 0.4854368932038835)

## **Cosine similarity of vectors from bert-base-uncased**

In [None]:
from transformers import pipeline
import torch
import torch.nn.functional as F

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pipe = pipeline('feature-extraction', model="bert-base-uncased",device = device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
def get_cls_vector(texts):
    start_vectors = [ pipe(sentence, return_tensors='pt')[0,0,:] for sentence in texts ]

    start_vectors = torch.stack(start_vectors)
    return start_vectors

def evaluate_bert_similarity(data):
    accuracies = []

    for item in data:
        texts = [item['question']] + item['options']
        vectors = get_cls_vector(texts)

        # Split question vector and answer vectors
        question_vector, answer_vectors = vectors[0], vectors[1:]

        # Compute cosine similarity between question and each answer
        similarities = F.cosine_similarity(question_vector.repeat(answer_vectors.size(0), 1), answer_vectors, dim=1)

        selected_option_index = torch.argmax(similarities)

        accuracies.append(selected_option_index.item() == item['correct_index'])

    return torch.tensor(accuracies).float().mean().item()
training_accuracy = evaluate_bert_similarity(train_data)
validation_accuracy = evaluate_bert_similarity(val_data)

print(f"Training Accuracy: {training_accuracy}")
print(f"Validation Accuracy: {validation_accuracy}")



Training Accuracy: 0.1430499255657196
Validation Accuracy: 0.20388349890708923


## **Fine-tuning a transformer model**

In [None]:
from datasets import Dataset
from transformers import AutoTokenizer
import torch

# Initialize the tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# Transform the data into the question-option pairs representation
def transform_data_to_pairs(data):
    pairs = []
    for item in data:
        for index, option in enumerate(item['options']):
            # Concatenate the question and option with "[SEP]"
            text = item['question'] + " [SEP] " + option
            # Tokenize the text pair
            encoding = tokenizer(text, padding = 'max_length', truncation=True, return_tensors='pt')
            # Add the encoded text pair and the label to the pairs list
            pairs.append({
                'input_ids': encoding['input_ids'].squeeze().tolist(),  # Convert tensors to lists
                'attention_mask': encoding['attention_mask'].squeeze().tolist(),
                'label': 1 if index == item['correct_index'] else 0
            })
    return pairs

# Apply the transformation to the data
transformed_data_train = transform_data_to_pairs(train_data)
transformed_data_val = transform_data_to_pairs(val_data)

# Now create a Dataset object
train_dataset = Dataset.from_dict({
    'input_ids': [item['input_ids'] for item in transformed_data_train],
    'attention_mask': [item['attention_mask'] for item in transformed_data_train],
    'labels': [item['label'] for item in transformed_data_train]
})
val_dataset = Dataset.from_dict({
    'input_ids': [item['input_ids'] for item in transformed_data_val],
    'attention_mask': [item['attention_mask'] for item in transformed_data_val],
    'labels': [item['label'] for item in transformed_data_val]
})

# View the Dataset structure and the first example
print(train_dataset)
print(train_dataset[3])

print(val_dataset)
print(val_dataset[0])


Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 2964
})
{'input_ids': [101, 2129, 2024, 10046, 10614, 2719, 1029, 102, 1037, 10046, 5430, 2003, 1037, 5430, 2719, 2306, 1996, 3256, 1997, 1037, 10046, 1012, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
learning_rate = 1e-5
batch_size = 8
epochs = 4
weight_decay = 0

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="my_awesome_model",
    learning_rate=learning_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    weight_decay=weight_decay,
    evaluation_strategy="epoch"
)


In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }


In [None]:
train_dataset.shape,val_dataset.shape

((2964, 3), (412, 3))

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

trainer.train()


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.454074,0.820388,0.569767,0.710145,0.475728
2,0.510700,0.454759,0.839806,0.60241,0.793651,0.485437
3,0.411600,0.476351,0.791262,0.590476,0.579439,0.601942
4,0.411600,0.501674,0.803398,0.597015,0.612245,0.582524


TrainOutput(global_step=1484, training_loss=0.41330760122952115, metrics={'train_runtime': 1164.9384, 'train_samples_per_second': 10.177, 'train_steps_per_second': 1.274, 'total_flos': 3119444672348160.0, 'train_loss': 0.41330760122952115, 'epoch': 4.0})

In [None]:
def acc(pred,data):
    reshaped_logits = pred.reshape(-1, 4, 2)
    pred_indices = reshaped_logits[:, :, 1].argmax(axis=1)
    true_indices = []
    for i in data:
        true_indices.append(i['correct_index'])
    true_indices = np.array(true_indices)
    count = 0
    for i in range(len(true_indices)):
        if pred_indices[i] == true_indices[i]:
            count += 1
    return count/len(true_indices)

In [None]:
predictions, label_ids, metrics = trainer.predict(train_dataset)

In [None]:
metrics

{'test_loss': 0.2582426965236664,
 'test_accuracy': 0.8984480431848852,
 'test_f1': 0.8064308681672026,
 'test_precision': 0.7702702702702703,
 'test_recall': 0.8461538461538461,
 'test_runtime': 96.1027,
 'test_samples_per_second': 30.842,
 'test_steps_per_second': 3.86}

In [None]:
print("Question accuracy in train data is ",acc(predictions, train_data))

Question accuracy in train data is  0.844804318488529


In [None]:
predictions, label_ids, metrics = trainer.predict(val_dataset)

In [None]:
metrics

{'test_loss': 0.5016736388206482,
 'test_accuracy': 0.8033980582524272,
 'test_f1': 0.5970149253731343,
 'test_precision': 0.6122448979591837,
 'test_recall': 0.5825242718446602,
 'test_runtime': 13.3352,
 'test_samples_per_second': 30.896,
 'test_steps_per_second': 3.899}

In [None]:
print("Question accuracy in val data is ",acc(predictions, val_data))

Question accuracy in val data is  0.6213592233009708


## **Test set performance**

In [None]:
transformed_data_test =  transform_data_to_pairs(test_data)
test_dataset = Dataset.from_dict({
    'input_ids': [item['input_ids'] for item in transformed_data_test],
    'attention_mask': [item['attention_mask'] for item in transformed_data_test],
    'labels': [item['label'] for item in transformed_data_test]
})

In [None]:
predictions, label_ids, metrics = trainer.predict(test_dataset)

In [None]:
metrics

{'test_loss': 0.5231308341026306,
 'test_accuracy': 0.7957920792079208,
 'test_f1': 0.60431654676259,
 'test_precision': 0.586046511627907,
 'test_recall': 0.6237623762376238,
 'test_runtime': 26.2741,
 'test_samples_per_second': 30.753,
 'test_steps_per_second': 3.844}

In [None]:
print("Question accuracy in test data is ",acc(predictions, test_data))

Question accuracy in val data is  0.6336633663366337
