# Install packages

In [None]:
# !pip install --force-reinstall numpy==1.22 # 1.23.4


In [None]:
!pip install datasets==2.14.6
!pip install transformers
!pip install evaluate
!pip install --no-cache-dir transformers sentencepiece

In [None]:
!pip install accelerate -U

# Imports

In [None]:
import torch
from transformers import AutoTokenizer, get_scheduler, TrainingArguments, Trainer, DataCollatorWithPadding, AutoModelForSequenceClassification

# from string import Template
# from pathlib import Path

import os

import warnings
warnings.simplefilter("ignore")

from tqdm.notebook import tqdm

import numpy as np
import pandas as pd

from datasets import Dataset, DatasetDict

from torch.utils.data import DataLoader


from IPython.display import Markdown, display

# Prepare training data

To access certain Language Model Models (LLMs) through the Hugging Face library, you may need to obtain an access token. You can acquire a token by signing up on the Hugging Face website and gaining permission to use the specific model you're interested in. 

The following cell demonstrates how to pass your access token in order to download the model and tokenizer. Put your access token in the `YOUR_HUGGING_FACE_TOKEN` variable.

In [None]:
from huggingface_hub import login

login(token='YOUR_HUGGING_FACE_TOKEN')

Here we determine the model we are using and the sub-task we are solving (Sentence Puzzle or Word Puzzle).

In [None]:
task = "SP"
model_name = "FacebookAI/roberta-large"

### Importing into Colab

Here we demonstrate how to import data into Colab. We have uploaded the data folder of the repository to a private Google Drive folder. Our folder is called `sem-dataset`.

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
# os.chdir('/content/drive/My Drive/sem-dataset')


In [None]:
# train_data = np.load('./data/'+task+'-train.npy', allow_pickle=True)

# test_data = np.load('./data/'+task+'_test_labeled.npy', allow_pickle=True)

### Importing into Kaggle

Here we demonstrate how to import data into Kaggle. We have uploaded the data folder of the repository to a private Kaggle dataset. Our dataset is called `sem-dataset`.

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input/sem-dataset'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

Here we import train and test data from the dataset.

In [None]:
train_data = np.load('/kaggle/input/sem-dataset/'+task+'-train.npy', allow_pickle=True)

test_data = np.load('/kaggle/input/sem-dataset/'+task+'_test_labeled.npy', allow_pickle=True)

### Make directory for our output

In [None]:
date_of_run = pd.to_datetime('today').strftime("%Y_%m_%d_%H_%M")

if '/' in model_name:
    # Split the model_name by "/"
    parts = model_name.split("/")
    
    # Check if there are at least 4 parts
    if len(parts) >= 5:
        # Concatenate the 3rd and 4th parts with an underscore
        model_suffix = parts[3] + "_" + parts[5]
    else:
        # model_suffix = model_name
        model_suffix = model_name.replace('/', '_')


run_dir = "./small_TxtCls_" + task + "_" + model_suffix + "_" + date_of_run
print(run_dir)

# Create the directory if it does not exist
if not os.path.exists(run_dir):
    os.makedirs(run_dir)

os.chdir(run_dir)

# Basic preprocessing

### Train dataset

In [None]:
def convert_from_numpy_to_dataset_type (numpy_array, split):
    data_list = numpy_array.tolist()
    df = pd.DataFrame(data_list)

    df = pd.DataFrame(data_list)
    df['id'] = df['id'].astype(str)      
    df['distractor1'] = df['distractor1'].astype(str)
    df['distractor2'] = df['distractor2'].astype(str)
    df['distractor(unsure)'] = df['distractor(unsure)'].astype(str)
    df['label'] = df['label'].astype(int)

    dataset = Dataset.from_pandas(df,  split=split)

    display(dataset[0])

    display(dataset.features) # just to check the type of the features

    return dataset

In [None]:
train_dataset = convert_from_numpy_to_dataset_type(train_data, "train")

### Test dataset

In [None]:
def convert_from_numpy_to_dataset_test_type (numpy_array):
    data_list = numpy_array.tolist()
    df = pd.DataFrame(data_list)

    df = pd.DataFrame(data_list)
    df['id'] = df['id'].astype(str)      
    df['label'] = df['label'].astype(int)

    dataset = Dataset.from_pandas(df)

    display(dataset[0])

    display(dataset.features) # just to check the type of the features

    return dataset

In [None]:
test_dataset = convert_from_numpy_to_dataset_test_type(test_data)


Importing the tokenizer in order to tokenize the data.


In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name)


### Splitting the dataset

- We start by preprocessing the data, dividing it into three categories: Original, Semantic Reconstruction, and Context Reconstruction.
  
- Next, we split the training dataset into three subsets: train, validation, and test, for each of the three data categories. We perform this split before shuffling to ensure that the same identifiers (ids) are consistent across the training, validation, and test sets within each category.

- After splitting, we concatenate and shuffle the data within each of the three categories (Original, Semantic, Context).

- This approach is necessary because the dataset initially lacked a separate test set at the beginning of the competition.

- Following the data preparation, we transform the multiple-choice task into a binary classification problem. For each unique identifier (id), we create four binary classification tasks based on the four multiple-choice options. Since the fourth option ("None of the above") is consistently irrelevant, we focus on the first three options to form our binary labels. We add a new column called `label`, assigning a value of 1 if the answer is correct and 0 otherwise. This allows us to train the model to predict correctness for each of the three binary classification tasks per unique identifier.

- These preprocessing and transformation steps are applied not only to the training dataset but also to the test dataset used in the competition.

#### Train dataset

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
ori_original_dataset = train_dataset.filter(lambda data: "_SR" not in data["id"] and "_CR" not in data["id"])
ori_scemantic_dataset = train_dataset.filter(lambda data: "_SR" in data["id"]) # SR => Semantic Reconstruction	
ori_context_dataset = train_dataset.filter(lambda data: "_CR" in data["id"]) # CR => Context Reconstruction

print(f"Original dataset size: {len(ori_original_dataset)}")
print(f"Semantic dataset size: {len(ori_scemantic_dataset)}")
print(f"Context dataset size: {len(ori_context_dataset)}")

In [None]:
def splitting_dataset(dataset, split_size):
    
    #split_size% test + validation
    train_testvalid = dataset.train_test_split(test_size=split_size, shuffle=False)
    
    # Split the rest test + valid in half test, half valid
    test_valid = train_testvalid["test"].train_test_split(test_size=0.5, shuffle=False)
    
    # gather everyone if you want to have a single DatasetDict
    datasets = DatasetDict({
        "train": train_testvalid["train"],
        "test": test_valid["test"],
        "valid": test_valid["train"]})
    
    return datasets


Here we are splitting the dataset into train, validation and test sets. **A good rule of thumb is to use 70% of the data for training, 15% for validation and 15% for testing.**

<u>**WE DO NOT WANT TO SHUFFLE THE DATASET BEFORE SPLITTING IT TO KEEP THE ORDER OF THE SENTENCES!!!**</u>

In [None]:
original_dataset = splitting_dataset(ori_original_dataset, 0.3)
scemantic_dataset = splitting_dataset(ori_scemantic_dataset, 0.3)
context_dataset = splitting_dataset(ori_context_dataset, 0.3)


Now we will make the only dataset that we will use for training and validation.
The testing will be done on several datasets.

In [None]:
from datasets import concatenate_datasets

assert original_dataset["train"].features.type == scemantic_dataset["train"].features.type
assert original_dataset["train"].features.type == context_dataset["train"].features.type
training_dataset = concatenate_datasets([original_dataset["train"], scemantic_dataset["train"], context_dataset["train"]])
# print(f"Training set size: {len(temp_training_dataset)}")
# print(temp_training_dataset)

assert original_dataset["valid"].features.type == scemantic_dataset["valid"].features.type
assert original_dataset["valid"].features.type == context_dataset["valid"].features.type
valid_dataset = concatenate_datasets([original_dataset["valid"], scemantic_dataset["valid"], context_dataset["valid"]])
# print(f"Validation set size: {len(valid_dataset)}")
# print(valid_dataset)

In [None]:
training_dataset = training_dataset.shuffle(seed=42)
valid_dataset = valid_dataset.shuffle(seed=42)


my_dataset = DatasetDict({
    "train": training_dataset,
    "valid": valid_dataset})

print(my_dataset)

#### Test dataset

In [None]:
test_original_dataset = test_dataset.filter(lambda data: "_SR" not in data["id"] and "_CR" not in data["id"])
test_scemantic_dataset = test_dataset.filter(lambda data: "_SR" in data["id"]) # SR => Semantic Reconstruction	
test_context_dataset = test_dataset.filter(lambda data: "_CR" in data["id"]) # CR => Context Reconstruction

print(f"Original dataset size: {len(ori_original_dataset)}")
print(f"Semantic dataset size: {len(ori_scemantic_dataset)}")
print(f"Context dataset size: {len(ori_context_dataset)}")

### Tokenize after splitting

`create_binary_pairs` is a function that takes a row of our dataset and creates the binary pairs. It returns a list of the new rows.

In [None]:
def create_binary_pairs(row):
    id = row['id']
    question = row['question']
    # correct_answer = row['answer']
    choices = row['choice_list']
    correct_answer = choices[row['label']]
    # choice_order = row['choice_order']
    
    binary_pairs = []
    
    # check if question contains '?' at the end
    question = question.strip()
    if question[-1] != '?':
        question = question + '?'

    for i in range(len(choices)):
        choice = choices[i]
        
        # if choice contains "None of the above" skip it
        if "none of above" in choice.lower():
            continue
        
        # handle choice format
        formatted_choice = choice.strip()
        if formatted_choice[-1] != '.':
            formatted_choice = formatted_choice + '.'
            
        is_correct = (choice == correct_answer)
        label = 1 if is_correct else 0

        # Concatenate the question and choice to create a new question
        new_question = f"{question} {formatted_choice}"
        
        # create new id to group these binary pairs together
        new_id = f"{id}_{i}"
        # new_id = f"{i}"

        # Create a binary pair with the new question and label
        pair = {'id': new_id, 'question': new_question, 'label': label}
        binary_pairs.append(pair)
    
    # row['binary_pairs'] = binary_pairs
    
    # row['binary_pairs'] = binary_pairs

    return binary_pairs


`create_binary_dataset` is a function that takes the dataset and creates the binary dataset. It returns a new dataset as a list.

In [None]:
binary_dataset = []
def create_binary_dataset(example):
    binary_questions = create_binary_pairs(example)
    binary_dataset.extend(binary_questions)


In [None]:
my_dataset["train"].map(create_binary_dataset)

print("Length of binary dataset: ", len(binary_dataset))
display(binary_dataset[:3])

Now we will create a binary pair dataset for the train, validation and test sets.

In [None]:
list_of_datasets = [my_dataset["train"], my_dataset["valid"], original_dataset["test"], scemantic_dataset["test"], context_dataset["test"], test_original_dataset, test_scemantic_dataset, test_context_dataset]

all_data = []

for i, dataset in enumerate(list_of_datasets):
    binary_dataset = []
    dataset.map(create_binary_dataset)
    print("Length of binary dataset: ", len(binary_dataset))
    
    all_data.append(binary_dataset)   
    

In [None]:
my_train_dataset = Dataset.from_list(all_data[0])
my_valid_dataset = Dataset.from_list(all_data[1])

my_original_test_dataset = Dataset.from_list(all_data[2])
my_scemantic_test_dataset = Dataset.from_list(all_data[3])
my_context_test_dataset = Dataset.from_list(all_data[4])

testset_original_test_dataset = Dataset.from_list(all_data[5])
testset_scemantic_test_dataset = Dataset.from_list(all_data[6])
testset_context_test_dataset = Dataset.from_list(all_data[7])


# Print the resulting dataset
# print(my_train_dataset)
# print(my_valid_dataset)

# print(my_original_test_dataset)
# print(my_scemantic_test_dataset)
# print(my_context_test_dataset)

# print(testset_original_test_dataset)
# print(testset_scemantic_test_dataset)
# print(testset_context_test_dataset)

In [None]:
my_dataset = DatasetDict({
    "train": my_train_dataset,
    "valid": my_valid_dataset})

# print(my_dataset)

Create the preprocessing function that will tokenize the data.

In [None]:
def preprocess_function(examples):
    return tokenizer(examples["question"], truncation=True)

In [None]:
my_dataset["train"] = my_dataset["train"].shuffle(seed=42) 
my_dataset["valid"] = my_dataset["valid"].shuffle(seed=42) 


In [None]:
list_of_datasets = [my_dataset["train"], my_dataset["valid"], my_original_test_dataset, my_scemantic_test_dataset, my_context_test_dataset, testset_original_test_dataset, testset_scemantic_test_dataset, testset_context_test_dataset]

Now we tokenize all the datasets.

In [None]:
tokenized_datasets = []

for i, dataset in enumerate(list_of_datasets):
    tokenized_datasets.append(dataset.map(preprocess_function, batched=True))
    # dataset = dataset.map(preprocess_function, batched=True)
    
    # display(tokenized_datasets[i].features)

In [None]:
my_train_dataset = Dataset.from_list(tokenized_datasets[0])
my_valid_dataset = Dataset.from_list(tokenized_datasets[1])

my_original_test_dataset = Dataset.from_list(tokenized_datasets[2])
my_scemantic_test_dataset = Dataset.from_list(tokenized_datasets[3])
my_context_test_dataset = Dataset.from_list(tokenized_datasets[4])


testset_original_test_dataset = Dataset.from_list(tokenized_datasets[5])
testset_scemantic_test_dataset = Dataset.from_list(tokenized_datasets[6])
testset_context_test_dataset = Dataset.from_list(tokenized_datasets[7])

In [None]:
my_dataset = DatasetDict({
    "train": my_train_dataset,
    "valid": my_valid_dataset})

# print(my_dataset)

Here we are using `.map()` to apply the `preprocess` function to the dataset.

In [None]:
tokenized_train = my_dataset["train"].map(preprocess_function, batched=True)
print(f"Training set size: {len(tokenized_train)}")
# 396*3 = 1188

## Fine-tuning model

Data collator that will dynamically pad the inputs received, as well as the labels.

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## Evaluate

Including a metric during training is often helpful for evaluating your model’s performance. or this task, we load the accuracy metric.

In [None]:
import evaluate

accuracy = evaluate.load("accuracy")

In [None]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

## Train

First we need to preprocess the data for the trainer.

The `get_final_dataset` function modifies the input `dataset` by renaming the column "label" to "labels". 


In [None]:
def get_final_dataset(dataset):
    tokenized_dataset = dataset.rename_column("label", "labels")
    tokenized_dataset = tokenized_dataset.remove_columns(['id', 'question'])
    tokenized_dataset.set_format("torch")
    return tokenized_dataset

In [None]:
tokenized_datasets = get_final_dataset(my_dataset)

original_datasets = get_final_dataset(my_original_test_dataset)
scemantic_datasets = get_final_dataset(my_scemantic_test_dataset)
context_datasets = get_final_dataset(my_context_test_dataset)

Before we start training our model, we create a map of the expected ids to their labels with id2label and label2id:



In [None]:
id2label = {0: "FALSE", 1: "TRUE"}
label2id = {"FALSE": 0, "TRUE": 1}

We disable Weights & Biases. You'll need to apply an API key when prompted if you use it for tracking the training metrics.

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"
# os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:256"


### Here we are loading the model we are using for the task.

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2, id2label=id2label, ignore_mismatched_sizes=True, label2id=label2id)

We check for the availability of a CUDA-enabled GPU and assign the appropriate device and then we move our model to that device for computation.

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(device)
model.to(device)

Here we are passing basic arguments to the `Trainer` class.
- **`batch_size`**: This parameter determines the number of examples (data points) processed in each iteration (or batch) during training.

- **`lr` (learning rate)**: This is the rate at which the model weights are updated during training.

- **`num_epochs`**: Specifies the number of times the training dataset will be iterated over by the model during training.

- **`num_training_steps`**: This calculates the total number of training steps that will be performed over the specified number of epochs.

- **`batches_per_epoch`**: This represents the number of batches (or iterations) that will be processed in each epoch.

In [None]:
batch_size = 4

lr=3e-5 

num_epochs = 3
# max_steps = 100

num_training_steps = (len(my_dataset["train"]) // batch_size) * num_epochs # num_epochs * len(train_dataloader)
batches_per_epoch = len(my_dataset["train"]) // batch_size
# print(batches_per_epoch)

We are initializing optimizer and scheduler here.

In [None]:
# Optimizer initialization
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)

# Learning rate scheduler initialization
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

Several arguments that we difine are the following:

- `output_dir`: The directory where model checkpoints and outputs will be saved.
- `logging_steps`: Log metrics every specified number of training steps.
- `logging_strategy`: Specify whether logging is done by "steps" or "epoch".
- `save_strategy`: Strategy for saving model checkpoints, either by "epoch" or "steps".
- `save_steps`: Save a model checkpoint every specified number of steps.
- `save_total_limit`: Maximum number of checkpoints to keep.
- `evaluation_strategy`: Strategy for evaluating the model during training.
- `eval_steps`: Evaluate the model every specified number of training steps.
- `report_to`: Where to report evaluation results, set to "none" to disable reporting.


In [None]:
import accelerate

training_args = TrainingArguments(
    output_dir="./output",
#     evaluation_strategy = "epoch", #To calculate metrics per epoch
    evaluation_strategy="steps", # Evaluate the model every logging step
    eval_steps=20,
    
#     logging_strategy="epoch", #Extra: to log training data stats for loss
    logging_steps=20,
    logging_strategy="steps",
    
    learning_rate=lr,
    num_train_epochs=num_epochs,
    # max_steps=100,

    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    # warmup_steps=0,
    # weight_decay=0.01,
#     logging_dir="./logs",
    report_to=None,  # Set report_to to None to disable integrations
    save_strategy="steps",  # Set save_strategy to "no" to prevent saving model checkpoints
    save_steps=100,               # Save every 10 checkpoints
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["valid"],
    optimizers=(optimizer, lr_scheduler),  # Pass both optimizer and scheduler
    compute_metrics=compute_metrics
)

Now we are ready to train our model!

In [None]:
# Training loop using Trainer API
print('training model {}...'.format(model_name))

train_result = trainer.train()

In [None]:
metrics = train_result.metrics
trainer.save_metrics("train", metrics)
trainer.save_state()

#### DataLoader
Create a DataLoader for our test datasets so we can iterate over batches of data

In [None]:

from torch.utils.data import DataLoader

batch_size_dataloader = 2

original_test_dataloader = DataLoader(original_datasets, batch_size=batch_size_dataloader, shuffle=False, collate_fn=DataCollatorWithPadding(tokenizer=tokenizer))
scemantic_test_dataloader = DataLoader(scemantic_datasets, batch_size=batch_size_dataloader, shuffle=False, collate_fn=DataCollatorWithPadding(tokenizer=tokenizer))
context_test_dataloader = DataLoader(context_datasets, batch_size=batch_size_dataloader, shuffle=False, collate_fn=DataCollatorWithPadding(tokenizer=tokenizer))


### Evaluate
accumulate all the batches with add_batch and calculate the metric at the very end

##### Accuracy on each dataset (original, scemanic, context) by itself


In the following function we are calculating the accuracy of the model by each binary question, not for every triplet of questions.

In [None]:
def compute_accuracy(dataloader, model):
    total_data = 0
    label_0_data = 0

    metric = evaluate.load("accuracy")
    model.eval()
    for batch in dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        
        
        # Increment total_data counter
        total_data += batch["labels"].size(0)

        # Increment label_0_data counter
        label_0_data += (batch["labels"] == 0).sum().item()
        
        # print(batch["labels"])
        # print(predictions)
        # print()
        
        metric.add_batch(predictions=predictions, references=batch["labels"])

    temp_acc = metric.compute()
    temp_acc = round(temp_acc["accuracy"], 3)

    print(f"test accuracy: {temp_acc}")
    # print(f"Total data points: {total_data}")
    # print(f"Data points with label 0: {label_0_data}")
    
    # return original_acc

In [None]:
compute_accuracy(original_test_dataloader, model)
print("##################################")
compute_accuracy(scemantic_test_dataloader, model)
print("##################################")
compute_accuracy(context_test_dataloader, model)


#### Computing Initial Accuracy for Multiple-Choice Tasks

Here, we are determining the accuracy of the original multiple-choice task by predicting binary classifications based on ids.

We need to group the binary pairs created from the original dataset by the id of the same question.

- The following function groups the binary pairs by the id of the same question. 
- It returns a dictionary where the keys are the ids of the questions and the values are the binary pairs of the same question.

In [None]:
def group_same_dataset(dataset):    
    # Initialize a dictionary to store the results
    grouped_pairs = {}

    for id1 in dataset['id']:
        # print(id1)
        id1_list = id1.split('_')
        if len(id1_list) > 2:
            id1_list[0] = id1_list[0] + '_' + id1_list[1]
            id1_list[1] = id1_list[2]
            
        grouped_pairs[id1_list[0]] = [id1]
        
        for id2 in dataset['id']:
            id2_list = id2.split('_')
            if len(id2_list) > 2:
                id2_list[0] = id2_list[0] + '_' + id2_list[1]
                id2_list[1] = id2_list[2]
            if id1_list[0] in id2_list[0] and id1_list[1] != id2_list[1] and len(id1_list[0]) == len(id2_list[0]):
                grouped_pairs[id1_list[0]].append(id2)
        
    # print(grouped_pairs)
    
    for key in grouped_pairs:
        if len(grouped_pairs[key]) != 3:
            print(key)
            print(grouped_pairs[key])
            print()
    # assert len(grouped_pairs.values()) == 3
    return grouped_pairs

In [None]:
grouped_pairs_original = group_same_dataset(my_original_test_dataset)
grouped_pairs_scemantic = group_same_dataset(my_scemantic_test_dataset)
grouped_pairs_context = group_same_dataset(my_context_test_dataset)

The function below will take row of dataset and model and return all the information needed to calculate the accuracy of the model on that row.

In [None]:
def dataset_compute (row, model):
    
    prompt = row['question'][0].strip()
    # candidates = row['choice_list'][0]
    true_label_original = row['label'][0]
    # candidate_1, candidate_2, candidate_3, candidate_4 = candidates[0].strip(), candidates[1].strip(), candidates[2].strip(), candidates[3].strip()
    
    inputs = tokenizer(prompt, return_tensors="pt", padding=True).to("cuda")
    
    # labels = torch.tensor(true_label_original).unsqueeze(0).to("cuda")  # Batch size 1
    
    # Pass the input through the model to obtain predictions
    with torch.no_grad():
        logits = model(**inputs).logits
    
    # logits = outputs.logits
    predicted_class = logits.argmax().item()
    
    return prompt, true_label_original, predicted_class

The function below will take a dictionary of binary pairs and a model and return the accuracy of the model on that dictionary based on the correct prediction of all the triplets of questions.

In [None]:
def compute_triplets_acc(dataset, group_pairs):

    none_of_above = {}

    total_correct_groups = 0.0
    total_groups = len(group_pairs)
    every_id = {}
    
    group_acc = {}

    # we take question id and binary pairs id
    for group_id, group_ids_list in group_pairs.items():
        # print(group_id, group_ids_list)
        # Initialize a variable to check if all three ids in the group are correct
        # print("#"*30)
#         print(group_id)
        all_correct = True
        
        a = {}
        correct_label = []
        
        # Check each id in the group
        for single_id in group_ids_list:
            # print(single_id)
            
            # we first filter the dataset to get the binary pair
            original_data = dataset.filter(lambda example: example['id'] == single_id)
            # print(original_data)
            
            # we then extract the prompt, true label and predicted label
            prompt, true_label_original, predicted_class = dataset_compute(original_data, model)
#             print(prompt, true_label_original, predicted_class)
#             print(true_label_original, predicted_class)

            
            # # we store the prompt, true label and predicted label in a dictionary
            # a[single_id] = [prompt, true_label_original, predicted_class]
            
            correct_label.append(true_label_original)
            
            # Check if the prediction is correct
            if predicted_class != true_label_original:
                # we store the prompt, true label and predicted label in a dictionary
                a[single_id[-1]] = [prompt, true_label_original, predicted_class]
                all_correct = False
                # print("False")
#                 print(prompt, true_label_original, predicted_class)
                # break  # No need to check further if one is incorrect

        # print(len(correct_label))
        # print("#############################################")
        
        if correct_label[0] == correct_label[1] == correct_label[2] == 0:
            # a.append("None of above")
            none_of_above[group_id] = group_ids_list
            # print(group_id, group_ids_list)
        every_id[group_id] = a
        
        # If all three ids in the group are correct, increment the total correct groups
        if all_correct:
#             print(group_id)
            group_acc[group_id] = 1
            total_correct_groups += 1
#             print(total_correct_groups)
        else:
            group_acc[group_id] = 0
    
    # remove key-value pair if the value is empty
    every_id = {key: value for key, value in every_id.items() if value}

    # Compute accuracy based on the total correct groups and total groups
    accuracy = total_correct_groups / total_groups

    print(f"Accuracy: {accuracy * 100:.2f}%")
    
    return every_id, none_of_above, accuracy, group_acc

In [None]:
original_ids, original_none_of_above, original_acc, original_wrong_ids = compute_triplets_acc(my_original_test_dataset, grouped_pairs_original)
scemantic_ids, scemantic_none_of_above, scemantic_acc, scemantic_wrong_ids = compute_triplets_acc(my_scemantic_test_dataset, grouped_pairs_scemantic)
context_ids, context_none_of_above, context_acc, context_wrong_ids = compute_triplets_acc(my_context_test_dataset, grouped_pairs_context)

In [None]:
print("Accuracy of original dataset:")
print(round(original_acc, 3))
print("Accuracy of scemantic dataset:")
print(round(scemantic_acc, 3))
print("Accuracy of context dataset:")
print(round(context_acc, 3))


Here based on the `group` number we will calculate the accuracy of the model on that group.

In the following function we are creating a detailed output of the predictions of the model on each group.
The function takes as input:
- a key of the dataset row that misspredictions were made
- the details of these prerpdictions
- the dataset name.

In [None]:
def output_details(output_key, triplet_details, dataset_name, dataset):
    # Initialize the output template
    output_template = ""

    # give title to the output
    output_template += "  {} dataset:\n".format(dataset_name)

    ############################ Initial Dataset ############################
    dataset_entry = dataset.filter(lambda example: example['id'] == output_key)[0]
    output_template += "    Prompt: {}\n".format(dataset_entry['question'])
    output_template += "    True Label: {} -> {}\n".format(dataset_entry['label'], dataset_entry['choice_list'][dataset_entry['label']].strip())

    ############################ Triplets Dataset ############################
    infos = triplet_details[output_key]

    for element in infos:
        true_label = infos[element][1]

        element = int(element)

        if dataset_entry['label'] == element and true_label == 0:
            output_template += "    Predicted Label as correct: {} -> {}\n".format(element, dataset_entry['choice_list'][element].strip())
        elif dataset_entry['label'] == element and true_label == 1:
            output_template += "    Predicted Label as wrong: {} -> {}\n".format(element, dataset_entry['choice_list'][element].strip())
        else:
            if true_label == 0:
                output_template += "    Mispredicted Label as correct also: {} -> {}\n".format(element, dataset_entry['choice_list'][element].strip())
            else:
                output_template += "    Mispredicted Label as wrong also: {} -> {}\n".format(element, dataset_entry['choice_list'][element].strip())

    output_template += "\n"
    return output_template



In [None]:
def group_accuracy(dataset, original_triplet_res, original_triplet_details,  scemantic_triplet_res, scemantic_triplet_details, context_triplet_res, context_triplet_details, num_groups=2):
    # correct_predictions = {}
    wrong_predictions = {}
    total_correct = 0
    model.eval()  # Set the model to evaluation mode

    # Iterate over keys
    for i, key in enumerate(original_triplet_res.keys()):
        
        ############################ original dataset ############################
        is_original_correct = original_triplet_res[key]
        
        
        ############################ semantic dataset ############################
        is_semantic_correct = scemantic_triplet_res[key+'_SR']
        
        if num_groups == 3:
        ############################ context dataset ############################
            is_context_correct = context_triplet_res[key+'_CR']

        # print(key)
        # if num_groups == 2:
        if is_original_correct and is_semantic_correct:
            total_correct += 1
            
        if not is_original_correct:
            if key not in wrong_predictions:
                wrong_predictions[key] = output_details(key, original_triplet_details, "Original", dataset)
            else:
                wrong_predictions[key] += output_details(key, original_triplet_details, "Original", dataset)

        if not is_semantic_correct:
            if key not in wrong_predictions:
                wrong_predictions[key] = output_details(key+'_SR', scemantic_triplet_details, "Semantic", dataset)
            else:
                wrong_predictions[key] += output_details(key+'_SR', scemantic_triplet_details, "Semantic", dataset)
            
        if num_groups == 3: 
            if not is_context_correct:
                if key not in wrong_predictions:
                    wrong_predictions[key] = output_details(key+'_CR', context_triplet_details, "Context", dataset)
                else:
                    wrong_predictions[key] += output_details(key+'_CR', context_triplet_details, "Context", dataset)

        total_instances = i + 1
    accuracy = round(total_correct / total_instances, 3)
    if num_groups ==2:
        print("Accuracy Ori & Sem: {} -> {}/{}".format(round(total_correct / total_instances, 3), total_correct, total_instances))
    else:
        print("Accuracy Ori & Sem & Con: {} -> {}/{}".format(round(total_correct / total_instances, 3), total_correct, total_instances))
    
    return wrong_predictions, accuracy


### Ori & Sem Accuracy


In [None]:
wrong_preds, ori_sem_accuracy = group_accuracy(train_dataset, original_wrong_ids, original_ids, scemantic_wrong_ids, scemantic_ids, context_wrong_ids, context_ids, num_groups=2)

In [None]:
print("Accuracy is: ", ori_sem_accuracy)
# for key in wrong_preds:
#     print(key)
#     print(wrong_preds[key])
    
    
ori_sem_details = "Accuracy: " + str(ori_sem_accuracy) + '\n\n'
for key in wrong_preds:
    ori_sem_details += key + '\n'
    ori_sem_details += wrong_preds[key] + '\n'

### Ori & Sem & Con Accuracy

In [None]:
wrong_preds, ori_sem_con_accuracy = group_accuracy(train_dataset, original_wrong_ids, original_ids, scemantic_wrong_ids, scemantic_ids, context_wrong_ids, context_ids, num_groups=3)

In [None]:
print("Accuracy is: ", ori_sem_con_accuracy)
# for key in wrong_preds:
#     print(key)
#     print(wrong_preds[key])
    
    
ori_sem_con_details = "Accuracy is: " + str(ori_sem_con_accuracy) + '\n\n'

for key in wrong_preds:
    ori_sem_con_details += key + '\n'
    ori_sem_con_details += wrong_preds[key] + '\n'

## For the competion Try the Trained Model!

Here we handle the test set that is provided by the competition. We are following the same logic as above.

### Prepare test dataset

In [None]:
testset_original_datasets = get_final_dataset(testset_original_test_dataset)
testset_scemantic_datasets = get_final_dataset(testset_scemantic_test_dataset)
testset_context_datasets = get_final_dataset(testset_context_test_dataset)

In [None]:
grouped_pairs_testset_original = group_same_dataset(testset_original_test_dataset)
grouped_pairs_testset_scemantic = group_same_dataset(testset_scemantic_test_dataset)
grouped_pairs_testset_context = group_same_dataset(testset_context_test_dataset)

### Predict with fine-tuned model

##### Accuracy on each dataset (original, scemanic, context) by itself


The function below will take row of dataset and model and return all the information needed to calculate the accuracy of the model on that row.

In [None]:
test_set_original_ids, test_set_original_none_of_above, test_set_original_acc, test_set_original_wrong_ids = compute_triplets_acc(testset_original_test_dataset, grouped_pairs_testset_original)
test_set_scemantic_ids, test_set_scemantic_none_of_above, test_set_scemantic_acc, test_set_scemantic_wrong_ids = compute_triplets_acc(testset_scemantic_test_dataset, grouped_pairs_testset_scemantic)
test_set_context_ids, test_set_context_none_of_above, test_set_context_acc, test_set_context_wrong_ids = compute_triplets_acc(testset_context_test_dataset, grouped_pairs_testset_context)

In [None]:
print("Accuracy of original dataset:")
print(round(test_set_original_acc, 3))
print("Accuracy of scemantic dataset:")
print(round(test_set_scemantic_acc, 3))
print("Accuracy of context dataset:")
print(round(test_set_context_acc, 3))


### Ori & Sem Accuracy


In [None]:
test_set_wrong_preds, test_set_ori_sem_accuracy = group_accuracy(test_dataset, test_set_original_wrong_ids, test_set_original_ids, test_set_scemantic_wrong_ids, test_set_scemantic_ids, test_set_context_wrong_ids, test_set_context_ids, num_groups=2)

In [None]:
print("Accuracy is: ", test_set_ori_sem_accuracy)
# for key in test_set_wrong_preds:
#     print(key)
#     print(test_set_wrong_preds[key])
    

test_set_ori_sem_details = "Accuracy is: " + str(test_set_ori_sem_accuracy) + "\n\n"

for key in test_set_wrong_preds:
    test_set_ori_sem_details += key + '\n'
    test_set_ori_sem_details += test_set_wrong_preds[key] + '\n'    

### Ori & Sem & Con Accuracy

In [None]:
test_set_wrong_preds, test_set_ori_sem_con_accuracy = group_accuracy(test_dataset, test_set_original_wrong_ids, test_set_original_ids, test_set_scemantic_wrong_ids, test_set_scemantic_ids, test_set_context_wrong_ids, test_set_context_ids, num_groups=3)

In [None]:
print("Accuracy is: ", test_set_ori_sem_con_accuracy)
# for key in test_set_wrong_preds:
#     print(key)
#     print(test_set_wrong_preds[key])
    
test_set_ori_sem_con_details = "Accuracy is: " + str(test_set_ori_sem_con_accuracy) + "\n\n"

for key in test_set_wrong_preds:
    test_set_ori_sem_con_details += key + '\n'
    test_set_ori_sem_con_details += test_set_wrong_preds[key] + '\n'

Save information of mispredictions regarding group-based metric

In [None]:
def save_to_text_file(content, filename):
    with open(filename, 'w') as file:
        file.write(content)

In [None]:
save_to_text_file(ori_sem_details, './ori_sem_wrong.txt')
save_to_text_file(ori_sem_con_details, './ori_sem_con_wrong.txt')

save_to_text_file(test_set_ori_sem_details, './test_set_ori_sem_wrong.txt')
save_to_text_file(test_set_ori_sem_con_details, './test_set_ori_sem_con_wrong.txt')

Gathering results to a json

In [None]:
df_res = pd.DataFrame(columns=['checkpoint', 'task',  'lr', 'batch_size', 'num_epochs', 'original_acc', 'scemantic_acc', 'context_acc', 'ori_sem_acc', 'ori_sem_con_acc', 'date_of_run'])

# Create a dictionary for the new row
new_row_data = {
    'checkpoint': [model_name],
    'task': [task+"__TxtCls"],
    'lr': [lr],
    'batch_size': [batch_size],
    'num_epochs': [num_epochs],
    'original_acc': [original_acc],
    'semantic_acc': [scemantic_acc],
    'context_acc': [context_acc],
    'ori_sem_acc': [ori_sem_accuracy],
    'ori_sem_con_acc': [ori_sem_con_accuracy],
    'date_of_run': pd.to_datetime('today').strftime("%Y_%m_%d_%H:%M")
}

# Append the new row to the DataFrame
df_train = pd.DataFrame(new_row_data)

# display(df_temp)
# df_temp.to_csv('./results.csv', index=False)

new_row_test_set_data = {
    'checkpoint': [model_name],
    'task': [task+"__TxtCls_test_set"],
    'lr': [lr],
    'batch_size': [batch_size],
    'num_epochs': [num_epochs],
    'original_acc': [test_set_original_acc],
    'semantic_acc': [test_set_scemantic_acc],
    'context_acc': [test_set_context_acc],
    'ori_sem_acc': [test_set_ori_sem_accuracy],
    'ori_sem_con_acc': [test_set_ori_sem_con_accuracy],
    'date_of_run': pd.to_datetime('today').strftime("%Y_%m_%d_%H:%M")
}

# Append the new row to the DataFrame
df_test = pd.DataFrame(new_row_test_set_data)

# display(df_temp)
# df_temp.to_csv('./results.csv', index=False)

df_res = df_train._append(df_test, ignore_index=False)
display(df_res)
df_res.to_csv('./results.csv', index=False)

# # df_res.to_csv('/kaggle/input/results/results.csv', index=True)
# df_res.to_csv('../results/results.csv', index=False)

##### Save model

In [None]:
check = model_name[:model_name.find('/')]

model.save_pretrained('./models/{}_{}_{}'.format(task, check, pd.to_datetime('today').strftime("%Y_%m_%d_%H_%M")))

## Logic to export the results when running in Kaggle

* The following logic produces a zip file of the results in order to download it. The zip file name can be change through the `NAME_OF_ZIP_FILE` variable.

In [None]:
print(os.listdir("/kaggle/working/"))

In [None]:
print(os.listdir())

In [None]:
from zipfile import ZipFile
from IPython.display import FileLink

NAME_OF_ZIP_FILE = run_dir

# Directory to be zipped
directory_to_zip = '/kaggle/working/' + run_dir

# Zip file name
zip_file_name = '{}.zip'.format(NAME_OF_ZIP_FILE)

# Create a ZipFile object
with ZipFile(zip_file_name, 'w') as zip_obj:
    # Iterate over all files and directories in the specified directory
    for root, dirs, files in os.walk(directory_to_zip):
        for file in files:
            file_path = os.path.join(root, file)
            zip_obj.write(file_path, os.path.relpath(file_path, directory_to_zip))

# Generate FileLink for the zipped file
FileLink(zip_file_name)


The following code is used to check the contents of the zip file.

In [None]:

# Path to the ZIP file
zip_file_path = 'NAME_OF_ZIP_FILE.zip'  # Update with the path to your ZIP file

# Open the ZIP file in read mode
with ZipFile(zip_file_path, 'r') as zip_file:
    # Print the list of elements (files and directories) inside the ZIP file
    print("Elements inside the ZIP file:")
    for element in zip_file.namelist():
        print(element)
