# Install packages

In [None]:
# !pip install --force-reinstall numpy==1.22 # 1.23.4


In [None]:
!pip install datasets==2.14.6
!pip install transformers
!pip install evaluate
!pip install --no-cache-dir transformers sentencepiece

In [None]:
!pip install accelerate -U

# Imports

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForMultipleChoice, get_scheduler, TrainingArguments, Trainer

# from string import Template
# from pathlib import Path

import os

import warnings
warnings.simplefilter("ignore")

from tqdm.notebook import tqdm

import numpy as np
import pandas as pd

from datasets import Dataset, DatasetDict

from torch.utils.data import DataLoader


from IPython.display import Markdown, display

# Prepare training data

To access certain Language Model Models (LLMs) through the Hugging Face library, you may need to obtain an access token. You can acquire a token by signing up on the Hugging Face website and gaining permission to use the specific model you're interested in. 

The following cell demonstrates how to pass your access token in order to download the model and tokenizer. Put your access token in the `YOUR_HUGGING_FACE_TOKEN` variable.

In [None]:
from huggingface_hub import login

login(token='YOUR_HUGGING_FACE_TOKEN')

Here we determine the model we are using and the sub-task we are solving (Sentence Puzzle or Word Puzzle).

In [None]:
task = "SP"
model_name = "FacebookAI/roberta-large"

### Importing into Colab

Here we demonstrate how to import data into Colab. We have uploaded the data folder of the repository to a private Google Drive folder. Our folder is called `sem-dataset`.

In [4]:
# from google.colab import drive
# drive.mount('/content/drive')

In [5]:
# os.chdir('/content/drive/My Drive/sem-dataset')


In [6]:
# train_data = np.load('./data/'+task+'-train.npy', allow_pickle=True)

# test_data = np.load('./data/'+task+'_test_labeled.npy', allow_pickle=True)

### Importing into Kaggle

Here we demonstrate how to import data into Kaggle. We have uploaded the data folder of the repository to a private Kaggle dataset. Our dataset is called `sem-dataset`.

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input/sem-dataset'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

Here we import train and test data from the dataset.

In [8]:
train_data = np.load('/kaggle/input/sem-dataset/'+task+'-train.npy', allow_pickle=True)

test_data = np.load('/kaggle/input/sem-dataset/'+task+'_test_labeled.npy', allow_pickle=True)

### Make directory for our output

In [None]:
date_of_run = pd.to_datetime('today').strftime("%Y_%m_%d_%H_%M")

if '/' in model_name:
    # Split the model_name by "/"
    parts = model_name.split("/")
    
    # Check if there are at least 4 parts
    if len(parts) >= 5:
        # Concatenate the 3rd and 4th parts with an underscore
        model_suffix = parts[3] + "_" + parts[5]
    else:
        model_suffix = model_name

run_dir = "./sftt_Mlt_" + task + "_" + model_suffix + "_" + date_of_run
print(run_dir)

# Create the directory if it does not exist
if not os.path.exists(run_dir):
    os.makedirs(run_dir)

os.chdir(run_dir)

# Basic preprocessing

* Here we preprocess the data by splitting the data in Original, Scemantic Reconstruction and Context Reconstruction. 

* We then split the data into train, validation and test sets for each of the three types of data. This is done before shuffling in order to retain the same ids in the training, validation and test sets regarding the three types of data.

After that we concatenate the data and shuffle it in each of the three sets (Original, Scemantic, Context).


We create a test split of the given training data to evaluate the model on unseen data. This is done because of the absence of a test set in the dataset in the beginning of the competition.

In [13]:
def convert_from_numpy_to_dataset_type (numpy_array, split):
    data_list = numpy_array.tolist()
    df = pd.DataFrame(data_list)

    # display(df.head(1))
    if split == "train":
      df['id'] = df['id'].astype(str)      
      df['distractor1'] = df['distractor1'].astype(str)
      df['distractor2'] = df['distractor2'].astype(str)
      df['distractor(unsure)'] = df['distractor(unsure)'].astype(str)
      df['label'] = df['label'].astype(int)

    dataset = Dataset.from_pandas(df,  split=split)

    display(dataset[0])

    display(dataset.features) # just to check the type of the features

    return dataset

Importing the tokenizer in order to tokenize the data.


In [None]:
checkpoint = "FacebookAI/roberta-large"

tokenizer = AutoTokenizer.from_pretrained(checkpoint)

## `preprocess_function` Overview

The preprocessing function described below takes input rows of our multiple-choice dataset.

### Steps:

1. **Combine Sentences:**
   - Replicate each `question` four times to create sentence starts for pairing with each `choice`.

2. **Extract Sentence Endings (`second_sentences`)**:
   - Flattens the list of choice lists to extract all possible sentence endings.

2. **Flatten and Tokenize:**
   - Tokenize the flattened sentences to obtain `input_ids` and `attention_mask`.

3. **Unflatten and Assign Labels:**
   - Unflatten the tokenized sequences into pairs of `(input_ids, attention_mask)` corresponding to `questions` and `choices`.


In [15]:
def preprocess_function(examples):

    first_sentences = [[context] * 4 for context in examples["question"]]
    first_sentences = sum(first_sentences, [])
    # print(first_sentences)
    # print()
    
    second_sentences = [item for item in examples["choice_list"]]
    second_sentences = sum(second_sentences, [])
    # print(second_sentences)

    tokenized_examples = tokenizer(first_sentences, second_sentences, truncation=True)
    # display(tokenized_examples)

    return {k: [v[i : i + 4] for i in range(0, len(v), 4)] for k, v in tokenized_examples.items()}

In [None]:
# display(train_data[3])

### Train data

In [None]:
train_dataset = convert_from_numpy_to_dataset_type(train_data, "train")

Here we are using `.map()` to apply the `preprocess` function to the dataset.

In [18]:
tokenized_train = train_dataset.map(preprocess_function, batched=True)
# print(f"Training set size: {len(tokenized_train)}")


Map:   0%|          | 0/507 [00:00<?, ? examples/s]

Training set size: 507


### Test data

In [19]:
def convert_from_numpy_to_dataset_test_type (numpy_array):
    data_list = numpy_array.tolist()
    df = pd.DataFrame(data_list)

    df = pd.DataFrame(data_list)
    df['id'] = df['id'].astype(str)      
    df['label'] = df['label'].astype(int)

    dataset = Dataset.from_pandas(df)

    display(dataset[0])

    display(dataset.features) # just to check the type of the features

    return dataset

In [None]:
test_dataset = convert_from_numpy_to_dataset_test_type(test_data)

In [None]:
tokenized_test = test_dataset.map(preprocess_function, batched=True)
# print(f"Training set size: {len(tokenized_test)}")

## Splitting the dataset

### Train data

In [23]:
from sklearn.model_selection import train_test_split

In [None]:
ori_original_dataset = tokenized_train.filter(lambda data: "_SR" not in data["id"] and "_CR" not in data["id"])
ori_scemantic_dataset = tokenized_train.filter(lambda data: "_SR" in data["id"]) # SR => Semantic Reconstruction	
ori_context_dataset = tokenized_train.filter(lambda data: "_CR" in data["id"]) # CR => Context Reconstruction

# print(f"Original dataset size: {len(ori_original_dataset)}")
# print(f"Semantic dataset size: {len(ori_scemantic_dataset)}")
# print(f"Context dataset size: {len(ori_context_dataset)}")

In [26]:
def splitting_dataset(dataset, split_size):
    
    #split_size% test + validation
    train_testvalid = dataset.train_test_split(test_size=split_size, shuffle=False)
    
    # Split the rest test + valid in half test, half valid
    test_valid = train_testvalid["test"].train_test_split(test_size=0.5, shuffle=False)
    
    # gather everyone if you want to have a single DatasetDict
    datasets = DatasetDict({
        "train": train_testvalid["train"],
        "test": test_valid["test"],
        "valid": test_valid["train"]})
    
    return datasets


Here we are splitting the dataset into train, validation and test sets. **A good rule of thumb is to use 70% of the data for training, 15% for validation and 15% for testing.**

<u>**WE DO NOT WANT TO SHUFFLE THE DATASET BEFORE SPLITTING IT TO KEEP THE ORDER OF THE SENTENCES!!!**</u>

In [27]:
original_dataset = splitting_dataset(ori_original_dataset, 0.3)
scemantic_dataset = splitting_dataset(ori_scemantic_dataset, 0.3)
context_dataset = splitting_dataset(ori_context_dataset, 0.3)


In [29]:
from datasets import concatenate_datasets

assert original_dataset["train"].features.type == scemantic_dataset["train"].features.type
assert original_dataset["train"].features.type == context_dataset["train"].features.type
train_dataset = concatenate_datasets([original_dataset["train"], scemantic_dataset["train"], context_dataset["train"]])
# print(f"Training set size: {len(temp_train_dataset)}")
# print(temp_train_dataset)

assert original_dataset["valid"].features.type == scemantic_dataset["valid"].features.type
assert original_dataset["valid"].features.type == context_dataset["valid"].features.type
valid_dataset = concatenate_datasets([original_dataset["valid"], scemantic_dataset["valid"], context_dataset["valid"]])
# print(f"Validation set size: {len(valid_dataset)}")
# print(valid_dataset)

In [None]:
train_dataset = train_dataset.shuffle(seed=42)
valid_dataset = valid_dataset.shuffle(seed=42)

my_dataset = DatasetDict({
    "train": train_dataset,
    "valid": valid_dataset})

# print(my_dataset)

## Fine-tuning model

Transformers doesn’t have a data collator for multiple choice, so you’ll need to adapt the `DataCollatorWithPadding` to create a batch of examples. It’s more efficient to dynamically pad the sentences to the longest length in a batch during collation, instead of padding the whole dataset to the maximum length.

In [31]:
from dataclasses import dataclass
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from typing import Optional, Union


@dataclass
class DataCollatorForMultipleChoice:
    """
    Data collator that will dynamically pad the inputs for multiple choice received.
    """

    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None

    def __call__(self, features):
        label_name = "label" if "label" in features[0].keys() else "labels"
        labels = [feature.pop(label_name) for feature in features]
        batch_size = len(features)
        num_choices = len(features[0]["input_ids"])
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        flattened_features = sum(flattened_features, [])

        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors="pt",
        )

        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        batch["labels"] = torch.tensor(labels, dtype=torch.int64)
        return batch

When called on a list of examples, it will flatten all the inputs/attentions masks etc. in big lists that it will pass to the `tokenizer.pad` method. This will return a dictionary with big tensors (of shape `(batch_size * 4) x seq_length`) that we then unflatten.

We can check this data collator works on a list of features, we just have to make sure to remove all features that are not inputs accepted by our model:

In [32]:
accepted_keys = ["input_ids", "attention_mask", "label", "labels"]
features = [
    {k: v for k, v in my_dataset["train"][i].items() if k in accepted_keys}
    for i in range(10)
]
batch = DataCollatorForMultipleChoice(tokenizer)(features)

You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


## Evaluate

Including a metric during training is often helpful for evaluating your model’s performance. or this task, we load the accuracy metric.

In [None]:
import evaluate

accuracy = evaluate.load("accuracy")

In [34]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

## Train

First we need to preprocess the data for the trainer.


The `get_final_dataset` function modifies the input `dataset` by renaming the column "label" to "labels". 

* Depending on the value of `i` (either 1 or not 1), it removes specific columns from the dataset. It then sets the format of the dataset to "torch" and returns the modified dataset.

In [36]:
def get_final_dataset(dataset, i):
    tokenized_dataset = dataset.rename_column("label", "labels")
    if i==1:
        tokenized_dataset = tokenized_dataset.remove_columns(['id', 'question', 'answer', 'distractor1', 'distractor2', 'distractor(unsure)', 'choice_list', 'choice_order'])
    # tokenized_dataset = tokenized_dataset.remove_columns(['question', 'answer', 'distractor1', 'distractor2', 'distractor(unsure)', 'choice_list', 'choice_order'])
    else:
        tokenized_dataset = tokenized_dataset.remove_columns(['id', 'question', 'choice_list', 'answer'])
    tokenized_dataset.set_format("torch")
    return tokenized_dataset

In [37]:
tokenized_datasets = get_final_dataset(my_dataset, 1)

original_datasets = get_final_dataset(original_dataset["test"], 1)
scemantic_datasets = get_final_dataset(scemantic_dataset["test"], 1)
context_datasets = get_final_dataset(context_dataset["test"], 1)



We disable Weights & Biases. You'll need to apply an API key when prompted if you use it for tracking the training metrics.

In [41]:
os.environ["WANDB_DISABLED"] = "true"
# os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:64"


### Here we are loading the model we are using for the task.

In [44]:
model = AutoModelForMultipleChoice.from_pretrained(checkpoint, ignore_mismatched_sizes=True)

We check for the availability of a CUDA-enabled GPU and assign the appropriate device and then we move our model to that device for computation.

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
# print(device)
model.to(device)

Here we are passing basic arguments to the `Trainer` class.
- **`batch_size`**: This parameter determines the number of examples (data points) processed in each iteration (or batch) during training.

- **`lr` (learning rate)**: This is the rate at which the model weights are updated during training.

- **`num_epochs`**: Specifies the number of times the training dataset will be iterated over by the model during training.

- **`num_training_steps`**: This calculates the total number of training steps that will be performed over the specified number of epochs.

- **`batches_per_epoch`**: This represents the number of batches (or iterations) that will be processed in each epoch.

In [46]:
batch_size = 4

lr=3e-5 

num_epochs = 3

num_training_steps = (len(my_dataset["train"]) // batch_size) * num_epochs 
batches_per_epoch = len(my_dataset["train"]) // batch_size
# print(batches_per_epoch)


88


We are initializing optimizer and scheduler here.

In [47]:
# Optimizer initialization
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)

# Learning rate scheduler initialization
lr_scheduler = get_scheduler(
    name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
)

Several arguments that we difine are the following:

- `output_dir`: The directory where model checkpoints and outputs will be saved.
- `logging_steps`: Log metrics every specified number of training steps.
- `logging_strategy`: Specify whether logging is done by "steps" or "epoch".
- `save_strategy`: Strategy for saving model checkpoints, either by "epoch" or "steps".
- `save_steps`: Save a model checkpoint every specified number of steps.
- `save_total_limit`: Maximum number of checkpoints to keep.
- `evaluation_strategy`: Strategy for evaluating the model during training.
- `eval_steps`: Evaluate the model every specified number of training steps.
- `report_to`: Where to report evaluation results, set to "none" to disable reporting.


In [None]:
import accelerate

training_args = TrainingArguments(
    output_dir="./output",
#     evaluation_strategy = "epoch", #To calculate metrics per epoch
    evaluation_strategy="steps", # Evaluate the model every logging step
    eval_steps=20,
    
#     logging_strategy="epoch", #Extra: to log training data stats for loss
    logging_steps=20,
    logging_strategy="steps",
    
    learning_rate=lr,
    num_train_epochs=num_epochs,
    max_steps=100,

    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=1,
    # warmup_steps=0,
    # weight_decay=0.01,
#     logging_dir="./logs",
    report_to=None,  # Set report_to to None to disable integrations
    save_strategy="steps",  # Set save_strategy to "no" to prevent saving model checkpoints
    save_steps=100,               # Save every 10 checkpoints

)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["valid"],
    optimizers=(optimizer, lr_scheduler),  # Pass both optimizer and scheduler
    compute_metrics=compute_metrics
)

Now we are ready to train our model!

In [None]:
# Training loop using Trainer API
print('training model {}...'.format(checkpoint))

train_result = trainer.train()

In [52]:
metrics = train_result.metrics
trainer.save_metrics("train", metrics)
trainer.save_state()

## Predict with fine-tuned model

To accelerate iteration over our test split, we will utilize a `DataLoader` to process the data in batches during testing. This approach improves efficiency by enabling faster iteration and batch-wise handling of the test dataset.

In [53]:
batch_size = 4

train_dataloader = DataLoader(tokenized_datasets["train"], shuffle=True, batch_size=batch_size, collate_fn=DataCollatorForMultipleChoice(tokenizer=tokenizer))
val_dataloader = DataLoader(tokenized_datasets["valid"], batch_size=batch_size, collate_fn=DataCollatorForMultipleChoice(tokenizer=tokenizer))


original_test_dataloader = DataLoader(original_datasets, batch_size=batch_size, shuffle=False, collate_fn=DataCollatorForMultipleChoice(tokenizer=tokenizer))
scemantic_test_dataloader = DataLoader(scemantic_datasets, batch_size=batch_size, shuffle=False, collate_fn=DataCollatorForMultipleChoice(tokenizer=tokenizer))
context_test_dataloader = DataLoader(context_datasets, batch_size=batch_size, shuffle=False, collate_fn=DataCollatorForMultipleChoice(tokenizer=tokenizer))


##### Accuracy on each dataset (original, scemanic, context) by itself

We accumulate all the batches with add_batch and calculate the accuracy metric based on predicted labels compared to ground truth labels.

In [57]:
def instance_acc(dataloader):
    metric = evaluate.load("accuracy")
    model.eval()
    for batch in dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=predictions, references=batch["labels"])

    accuracy = metric.compute()
    accuracy = round(accuracy["accuracy"], 3)

    print(f"Accuracy: {accuracy}")
    
    return accuracy
    

In [None]:
original_acc = instance_acc(original_test_dataloader)
scemantic_acc = instance_acc(scemantic_test_dataloader)
context_acc = instance_acc(context_test_dataloader)

Here we implement the logic to calculate group-based accuracy. 

First we check that we have the same ids in all the datasets.

In [None]:
# Initialize a dictionary to store the results
id_is_substring = {}

for id1 in original_dataset["test"]['id']:
    # print(id1)
    id_is_substring[id1] = []
    for id2 in scemantic_dataset["test"]['id']:
        if str(id1) in str(id2):
            id_is_substring[id1].append(id2)
            
    for id3 in context_dataset["test"]['id']:
        if str(id1) in str(id3):
            id_is_substring[id1].append(id3)
    
# print(id_is_substring)

assert len(id_is_substring) == len(original_dataset["test"]['id'])

for key, value in id_is_substring.items():
    assert len(value) == 2

The function below will take a row of the given dataset and model and return all the information needed to calculate the accuracy of the model on that row.

In [60]:
def dataset_compute (row, model):
    
    prompt = row['question'][0].strip()
    candidates = row['choice_list'][0]
    true_label_original = row['label'][0]
    candidate_1, candidate_2, candidate_3, candidate_4 = candidates[0].strip(), candidates[1].strip(), candidates[2].strip(), candidates[3].strip()
    
    inputs = tokenizer([[prompt, candidate_1], [prompt, candidate_2], [prompt, candidate_3], [prompt, candidate_4]],
                       return_tensors="pt", padding=True).to("cuda")
    
    labels = torch.tensor(true_label_original).unsqueeze(0).to("cuda")  # Batch size 1
    
    # Pass the input through the model to obtain predictions
    with torch.no_grad():
        outputs = model(**{k: v.unsqueeze(0) for k, v in inputs.items()}, labels=labels)
    
    logits = outputs.logits
    predicted_class = logits.argmax().item()
    
    return prompt, candidates, true_label_original, predicted_class

Here based on the `group` number we will calculate the accuracy of the model on that group (Ori&Scem / Ori&Scem&Cont).

In [61]:
def group_accuracy(original_dataset, scemantic_dataset, context_dataset, model, num_groups=2):

    correct_predictions = {}
    wrong_predictions = {}
    total_correct = 0
    model.eval()  # Set the model to evaluation mode

    # Iterate over keys
    for i, key in enumerate(original_dataset['id']):
        
        ############################ original dataset ############################
        original_data = original_dataset.filter(lambda example: example['id'] == key)
        original_results = dataset_compute(original_data, model)
        
        ############################ semantic dataset ############################
        semantic_data = scemantic_dataset.filter(lambda example: example['id'] == key + '_SR')
        semantic_results = dataset_compute(semantic_data, model)
        
        if num_groups == 3:
        ############################ context dataset ############################
            context_data = context_dataset.filter(lambda example: example['id'] == key + '_CR')
            context_results = dataset_compute(context_data, model)
            
        # dataset_compute returns a tuple of (prompt, candidates, true_label, predicted_class)
        
        # check if the predicted class is equal to the true label
        is_original_correct = original_results[2] == original_results[3]
        is_semantic_correct = semantic_results[2] == semantic_results[3]
        is_context_correct = num_groups == 3 and context_results[2] == context_results[3]

        # if the predicted class is equal to the true label, add the results to the correct_predictions dictionary
        if num_groups == 2 and is_original_correct and is_semantic_correct:
            correct_predictions[key] = (original_results, semantic_results)
            total_correct += 1
        elif num_groups == 3 and is_original_correct and is_semantic_correct and is_context_correct:
            correct_predictions[key] = (original_results, semantic_results, context_results)
            total_correct += 1
        # otherwise, add the results to the wrong_predictions dictionary
        else:
            wrong_predictions[key] = (original_results, semantic_results)
            if num_groups == 3:
                wrong_predictions[key] += (context_results,)

        total_instances = i + 1
    accuracy = round(total_correct / total_instances, 3)
    if num_groups ==2:
        print("Accuracy Ori & Sem: {} -> {}/{}".format(round(total_correct / total_instances, 3), total_correct, total_instances))
    else:
        print("Accuracy Ori & Sem & Con: {} -> {}/{}".format(round(total_correct / total_instances, 3), total_correct, total_instances))
    
    return correct_predictions, wrong_predictions, accuracy


Function to display detailed results of the model on a specific group

In [62]:
# wrong_preds has num_groups tuples of (prompt, candidates, true_label, predicted_class)
def display_group_predictions(predictions):
    print("Number of predictions: {} \n".format(len(predictions)))
    for key, results in predictions.items():
        print(f"Key: {key}")
        for i, result in enumerate(results):
            print("  Dataset {}:".format(i + 1))
            print("    Prompt: {}".format(result[0]))
            # print("    Candidates: {}".format(result[1]))
            print("    True Label: {} -> {}".format(result[2], result[1][result[2]].strip()))
            print("    Predicted Class: {} -> {}".format(result[3], result[1][result[3]].strip()))


### Ori & Sem Accuracy


In [None]:
correct_preds, wrong_preds, ori_sem_accuracy = group_accuracy(original_dataset["test"], scemantic_dataset["test"], context_dataset["test"], model, num_groups=2)
display_group_predictions(wrong_preds)

### Ori & Sem & Con Accuracy

In [None]:
correct_preds, wrong_preds, ori_sem_con_accuracy = group_accuracy(original_dataset["test"], scemantic_dataset["test"], context_dataset["test"], model, num_groups=3)
display_group_predictions(wrong_preds)

## For the competion Try the Trained Model!

Here we handle the test set that is provided by the competition. We are following the same logic as above.

### Prepare test dataset

In [None]:
original_dataset = tokenized_test.filter(lambda data: "_SR" not in data["id"] and "_CR" not in data["id"])
scemantic_dataset = tokenized_test.filter(lambda data: "_SR" in data["id"]) # SR => Semantic Reconstruction	
context_dataset = tokenized_test.filter(lambda data: "_CR" in data["id"]) # CR => Context Reconstruction

print(f"Original dataset size: {len(original_dataset)}")
print(f"Semantic dataset size: {len(scemantic_dataset)}")
print(f"Context dataset size: {len(context_dataset)}")

In [66]:
# check that every id in original_dataset is also in scemantic_dataset and context_dataset
original_ids = [data["id"] for data in original_dataset]
scemantic_ids = [data["id"].split("_")[0] for data in scemantic_dataset]
context_ids = [data["id"].split("_")[0] for data in context_dataset]

print("Difference between original and context datasets {}".format(len(set(original_ids) - set(context_ids))))
print("Difference between scemantic and context datasets {}".format(len(set(scemantic_ids) - set(context_ids))))
print("Difference between original and scemantic datasets {}".format(len(set(original_ids) - set(scemantic_ids))))

Difference between original and context datasets 0
Difference between scemantic and context datasets 0
Difference between original and scemantic datasets 0


In [67]:
original_datasets = get_final_dataset(original_dataset, 0)
scemantic_datasets = get_final_dataset(scemantic_dataset, 0)
context_datasets = get_final_dataset(context_dataset, 0)


In [70]:

batch_size = 4

original_test_dataloader = DataLoader(original_datasets, batch_size=batch_size, shuffle=False, collate_fn=DataCollatorForMultipleChoice(tokenizer=tokenizer))
scemantic_test_dataloader = DataLoader(scemantic_datasets, batch_size=batch_size, shuffle=False, collate_fn=DataCollatorForMultipleChoice(tokenizer=tokenizer))
context_test_dataloader = DataLoader(context_datasets, batch_size=batch_size, shuffle=False, collate_fn=DataCollatorForMultipleChoice(tokenizer=tokenizer))

### Predict with fine-tuned model

##### Accuracy on each dataset (original, scemanic, context) by itself


In [71]:
test_set_original_acc = instance_acc(original_test_dataloader)
test_set_scemantic_acc = instance_acc(scemantic_test_dataloader)
test_set_context_acc = instance_acc(context_test_dataloader)

Accuracy: 0.875
Accuracy: 0.875
Accuracy: 0.875


In [72]:
id_is_substring = {}

for id1 in original_dataset['id']:
    # print(id1)
    id_is_substring[id1] = []
    for id2 in scemantic_dataset['id']:
        if str(id1 +"_") in str(id2):
            id_is_substring[id1].append(id2)
            
    for id3 in context_dataset['id']:
        if str(id1+"_") in str(id3):
            id_is_substring[id1].append(id3)
    
# print(id_is_substring)

assert len(id_is_substring) == len(original_dataset['id'])

for key, value in id_is_substring.items():
    assert len(value) == 2

{'SP-123': ['SP-123_SR', 'SP-123_CR'], 'SP-184': ['SP-184_SR', 'SP-184_CR'], 'SP-73': ['SP-73_SR', 'SP-73_CR'], 'SP-186': ['SP-186_SR', 'SP-186_CR'], 'SP-166': ['SP-166_SR', 'SP-166_CR'], 'SP-146': ['SP-146_SR', 'SP-146_CR'], 'SP-156': ['SP-156_SR', 'SP-156_CR'], 'SP-190': ['SP-190_SR', 'SP-190_CR'], 'SP-4': ['SP-4_SR', 'SP-4_CR'], 'SP-120': ['SP-120_SR', 'SP-120_CR'], 'SP-205': ['SP-205_SR', 'SP-205_CR'], 'SP-19': ['SP-19_SR', 'SP-19_CR'], 'SP-157': ['SP-157_SR', 'SP-157_CR'], 'SP-90': ['SP-90_SR', 'SP-90_CR'], 'SP-191': ['SP-191_SR', 'SP-191_CR'], 'SP-16': ['SP-16_SR', 'SP-16_CR'], 'SP-47': ['SP-47_SR', 'SP-47_CR'], 'SP-30': ['SP-30_SR', 'SP-30_CR'], 'SP-122': ['SP-122_SR', 'SP-122_CR'], 'SP-147': ['SP-147_SR', 'SP-147_CR'], 'SP-189': ['SP-189_SR', 'SP-189_CR'], 'SP-79': ['SP-79_SR', 'SP-79_CR'], 'SP-121': ['SP-121_SR', 'SP-121_CR'], 'SP-14': ['SP-14_SR', 'SP-14_CR'], 'SP-72': ['SP-72_SR', 'SP-72_CR'], 'SP-60': ['SP-60_SR', 'SP-60_CR'], 'SP-183': ['SP-183_SR', 'SP-183_CR'], 'SP-142':

Function to produce detailed results of the model on a specific group

In [74]:
def display_group_predictions(predictions):
    output = "Number of predictions: {} \n\n".format(len(predictions))
    for key, results in predictions.items():
        output += "#"*120 + "\n"
        output += "Key: {}\n".format(key)
        for i, result in enumerate(results):
            output += "  Dataset {}:\n".format(i + 1)
            output += "    Prompt: {}\n".format(result[0])
            output += "    True Label: {} -> {}\n".format(result[2], result[1][result[2]].strip())
            output += "    Predicted Class: {} -> {}\n".format(result[3], result[1][result[3]].strip())
    

    output += "#"*120 + "\n"

    return output

### Ori & Sem Accuracy


In [None]:
correct_preds, wrong_preds, test_set_ori_sem_accuracy = group_accuracy(original_dataset, scemantic_dataset, context_dataset, model, num_groups=2)

In [76]:
test_set_ori_sem_wrong_answers = display_group_predictions(wrong_preds)

### Ori & Sem & Con Accuracy

In [None]:
correct_preds, wrong_preds, test_set_ori_sem_con_accuracy = group_accuracy(original_dataset, scemantic_dataset, context_dataset, model, num_groups=3)

In [78]:
test_set_ori_sem_con_wrong_answers = display_group_predictions(wrong_preds)

Save information of mispredictions regarding group-based metric

In [79]:
# Define the directory path
results_dir = './results/'

# Create the directory if it does not exist
if not os.path.exists(results_dir):
    os.makedirs(results_dir)

In [80]:
def save_to_text_file(content, filename):
    with open(filename, 'w') as file:
        file.write(content)

In [82]:
save_to_text_file(test_set_ori_sem_wrong_answers, './results/test_set_ori_sem_con_wrong.txt')
save_to_text_file(test_set_ori_sem_con_wrong_answers, './results/test_set_ori_sem_wrong.txt')

Gathering results to a json

In [83]:

df_res = pd.DataFrame(columns=['checkpoint', 'task',  'lr', 'batch_size', 'num_epochs', 'original_acc', 'scemantic_acc', 'context_acc', 'ori_sem_acc', 'ori_sem_con_acc', 'date_of_run'])

# Create a dictionary for the new row
new_row_data = {
    'checkpoint': [checkpoint],
    'task': [task],
    'lr': [lr],
    'batch_size': [batch_size],
    'num_epochs': [num_epochs],
    'original_acc': [original_acc],
    'scemantic_acc': [scemantic_acc],
    'context_acc': [context_acc],
    'ori_sem_acc': [ori_sem_accuracy],
    'ori_sem_con_acc': [ori_sem_con_accuracy],
    'date_of_run': pd.to_datetime('today').strftime("%Y_%m_%d_%H:%M")
}

# Append the new row to the DataFrame
df_temp = pd.DataFrame(new_row_data)

new_row_data = {
    'checkpoint': [checkpoint],
    'task': [task+"_test_set"],
    'lr': [lr],
    'batch_size': [batch_size],
    'num_epochs': [num_epochs],
    'original_acc': [test_set_original_acc],
    'scemantic_acc': [test_set_scemantic_acc],
    'context_acc': [test_set_context_acc],
    'ori_sem_acc': [test_set_ori_sem_accuracy],
    'ori_sem_con_acc': [test_set_ori_sem_con_accuracy],
    'date_of_run': pd.to_datetime('today').strftime("%Y_%m_%d_%H:%M")
}

# Append the new row to the DataFrame
df_res = pd.DataFrame(new_row_data)

# display(df_temp)
# df_temp.to_csv('./results/results.csv', index=False)


df_res = df_res._append(df_temp, ignore_index=False)
display(df_res)


csv_path = os.path.join(results_dir, 'results.csv')
df_temp.to_csv(csv_path, index=False)


Unnamed: 0,checkpoint,task,lr,batch_size,num_epochs,original_acc,scemantic_acc,context_acc,ori_sem_acc,ori_sem_con_acc,date_of_run
0,FacebookAI/roberta-large,SP_test_set,3e-05,4,3,0.875,0.875,0.875,0.85,0.775,2024_04_01_09:40
0,FacebookAI/roberta-large,SP,3e-05,4,3,0.692,0.692,0.692,0.654,0.538,2024_04_01_09:40


##### Save model

In [82]:
check = checkpoint[:checkpoint.find('/')]

model.save_pretrained('./models/{}_{}_{}'.format(task, check, pd.to_datetime('today').strftime("%Y_%m_%d_%H_%M")))

## Logic to export the results when running in Kaggle

* The following logic produces a zip file of the results in order to download it. The zip file name can be change through the `NAME_OF_ZIP_FILE` variable.

In [None]:
print(os.listdir("/kaggle/working/"))

In [None]:
print(os.listdir())

In [None]:
from zipfile import ZipFile
from IPython.display import FileLink

NAME_OF_ZIP_FILE = run_dir

# Directory to be zipped
directory_to_zip = '/kaggle/working/' + run_dir

# Zip file name
zip_file_name = '{}.zip'.format(NAME_OF_ZIP_FILE)

# Create a ZipFile object
with ZipFile(zip_file_name, 'w') as zip_obj:
    # Iterate over all files and directories in the specified directory
    for root, dirs, files in os.walk(directory_to_zip):
        for file in files:
            file_path = os.path.join(root, file)
            zip_obj.write(file_path, os.path.relpath(file_path, directory_to_zip))

# Generate FileLink for the zipped file
FileLink(zip_file_name)


The following code is used to check the contents of the zip file.

In [None]:

# Path to the ZIP file
zip_file_path = 'NAME_OF_ZIP_FILE.zip'  # Update with the path to your ZIP file

# Open the ZIP file in read mode
with ZipFile(zip_file_path, 'r') as zip_file:
    # Print the list of elements (files and directories) inside the ZIP file
    print("Elements inside the ZIP file:")
    for element in zip_file.namelist():
        print(element)
