# Install packages

In [None]:
!pip install -q -U bitsandbytes==0.42.0
!pip install -q -U peft==0.8.2
!pip install -q -U trl==0.7.10
!pip install -q -U accelerate==0.27.1
!pip install -q -U datasets==2.17.0
!pip install -q -U transformers==4.38.1

!pip install torch

!pip install scipy
!pip install -U sentence-transformers


# Imports

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

import os

import warnings
warnings.simplefilter("ignore")

from tqdm.notebook import tqdm

from peft import LoraConfig
from transformers import TrainingArguments
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM


import bitsandbytes as bnb
import numpy as np
import pandas as pd

from datasets import Dataset, DatasetDict

from IPython.display import Markdown, display

# Load model and tokenizer

To access certain Language Model Models (LLMs) through the Hugging Face library, you may need to obtain an access token. You can acquire a token by signing up on the Hugging Face website and gaining permission to use the specific model you're interested in. 

The following cell demonstrates how to pass your access token in order to download the model and tokenizer. Put your access token in the `YOUR_HUGGING_FACE_TOKEN` variable.

In [None]:
from huggingface_hub import login

login(token='YOUR_HUGGING_FACE_TOKEN')

Here we determine the model we are using, the sub-task we are solving (Sentence Puzzle or Word Puzzle), and the maximum steps of our training.

In [4]:
model_name = "microsoft/phi-2"

task = "SP"
max_steps = 250

### Creating Bitsandbytes Configuration

Before loading the model, we will define a function `create_bnb_config` to define the `bitsandbytes` configuration. The `bitsandbytes` library allows model quantization. Quantization is a technique used to compress deep learning models by reducing the number of bits used to represent their weights and activations. This compression allows for faster inference and reduced memory consumption, making it possible to deploy these models on edge devices with limited resources.

By using 4-bit transformer language models, we can achieve impressive results while significantly reducing memory and computational requirements.

Hugging Face Transformers (`transformers`) is closely integrated with `bitsandbytes`. The `BitsAndBytesConfig` class from the `transformers` library allows configuring the model quantization method.

Parameters:

`load_in_4bit`: Load the model in 4-bit precision, i.e., divide memory usage by 4.

`bnb_4bit_use_double_quant`: Use nested quantization techniques for more memory-efficient inference at no additional cost.

`bnb_4bit_quant_type`: Set quantization data type. The options are either FP4 (4-bit precision), which is the default quantization data type, or NF4 (Normal Float 4), a new 4-bit data type adapted for weights that have been initialized using a normal distribution.

`bnb_4bit_compute_dtype`: Set the computational data type for 4-bit models. Default value: torch.float32

In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False, #True
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtyp=torch.float16  #torch.bfloat16,
)

compute_dtype = getattr(torch, "float16")

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map = "auto", 
    trust_remote_code=True,
)

# this should be set as False for finetuning
model.config.use_cache = False

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"

# Prepare training data

### Importing into Kaggle

Here we demonstrate how to import data into Kaggle. We have uploaded the data folder of the repository to a private Kaggle dataset. Our dataset is called `sem-dataset`.

In [None]:
for dirname, _, filenames in os.walk('/kaggle/input/sem-dataset'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

Here we import train and test data from the dataset.

In [9]:
train_data = np.load('/kaggle/input/sem-dataset/'+task+'-train.npy', allow_pickle=True)

test_data = np.load('/kaggle/input/sem-dataset/'+task+'_test_labeled.npy', allow_pickle=True)

### Importing into Colab

Here we demonstrate how to import data into Colab. We have uploaded the data folder of the repository to a private Google Drive folder. Our folder is called `sem-dataset`.

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
# os.chdir('/content/drive/My Drive/sem-dataset')


In [None]:
# train_data = np.load('./data/'+task+'-train.npy', allow_pickle=True)

# test_data = np.load('./data/'+task+'_test_labeled.npy', allow_pickle=True)

### Make directory for our output

In [10]:
date_of_run = pd.to_datetime('today').strftime("%Y_%m_%d_%H_%M")

if '/' in model_name:
    # Split the model_name by "/"
    parts = model_name.split("/")
    
    # Check if there are at least 4 parts
    if len(parts) >= 5:
        # Concatenate the 3rd and 4th parts with an underscore
        model_suffix = parts[3] + "_" + parts[5]
    else:
        model_suffix = model_name

run_dir = "./sftt_Mlt_" + task + "_" + model_suffix + "_" + date_of_run
# print(run_dir)

# Create the directory if it does not exist
if not os.path.exists(run_dir):
    os.makedirs(run_dir)

os.chdir(run_dir)

# Basic preprocessing

### Train dataset

In [13]:
def convert_from_numpy_to_dataset_type (numpy_array, split):
    data_list = numpy_array.tolist()
    df = pd.DataFrame(data_list)

    # display(df.head(1))
    if split == "train":
      df['id'] = df['id'].astype(str)
      df['distractor1'] = df['distractor1'].astype(str)
      df['distractor2'] = df['distractor2'].astype(str)
      df['distractor(unsure)'] = df['distractor(unsure)'].astype(str)
      df['label'] = df['label'].astype(int)

    # dataset = Dataset.from_pandas(df,  split=split)
    dataset = Dataset.from_pandas(df)


    # display(dataset[0])

    # display(dataset.features) # just to check the type of the features

    return dataset

In [14]:
train_dataset = convert_from_numpy_to_dataset_type(train_data, "train")

### Test dataset

In [15]:
def convert_from_numpy_to_dataset_test_type (numpy_array):
    data_list = numpy_array.tolist()
    df = pd.DataFrame(data_list)

    df = pd.DataFrame(data_list)
    df['id'] = df['id'].astype(str)      
    df['label'] = df['label'].astype(int)

    dataset = Dataset.from_pandas(df)

    display(dataset[0])

    display(dataset.features) # just to check the type of the features

    return dataset

In [None]:
test_dataset = convert_from_numpy_to_dataset_test_type(test_data)


### Splitting the dataset

* Here we preprocess the data by splitting the data in Original, Scemantic Reconstruction and Context Reconstruction. 

* We then split the train dataset into train, validation and test splits for each of the three types of data. This is done before shuffling in order to retain the same ids in the training, validation and test sets regarding the three types of data.

After that we concatenate the data and shuffle it in each of the three sets (Original, Scemantic, Context).

* This is done because of the absence of a test set in the dataset in the beginning of the competition.


* After that we are using some logic to transform the multiple choice task into a binary classification task. For each unique id we create from the 4 multiple choices of the original problem, 4 binary classification problems. Because the fourth option is always "None of above", we skip it and we have 3 binary classification problems for each unique id. This is done by creating a new column `label` which is 1 if the answer is correct and 0 otherwise. Then we ask the model if the answer is correct for each of the 3 binary classification problems.

The above are done for the three splits of the train dataset but also for the test dataset of the competition.

#### Train dataset

In [17]:
from sklearn.model_selection import train_test_split

In [18]:
ori_original_dataset = train_dataset.filter(lambda data: "_SR" not in data["id"] and "_CR" not in data["id"])
ori_scemantic_dataset = train_dataset.filter(lambda data: "_SR" in data["id"]) # SR => Semantic Reconstruction
ori_context_dataset = train_dataset.filter(lambda data: "_CR" in data["id"]) # CR => Context Reconstruction

# print(f"Original dataset size: {len(ori_original_dataset)}")
# print(f"Semantic dataset size: {len(ori_scemantic_dataset)}")
# print(f"Context dataset size: {len(ori_context_dataset)}")

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

In [19]:
def splitting_dataset(dataset, split_size):
    from datasets import DatasetDict

    #split_size% test + validation
    train_testvalid = dataset.train_test_split(test_size=split_size, shuffle=False)

    # Split the rest test + valid in half test, half valid
    test_valid = train_testvalid["test"].train_test_split(test_size=0.5, shuffle=False)

    # gather everyone if you want to have a single DatasetDict
    datasets = DatasetDict({
        "train": train_testvalid["train"],
        "test": test_valid["test"],
        "valid": test_valid["train"]})

    return datasets


Here we are splitting the dataset into train, validation and test sets. **A good rule of thumb is to use 70% of the data for training, 15% for validation and 15% for testing.**

<u>**WE DO NOT WANT TO SHUFFLE THE DATASET BEFORE SPLITTING IT TO KEEP THE ORDER OF THE SENTENCES!!!**</u>

In [20]:
original_dataset = splitting_dataset(ori_original_dataset, 0.3)
scemantic_dataset = splitting_dataset(ori_scemantic_dataset, 0.3)
context_dataset = splitting_dataset(ori_context_dataset, 0.3)


Now we will make the only dataset that we will use for training and validation.
The testing will be done on several datasets.

In [21]:
from datasets import concatenate_datasets

assert original_dataset["train"].features.type == scemantic_dataset["train"].features.type
assert original_dataset["train"].features.type == context_dataset["train"].features.type
training_dataset = concatenate_datasets([original_dataset["train"], scemantic_dataset["train"], context_dataset["train"]])
# print(f"Training set size: {len(temp_training_dataset)}")
# print(temp_training_dataset)

assert original_dataset["valid"].features.type == scemantic_dataset["valid"].features.type
assert original_dataset["valid"].features.type == context_dataset["valid"].features.type
valid_dataset = concatenate_datasets([original_dataset["valid"], scemantic_dataset["valid"], context_dataset["valid"]])
# print(f"Validation set size: {len(valid_dataset)}")
# print(valid_dataset)

In [22]:
training_dataset = training_dataset.shuffle(seed=42)
valid_dataset = valid_dataset.shuffle(seed=42)


my_dataset = DatasetDict({
    "train": training_dataset,
    "valid": valid_dataset})

# print(my_dataset)

#### Test dataset

In [23]:
test_original_dataset = test_dataset.filter(lambda data: "_SR" not in data["id"] and "_CR" not in data["id"])
test_scemantic_dataset = test_dataset.filter(lambda data: "_SR" in data["id"]) # SR => Semantic Reconstruction	
test_context_dataset = test_dataset.filter(lambda data: "_CR" in data["id"]) # CR => Context Reconstruction

print(f"Original dataset size: {len(ori_original_dataset)}")
print(f"Semantic dataset size: {len(ori_scemantic_dataset)}")
print(f"Context dataset size: {len(ori_context_dataset)}")

Filter:   0%|          | 0/120 [00:00<?, ? examples/s]

Filter:   0%|          | 0/120 [00:00<?, ? examples/s]

Filter:   0%|          | 0/120 [00:00<?, ? examples/s]

Original dataset size: 169
Semantic dataset size: 169
Context dataset size: 169


### Tokenize after splitting

`create_binary_pairs` is a function that takes a row of our dataset and creates the binary pairs. It returns a list of the new rows.

In [24]:
def create_binary_pairs(row):
    id = row['id']
    question = row['question']
    correct_answer = row['answer']
    choices = row['choice_list']
    # choice_order = row['choice_order']

    binary_pairs = []

    # check if question contains '?' at the end
    question = question.strip()
    if question[-1] != '?':
        question = question + '?'

    for i in range(len(choices)):
        choice = choices[i]

        # if choice contains "None of the above" skip it
        if "none of above" in choice.lower():
            continue

        # handle choice format
        formatted_choice = choice.strip()
        if formatted_choice[-1] != '.':
            formatted_choice = formatted_choice + '.'

        is_correct = (choice == correct_answer)
        label = 1 if is_correct else 0

        # Concatenate the question and choice to create a new question
        new_question = f"{question} {formatted_choice}"

        # create new id to group these binary pairs together
        new_id = f"{id}_{i}"
        # new_id = f"{i}"

        # Create a binary pair with the new question and label
        pair = {'id': new_id, 'question': new_question, 'label': label}
        binary_pairs.append(pair)

    # row['binary_pairs'] = binary_pairs

    # row['binary_pairs'] = binary_pairs

    return binary_pairs


`create_binary_dataset` is a function that takes the dataset and creates the binary dataset. It returns a new dataset as a list.

In [26]:
binary_dataset = []
def create_binary_dataset(example):
    binary_questions = create_binary_pairs(example)
    binary_dataset.extend(binary_questions)


In [None]:
my_dataset["train"].map(create_binary_dataset)

# print("Length of binary dataset: ", len(binary_dataset))
display(binary_dataset[:3])

Now we will create a binary pair dataset for the train, validation and test sets.

In [None]:
list_of_datasets = [my_dataset["train"], my_dataset["valid"], original_dataset["test"], scemantic_dataset["test"], context_dataset["test"], test_original_dataset, test_scemantic_dataset, test_context_dataset]

all_data = []

for i, dataset in enumerate(list_of_datasets):
    binary_dataset = []
    dataset.map(create_binary_dataset)
    print("Length of binary dataset: ", len(binary_dataset))
    
    all_data.append(binary_dataset)   
    
# print(len(all_data))


In [None]:
my_train_dataset = Dataset.from_list(all_data[0])
my_valid_dataset = Dataset.from_list(all_data[1])

my_original_test_dataset = Dataset.from_list(all_data[2])
my_scemantic_test_dataset = Dataset.from_list(all_data[3])
my_context_test_dataset = Dataset.from_list(all_data[4])

testset_original_test_dataset = Dataset.from_list(all_data[5])
testset_scemantic_test_dataset = Dataset.from_list(all_data[6])
testset_context_test_dataset = Dataset.from_list(all_data[7])


# Print the resulting dataset
print(my_train_dataset)
print(my_valid_dataset)

print(my_original_test_dataset)
print(my_scemantic_test_dataset)
print(my_context_test_dataset)

print(testset_original_test_dataset)
print(testset_scemantic_test_dataset)
print(testset_context_test_dataset)

In [31]:
my_dataset = DatasetDict({
    "train": my_train_dataset,
    "valid": my_valid_dataset})

# print(my_dataset)

## Making template for model

Here we define the template for our model. As our problem is a binary classification problem, we format the dataset accordingly.

In [32]:
# Modified template for text classification task
template = """

### Instruction:
Below is an instruction that describes a text classification task. Determine whether the following statement is true or false.
Select the most suitable answer while making the necessary assumptions. Give only answer and a short explanation of two or three sentences. Nothing else.

### Input:
Statement: {prompt}\n
Is the statement true or false?

### Answer:
The statement is {answer}"""

# prompt = template.format(prompt, answer)

The following logic is used for mapping the correct answer to label 1 and the other answers to label 0.

In [33]:
def format_text(example):
    """ fill inputs in promt for a sample  """

    if example['label'] == 1:
        answer = "True"
    else:
        answer = "False"

    text = template.format(prompt=example['question'],
                        answer=answer)
    return {"text": text}

In [34]:
my_dataset["train"] = my_dataset["train"].map(format_text)
temp = my_dataset["train"]
# print(f"Training set size: {len(temp)}")


Map:   0%|          | 0/1062 [00:00<?, ? examples/s]

In [35]:
my_dataset["valid"] = my_dataset["valid"].map(format_text)
temp = my_dataset["valid"]
# print(f"Valid set size: {len(temp)}")

Map:   0%|          | 0/225 [00:00<?, ? examples/s]

# Set up training arguments

In [36]:
# SOURCE https://github.com/artidoro/qlora/blob/main/qlora.py

def find_linear_layers(model):
    cls = bnb.nn.Linear4bit #if args.bits == 4 else (bnb.nn.Linear8bitLt if args.bits == 8 else torch.nn.Linear)
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])


    if 'lm_head' in lora_module_names: # needed for 16-bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)


In [37]:
def create_peft_config(modules, lora_r, lora_alpha, lora_dropout):
    """
    Create Parameter-Efficient Fine-Tuning config for your model
    :param modules: Names of the modules to apply Lora to
    """
    config = LoraConfig(
        r=lora_r,  # dimension of the updated matrices
        lora_alpha=lora_alpha,  # parameter for scaling
        target_modules=modules,
        lora_dropout=lora_dropout,  # dropout probability for layers
        bias="none",
        task_type="CAUSAL_LM",
    )

    return config

In [38]:
def print_trainable_parameters(model, use_4bit=False):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        num_params = param.numel()
        # if using DS Zero 3 and the weights are initialized empty
        if num_params == 0 and hasattr(param, "ds_numel"):
            num_params = param.ds_numel

        all_param += num_params
        if param.requires_grad:
            trainable_params += num_params
    if use_4bit:
        trainable_params /= 2
    print(
        f"all params: {all_param} || trainable params: {trainable_params} || trainable%: {100 * trainable_params / all_param}"
    )

Here we define the LoRA config.

- `r` is the rank of the low-rank matrix used in the adapters, which thus controls the number of parameters trained. A higher rank will allow for more expressivity, but there is a compute tradeoff.

- `alpha` is the scaling factor for the learned weights. The weight matrix is scaled by `alpha/r`, and thus a higher value for alpha assigns more weight to the LoRA activations.

We experiment with different values for `r` and `alpha` to find the best combination for our model.

In [39]:
################################################################################
# QLoRA parameters
################################################################################

# LoRA attention dimension
lora_r = 64
# lora_r = 8

# Alpha parameter for LoRA scaling
lora_alpha = 64
# lora_alpha = 16

# Dropout probability for LoRA layers
# lora_dropout = 0.05
lora_dropout = 0.1

# Bias
bias = "none"

# Task type
task_type = "CAUSAL_LM"



In [None]:
# Get LoRA module names
target_modules = find_linear_layers(model)
# print(target_modules)

#for llama 2
qlora_config = create_peft_config(target_modules, lora_r, lora_alpha, lora_dropout)

# Print information about the percentage of trainable parameters
# print_trainable_parameters(model, True)

Here we define the training arguments for our model.

- `train_batch_size` and `eval_batch_size` are the batch sizes for training and evaluation. If the model is too large to fit in memory, you can reduce these values.

- `gradient_accumulation_steps` is the number of steps to accumulate gradients before performing an optimization step. This is useful when the batch size is too large to fit in memory.

- `learning_rate` is the learning rate for the optimizer. We set it to 2e-5, but you can experiment with different values.

- `optimizer` is the optimizer used for training. We use AdamW, which is a popular optimizer for training transformer models. If less memory is available, you can use a quantized optimizer.

- `num_train_epochs` is the number of epochs for training. We set it to 3 for demonstration purposes, but you can increase it for better results.

- `max_steps` is the maximum number of training steps. We set it to 250 because we are using a small dataset. It is used instead of `num_train_epochs` if it is specified. If you want to train for a specific number of epochs, you can remove this argument.


In [42]:
################################################################################
# TrainingArguments parameters
################################################################################

# Output directory where the model predictions and checkpoints will be stored
output_dir = "./outputs"

# Batch size per GPU for training
train_batch_size = 1
eval_batch_size = 1

# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 1

# Initial learning rate (AdamW optimizer)
lr = 2e-5

# Optimizer to use
optim = "paged_adamw_32bit"
# optim="paged_adamw_8bit

# Number of training epochs
# num_of_epochs = 2
max_steps = 250


Several arguments that we difine are the following:

- `output_dir`: The directory where model checkpoints and outputs will be saved.
- `logging_steps`: Log metrics every specified number of training steps.
- `logging_strategy`: Specify whether logging is done by "steps" or "epoch".
- `warmup_ratio`: Percentage of total training steps used for learning rate warm-up.
- `group_by_length`: Whether to group training samples by sequence length for more efficient processing.
- `lr_scheduler_type`: Type of learning rate scheduler, such as "linear" or "constant".
- `weight_decay`: Strength of weight decay regularization applied to the optimizer.
- `fp16`: Enable mixed-precision training using 16-bit floating-point format.
- `save_strategy`: Strategy for saving model checkpoints, either by "epoch" or "steps".
- `save_steps`: Save a model checkpoint every specified number of steps.
- `save_total_limit`: Maximum number of checkpoints to keep.
- `evaluation_strategy`: Strategy for evaluating the model during training.
- `eval_steps`: Evaluate the model every specified number of training steps.
- `do_eval`: Whether to perform evaluation during training.
- `report_to`: Where to report evaluation results, set to "none" to disable reporting.


In [43]:
# "max_steps=1" is just for testing execution
training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=train_batch_size,
    per_device_eval_batch_size=eval_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    learning_rate=lr,
    logging_steps=25,
    logging_strategy="steps",
    warmup_ratio=0.03,
    group_by_length = True,
    # warmup_steps=2,
    lr_scheduler_type = "constant",
    weight_decay= 0.001,
#     num_train_epochs=num_of_epochs,
    max_steps=max_steps,
    fp16=True,
    # save_strategy="epoch",       # Save the model checkpoint every logging step
    save_strategy="steps",       # Save the model checkpoint every logging step
    save_steps=250,               # Save every 10 checkpoints
    save_total_limit=5,
    evaluation_strategy="steps", # Evaluate the model every logging step
    eval_steps=25,               # Evaluate and save checkpoints every 50 steps
    do_eval=True,
    # bf16=True
    # run_name="baseline-llama2-sft",
    # save_total_limit=1,  # can be increased, but but beware of kaggle notebook output size limit
    report_to="none"
)

Here we define our `response_template`. This template will be used to format the output of our model.

We also use `DataCollatorForCompletionOnleLM` in order to calculate the loss of our model only using the generated text.

In [None]:
response_template = "### Answer:"
collator = DataCollatorForCompletionOnlyLM(response_template=response_template, tokenizer=tokenizer)

In [47]:

supervised_finetuning_trainer = SFTTrainer(
    model,
    train_dataset=my_dataset["train"],
    eval_dataset=my_dataset["valid"],
    args=training_args,
    tokenizer=tokenizer,
    peft_config=qlora_config,
    dataset_text_field="text",
    max_seq_length=4096,
    packing=False,
    data_collator=collator
)

Map:   0%|          | 0/1062 [00:00<?, ? examples/s]

Map:   0%|          | 0/225 [00:00<?, ? examples/s]

We disable Weights & Biases. You'll need to apply an API key when prompted if you use it for tracking the training metrics.

In [48]:
import os
os.environ["WANDB_DISABLED"] = "true"
# os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:256"

Now we are ready to train our model!

In [None]:
train_result = supervised_finetuning_trainer.train()


In [51]:
metrics = train_result.metrics
# supervised_finetuning_trainer.log_metrics("train", metrics)
supervised_finetuning_trainer.save_metrics("train", metrics)
supervised_finetuning_trainer.save_state()
# print(metrics)


# Test split of the training data

## Format test split

Here we define the template for our test set. The only difference is that we don't include the {answer} field, as this is what we are trying to predict.

In [52]:
# Modified template for text classification task
template = """

### Instruction:
Below is an instruction that describes a text classification task. Determine whether the following statement is true or false.
Select the most suitable answer while making the necessary assumptions. Give only answer and a short explanation of two or three sentences. Nothing else.

### Input:
Statement: {prompt}\n
Is the statement true or false?

### Answer:
The statement is {answer}"""

# prompt = template(prompt=prompt, answer=answer)

In [53]:
# We don't have answers for test
def format_text_test(example):
    text = template.format(prompt=example['question'],
                         answer='')
    return {"text": text}

In [54]:
original_test_dataset = my_original_test_dataset.map(format_text_test)
# print(f"Test set size: {len(my_original_test_dataset)}")

scemantic_test_dataset = my_scemantic_test_dataset.map(format_text_test)
# print(f"Test set size: {len(my_scemantic_test_dataset)}")

context_test_dataset = my_context_test_dataset.map(format_text_test)
# print(f"Test set size: {len(my_context_test_dataset)}")

Map:   0%|          | 0/78 [00:00<?, ? examples/s]

Map:   0%|          | 0/78 [00:00<?, ? examples/s]

Map:   0%|          | 0/78 [00:00<?, ? examples/s]

In [None]:
# display(original_test_dataset[0])

With the following function we are keeping the triplets of binary classification problems for each unique id, in order to calculate the final prediction of the original multiple choice problem.

In [78]:
def group_same_dataset(dataset):
    # Initialize a dictionary to store the results
    grouped_pairs = {}

    for id1 in dataset['id']:
        # print(id1)
        id1_list = id1.split('_')
        if len(id1_list) > 2:
            id1_list[0] = id1_list[0] + '_' + id1_list[1]
            id1_list[1] = id1_list[2]

        grouped_pairs[id1_list[0]] = [id1]

        for id2 in dataset['id']:
            id2_list = id2.split('_')
            if len(id2_list) > 2:
                id2_list[0] = id2_list[0] + '_' + id2_list[1]
                id2_list[1] = id2_list[2]
            if id1_list[0] in id2_list[0] and id1_list[1] != id2_list[1] and len(id1_list[0]) == len(id2_list[0]):
                grouped_pairs[id1_list[0]].append(id2)

    # print(grouped_pairs)

    for key in grouped_pairs:
        if len(grouped_pairs[key]) != 3:
            print("Error in triplets!!!")
            print(key)
            print(grouped_pairs[key])
            print()
    # assert len(grouped_pairs.values()) == 3
    return grouped_pairs

In [59]:
grouped_pairs_original = group_same_dataset(my_original_test_dataset)
grouped_pairs_scemantic = group_same_dataset(my_scemantic_test_dataset)
grouped_pairs_context = group_same_dataset(my_context_test_dataset)

In [None]:
# print(grouped_pairs_original)

## Predict with fine-tuned model

In order to find wich option of the multiple choice is the correct one, we will generate text using the model and then scemantically find the closest option to the generated text.

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

# Initialize Sentence Transformer model
model_st = SentenceTransformer('paraphrase-MiniLM-L6-v2')

In [62]:
eval_tokenizer = AutoTokenizer.from_pretrained(model_name, add_bos_token=True, trust_remote_code=True)

##### Accuracy on each dataset (original, scemanic, context) by itself

The following function will generate text for each of the datasets' binary classification problems and then calculate the accuracy of the model on each dataset.
It also keeps track of the wrong predictions and the ids of the wrong predictions, as well the generated text of the mispredicted ids.
* We can also control if we want to print the generated text of each id through the `printall` argument.



In [63]:
def dataset_compute (row, model, printall=False):

    # preds = []
    preds = {}
    # id = dataset[idx]['id']
    with torch.no_grad():

        truth_answer = row["label"][0]
        correct_answer = ["False", "True"][truth_answer]
#         print(truth_answer)
        
        model_inputs = eval_tokenizer(row["text"], return_tensors="pt").to("cuda")
        output = model.generate(**model_inputs, max_new_tokens=90, repetition_penalty=1.15)

        generated_text = eval_tokenizer.decode(output[0], skip_special_tokens=True)
        print("#"*80)
        print(generated_text)
        print("#"*80)


        ################## Extracting answer ##################
        # Extracting the answer
        start_index = generated_text.find("The statement is ")
        explanation_start_index = generated_text.find("### Explanation:")
        end_index = explanation_start_index - 1
        generated_answer = generated_text[start_index + len("The statement is "):end_index].strip()

        # Extracting the explanation
        # Find the next occurrence of "###" after the explanation start index
        next_header_index = generated_text.find("###", explanation_start_index + len("### Explanation:"))

        # Determine the end index of the explanation
        if next_header_index != -1:
            # If "###" is found, stop before it
            end_index = next_header_index - 1
        else:
            # If "###" is not found, extract until the end of the string
            end_index = len(generated_text)

        # Extract the explanation
        explanation = generated_text[explanation_start_index + len("### Explanation:"):end_index].strip()
        
        ################## Matching  answer ##################
        # Encode the answer text and multiple choices into sentence embeddings
        generated_embedding = model_st.encode(generated_answer)
        choices_embeddings = ["False", "True"]
        choices_embeddings = model_st.encode(choices_embeddings)
        
        # Compute the cosine similarity between the generated answer and the choices
        # The higher the value, the more similar the two vectors are
        cosine_scores = cosine_similarity([generated_embedding], choices_embeddings)[0]
        # similarities = cosine_similarity(generated_embedding.reshape(1, -1), choices_embeddings)

        # Get the index of the highest score
        predicted_answer_index = cosine_scores.argmax()
        # Get the predicted answer
        predicted_answer = ["False", "True"][predicted_answer_index]
        
        ################## Testing puproses ##################

        if printall == True:
            print("#"*80)
            print("Correct answer: {}".format(correct_answer))

            print("="*80)
            print("Generated answer: {}".format(generated_answer))
            print("Explanation: {}".format(explanation))
            print("="*80)

            print("+"*80)
            print("Predicted answer: {}".format(predicted_answer))
            print("+"*80)

            print("#"*80)
        ##########################################
#         all_answers.append(predicted_answer_index)

    return row['question'][0].strip(), row['label'][0],  predicted_answer_index

The following function computes the accuracy of the model on the original multiple choice problem. It takes as input the predictions of the binary classification problems and the original multiple choice problem. 

It returns the accuracy of the model on the original multiple choice problem.

In [64]:
def compute_triplets_acc(dataset, group_pairs):

    none_of_above = {}

    total_correct_groups = 0.0
    total_groups = len(grouped_pairs_original)
    every_id = {}

    group_acc = {}

    # we take question id and binary pairs id
    for group_id, group_ids_list in group_pairs.items():
        print("*"*120)
        
        # print(group_id, group_ids_list)
        # Initialize a variable to check if all three ids in the group are correct
        all_correct = True

        a = {}
        correct_label = []

        # Check each id in the group
        for single_id in group_ids_list:
            # print(single_id)

            # we first filter the dataset to get the binary pair
            original_data = dataset.filter(lambda example: example['id'] == single_id)
#             print(original_data[0])

            # we then extract the prompt, true label and predicted label
            prompt, true_label_original, predicted_class = dataset_compute(original_data, model) #True
#             if predicted_class == "True":
#                 predicted_class = 1
#             else:
#                 predicted_class = 0

            print(true_label_original, predicted_class)

            # we store the prompt, true label and predicted label in a dictionary
            # a[single_id] = [prompt, true_label_original, predicted_class]

            correct_label.append(true_label_original)

            # Check if the prediction is correct
            if predicted_class != true_label_original:
                # we store the prompt, true label and predicted label in a dictionary
                a[single_id[-1]] = [prompt, true_label_original, predicted_class]

                all_correct = False
                # print("False")
                # break  # No need to check further if one is incorrect

        # print(len(correct_label))
        # print("#############################################")

        if correct_label[0] == correct_label[1] == correct_label[2] == 0:
            # a.append("None of above")
            none_of_above[group_id] = group_ids_list
            # print(group_id, group_ids_list)
        every_id[group_id] = a

        # If all three ids in the group are correct, increment the total correct groups
        if all_correct:

            group_acc[group_id] = 1
            total_correct_groups += 1
        else:
            group_acc[group_id] = 0
            
#         break
    # remove key-value pair if the value is empty
    every_id = {key: value for key, value in every_id.items() if value}

    # Compute accuracy based on the total correct groups and total groups
    accuracy = total_correct_groups / total_groups

    # print(f"Accuracy: {accuracy * 100:.2f}%")

    return every_id, none_of_above, accuracy, group_acc

In [None]:
original_ids, original_none_of_above, original_acc, original_wrong_ids = compute_triplets_acc(original_test_dataset, grouped_pairs_original)
scemantic_ids, scemantic_none_of_above, scemantic_acc, scemantic_wrong_ids = compute_triplets_acc(scemantic_test_dataset, grouped_pairs_scemantic)
context_ids, context_none_of_above, context_acc, context_wrong_ids = compute_triplets_acc(context_test_dataset, grouped_pairs_context)


In [None]:
print("Accuracy of original dataset:")
print(round(original_acc, 3))
print("Accuracy of scemantic dataset:")
print(round(scemantic_acc, 3))
print("Accuracy of context dataset:")
print(round(context_acc, 3))
print()


Here based on the `group` number we will calculate the accuracy of the model on that group.

Here we implement the logic to calculate group-based accuracy. We need the list of the wrong ids of each dataset in order to calculate the group-based accuracy.

In the following function we are creating a detailed output of the predictions of the model on each group-based metric
The function takes as input:
- a key of the dataset row that misspredictions were made
- the details of these prerpdictions
- the dataset name

In [68]:
def output_details(output_key, triplet_details, dataset_name, dataset):
    # Initialize the output template
    output_template = ""

    # give title to the output
    output_template += "  {} dataset:\n".format(dataset_name)

    ############################ Initial Dataset ############################
    dataset_entry = dataset.filter(lambda example: example['id'] == output_key)[0]
    output_template += "    Prompt: {}\n".format(dataset_entry['question'])
    output_template += "    True Label: {} -> {}\n".format(dataset_entry['label'], dataset_entry['choice_list'][dataset_entry['label']].strip())

    ############################ Triplets Dataset ############################
    infos = triplet_details[output_key]

    for element in infos:
        true_label = infos[element][1]

        element = int(element)

        if dataset_entry['label'] == element and true_label == 0:
            output_template += "    Predicted Label as correct: {} -> {}\n".format(element, dataset_entry['choice_list'][element].strip())
        elif dataset_entry['label'] == element and true_label == 1:
            output_template += "    Predicted Label as wrong: {} -> {}\n".format(element, dataset_entry['choice_list'][element].strip())
        else:
            if true_label == 0:
                output_template += "    Mispredicted Label as correct also: {} -> {}\n".format(element, dataset_entry['choice_list'][element].strip())
            else:
                output_template += "    Mispredicted Label as wrong also: {} -> {}\n".format(element, dataset_entry['choice_list'][element].strip())

    output_template += "\n"
    return output_template



In [69]:
def group_accuracy(dataset, original_triplet_res, original_triplet_details,  scemantic_triplet_res, scemantic_triplet_details, context_triplet_res, context_triplet_details, num_groups=2):
    # correct_predictions = {}
    wrong_predictions = {}
    total_correct = 0
    model.eval()  # Set the model to evaluation mode

    # Iterate over keys
    for i, key in enumerate(original_triplet_res.keys()):
        
        ############################ original dataset ############################
        is_original_correct = original_triplet_res[key]
        
        
        ############################ semantic dataset ############################
        is_semantic_correct = scemantic_triplet_res[key+'_SR']
        
        if num_groups == 3:
        ############################ context dataset ############################
            is_context_correct = context_triplet_res[key+'_CR']

        # print(key)
        # if num_groups == 2:
        if is_original_correct and is_semantic_correct:
            total_correct += 1
            
        if not is_original_correct:
            if key not in wrong_predictions:
                wrong_predictions[key] = output_details(key, original_triplet_details, "Original", dataset)
            else:
                wrong_predictions[key] += output_details(key, original_triplet_details, "Original", dataset)

        if not is_semantic_correct:
            if key not in wrong_predictions:
                wrong_predictions[key] = output_details(key+'_SR', scemantic_triplet_details, "Semantic", dataset)
            else:
                wrong_predictions[key] += output_details(key+'_SR', scemantic_triplet_details, "Semantic", dataset)
            
        if num_groups == 3: 
            if not is_context_correct:
                if key not in wrong_predictions:
                    wrong_predictions[key] = output_details(key+'_CR', context_triplet_details, "Context", dataset)
                else:
                    wrong_predictions[key] += output_details(key+'_CR', context_triplet_details, "Context", dataset)

        total_instances = i + 1
    accuracy = round(total_correct / total_instances, 3)
    if num_groups ==2:
        print("Accuracy Ori & Sem: {} -> {}/{}".format(round(total_correct / total_instances, 3), total_correct, total_instances))
    else:
        print("Accuracy Ori & Sem & Con: {} -> {}/{}".format(round(total_correct / total_instances, 3), total_correct, total_instances))
    
    return wrong_predictions, accuracy


### Ori & Sem Accuracy


In [70]:
wrong_preds, ori_sem_accuracy = group_accuracy(train_dataset, original_wrong_ids, original_ids, scemantic_wrong_ids, scemantic_ids, context_wrong_ids, context_ids, num_groups=2)

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Accuracy Ori & Sem: 0.231 -> 6/26


In [None]:
print("Accuracy is: ", ori_sem_accuracy)
for key in wrong_preds:
    print(key)
    print(wrong_preds[key])
    
    
ori_sem_details = "Accuracy: " + str(ori_sem_accuracy) + '\n\n'
for key in wrong_preds:
    ori_sem_details += key + '\n'
    ori_sem_details += wrong_preds[key] + '\n'

### Ori & Sem & Con Accuracy

In [72]:
wrong_preds, ori_sem_con_accuracy = group_accuracy(train_dataset, original_wrong_ids, original_ids, scemantic_wrong_ids, scemantic_ids, context_wrong_ids, context_ids, num_groups=3)

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Accuracy Ori & Sem & Con: 0.231 -> 6/26


In [None]:
print("Accuracy is: ", ori_sem_con_accuracy)
for key in wrong_preds:
    print(key)
    print(wrong_preds[key])
    
    
ori_sem_con_details = "Accuracy is: " + str(ori_sem_con_accuracy) + '\n\n'

for key in wrong_preds:
    ori_sem_con_details += key + '\n'
    ori_sem_con_details += wrong_preds[key] + '\n'

## For the competion Try the Test dataset!

Here we handle the test set that is provided by the competition. We are following the same logic as above.

In [82]:
testset_original_datasets = testset_original_test_dataset.map(format_text_test)
# print(f"Test set size: {len(my_original_test_dataset)}")

testset_scemantic_datasets = testset_scemantic_test_dataset.map(format_text_test)
# print(f"Test set size: {len(my_scemantic_test_dataset)}")

testset_context_datasets = testset_context_test_dataset.map(format_text_test)
# print(f"Test set size: {len(my_context_test_dataset)}")

Map:   0%|          | 0/120 [00:00<?, ? examples/s]

Map:   0%|          | 0/120 [00:00<?, ? examples/s]

Map:   0%|          | 0/120 [00:00<?, ? examples/s]

In [None]:
display(testset_original_datasets[3])

In [79]:
grouped_pairs_testset_original = group_same_dataset(testset_original_test_dataset)
grouped_pairs_testset_scemantic = group_same_dataset(testset_scemantic_test_dataset)
grouped_pairs_testset_context = group_same_dataset(testset_context_test_dataset)

In [None]:
test_set_original_ids, test_set_original_none_of_above, test_set_original_acc, test_set_original_wrong_ids = compute_triplets_acc(testset_original_datasets, grouped_pairs_testset_original)
test_set_scemantic_ids, test_set_scemantic_none_of_above, test_set_scemantic_acc, test_set_scemantic_wrong_ids = compute_triplets_acc(testset_scemantic_datasets, grouped_pairs_testset_scemantic)
test_set_context_ids, test_set_context_none_of_above, test_set_context_acc, test_set_context_wrong_ids = compute_triplets_acc(testset_context_datasets, grouped_pairs_testset_context)

In [None]:
print("Accuracy of original dataset:")
print(round(test_set_original_acc, 3))
print("Accuracy of scemantic dataset:")
print(round(test_set_scemantic_acc, 3))
print("Accuracy of context dataset:")
print(round(test_set_context_acc, 3))
print()


### Ori & Sem Accuracy


In [None]:
test_set_wrong_preds, test_set_ori_sem_accuracy = group_accuracy(test_dataset, test_set_original_wrong_ids, test_set_original_ids, test_set_scemantic_wrong_ids, test_set_scemantic_ids, test_set_context_wrong_ids, test_set_context_ids, num_groups=2)

In [None]:
print("Accuracy is: ", test_set_ori_sem_accuracy)
for key in test_set_wrong_preds:
    print(key)
    print(test_set_wrong_preds[key])
    

test_set_ori_sem_details = "Accuracy is: " + str(test_set_ori_sem_accuracy) + "\n\n"

for key in test_set_wrong_preds:
    test_set_ori_sem_details += key + '\n'
    test_set_ori_sem_details += test_set_wrong_preds[key] + '\n'    

### Ori & Sem & Con Accuracy

In [None]:
test_set_wrong_preds, test_set_ori_sem_con_accuracy = group_accuracy(test_dataset, test_set_original_wrong_ids, test_set_original_ids, test_set_scemantic_wrong_ids, test_set_scemantic_ids, test_set_context_wrong_ids, test_set_context_ids, num_groups=3)

In [None]:
print("Accuracy is: ", test_set_ori_sem_con_accuracy)
for key in test_set_wrong_preds:
    print(key)
    print(test_set_wrong_preds[key])
    
test_set_ori_sem_con_details = "Accuracy is: " + str(test_set_ori_sem_con_accuracy) + "\n\n"

for key in test_set_wrong_preds:
    test_set_ori_sem_con_details += key + '\n'
    test_set_ori_sem_con_details += test_set_wrong_preds[key] + '\n'

Gathering results to a json

In [None]:
date_of_run = pd.to_datetime('today').strftime("%Y_%m_%d_%H_%M")
check = model_name[:model_name.find('/')]

run_dir = "./" +task+ "__TxtCls" + "_" + check + "_" + date_of_run
# print(run_dir)

# Create the directory if it does not exist
if not os.path.exists(run_dir):
    os.makedirs(run_dir)
    
os.chdir(run_dir)

Gathering results to a json

In [None]:
import pandas as pd

# df_res = pd.read_csv('../results/results.csv')

# df_res = pd.DataFrame(columns=['checkpoint', 'task',  'lr', 'batch_size', 'num_epochs', 'original_acc', 'scemantic_acc', 'context_acc', 'ori_sem_acc', 'ori_sem_con_acc', 'date_of_run'])

# Create a dictionary for the new row
new_row_data = {
    'checkpoint': [model_name],
    'task': [task+"__TxtCls"],
    'lr': [lr],
    'batch_size': [train_batch_size],
    'num_steps': [max_steps],
    'original_acc': [original_acc],
    'semantic_acc': [scemantic_acc],
    'context_acc': [context_acc],
    'ori_sem_acc': [ori_sem_accuracy],
    'ori_sem_con_acc': [ori_sem_con_accuracy],
    'date_of_run': pd.to_datetime('today').strftime("%Y_%m_%d_%H:%M")
}

# Append the new row to the DataFrame
df_train = pd.DataFrame(new_row_data)

# display(df_temp)
# df_temp.to_csv('./results.csv', index=False)

new_row_test_set_data = {
    'checkpoint': [model_name],
    'task': [task+"__TxtCls_test_set"],
    'lr': [lr],
    'batch_size': [train_batch_size],
    'num_steps': [max_steps],
    'original_acc': [test_set_original_acc],
    'semantic_acc': [test_set_scemantic_acc],
    'context_acc': [test_set_context_acc],
    'ori_sem_acc': [test_set_ori_sem_accuracy],
    'ori_sem_con_acc': [test_set_ori_sem_con_accuracy],
    'date_of_run': pd.to_datetime('today').strftime("%Y_%m_%d_%H:%M")
}

# Append the new row to the DataFrame
df_test = pd.DataFrame(new_row_test_set_data)

# display(df_temp)
# df_temp.to_csv('./results.csv', index=False)

df_res = df_train._append(df_test, ignore_index=False)
display(df_res)
df_res.to_csv('./results.csv', index=False)

# # df_res.to_csv('/kaggle/input/results/results.csv', index=True)
# df_res.to_csv('../results/results.csv', index=False)

In [None]:
def save_to_text_file(content, filename):
    with open(filename, 'w') as file:
        file.write(content)

In [None]:
save_to_text_file(test_set_ori_sem_details, './ori_sem_wrong.txt')
save_to_text_file(test_set_ori_sem_con_details, './ori_sem_con_wrong.txt')

## Logic to export the results when running in Kaggle

* The following logic produces a zip file of the results in order to download it. The zip file name can be change through the `NAME_OF_ZIP_FILE` variable.

In [None]:
print(os.listdir())

In [None]:
from zipfile import ZipFile
from IPython.display import FileLink

NAME_OF_ZIP_FILE = run_dir

# Directory to be zipped
directory_to_zip = '/kaggle/working/' + run_dir

# Zip file name
zip_file_name = '{}.zip'.format(NAME_OF_ZIP_FILE)

# Create a ZipFile object
with ZipFile(zip_file_name, 'w') as zip_obj:
    # Iterate over all files and directories in the specified directory
    for root, dirs, files in os.walk(directory_to_zip):
        for file in files:
            file_path = os.path.join(root, file)
            zip_obj.write(file_path, os.path.relpath(file_path, directory_to_zip))

# Generate FileLink for the zipped file
FileLink(zip_file_name)


The following code is used to check the contents of the zip file.

In [None]:
from zipfile import ZipFile

# Path to the ZIP file
zip_file_path = 'NAME_OF_ZIP_FILE.zip'  # Update with the path to your ZIP file

# Open the ZIP file in read mode
with ZipFile(zip_file_path, 'r') as zip_file:
    # Print the list of elements (files and directories) inside the ZIP file
    print("Elements inside the ZIP file:")
    for element in zip_file.namelist():
        print(element)
