# Install packages

In [None]:
!pip install -q -U bitsandbytes==0.42.0
!pip install -q -U peft==0.8.2
!pip install -q -U trl==0.7.10
!pip install -q -U accelerate==0.27.1
!pip install -q -U datasets==2.17.0
!pip install -q -U transformers==4.38.1

!pip install torch

!pip install scipy
!pip install -U sentence-transformers


# Imports

In [3]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
# from string import Template
# from pathlib import Path


import os

import warnings
warnings.simplefilter("ignore")

from tqdm.notebook import tqdm

# for training
from peft import LoraConfig
from transformers import TrainingArguments
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM

# for training set
# from datasets import load_dataset
# from langchain.prompts import PromptTemplate
# import matplotlib.pyplot as plt
import bitsandbytes as bnb
import numpy as np
import pandas as pd


from IPython.display import Markdown, display

2024-03-30 19:20:28.665523: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-03-30 19:20:28.665681: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-03-30 19:20:28.830115: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# Load model and tokenizer

To access certain Language Model Models (LLMs) through the Hugging Face library, you may need to obtain an access token. You can acquire a token by signing up on the Hugging Face website and gaining permission to use the specific model you're interested in. 

The following cell demonstrates how to pass your access token in order to download the model and tokenizer. Put your access token in the `YOUR_HUGGING_FACE_TOKEN` variable.

In [4]:
from huggingface_hub import login

login(token='YOUR_HUGGING_FACE_TOKEN')

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


Here we determine the model we are using, the sub-task we are solving (Sentence Puzzle or Word Puzzle), and the maximum steps of our training.

In [5]:
model_name = "microsoft/phi-2"

task = "SP"
max_steps = 250

### Creating Bitsandbytes Configuration

Before loading the model, we will define a function `create_bnb_config` to define the `bitsandbytes` configuration. The `bitsandbytes` library allows model quantization. Quantization is a technique used to compress deep learning models by reducing the number of bits used to represent their weights and activations. This compression allows for faster inference and reduced memory consumption, making it possible to deploy these models on edge devices with limited resources.

By using 4-bit transformer language models, we can achieve impressive results while significantly reducing memory and computational requirements.

Hugging Face Transformers (`transformers`) is closely integrated with `bitsandbytes`. The `BitsAndBytesConfig` class from the `transformers` library allows configuring the model quantization method.

Parameters:

`load_in_4bit`: Load the model in 4-bit precision, i.e., divide memory usage by 4.

`bnb_4bit_use_double_quant`: Use nested quantization techniques for more memory-efficient inference at no additional cost.

`bnb_4bit_quant_type`: Set quantization data type. The options are either FP4 (4-bit precision), which is the default quantization data type, or NF4 (Normal Float 4), a new 4-bit data type adapted for weights that have been initialized using a normal distribution.

`bnb_4bit_compute_dtype`: Set the computational data type for 4-bit models. Default value: torch.float32

In [6]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=False, #True
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtyp=torch.float16  #torch.bfloat16,
)

compute_dtype = getattr(torch, "float16")

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map = "auto", 
    trust_remote_code=True,
)

# this should be set as False for finetuning
model.config.use_cache = False

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
# tokenizer.pad_token = "<PAD>"
tokenizer.padding_side = "left"

# Prepare training data

### Importing into Kaggle

Here we demonstrate how to import data into Kaggle. We have uploaded the data folder of the repository to a private Kaggle dataset. Our dataset is called `sem-dataset`.

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input/sem-dataset'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/sem-dataset/WP_val_question_random.npy
/kaggle/input/sem-dataset/WP_new_test.npy
/kaggle/input/sem-dataset/SP_test_labeled.npy
/kaggle/input/sem-dataset/annotated_SP_new_test.npy
/kaggle/input/sem-dataset/WP_test_labeled.npy
/kaggle/input/sem-dataset/SP-train.npy
/kaggle/input/sem-dataset/WP-train.npy
/kaggle/input/sem-dataset/WP_eval_data_for_practice.npy
/kaggle/input/sem-dataset/annotated_WP_new_test.npy
/kaggle/input/sem-dataset/SP_eval_data_for_practice.npy
/kaggle/input/sem-dataset/SP_val_question_random.npy
/kaggle/input/sem-dataset/SP_new_test.npy


Here we import train and test data from the dataset.

In [9]:
train_data = np.load('/kaggle/input/sem-dataset/'+task+'-train.npy', allow_pickle=True)

test_data = np.load('/kaggle/input/sem-dataset/'+task+'_test_labeled.npy', allow_pickle=True)

### Importing into Colab

Here we demonstrate how to import data into Colab. We have uploaded the data folder of the repository to a private Google Drive folder. Our folder is called `sem-dataset`.

In [12]:
# from google.colab import drive
# drive.mount('/content/drive')

In [13]:
# os.chdir('/content/drive/My Drive/sem-dataset')


In [14]:
# train_data = np.load('./data/'+task+'-train.npy', allow_pickle=True)

# test_data = np.load('./data/'+task+'_test_labeled.npy', allow_pickle=True)

### Make directory for our output

In [15]:

date_of_run = pd.to_datetime('today').strftime("%Y_%m_%d_%H_%M")

if '/' in model_name:
    # Split the model_name by "/"
    parts = model_name.split("/")
    
    # Check if there are at least 4 parts
    if len(parts) >= 5:
        # Concatenate the 3rd and 4th parts with an underscore
        model_suffix = parts[3] + "_" + parts[5]
    else:
        model_suffix = model_name

run_dir = "./sftt_Mlt_" + task + "_" + model_suffix + "_" + date_of_run
# print(run_dir)

# Create the directory if it does not exist
if not os.path.exists(run_dir):
    os.makedirs(run_dir)

os.chdir(run_dir)

./sftt_Mlt_SP_phi_2_2024_03_30_19_21


# Basic preprocessing

* Here we preprocess the data by splitting the data in Original, Scemantic Reconstruction and Context Reconstruction. 

* We then split the data into train, validation and test sets for each of the three types of data. This is done before shuffling in order to retain the same ids in the training, validation and test sets regarding the three types of data.

After that we concatenate the data and shuffle it in each of the three sets (Original, Scemantic, Context).


We create a test split of the given training data to evaluate the model on unseen data. 
* This is done because of the absence of a test set in the dataset in the beginning of the competition.

In [16]:
def convert_from_numpy_to_dataset_type (numpy_array, split):
    from datasets import Dataset
    data_list = numpy_array.tolist()
    df = pd.DataFrame(data_list)

    # display(df.head(1))
    if split == "train":
      df['id'] = df['id'].astype(str)
      df['distractor1'] = df['distractor1'].astype(str)
      df['distractor2'] = df['distractor2'].astype(str)
      df['distractor(unsure)'] = df['distractor(unsure)'].astype(str)
      df['label'] = df['label'].astype(int)

    # dataset = Dataset.from_pandas(df,  split=split)
    dataset = Dataset.from_pandas(df)


    # display(dataset[0])

    # display(dataset.features) # just to check the type of the features

    return dataset

In [17]:
train_dataset = convert_from_numpy_to_dataset_type(train_data, "train")

## Splitting the dataset

In [20]:
from sklearn.model_selection import train_test_split

In [21]:
# print(train_dataset.column_names)

In [22]:
ori_original_dataset = train_dataset.filter(lambda data: "_SR" not in data["id"] and "_CR" not in data["id"])
ori_scemantic_dataset = train_dataset.filter(lambda data: "_SR" in data["id"]) # SR => Semantic Reconstruction
ori_context_dataset = train_dataset.filter(lambda data: "_CR" in data["id"]) # CR => Context Reconstruction

# print(f"Original dataset size: {len(ori_original_dataset)}")
# print(f"Semantic dataset size: {len(ori_scemantic_dataset)}")
# print(f"Context dataset size: {len(ori_context_dataset)}")

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

Filter:   0%|          | 0/507 [00:00<?, ? examples/s]

In [23]:
def splitting_dataset(dataset, split_size):
    from datasets import DatasetDict

    #split_size% test + validation
    train_testvalid = dataset.train_test_split(test_size=split_size, shuffle=False)

    # Split the rest test + valid in half test, half valid
    test_valid = train_testvalid["test"].train_test_split(test_size=0.5, shuffle=False)

    # gather everyone if you want to have a single DatasetDict
    datasets = DatasetDict({
        "train": train_testvalid["train"],
        "test": test_valid["test"],
        "valid": test_valid["train"]})

    return datasets


In [24]:
original_dataset = splitting_dataset(ori_original_dataset, 0.3)
scemantic_dataset = splitting_dataset(ori_scemantic_dataset, 0.3)
context_dataset = splitting_dataset(ori_context_dataset, 0.3)


In [25]:
from datasets import concatenate_datasets

assert original_dataset["train"].features.type == scemantic_dataset["train"].features.type
assert original_dataset["train"].features.type == context_dataset["train"].features.type
train_dataset = concatenate_datasets([original_dataset["train"], scemantic_dataset["train"], context_dataset["train"]])
# print(f"Training set size: {len(temp_train_dataset)}")
# print(temp_train_dataset)

assert original_dataset["valid"].features.type == scemantic_dataset["valid"].features.type
assert original_dataset["valid"].features.type == context_dataset["valid"].features.type
valid_dataset = concatenate_datasets([original_dataset["valid"], scemantic_dataset["valid"], context_dataset["valid"]])
# print(f"Validation set size: {len(valid_dataset)}")
# print(valid_dataset)

In [26]:
train_dataset = train_dataset.shuffle(seed=42)
valid_dataset = valid_dataset.shuffle(seed=42)

from datasets import DatasetDict

my_dataset = DatasetDict({
    "train": train_dataset,
    "valid": valid_dataset})

# print(my_dataset)

## Making template for model

Here we define the template for our model. As our problem is a multi-class classification problem, we format the dataset accordingly.
We are using the following instructions and format as a training template.

In [28]:
# Give only answer and a short explanation of two or three sentences. Nothing else.

template = """
### Instruction:
Below is an instruction that describes a multiple choice task. Answer the following multiple choice question by giving the most appropriate response. 
Answer should be one among options provided after the question. Select the most suitable answer from the given ones.
Give only answer and a short explanation of two or three sentences. Nothing else.

Question: {prompt}\n
1) {a}\n
2) {b}\n
3) {c}\n
4) {d}\n

### Answer:
The correct answer is: {label}) {answer}"""

In [None]:
# # display sample to see template
# sample = my_dataset["train"][0]
# display(Markdown(template.format(prompt=sample['question'],
#                                a=sample['choice_list'][0],
#                                b=sample['choice_list'][1],
#                                c=sample['choice_list'][2],
#                                d=sample['choice_list'][3],
#                                answer=sample['answer'],
#                                label=int(sample['label'])+1)))

In [32]:
def format_text(example):
    """ fill inputs in promt for a sample  """
    text = template.format(prompt=example['question'],
                               a=example['choice_list'][0],
                               b=example['choice_list'][1],
                               c=example['choice_list'][2],
                               d=example['choice_list'][3],
                               answer=example['answer'],
                               label=int(example['label'])+1)
    return {"text": text}

Now we are using the `.map` function to apply the `preprocess_function` to the dataset. This function will preprocess the data and format it according to the template we defined earlier.
This is done for both the training and validation datasets.

In [33]:
my_dataset["train"] = my_dataset["train"].map(format_text)
temp = my_dataset["train"]
# print(f"Training set size: {len(temp)}")


Map:   0%|          | 0/354 [00:00<?, ? examples/s]

In [34]:
my_dataset["valid"] = my_dataset["valid"].map(format_text)
temp = my_dataset["valid"]
# print(f"Valid set size: {len(temp)}")

Map:   0%|          | 0/75 [00:00<?, ? examples/s]

# Set up training arguments

In [36]:
# SOURCE https://github.com/artidoro/qlora/blob/main/qlora.py

def find_linear_layers(model):
    cls = bnb.nn.Linear4bit #if args.bits == 4 else (bnb.nn.Linear8bitLt if args.bits == 8 else torch.nn.Linear)
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])


    if 'lm_head' in lora_module_names: # needed for 16-bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)


In [37]:
def create_peft_config(modules, lora_r, lora_alpha, lora_dropout):
    """
    Create Parameter-Efficient Fine-Tuning config for your model
    :param modules: Names of the modules to apply Lora to
    """
    config = LoraConfig(
        r=lora_r,  # dimension of the updated matrices
        lora_alpha=lora_alpha,  # parameter for scaling
        target_modules=modules,
        lora_dropout=lora_dropout,  # dropout probability for layers
        bias="none",
        task_type="CAUSAL_LM",
    )

    return config

In [38]:
def print_trainable_parameters(model, use_4bit=False):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        num_params = param.numel()
        # if using DS Zero 3 and the weights are initialized empty
        if num_params == 0 and hasattr(param, "ds_numel"):
            num_params = param.ds_numel

        all_param += num_params
        if param.requires_grad:
            trainable_params += num_params
    if use_4bit:
        trainable_params /= 2
    print(
        f"all params: {all_param} || trainable params: {trainable_params} || trainable%: {100 * trainable_params / all_param}"
    )

Here we define the LoRA config.

- `r` is the rank of the low-rank matrix used in the adapters, which thus controls the number of parameters trained. A higher rank will allow for more expressivity, but there is a compute tradeoff.

- `alpha` is the scaling factor for the learned weights. The weight matrix is scaled by `alpha/r`, and thus a higher value for alpha assigns more weight to the LoRA activations.

We experiment with different values for `r` and `alpha` to find the best combination for our model.

In [39]:
## # QLoRA parameters
################################################################################

# LoRA attention dimension
lora_r = 64

# Alpha parameter for LoRA scaling
lora_alpha = 128

# Dropout probability for LoRA layers
# lora_dropout = 0.05
lora_dropout = 0.1

# Bias
bias = "none"

# Task type
task_type = "CAUSAL_LM"



In [40]:
# Get LoRA module names
target_modules = find_linear_layers(model)
# print(target_modules)

#for llama 2
qlora_config = create_peft_config(target_modules, lora_r, lora_alpha, lora_dropout)

# Print information about the percentage of trainable parameters
# print_trainable_parameters(model, True)

['v_proj', 'dense', 'k_proj', 'q_proj', 'fc2', 'fc1']
all params: 1521392640 || trainable params: 131182080.0 || trainable%: 8.622499974759966


Here we define the training arguments for our model.

- `train_batch_size` and `eval_batch_size` are the batch sizes for training and evaluation. If the model is too large to fit in memory, you can reduce these values.

- `gradient_accumulation_steps` is the number of steps to accumulate gradients before performing an optimization step. This is useful when the batch size is too large to fit in memory.

- `learning_rate` is the learning rate for the optimizer. We set it to 2e-5, but you can experiment with different values.

- `optimizer` is the optimizer used for training. We use AdamW, which is a popular optimizer for training transformer models. If less memory is available, you can use a quantized optimizer.

- `num_train_epochs` is the number of epochs for training. We set it to 3 for demonstration purposes, but you can increase it for better results.

- `max_steps` is the maximum number of training steps. We set it to 250 because we are using a small dataset. It is used instead of `num_train_epochs` if it is specified. If you want to train for a specific number of epochs, you can remove this argument.


In [42]:
################################################################################
# TrainingArguments parameters
################################################################################

# Output directory where the model predictions and checkpoints will be stored
output_dir = "./outputs"
# output_dir = "./SFT-llama2-7b"

# Batch size per GPU for training
train_batch_size = 1
eval_batch_size = 1

# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 1

# Initial learning rate (AdamW optimizer)
lr = 2e-5

# Optimizer to use
optim = "paged_adamw_32bit"
# optim="paged_adamw_8bit"

# Number of training epochs
max_steps = 250


Several arguments that we difine are the following:

- `output_dir`: The directory where model checkpoints and outputs will be saved.
- `logging_steps`: Log metrics every specified number of training steps.
- `logging_strategy`: Specify whether logging is done by "steps" or "epoch".
- `warmup_ratio`: Percentage of total training steps used for learning rate warm-up.
- `group_by_length`: Whether to group training samples by sequence length for more efficient processing.
- `lr_scheduler_type`: Type of learning rate scheduler, such as "linear" or "constant".
- `weight_decay`: Strength of weight decay regularization applied to the optimizer.
- `fp16`: Enable mixed-precision training using 16-bit floating-point format.
- `save_strategy`: Strategy for saving model checkpoints, either by "epoch" or "steps".
- `save_steps`: Save a model checkpoint every specified number of steps.
- `save_total_limit`: Maximum number of checkpoints to keep.
- `evaluation_strategy`: Strategy for evaluating the model during training.
- `eval_steps`: Evaluate the model every specified number of training steps.
- `do_eval`: Whether to perform evaluation during training.
- `report_to`: Where to report evaluation results, set to "none" to disable reporting.


In [43]:
# "max_steps=1" is just for testing execution
training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=train_batch_size,
    per_device_eval_batch_size=eval_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    learning_rate=lr,
    logging_steps=25,
    logging_strategy="steps",
    warmup_ratio=0.03,
    group_by_length = True,
    # warmup_steps=2,
    lr_scheduler_type = "constant",
    weight_decay= 0.001,
#     num_train_epochs=num_of_epochs,
    max_steps=max_steps,
    fp16=True,
    # save_strategy="epoch",       # Save the model checkpoint every logging step
    save_strategy="steps",       # Save the model checkpoint every logging step
    save_steps=250,               # Save every 10 checkpoints
    save_total_limit=5,
    evaluation_strategy="steps", # Evaluate the model every logging step
    eval_steps=25,               # Evaluate and save checkpoints every 50 steps
    do_eval=True,
    # bf16=True
    # run_name="baseline-llama2-sft",
    # save_total_limit=1,  # can be increased, but but beware of kaggle notebook output size limit
    report_to="none"
)

Here we define our `response_template`. This template will be used to format the output of our model.

We also use `DataCollatorForCompletionOnleLM` in order to calculate the loss of our model only using the generated text.

In [46]:
# instruction_template = "### Instruction:"
response_template = "### Answer:"
collator = DataCollatorForCompletionOnlyLM(response_template=response_template, tokenizer=tokenizer)

In [47]:
supervised_finetuning_trainer = SFTTrainer(
    model,
    train_dataset=my_dataset["train"],
    eval_dataset=my_dataset["valid"],
    args=training_args,
#     tokenizer=tokenizer,
    peft_config=qlora_config,
    dataset_text_field="text",
    max_seq_length=1024,
    packing=False,
    data_collator=collator
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map:   0%|          | 0/354 [00:00<?, ? examples/s]

Map:   0%|          | 0/75 [00:00<?, ? examples/s]

We disable Weights & Biases. You'll need to apply an API key when prompted if you use it for tracking the training metrics.

In [48]:
os.environ["WANDB_DISABLED"] = "true"

Now we are ready to train our model!

In [49]:
train_result = supervised_finetuning_trainer.train()


Step,Training Loss,Validation Loss
25,0.1607,0.090152
50,0.0986,0.09917
75,0.0426,0.056664
100,0.0796,0.101712
125,0.0394,0.064998
150,0.1995,0.046732
175,0.0591,0.03598
200,0.0309,0.04452
225,0.0162,0.138688
250,0.0871,0.046758


In [51]:
metrics = train_result.metrics
# supervised_finetuning_trainer.log_metrics("train", metrics)
supervised_finetuning_trainer.save_metrics("train", metrics)
supervised_finetuning_trainer.save_state()
# print(metrics)


# Test split of the training data

## Format test split

Here we define the template for our test set. The only difference is that we don't include the {answer} field, as this is what we are trying to predict.

In [57]:

template = """
### Instruction:
Below is an instruction that describes a multiple choice task. Answer the following multiple choice question by giving the most appropriate response. 
Answer should be one among options provided after the question. Select the most suitable answer from the given ones.
Give only answer and a short explanation of two or three sentences. Nothing else.

Question: {prompt}\n
1) {a}\n
2) {b}\n
3) {c}\n
4) {d}\n

### Answer:
The correct answer is: {answer}"""

# # Define your input variables
# prompt = "Your prompt here"
# options = ["Option A", "Option B", "Option C", "Option D"]
# answer = "Your chosen answer"
# label = "A"  # Replace with the label corresponding to the correct answer (e.g., "A")

# prompt = template.format(prompt=prompt, a=options[0], b=options[1], c=options[2], d=options[3], answer=answer, label=label)

In [58]:
# We don't have answers for test
def format_text_test(example):
    text = template.format(prompt=example['question'],
                         a=example['choice_list'][0],
                         b=example['choice_list'][1],
                         c=example['choice_list'][2],
                         d=example['choice_list'][3],
                         answer='')
    return {"text": text}

In [59]:
original_test_dataset = original_dataset["test"].map(format_text_test)
# print(f"Test set size: {len(original_test_dataset)}")

scemantic_test_dataset = scemantic_dataset["test"].map(format_text_test)
# print(f"Test set size: {len(scemantic_dataset["test"])}")

context_test_dataset = context_dataset["test"].map(format_text_test)
# print(f"Test set size: {len(context_dataset["test"])}")

Map:   0%|          | 0/26 [00:00<?, ? examples/s]

Map:   0%|          | 0/26 [00:00<?, ? examples/s]

Map:   0%|          | 0/26 [00:00<?, ? examples/s]

## Predict with fine-tuned model

In order to find wich option of the multiple choice is the correct one, we will generate text using the model and then scemantically find the closest option to the generated text.

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

# Initialize Sentence Transformer model
model_st = SentenceTransformer('paraphrase-MiniLM-L6-v2')

In [61]:
eval_tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


##### Accuracy on each dataset (original, scemanic, context) by itself

The following function will generate text for each of the datasets and then calculate the accuracy of the model on each of the datasets.
It also keeps track of the wrong predictions and the ids of the wrong predictions, as well the generated text of the mispredicted ids.
* We can also control if we want to print the generated text of each id through the `printall` argument.
* As the datasets that are passed to the function might not have ids (test set for the competition), we can control if we want to keep track of the ids through the `id_exist` argument.

This function returns:
* The accuracy of the model on the dataset that is passed to the function.
* The wrong predictions of the model on the dataset that is passed to the function.
* The explanation of the wrong predictions.
* The wrong ids details in a format friendly to the human eye, so that we can understand the wrong predictions better.
* All the answers in a list format.



In [62]:

def generate_answers(dataset, printall=False, id_exist=True):
    # generated_outputs = []
    wrong_answers = []
    explanation_answers = {}
    correct_answers = 0
    total_answers = 0
    all_answers = []

    wrong_details = ""

    # Iterate through the dataset
    for i, row in (enumerate(dataset)):
        print("{} from {}".format(i, len(dataset)))
        ################## Generating answer ##################

        truth_answer = row["choice_list"][row["label"]]

        model_inputs = eval_tokenizer(row["text"], return_tensors="pt").to("cuda")
        output = model.generate(**model_inputs, max_new_tokens=90, repetition_penalty=1.15)

        # repetition_penalty: The parameter for repetition penalty. 1.0 means no penalty. Above 1.0 penalizes previously generated tokens. Between 0.0 and 1.0 rewards previously generated tokens.

        generated_text = eval_tokenizer.decode(output[0], skip_special_tokens=True)
        print(generated_text)

        ################## Extracting answer ##################
        # Extracting the answer
        start_index = generated_text.find("The correct answer is: ")
        explanation_start_index = generated_text.find("### Explanation:")
        end_index = explanation_start_index - 1
        generated_answer = generated_text[start_index + len("The correct answer is: "):end_index].strip()

        # Extracting the explanation
        # Find the next occurrence of "###" after the explanation start index
        next_header_index = generated_text.find("###", explanation_start_index + len("### Explanation:"))

        # Determine the end index of the explanation
        if next_header_index != -1:
            # If "###" is found, stop before it
            end_index = next_header_index - 1
        else:
            # If "###" is not found, extract until the end of the string
            end_index = len(generated_text)

        # Extract the explanation
        explanation = generated_text[explanation_start_index + len("### Explanation:"):end_index].strip()

        # explanation_end_index = generated_text.find("\n\n", explanation_start_index)
        # explanation = generated_text[explanation_start_index + len("### Explanation:"):].strip()

        ################## Matching  answer ##################
        # Encode the answer text and multiple choices into sentence embeddings
        generated_embedding = model_st.encode(generated_answer)
        choices_embeddings = model_st.encode(row["choice_list"])

        # Compute the cosine similarity between the generated answer and the choices
        # The higher the value, the more similar the two vectors are
        cosine_scores = cosine_similarity([generated_embedding], choices_embeddings)[0]
        # similarities = cosine_similarity(generated_embedding.reshape(1, -1), choices_embeddings)

        # Get the index of the highest score
        predicted_answer_index = cosine_scores.argmax()
        # Get the predicted answer
        predicted_answer = row["choice_list"][predicted_answer_index]

        ################## Testing puproses ##################

        if printall == True:
            print("#"*80)
            print("Correct answer: {}".format(truth_answer))

            print("="*80)
            print("Generated answer: {}".format(generated_answer))
            print("Explanation: {}".format(explanation))
            print("="*80)

            print("+"*80)
            print("Predicted answer: {}".format(predicted_answer))
            print("+"*80)

            print("#"*80)
        ##########################################
        all_answers.append(predicted_answer_index)


        if predicted_answer_index == row["label"]:
            correct_answers +=1

        else:
            wrong_details += "#"*30 + "\n"
            wrong_details += "Id is: {}\n".format(i)
            wrong_details += "Question: {}\n".format(row["question"])
            wrong_details += "Multiple choices: \n\t 0){}\n\t 1){}\n\t 2){}\n\t 3){}\n".format(row["choice_list"][0], row["choice_list"][1], row["choice_list"][2], row["choice_list"][3])
            wrong_details += "Correct answer -> \t{}\n".format(row["label"])
            wrong_details += "Predicted answer -> \t{}\n".format(predicted_answer_index)
            wrong_details += "Explanation: {}\n".format(explanation)
            wrong_details += "#"*30 + "\n\n"

            if id_exist:
                wrong_answers.append(row["id"])
                explanation_answers[row["id"]] = explanation

            else:
                wrong_answers.append(i)
                explanation_answers[i] = explanation


        total_answers +=1

    acc = correct_answers/total_answers
    print("Accuracy: {}".format(acc))

    return acc, wrong_answers, explanation_answers, wrong_details, all_answers


In [None]:
original_acc, original_wrong_ids, original_wrong_explanations, original_wrong_answers, original_all_answers = generate_answers(original_test_dataset, True, True)

In [64]:
# print("Accuracy: {}".format(original_acc))

Accuracy: 0.8076923076923077


In [None]:
scemantic_acc, scemantic_wrong_ids, scemantic_wrong_explanations, scemantic_wrong_answers, scemantic_all_answers = generate_answers(scemantic_test_dataset, True, True)


In [66]:
# print("Accuracy: {}".format(scemantic_acc))


Accuracy: 0.7307692307692307


In [None]:
context_acc, context_wrong_ids, context_wrong_explanations, context_wrong_answers, context_all_answers = generate_answers(context_test_dataset, True, True)


In [68]:
print("Accuracy: {}".format(context_acc))

Accuracy: 0.7307692307692307


Here we implement the logic to calculate group-based accuracy. We need the list of the wrong ids of each dataset in order to calculate the group-based accuracy.

In [69]:
lists = [
    original_wrong_ids,
    scemantic_wrong_ids,
    context_wrong_ids
]

new_lists = []

# Iterate through each list
for l in lists:
    new_list = []
    # Iterate through each element in the current list
    for item in l:
        # Find the index of the underscore character
        underscore_index = item.find('_')
        if underscore_index != -1:  # If underscore exists in the item
            # Remove everything from the underscore to the end
            new_item = item[:underscore_index]
            new_list.append(new_item)
        else:
            # If no underscore exists, keep the original item
            new_list.append(item)
    # Append the modified list to the new_lists list
    new_lists.append(new_list)

# Print the new lists
# for l in new_lists:
#     print(l)

['SP-173', 'SP-174', 'SP-193', 'SP-195', 'SP-201']
['SP-174', 'SP-180', 'SP-192', 'SP-193', 'SP-195', 'SP-201', 'SP-208']
['SP-173', 'SP-177', 'SP-180', 'SP-188', 'SP-192', 'SP-195', 'SP-200']


In [70]:
def count_unique_elements(list_of_lists):
    # Combine all lists into one list
    combined_list = [item for sublist in list_of_lists for item in sublist]

    # Convert the combined list into a set to remove duplicates
    unique_elements_set = set(combined_list)

    # Return the number of unique elements
    return len(unique_elements_set)

### Ori & Sem Accuracy


In [71]:
# print(new_lists[:2])
ori_scem_wrong = count_unique_elements(new_lists[:2])
# print(ori_scem_wrong)


ori_scem_wrong_ids = original_wrong_ids + scemantic_wrong_ids
# print(ori_scem_wrong_ids)

ori_scem_acc = 1 -(ori_scem_wrong/len(context_test_dataset))
print("Ori-Scem Accuracy: {}".format(ori_scem_acc))


Ori-Scem Accuracy: 0.6923076923076923


### Ori & Sem & Con Accuracy

In [72]:
# print(new_lists)
ori_scem_con_wrong = count_unique_elements(new_lists)
# print(ori_scem_con_wrong)

ori_scem_con_wrong_ids = original_wrong_ids + scemantic_wrong_ids + context_wrong_ids
# print(ori_scem_con_wrong_ids)


ori_scem_con_acc = 1 -(ori_scem_con_wrong/len(context_test_dataset))

print("Ori-Scem-Con Accuracy: {}".format(ori_scem_con_acc))


Ori-Scem-Con Accuracy: 0.5769230769230769


Gathering results to a json

In [None]:
# import pandas as pd

# df_res = pd.read_csv('./results/results.csv')

# Define the directory path
results_dir = './results/'

# Create the directory if it does not exist
if not os.path.exists(results_dir):
    os.makedirs(results_dir)

df_res = pd.DataFrame(columns=['checkpoint', 'task',  'lr', 'batch_size', 'lora_r', 'lora_alpha', 'lora_dropout',
                              #  'num_epochs',
                               'num_steps',
                               'original_acc', 'scemantic_acc', 'context_acc', 'ori_sem_acc', 'ori_sem_con_acc', 'date_of_run'])

# Create a dictionary for the new row
new_row_data = {
    'checkpoint': [model_name],
    'task': [task],
    'lr': [lr],
    'batch_size': [train_batch_size],
    'lora_r': [lora_r],
    'lora_alpha': [lora_alpha],
    'lora_dropout': [lora_dropout],
    # 'num_epochs': [num_of_epochs],
    'num_steps': [max_steps],
    'original_acc': [original_acc],
    'semantic_acc': [scemantic_acc],
    'context_acc': [context_acc],
    'ori_sem_acc': [ori_scem_acc],
    'ori_sem_con_acc': [ori_scem_con_acc],
    'date_of_run': pd.to_datetime('today').strftime("%Y_%m_%d_%H:%M")
}
# Append the new row to the DataFrame
df_temp = pd.DataFrame(new_row_data)

display(df_temp)

csv_path = os.path.join(results_dir, 'results.csv')
df_temp.to_csv(csv_path, index=False)


In [74]:
def save_to_text_file(content, filename):
    with open(filename, 'w') as file:
        file.write(content)

In [75]:
ori_sem_predictions = original_wrong_answers + "+"*30 + "\n" + scemantic_wrong_answers
ori_sem_con_predictions = original_wrong_answers + "+"*30 + "\n" + scemantic_wrong_answers + "+"*30 + "\n" + context_wrong_answers


In [76]:
save_to_text_file(ori_sem_con_predictions, './ori_sem_con_wrong.txt')
save_to_text_file(ori_sem_predictions, './ori_sem_wrong.txt')

## For the competion Try the Trained Model!

Here we handle the test set that is provided by the competition. We are following the same logic as above.

### Prepare test dataset

In [91]:
def convert_from_numpy_to_dataset_type (numpy_array):
    from datasets import Dataset
    data_list = numpy_array.tolist()
    df = pd.DataFrame(data_list)

    dataset = Dataset.from_pandas(df)
    display(dataset[0])
    display(dataset.features) # just to check the type of the features

    return dataset

In [92]:
test_dataset = convert_from_numpy_to_dataset_type(test_data)


{'question': 'In a small village, two farmers are working in their fields - a diligent farmer and a lazy farmer. The hardworking farmer is the son of the lazy farmer, but the lazy farmer is not the father of the hardworking farmer. Can you explain this unusual relationship?',
 'choice_list': ['The lazy farmer is his mother.',
  'The lazy farmer is not a responsible father as he is lazy.',
  'The diligent farmer devoted himself to the farm and gradually forgot his father.',
  'None of above.'],
 'label': 0}

{'question': Value(dtype='string', id=None),
 'choice_list': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'label': Value(dtype='int64', id=None)}

In [93]:
test_dataset = test_dataset.map(format_text_test)
print(f"Test set size: {len(test_dataset)}")

Map:   0%|          | 0/120 [00:00<?, ? examples/s]

Test set size: 120


In [95]:
# display(Markdown(test_dataset[0]["text"]))


### Instruction:
Below is an instruction that describes a multiple choice task. Answer the following multiple choice question by giving the most appropriate response. 
Answer should be one among options provided after the question. Select the most suitable answer from the given ones.
Give only answer and a short explanation of two or three sentences. Nothing else.

Question: In a small village, two farmers are working in their fields - a diligent farmer and a lazy farmer. The hardworking farmer is the son of the lazy farmer, but the lazy farmer is not the father of the hardworking farmer. Can you explain this unusual relationship?

1) The lazy farmer is his mother.

2) The lazy farmer is not a responsible father as he is lazy.

3) The diligent farmer devoted himself to the farm and gradually forgot his father.

4) None of above.


### Answer:
The correct answer is: 

### Predict with fine-tuned model

In [96]:
eval_tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) # add_bos_token=True, 

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
test_acc, test_wrong_ids, test_wrong_explanations, test_wrong_answers, test_all_answers = generate_answers(test_dataset, True, False)


In [98]:
# print("Accuracy: {}".format(test_acc))


Accuracy: 0.6


In [99]:
test_set_output = """
Accuracy: {}\n
Wrong ids: {}\n
Wrong answers number: {}\n

Wrong answers: \n
{}
""".format(test_acc, test_wrong_ids, len(test_wrong_ids), test_wrong_answers)

In [None]:
# print(test_set_output)

In [101]:
save_to_text_file(test_set_output, './test_set_output.txt')


### Create submission

Here we only keep ordered the predictions of the model in order to create the submission file.

In [102]:
print(test_all_answers)

[0, 2, 0, 1, 1, 0, 0, 2, 2, 1, 1, 0, 2, 0, 1, 1, 0, 2, 1, 3, 1, 1, 1, 0, 3, 2, 0, 2, 0, 0, 0, 2, 2, 2, 2, 2, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 2, 0, 0, 2, 0, 0, 3, 2, 1, 0, 2, 1, 1, 1, 3, 0, 1, 1, 1, 1, 2, 1, 2, 0, 2, 2, 2, 2, 1, 2, 1, 3, 1, 0, 1, 2, 1, 1, 0, 1, 3, 1, 2, 2, 2, 1, 3, 0, 2, 2, 1, 2, 0, 1, 1, 2, 2, 2, 2, 2, 1, 3, 2, 3, 1, 1, 1, 0, 2, 2, 2, 1, 1, 2]


In [103]:
def save_answers_to_file(filename, predictions):
    with open(filename, 'w') as file:
        for predicted_class in predictions:
            file.write(f'{predicted_class}\n')


In [104]:
if task == "SP":
    directory = './competition/' + model_suffix
    os.makedirs(directory, exist_ok=True)  # Create directory if it doesn't exist
    save_answers_to_file(os.path.join(directory, 'answer_sen_'+ pd.to_datetime('today').strftime("%Y_%m_%d_%H_%M")+ '.txt'), test_all_answers)

if task == "WP":
    directory = './competition/' + model_suffix
    os.makedirs(directory, exist_ok=True)  # Create directory if it doesn't exist
    save_answers_to_file(os.path.join(directory, 'answer_word_'+ pd.to_datetime('today').strftime("%Y_%m_%d_%H_%M")+ '.txt'), test_all_answers)

## Logic to export the results when running in Kaggle



* The following logic produces a zip file of the results in order to download it. The zip file name can be change through the `NAME_OF_ZIP_FILE` variable.

* The submission file is in the directory ` /kaggle/working/' + run_dir + '/competition/ `

* The name of the submission file is based on the task according to the competition guidelines.

In [105]:
print(os.listdir())

['ori_sem_con_wrong.txt', 'competition', 'ori_sem_wrong.txt', 'results', 'outputs', 'test_set_output.txt']


In [106]:
import os
from zipfile import ZipFile
from IPython.display import FileLink

NAME_OF_ZIP_FILE = run_dir

# Directory to be zipped
directory_to_zip = '/kaggle/working/' + run_dir

# Zip file name
zip_file_name = '{}.zip'.format(NAME_OF_ZIP_FILE)

# Create a ZipFile object
with ZipFile(zip_file_name, 'w') as zip_obj:
    # Iterate over all files and directories in the specified directory
    for root, dirs, files in os.walk(directory_to_zip):
        for file in files:
            file_path = os.path.join(root, file)
            zip_obj.write(file_path, os.path.relpath(file_path, directory_to_zip))

# Generate FileLink for the zipped file
FileLink(zip_file_name)


The following code is used to check the contents of the zip file.

In [None]:
from zipfile import ZipFile

# Path to the ZIP file
zip_file_path = 'NAME_OF_ZIP_FILE.zip'  # Update with the path to your ZIP file

# Open the ZIP file in read mode
with ZipFile(zip_file_path, 'r') as zip_file:
    # Print the list of elements (files and directories) inside the ZIP file
    print("Elements inside the ZIP file:")
    for element in zip_file.namelist():
        print(element)
