# Load Valid Data

In [None]:
"""
This script demonstrates how to use a 40k dataset for fine-tuning a transformer model.
It imports necessary libraries, sets environment variables, and defines various parameters for training.
The script also specifies the Hugging Face model to be used for fine-tuning.
"""

import os
os.environ["CUDA_VISIBLE_DEVICES"]="0,1"

from typing import Optional, Union
import pandas as pd, numpy as np, torch
from datasets import Dataset
from dataclasses import dataclass
from transformers import AutoTokenizer
from transformers import EarlyStoppingCallback
from transformers.tokenization_utils_base import PreTrainedTokenizerBase, PaddingStrategy
from transformers import AutoModelForMultipleChoice, TrainingArguments, Trainer

VER=3
# TRAIN WITH SUBSET OF MMLU
NUM_MMLU_TRAIN_SAMPLES = 2_048
# PARAMETER EFFICIENT FINE TUNING
USE_PEFT = False
# NUMBER OF LAYERS TO FREEZE 
# DEBERTA LARGE HAS TOTAL OF 24 LAYERS
FREEZE_LAYERS = 18
# BOOLEAN TO FREEZE EMBEDDINGS
FREEZE_EMBEDDINGS = True
# LENGTH OF CONTEXT PLUS QUESTION ANSWER
MAX_INPUT = 256
# HUGGING FACE MODEL
MODEL = 'Model_Path'

In [None]:
df_valid = pd.read_csv('/kaggle/input/60k-data-with-context-v2/train_with_context2.csv')
print('Validation data size:', df_valid.shape )
df_valid.head()

# Load Train Data 

In [None]:
# FUNCTIONS TO ADD A NEW RANDOM WRONG CHOICE
def make_random_4_from_3(row):
    """
    Randomly selects a move from a given row and creates a new column 'D' with the selected move.
    Then, randomly selects another move and replaces the selected move with the new move.
    If the selected move is the correct answer, updates the answer to 'D'.
    
    Args:
        row: A pandas DataFrame row containing the moves and the correct answer.
        
    Returns:
        The modified row with the new column 'D' and updated moves and answer.
    """
    wrong = [x for x in ['A','B','C'] if x != row.answer]
    right = [row.answer]
    move = np.random.choice(wrong*3 + right*2)
    row['D'] = row[move]
    duplicate = np.random.choice(wrong)
    row[move] = row[duplicate]
    if move==row.answer:
        row.answer = 'D'
    return row

def make_random_5_from_4(row):
    """
    Modifies the given row by randomly selecting a wrong answer choice and replacing it with a duplicate answer choice.
    If the selected wrong answer choice is the same as the correct answer, the correct answer is replaced with 'E'.

    Args:
        row: A pandas DataFrame row representing a question and its answer choices.

    Returns:
        The modified row with the answer choices updated.
    """
    wrong = [x for x in ['A','B','C','D'] if x != row.answer]
    right = [row.answer]
    move = np.random.choice(wrong*4 + right*3)
    row['E'] = row[move]
    duplicate = np.random.choice(wrong)
    row[move] = row[duplicate]
    if move==row.answer:
        row.answer = 'E'
    return row

In [None]:
# LOAD 3 DATASETS AND FILTER
"""
Load three datasets and apply filters to each dataset.
"""

MMLU = pd.read_csv('/kaggle/input/40k-data-with-context-v2/MMLU_17k_with_context2.csv')
MMLU = MMLU.loc[MMLU.is_question].sample(NUM_MMLU_TRAIN_SAMPLES, random_state=42)
print('MMLU shape',MMLU.shape)

ScienceQA = pd.read_csv('/kaggle/input/40k-data-with-context-v2/ScienceQA_with_context2.csv')
ScienceQA_3 = ScienceQA.loc[ScienceQA.image.isna() & (ScienceQA.ct==3)].drop(columns=['D','E'])
ScienceQA_4 = ScienceQA.loc[ScienceQA.image.isna() & (ScienceQA.ct==4)].drop(columns=['E'])
print('ScienceQA_3 shape',ScienceQA_3.shape)
print('ScienceQA_4 shape',ScienceQA_4.shape)

OpenBook = pd.read_csv('/kaggle/input/40k-data-with-context-v2/OpenBook_with_context2.csv')
OpenBook = OpenBook.loc[OpenBook.is_question]
print('OpenBook shape',OpenBook.shape)

In [None]:
# PREPROCESS AND CONCATENATE 3 DATASETS
COLS = [c for c in MMLU.columns if c != 'is_question']
ScienceQA_3 = ScienceQA_3.apply(make_random_4_from_3,axis=1)
ScienceQA_3= ScienceQA_3[COLS]
ScienceQA_4= ScienceQA_4[COLS]
OpenBook = OpenBook[COLS]
df_train = pd.concat([MMLU,ScienceQA_3,ScienceQA_4,OpenBook],axis=0,ignore_index=True)
df_train = df_train.apply(make_random_5_from_4,axis=1)
df_train = df_train.sample(frac=1).reset_index(drop=True)
df_train = df_train[['prompt','context','A','B','C','D','E','answer']]
df_train = df_train.fillna('')
print('Train shape', df_train.shape )
df_train.head()

# Data Loader

In [None]:
# Define a dictionary to map options to their corresponding indices
option_to_index = {option: idx for idx, option in enumerate('ABCDE')}

# Define a dictionary to map indices back to their corresponding options
index_to_option = {v: k for k,v in option_to_index.items()}

def preprocess(example):
    """
    Preprocesses a single example by tokenizing the context and prompt-answer pairs.
    
    Args:
        example (dict): A dictionary containing the example data with keys 'context', 'prompt', 'answer'.
        
    Returns:
        dict: A tokenized example with keys 'input_ids', 'attention_mask', 'token_type_ids', 'labels'.
    """
    # Repeat the context for each option
    first_sentence = [ "[CLS] " + example['context'] ] * 5
    
    # Create the second sentences by combining the prompt and each option
    second_sentences = [" #### " + example['prompt'] + " [SEP] " + example[option] + " [SEP]" for option in 'ABCDE']
    
    # Tokenize the example using the tokenizer
    tokenized_example = tokenizer(first_sentence, second_sentences, truncation='only_first', 
                                  max_length=MAX_INPUT, add_special_tokens=False)
    
    # Map the answer option to its corresponding index
    tokenized_example['label'] = option_to_index[example['answer']]
    
    return tokenized_example

@dataclass
class DataCollatorForMultipleChoice:
    """
    Data collator for multiple choice tasks.
    """
    tokenizer: PreTrainedTokenizerBase
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    
    def __call__(self, features):
        """
        Collates a list of tokenized examples into a batch.
        
        Args:
            features (List[dict]): A list of tokenized examples, each containing keys 'input_ids', 'attention_mask', 'token_type_ids', 'labels'.
        
        Returns:
            dict: A batch of tokenized examples with keys 'input_ids', 'attention_mask', 'token_type_ids', 'labels'.
        """
        # Determine the name of the label key based on the presence of 'label' or 'labels' in the features
        label_name = 'label' if 'label' in features[0].keys() else 'labels'
        
        # Extract the labels from the features
        labels = [feature.pop(label_name) for feature in features]
        
        # Get the batch size and number of choices
        batch_size = len(features)
        num_choices = len(features[0]['input_ids'])
        
        # Flatten the features list
        flattened_features = [
            [{k: v[i] for k, v in feature.items()} for i in range(num_choices)] for feature in features
        ]
        flattened_features = sum(flattened_features, [])
        
        # Pad the flattened features to create a batch
        batch = self.tokenizer.pad(
            flattened_features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors='pt',
        )
        
        # Reshape the batch to match the original structure
        batch = {k: v.view(batch_size, num_choices, -1) for k, v in batch.items()}
        
        # Add the labels to the batch
        batch['labels'] = torch.tensor(labels, dtype=torch.int64)
        
        return batch

In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL)
dataset_valid = Dataset.from_pandas(df_valid)
dataset = Dataset.from_pandas(df_train)
dataset

In [None]:
tokenized_dataset_valid = dataset_valid.map(preprocess, remove_columns=['prompt', 'context', 'A', 'B', 'C', 'D', 'E', 'answer'])
tokenized_dataset = dataset.map(preprocess, remove_columns=['prompt', 'context', 'A', 'B', 'C', 'D', 'E', 'answer'],num_proc=2)
tokenized_dataset

# Build Model

In [None]:
model = AutoModelForMultipleChoice.from_pretrained(MODEL)

In [None]:
if USE_PEFT:
    !pip install --no-index --no-deps /kaggle/input/llm-whls/peft-0.4.0-py3-none-any.whl

In [None]:
# Check if PEFT is enabled
if USE_PEFT:
    print('We are using PEFT.')
    
    # Import necessary libraries for PEFT
    from peft import LoraConfig, get_peft_model, TaskType
    
    # Configure PEFT parameters
    peft_config = LoraConfig(
        r=8, lora_alpha=4, task_type=TaskType.SEQ_CLS, lora_dropout=0.1, 
        bias="none", inference_mode=False, 
        target_modules=["query_proj", "value_proj"],
        modules_to_save=['classifier','pooler'],
    )
    
    # Get the PEFT model
    model = get_peft_model(model, peft_config)
    
    # Print the trainable parameters of the model
    model.print_trainable_parameters()

In [None]:
# Freeze the embeddings if FREEZE_EMBEDDINGS is True
if FREEZE_EMBEDDINGS:
    print('Freezing embeddings.')
    for param in model.deberta.embeddings.parameters():
        param.requires_grad = False

# Freeze the specified number of layers if FREEZE_LAYERS is greater than 0
if FREEZE_LAYERS > 0:
    print(f'Freezing {FREEZE_LAYERS} layers.')
    for layer in model.deberta.encoder.layer[:FREEZE_LAYERS]:
        for param in layer.parameters():
            param.requires_grad = False

# MAP@3 Metric
The competition metric is MAP@3 therefore we will make a custom code to add to Hugging Face's trainer. Discussion [here][1]

[1]: https://www.kaggle.com/competitions/kaggle-llm-science-exam/discussion/435602

In [None]:
def map_at_3(predictions, labels):
    """
    Calculates the Mean Average Precision at 3 (MAP@3) score.

    Args:
        predictions (numpy.ndarray): Array of predicted values.
        labels (numpy.ndarray): Array of true labels.

    Returns:
        float: The MAP@3 score.
    """
    map_sum = 0
    pred = np.argsort(-1*np.array(predictions),axis=1)[:,:3]
    for x,y in zip(pred,labels):
        z = [1/i if y==j else 0 for i,j in zip([1,2,3],x)]
        map_sum += np.sum(z)
    return map_sum / len(predictions)

def compute_metrics(p):
    predictions = p.predictions.tolist()
    labels = p.label_ids.tolist()
    return {"map@3": map_at_3(predictions, labels)}

# Train and Save 

In [None]:
# Define the TrainingArguments with customizable parameters for training a model
training_args = TrainingArguments(
    warmup_ratio=0.1,  # The ratio of warmup steps to total training steps
    learning_rate=1e-5,  # The learning rate for the optimizer
    per_device_train_batch_size=1,  # The batch size per device for training
    per_device_eval_batch_size=2,  # The batch size per device for evaluation
    num_train_epochs=2,  # The number of training epochs
    report_to='none',  # The destination to send training reports
    output_dir=f'./checkpoints_{VER}',  # The output directory for saving checkpoints and logs
    overwrite_output_dir=True,  # Whether to overwrite the output directory if it already exists
    fp16=True,  # Whether to use mixed precision training with float16
    gradient_accumulation_steps=8,  # The number of steps to accumulate gradients before performing an update
    logging_steps=75,  # The number of steps between logging training metrics
    evaluation_strategy='steps',  # The strategy for evaluating during training
    eval_steps=75,  # The number of steps between evaluations
    save_strategy="steps",  # The strategy for saving checkpoints
    save_steps=75,  # The number of steps between saving checkpoints
    load_best_model_at_end=False,  # Whether to load the best model at the end of training
    metric_for_best_model='map@3',  # The metric to use for determining the best model
    lr_scheduler_type='cosine',  # The type of learning rate scheduler to use
    weight_decay=0.01,  # The weight decay rate for regularization
    save_total_limit=2,  # The maximum number of checkpoints to keep
)



In [None]:
# Create a Trainer object for training the model
trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer,
    data_collator=DataCollatorForMultipleChoice(tokenizer=tokenizer),
    train_dataset=tokenized_dataset,
    eval_dataset=tokenized_dataset_valid,
    compute_metrics=compute_metrics,
    #callbacks=[EarlyStoppingCallback(early_stopping_patience=5)],
)

# Train the model
trainer.train()

# Save the trained model
trainer.save_model(f'model_v{VER}')

# Verify Saved Model

In [None]:
del model, trainer
if USE_PEFT:
    model = AutoModelForMultipleChoice.from_pretrained(MODEL)
    model = get_peft_model(model, peft_config)
    checkpoint = torch.load(f'model_v{VER}/pytorch_model.bin')
    model.load_state_dict(checkpoint)
else:
    model = AutoModelForMultipleChoice.from_pretrained(f'model_v{VER}')
trainer = Trainer(model=model)

In [None]:
test_df = pd.read_csv('/kaggle/input/60k-data-with-context-v2/train_with_context2.csv')
tokenized_test_dataset = Dataset.from_pandas(test_df).map(
        preprocess, remove_columns=['prompt', 'context', 'A', 'B', 'C', 'D', 'E'])

test_predictions = trainer.predict(tokenized_test_dataset).predictions
predictions_as_ids = np.argsort(-test_predictions, 1)
predictions_as_answer_letters = np.array(list('ABCDE'))[predictions_as_ids]
predictions_as_string = test_df['prediction'] = [
    ' '.join(row) for row in predictions_as_answer_letters[:, :3]
]

# Compute Validation Score

In [None]:
import numpy as np
def precision_at_k(r, k):
    """
    Calculate the precision at k.
    
    Parameters:
    r (list): The list of relevance scores.
    k (int): The value of k.
    
    Returns:
    float: The precision at k.
    """
    assert k <= len(r)
    assert k != 0
    return sum(int(x) for x in r[:k]) / k

def MAP_at_3(predictions, true_items):
    """
    Calculate the mean average precision at 3.

    Parameters:
    predictions (list): A list of strings representing the predicted items for each user.
    true_items (list): A list of strings representing the true items for each user.

    Returns:
    float: The mean average precision at 3.

    """
    U = len(predictions)
    map_at_3 = 0.0
    for u in range(U):
        user_preds = predictions[u].split()
        user_true = true_items[u]
        user_results = [1 if item == user_true else 0 for item in user_preds]
        for k in range(min(len(user_preds), 3)):
            map_at_3 += precision_at_k(user_results, k+1) * user_results[k]
    return map_at_3 / U

In [None]:
m = MAP_at_3(test_df.prediction.values, test_df.answer.values)
print( 'CV MAP@3 =',m )