In [1]:
# installing offline dependencies

!pip install --no-index --no-deps /kaggle/input/llm-whls/transformers-4.31.0-py3-none-any.whl
!pip install --no-index --no-deps /kaggle/input/llm-whls/peft-0.4.0-py3-none-any.whl
!pip install --no-index --no-deps /kaggle/input/llm-whls/datasets-2.14.3-py3-none-any.whl
!pip install --no-index --no-deps /kaggle/input/llm-whls/trl-0.5.0-py3-none-any.whl

Processing /kaggle/input/llm-whls/transformers-4.31.0-py3-none-any.whl
Installing collected packages: transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.30.2
    Uninstalling transformers-4.30.2:
      Successfully uninstalled transformers-4.30.2
Successfully installed transformers-4.31.0
Processing /kaggle/input/llm-whls/peft-0.4.0-py3-none-any.whl
Installing collected packages: peft
Successfully installed peft-0.4.0
Processing /kaggle/input/llm-whls/datasets-2.14.3-py3-none-any.whl
Installing collected packages: datasets
  Attempting uninstall: datasets
    Found existing installation: datasets 2.1.0
    Uninstalling datasets-2.1.0:
      Successfully uninstalled datasets-2.1.0
Successfully installed datasets-2.14.3
Processing /kaggle/input/llm-whls/trl-0.5.0-py3-none-any.whl
Installing collected packages: trl
Successfully installed trl-0.5.0


In [2]:
# using single gpu for this case

import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="0"

In [3]:
import torch
import numpy as np
from dataclasses import dataclass, field
from typing import Optional

from datasets import load_dataset, Dataset
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model, PeftModel, PeftConfig, TaskType, PeftModelForSequenceClassification
from tqdm.auto import tqdm
from transformers import (
    AutoModelForSequenceClassification, AutoModelForCausalLM, AutoModel,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    DataCollatorWithPadding
)

import random
import os
import gc



from trl import RewardTrainer

import pandas as pd
pd.set_option('display.max_colwidth', None)
tqdm.pandas()

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [4]:
class CFG:
    base_model = "/kaggle/input/huggingfacedebertav3variants/deberta-v3-large"
    seed = 42

In [5]:
def set_seed(seed = int):
    '''Sets the seed of the entire notebook so results are the same every time we run.
    This is for REPRODUCIBILITY.'''
    np.random.seed(seed)
    random_state = np.random.RandomState(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ['PYTHONHASHSEED'] = str(seed)
    return random_state


random_state = set_seed(CFG.seed)

In [6]:
# loading multi option dataset

train_df_0 = pd.read_csv('/kaggle/input/additional-train-data-for-llm-science-exam/6000_train_examples.csv')
train_df_1 = pd.read_csv('/kaggle/input/additional-train-data-for-llm-science-exam/extra_train_set.csv')
test_df = pd.read_csv('/kaggle/input/kaggle-llm-science-exam/train.csv')


# merge and drop empty lines

train_df = pd.concat((train_df_0, train_df_1), axis=0)
train_df.dropna(inplace=True)
train_df.reset_index(drop=True, inplace=True)

In [7]:
def generate_new_dataframe(df):
    new_rows = []

    # Iterate through each row in the original DataFrame
    for _, row in df.iterrows():
        prompt = row['prompt']
        chosen_option = row[row['answer']]  # Get the text of the chosen option based on the 'answer' column

        # Iterate through each option
        for option in ['A', 'B', 'C', 'D', 'E']:
            if option != row['answer']:
                rejected_option = row[option]  # Get the text of the rejected option
                new_row = {'chosen': prompt + ' ' + chosen_option, 'rejected': prompt + ' ' + rejected_option}
                new_rows.append(new_row)

    # Create a new DataFrame from the new_rows list
    new_df = pd.DataFrame(new_rows)
    return new_df


train_df = generate_new_dataframe(train_df)
test_df = generate_new_dataframe(test_df)

In [8]:
# adding extra reward data

train_df_2 = pd.read_csv('/kaggle/input/rlhf-data-for-llm-science-exam/llm_rlhf_extra.csv')
train_df = pd.concat((train_df, train_df_2), axis=0)
train_df = train_df.sample(frac=1.0, random_state=CFG.seed)

In [9]:
# converting to dataset format

train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

In [10]:
# loading base model and tokenizers

model = AutoModelForSequenceClassification.from_pretrained(
    CFG.base_model,
    num_labels=1,

)

tokenizer = AutoTokenizer.from_pretrained(CFG.base_model)

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at /kaggle/input/huggingfacedebertav3variants/deberta-v3-large and are newly initialized: ['classifier.bias', 'pooler.dense.bias', 'classifier.weight', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [11]:
peft_config = LoraConfig(
    r=8, lora_alpha=4, task_type=TaskType.SEQ_CLS, lora_dropout=0.1, 
    bias="none", inference_mode=False, target_modules=["query_proj", "value_proj"]
)

In [12]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 788,482 || all params: 435,850,242 || trainable%: 0.1809066335220711


In [13]:
def preprocess_function(examples):
    new_examples = {
        "input_ids_chosen": [],
        "attention_mask_chosen": [],
        "input_ids_rejected": [],
        "attention_mask_rejected": [],
    }
    for chosen, rejected in zip(examples["chosen"], examples["rejected"]):
        tokenized_j = tokenizer(chosen, truncation=True)
        tokenized_k = tokenizer(rejected, truncation=True)

        new_examples["input_ids_chosen"].append(tokenized_j["input_ids"])
        new_examples["attention_mask_chosen"].append(tokenized_j["attention_mask"])
        new_examples["input_ids_rejected"].append(tokenized_k["input_ids"])
        new_examples["attention_mask_rejected"].append(tokenized_k["attention_mask"])

    return new_examples

train_dataset = train_dataset.map(
    preprocess_function,
    batched=True,
    num_proc=4,
)
train_dataset = train_dataset.filter(
    lambda x: len(x["input_ids_chosen"]) <= 256
    and len(x["input_ids_rejected"]) <= 256
)


test_dataset = test_dataset.map(
    preprocess_function,
    batched=True,
    num_proc=4,
)
test_dataset = test_dataset.filter(
    lambda x: len(x["input_ids_chosen"]) <= 2048
    and len(x["input_ids_rejected"]) <= 2048
)

Map (num_proc=4):   0%|          | 0/30132 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Filter:   0%|          | 0/30132 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/800 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Filter:   0%|          | 0/800 [00:00<?, ? examples/s]

In [14]:
training_args = TrainingArguments(
    output_dir='op',
    overwrite_output_dir = True,
    warmup_ratio=0.1,
    lr_scheduler_type='cosine',
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    gradient_accumulation_steps=2,
    learning_rate=2e-4,
    remove_unused_columns=False,
    optim="adafactor",
    logging_steps=250,
    eval_steps=250,
    evaluation_strategy='steps',
    load_best_model_at_end=True,
    save_total_limit = 2,
    fp16=True,
    bf16=False,
    weight_decay=0.01,
    report_to="none",
)


In [15]:
trainer = RewardTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    peft_config=peft_config,
    max_length=256,
)

In [16]:
model.config.use_cache = False
trainer.train()
trainer.save_model('deberta_adapter')

del model

gc.collect()
torch.cuda.empty_cache()

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
Could not estimate the number of tokens of the input, floating-point operations will not be computed


Step,Training Loss,Validation Loss,Accuracy
250,0.6955,0.692644,0.6725
500,0.6573,0.559921,0.855
750,0.5708,0.473095,0.8675
1000,0.534,0.422163,0.87
1250,0.4993,0.411132,0.8725
1500,0.5008,0.408092,0.875
1750,0.5257,0.407692,0.87




In [17]:
# loading original training data for evaluation.
df = pd.read_csv('/kaggle/input/kaggle-llm-science-exam/train.csv')

In [18]:
def get_score(model, tokenizer, prompt, response):
    inputs = tokenizer(prompt + ' ' + response, return_tensors="pt", max_length=2048, padding='longest', truncation=True).to('cuda')
    model.to('cuda')
    model.eval()
    with torch.autocast('cuda', dtype=torch.float16):
        outputs = model(input_ids = inputs['input_ids'], attention_mask=inputs['attention_mask'])
    logits = outputs.logits

    return logits.item()

def get_top_3_winners(model, tokenizer, prompt, response_options):
    scores = []
    for index, response in enumerate(response_options):
        score = get_score(model, tokenizer, prompt, response)
        scores.append((index, score))

    
    sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)
    
    top_3_winners = sorted_scores[:3]
    top_3_winners = [t[0] for t in top_3_winners]

    int_to_string = {
    0: 'A',
    1: 'B',
    2: 'C',
    3: 'D',
    4: 'E'
    }

    top_3_winners = [int_to_string[val] for val in top_3_winners]

    
    return top_3_winners

In [19]:
preds = []
for _, row in tqdm(df.iterrows()):
    prompt = row['prompt']
    response_options = [
        row['A'],
        row['B'],
        row['C'],
        row['D'],
        row['E']
    ]
    top_3_winners = get_top_3_winners(trainer.model, tokenizer, prompt, response_options)
    preds.append(top_3_winners)
    
final_preds = [' '.join(pred) for pred in preds]

0it [00:00, ?it/s]

In [20]:
# source https://www.kaggle.com/code/philippsinger/h2ogpt-perplexity-ranking

def precision_at_k(r, k):
    """Precision at k"""
    assert k <= len(r)
    assert k != 0
    return sum(int(x) for x in r[:k]) / k

def MAP_at_3(predictions, true_items):
    """Score is mean average precision at 3"""
    U = len(predictions)
    map_at_3 = 0.0
    for u in range(U):
        user_preds = predictions[u]
        user_true = true_items[u]
        user_results = [1 if item == user_true else 0 for item in user_preds]
        for k in range(min(len(user_preds), 3)):
            map_at_3 += precision_at_k(user_results, k+1) * user_results[k]
    return map_at_3 / U


MAP_at_3(final_preds, df['answer'])

0.7516666666666666

In [21]:
df = pd.read_csv('/kaggle/input/kaggle-llm-science-exam/test.csv')
preds = []
for _, row in tqdm(df.iterrows()):
    prompt = row['prompt']
    response_options = [
        row['A'],
        row['B'],
        row['C'],
        row['D'],
        row['E']
    ]
    top_3_winners = get_top_3_winners(trainer.model, tokenizer, prompt, response_options)
    preds.append(top_3_winners)
final_preds = [' '.join(pred) for pred in preds]

0it [00:00, ?it/s]

In [22]:
sub = pd.read_csv('/kaggle/input/kaggle-llm-science-exam/sample_submission.csv')
sub['prediction'] = final_preds

In [23]:
sub.to_csv('submission.csv', index=False)
pd.read_csv('submission.csv').head()

Unnamed: 0,id,prediction
0,0,D B E
1,1,A B C
2,2,A C E
3,3,C B A
4,4,D A B
