In [1]:
import os
from dotenv import load_dotenv
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"  # for ultra-fast downloads
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [2]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import transformers
transformers.logging.set_verbosity_info()

from transformers import AutoTokenizer, AutoModelForSequenceClassification, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType

In [31]:
df = pd.read_parquet("../data/train_combined_stage_1_v2.parquet")
# df = df.sample(frac=0.01, random_state=42).reset_index(drop=True)
print(df.shape)
df.head(2)

(150309, 10)


Unnamed: 0,id,prompt,response_a,response_b,winner,model_a,model_b,language,split,source
0,58210e39b3fd4441a2bd4a518bb44c2d,What is the difference between OpenCL and CUDA?,OpenCL and CUDA are two different programming ...,OpenCL and CUDA are both programming languages...,model_b,chatglm-6b,koala-13b,English,train,lmsys_33k
1,90bfd142157948aba01931726c888e7f,"Fuji vs. Nikon, which is better?",Both Fuji and Nikon are popular camera brands ...,This is a subjective question and the answer d...,model_b,koala-13b,oasst-pythia-12b,English,train,lmsys_33k


In [32]:
df.source.value_counts()

source
current_comp        48439
orpo_dpo_mix_40k    44245
prev_comp           39716
lmsys_33k           17909
Name: count, dtype: int64

In [33]:
model_id = "google/gemma-2b-it"

In [34]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,               # Enable 4-bit quantization
    bnb_4bit_quant_type='nf4',       # Use Normal Float 4 quantization
    bnb_4bit_use_double_quant=True,   # Enable double quantization for better efficiency
    bnb_4bit_compute_dtype=torch.bfloat16  # Set computation type to bfloat16 for faster performance
)
model = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels=1, quantization_config=bnb_config)
model.config.num_labels = 1
lora_config = LoraConfig(
    r=64,
    lora_alpha=64,
    # only target self-attention
    target_modules="all-linear",
    lora_dropout=0,
    bias="none",
    task_type=TaskType.SEQ_CLS,
)
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)
model

loading configuration file config.json from cache at /home/.cache/huggingface/hub/models--google--gemma-2b-it/snapshots/96988410cbdaeb8d5093d1ebdc5a8fb563e02bad/config.json
Model config GemmaConfig {
  "_name_or_path": "google/gemma-2b-it",
  "architectures": [
    "GemmaForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 2,
  "eos_token_id": 1,
  "head_dim": 256,
  "hidden_act": "gelu",
  "hidden_activation": null,
  "hidden_size": 2048,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "intermediate_size": 16384,
  "label2id": {
    "LABEL_0": 0
  },
  "max_position_embeddings": 8192,
  "model_type": "gemma",
  "num_attention_heads": 8,
  "num_hidden_layers": 18,
  "num_key_value_heads": 1,
  "pad_token_id": 0,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "torch_dtype": "bfloat16",
  "transformers_version": "4.48.0",
  "use_cache": true,
  "vocab_size": 256000
}

The device_map was not initiali

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

All model checkpoint weights were used when initializing GemmaForSequenceClassification.

Some weights of GemmaForSequenceClassification were not initialized from the model checkpoint at google/gemma-2b-it and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): GemmaForSequenceClassification(
      (model): GemmaModel(
        (embed_tokens): Embedding(256000, 2048, padding_idx=0)
        (layers): ModuleList(
          (0-17): 18 x GemmaDecoderLayer(
            (self_attn): GemmaAttention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Identity()
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=64, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=64, out_features=2048, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
    

In [35]:
class SequenceProcessor:
    def __init__(
        self,
        model_name: str,
        max_length: int = 1600,
        prompt_ratio: float = 0.3,  # Default 30% for prompt
    ):
        self.tokenizer = AutoTokenizer.from_pretrained(
            model_name,
            trust_remote_code=True,
            add_prefix_space=False,
        )
        
        self.max_length = max_length
        self.prompt_ratio = prompt_ratio
        self.template_overhead = 10

    def truncate_if_needed(self, tokens, max_tokens):
        """Truncate tokens if they exceed max_tokens by keeping start and end portions."""
        if len(tokens) <= max_tokens:
            return tokens

        ellipsis_tokens = self.tokenizer.encode(" [...] ", add_special_tokens=False)
        
        keep_tokens = (max_tokens - len(ellipsis_tokens)) // 2
        return tokens[:keep_tokens] + ellipsis_tokens + tokens[-keep_tokens:]

    def process_single_sequence(self, prompt, response):
        """Process a single prompt-response pair."""
        available_tokens = self.max_length - self.template_overhead

        # Initial token allocation
        prompt_max = int(available_tokens * self.prompt_ratio)
        response_max = available_tokens - prompt_max

        # Tokenize without special tokens
        prompt_tokens = self.tokenizer(
            prompt, 
            add_special_tokens=False
        )["input_ids"]

        response_tokens = self.tokenizer(
            response,
            add_special_tokens=False
        )["input_ids"]

        # Calculate actual needed tokens and redistribute
        prompt_needed = min(len(prompt_tokens), prompt_max)
        excess_tokens = prompt_max - prompt_needed
        response_max = response_max + excess_tokens

        # Apply truncation if needed
        prompt_tokens = self.truncate_if_needed(prompt_tokens, prompt_needed)
        response_tokens = self.truncate_if_needed(response_tokens, response_max)

        # Decode back to text while preserving format
        prompt = self.tokenizer.decode(prompt_tokens, skip_special_tokens=False)
        response = self.tokenizer.decode(response_tokens, skip_special_tokens=False)

        # Create conversation format
        conversation = [
            {"role": "user", "content": prompt},
            {"role": "assistant", "content": response}
        ]

        # Apply chat template
        tokenized_conversation = self.tokenizer.apply_chat_template(
            conversation,
            tokenize=True,
        )

        return {
            "input_ids": tokenized_conversation,
            "attention_mask": [1] * len(tokenized_conversation)
        }

    def process_sequence(self, row):
        """Process row with prompt and responses, handling winner selection."""
        prompt = row["prompt"]
        response_a = row["response_a"]
        response_b = row["response_b"]
        winner = row["winner"]  # Expecting 'model_a' or 'model_b'

        # Process both sequences
        sequence_a = self.process_single_sequence(prompt, response_a)
        sequence_b = self.process_single_sequence(prompt, response_b)

        # Assign chosen/rejected based on winner
        if winner == "model_a":
            return {
                "input_ids_chosen": sequence_a["input_ids"],
                "attention_mask_chosen": sequence_a["attention_mask"],
                "input_ids_rejected": sequence_b["input_ids"],
                "attention_mask_rejected": sequence_b["attention_mask"],
                "length": max(len(sequence_a["input_ids"]), len(sequence_b["input_ids"]))

            }
        else:  # winner == "model_b"
            return {
                "input_ids_chosen": sequence_b["input_ids"],
                "attention_mask_chosen": sequence_b["attention_mask"],
                "input_ids_rejected": sequence_a["input_ids"],
                "attention_mask_rejected": sequence_a["attention_mask"],
                "length": max(len(sequence_a["input_ids"]), len(sequence_b["input_ids"]))
            }

In [36]:
processor = SequenceProcessor(model_name=model_id, max_length=1536)

loading file tokenizer.model from cache at /home/.cache/huggingface/hub/models--google--gemma-2b-it/snapshots/96988410cbdaeb8d5093d1ebdc5a8fb563e02bad/tokenizer.model
loading file tokenizer.json from cache at /home/.cache/huggingface/hub/models--google--gemma-2b-it/snapshots/96988410cbdaeb8d5093d1ebdc5a8fb563e02bad/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at /home/.cache/huggingface/hub/models--google--gemma-2b-it/snapshots/96988410cbdaeb8d5093d1ebdc5a8fb563e02bad/special_tokens_map.json
loading file tokenizer_config.json from cache at /home/.cache/huggingface/hub/models--google--gemma-2b-it/snapshots/96988410cbdaeb8d5093d1ebdc5a8fb563e02bad/tokenizer_config.json
loading file chat_template.jinja from cache at None


In [37]:
row = df.iloc[2]
row

id                             a7c5accc53e649a3bc6b2e41d962ebc4
prompt                      How to build an arena for chatbots?
response_a    Building an arena for chatbots can be done by ...
response_b    Building an arena for chatbots is a great way ...
winner                                                  model_b
model_a                                              vicuna-13b
model_b                                        oasst-pythia-12b
language                                                English
split                                                     train
source                                                lmsys_33k
Name: 2, dtype: object

In [38]:
row["response_b"]

'Building an arena for chatbots is a great way to encourage users to interact with your bot and provides a space for your bot to showcase its abilities. Here are some steps you can follow to build an arena for chatbots:\n\n1. Define the objective: Start by defining the objective of your arena. This could be to create a platform where users can interact with chatbots, ask questions, or simply have conversations.\n\n2. Choose a location: Decide on a location for your arena and make sure it is accessible and has enough space to accommodate the number of bots you plan to have.\n\n3. Set up the infrastructure: Set up the infrastructure needed to run your arena, including a server, database, and programming tools.\n\n4. Design the bots: Design the bots that will be participating in your arena. Consider the various features and capabilities that your bots will need to perform and create a design that will allow them to interact with users in a natural and intuitive way.\n\n5. Develop the soft

In [39]:
tok = processor.process_sequence(row)

In [40]:
row["prompt"]

'How to build an arena for chatbots?'

In [41]:
processor.tokenizer.decode(tok['input_ids_rejected'])

'<bos><start_of_turn>user\nHow to build an arena for chatbots?<end_of_turn>\n<start_of_turn>model\nBuilding an arena for chatbots can be done by following these steps:\n\n1. Define the purpose of the arena: The first step is to clearly define the purpose of the arena, such as testing chatbots for a specific task or evaluating their performance in a certain scenario.\n2. Choose a platform: There are several platforms available for building chatbot arenas, such as Dialogflow, Botpress, and Rasa. Choose a platform that best fits your needs and has the necessary features for building the arena.\n3. Create the bot: Create the chatbot that will participate in the arena. This bot should be designed to perform a specific task or set of tasks that are relevant to the purpose of the arena.\n4. Define the evaluation criteria: Determine how the chatbots will be evaluated in the arena. This could include factors such as accuracy, speed, and user experience.\n5. Develop the arena: Use the platform t

In [42]:
processor.tokenizer.decode(tok['input_ids_chosen'])

'<bos><start_of_turn>user\nHow to build an arena for chatbots?<end_of_turn>\n<start_of_turn>model\nBuilding an arena for chatbots is a great way to encourage users to interact with your bot and provides a space for your bot to showcase its abilities. Here are some steps you can follow to build an arena for chatbots:\n\n1. Define the objective: Start by defining the objective of your arena. This could be to create a platform where users can interact with chatbots, ask questions, or simply have conversations.\n\n2. Choose a location: Decide on a location for your arena and make sure it is accessible and has enough space to accommodate the number of bots you plan to have.\n\n3. Set up the infrastructure: Set up the infrastructure needed to run your arena, including a server, database, and programming tools.\n\n4. Design the bots: Design the bots that will be participating in your arena. Consider the various features and capabilities that your bots will need to perform and create a design 

In [43]:
# data_collator = RewardDataCollatorWithPadding(processor.tokenizer)

In [44]:
train_df = df[df['split'] == 'train'].reset_index(drop=True)
eval_df = df[df['split'] == 'valid'].reset_index(drop=True)
print(train_df.shape, eval_df.shape)

(145470, 10) (4839, 10)


In [45]:
from datasets import Dataset
train_ds = Dataset.from_pandas(train_df)
eval_ds = Dataset.from_pandas(eval_df)

In [48]:
train_tok_ds = train_ds.map(processor.process_sequence, num_proc=16)
eval_tok_ds = eval_ds.map(processor.process_sequence, num_proc=16)

Map (num_proc=16):   0%|          | 0/145470 [00:00<?, ? examples/s]

Map (num_proc=16):   0%|          | 0/4839 [00:00<?, ? examples/s]

In [50]:
chosen_lengths = []
rejected_lengths = []
for i in train_tok_ds:
    chosen_lengths.append(len(i["input_ids_chosen"]))
    rejected_lengths.append(len(i["input_ids_rejected"]))

def count_numbers_in_ranges(numbers, range_step):
    max_value = max(numbers)
    range_counts = {}

    for start in range(0, max_value + range_step, range_step):
        end = start + range_step
        count = sum(start <= num < end for num in numbers)
        range_counts[f"{start}-{end}"] = count

    return range_counts

chosen_counts = count_numbers_in_ranges(chosen_lengths, range_step=100)
print(chosen_counts)
rejected_counts = count_numbers_in_ranges(rejected_lengths, range_step=100)
print(rejected_counts)

{'0-100': 14488, '100-200': 18767, '200-300': 20323, '300-400': 20438, '400-500': 17756, '500-600': 13709, '600-700': 9920, '700-800': 7536, '800-900': 5875, '900-1000': 4366, '1000-1100': 2935, '1100-1200': 2081, '1200-1300': 1579, '1300-1400': 1179, '1400-1500': 850, '1500-1600': 3668, '1600-1700': 0}
{'0-100': 21152, '100-200': 26754, '200-300': 21712, '300-400': 18804, '400-500': 14808, '500-600': 11303, '600-700': 7761, '700-800': 5742, '800-900': 4300, '900-1000': 3463, '1000-1100': 2388, '1100-1200': 1578, '1200-1300': 1158, '1300-1400': 956, '1400-1500': 612, '1500-1600': 2979, '1600-1700': 0}


In [None]:
# from transformers import Trainer, TrainingArguments
# def compute_metrics(eval_pred):
#     result = {}
#     pos_predictions_scores = eval_pred.predictions[0]
#     neg_predictions_scores = eval_pred.predictions[1]
#     # We assume that the first sample is preferred by default in groundtruth
#     result['accuracy'] = np.sum(
#         pos_predictions_scores > neg_predictions_scores) / len(pos_predictions_scores)
#     return result


# class RewardTrainer(Trainer):
#     def compute_loss(self, model, inputs, return_outputs=False):
#         rewards = model(
#             input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"]
#         )[0]
#         print(f"rewards shape: {rewards.shape}")
#         bsz = rewards.size(0)
#         jidx = torch.arange(0, bsz, 2)
#         print(f"jidx shape: {jidx.shape}")
#         kidx = jidx + 1
#         rewards_j = rewards[jidx]
#         print(f"rewards_j: {rewards_j}")
#         rewards_k = rewards[kidx]
#         print(f"rewards_k: {rewards_k}")
#         loss = -nn.functional.logsigmoid(rewards_j - rewards_k).mean()
#         if return_outputs:
#             return loss, {"rewards_j": rewards_j, "rewards_k": rewards_k}
#         return loss

In [19]:
from trl import RewardConfig, RewardTrainer

In [29]:
training_args = RewardConfig(
    output_dir="output",
    overwrite_output_dir=True,
    report_to="none",
    num_train_epochs=0.2,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=2,
    per_device_eval_batch_size=1,
    logging_steps=1,
    eval_strategy="epoch",
    save_strategy="no",
    # save_steps=200,
    optim="adamw_8bit",
    bf16=True,
    bf16_full_eval=True,
    learning_rate=7e-5,
    warmup_steps=0,
    remove_unused_columns=False,
    group_by_length=False,
    length_column_name="length",

)

PyTorch: setting up devices


In [30]:
# Train the model, woohoo.
trainer = RewardTrainer(
    model=model,
    args=training_args,
    train_dataset=train_tok_ds,
    eval_dataset=eval_tok_ds,
    processing_class=processor.tokenizer,
    # compute_metrics=compute_metrics,
)


trainer.train()

Using auto half precision backend
***** Running training *****
  Num examples = 1,447
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 2
  Total optimization steps = 18
  Number of trainable parameters = 78,448,640
  return fn(*args, **kwargs)


Epoch,Training Loss,Validation Loss,Accuracy
0,1.6692,1.208923,0.571429





***** Running Evaluation *****
  Num examples = 56
  Batch size = 1


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=18, training_loss=0.45442743794733864, metrics={'train_runtime': 236.8912, 'train_samples_per_second': 1.222, 'train_steps_per_second': 0.076, 'total_flos': 0.0, 'train_loss': 0.45442743794733864, 'epoch': 0.19889502762430938})

In [None]:
# 0.625, 3:48, group by length false
# 0.58 , 2:14, group by length true
# 0.607 , 2:14, group by length true
# 0.607 , 2:14, group by length true
# 0.57, 3:48, group by length false

# so it's safe to do group by length for RewardTrainer I checked the code internally.