In [1]:
import torch

seed = 28
torch.manual_seed(seed)

device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


### Load model to check

In [2]:
# from transformers import AutoTokenizer, AutoModelForCausalLM


# model_name = "HuggingFaceTB/SmolLM2-135M-Instruct"
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

In [3]:
# messages = [{"role": "user", "content": "What is gravity?"}]
# input_text=tokenizer.apply_chat_template(messages, tokenize=False)

# inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)
# outputs = model.generate(inputs, max_new_tokens=50, temperature=0.2, top_p=0.9, do_sample=True)
# print(tokenizer.decode(outputs[0]))

### Load and prepare dataset

In [4]:
from datasets import load_dataset

dataset = load_dataset("esfrankel17/HelpSteer2_binarized", split='average_rating_split')

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
dataset

Dataset({
    features: ['prompt', 'chosen', 'chosen_rating', 'rejected', 'rejected_rating'],
    num_rows: 8678
})

In [6]:
dataset = dataset.train_test_split(test_size=0.2, shuffle=True, seed=seed)

In [7]:
dataset

DatasetDict({
    train: Dataset({
        features: ['prompt', 'chosen', 'chosen_rating', 'rejected', 'rejected_rating'],
        num_rows: 6942
    })
    test: Dataset({
        features: ['prompt', 'chosen', 'chosen_rating', 'rejected', 'rejected_rating'],
        num_rows: 1736
    })
})

In [8]:
dataset["train"][0]

{'prompt': 'What is the point group of m3m? any crystal group get this symmetry? Since it is a cubic point group, referred to Oh? is C4 also a cubic point group?\n<extra_id_1>Assistant\nThe point group of m3m is m3m, which has order 16 and is a cubic point group. The crystal structure of m3m is cubic, and the symmetry of the m3m group is referred to as Oh. The point group C4 also has order 16 and is a cubic point group, and has a cubic crystal structure with symmetry referred to as Ci.\n<extra_id_1>User\nare m3m and C4 describing the same structure?\n<extra_id_1>Assistant\nYes, m3m and C4 describe the same crystal structure. This structure has a four-fold rotation axis along the center of the cube, and eight mirror planes along the diagonal directions of the cube.\n<extra_id_1>User\nGenerate a picture for me ?',
 'chosen': [{'content': 'What is the point group of m3m? any crystal group get this symmetry? Since it is a cubic point group, referred to Oh? is C4 also a cubic point group?\n

In [9]:
# check if there is more than one request-response pair
for example in dataset["train"]:
    if len(example["chosen"]) > 2:
        print(example["chosen"])
        break
    if len(example["rejected"]) > 2:
        print(example["rejected"])
        break

In [10]:
def process_example(example):
    chosen_text = " ".join([msg["content"] for msg in example["chosen"]])
    rejected_text = " ".join([msg["content"] for msg in example["rejected"]])
    margin = example["chosen_rating"] - example["rejected_rating"]
    
    return {"chosen": chosen_text, "rejected": rejected_text, "margin": margin}

In [11]:
processed_dataset = dataset.map(process_example)

In [12]:
processed_dataset["train"][0]

{'prompt': 'What is the point group of m3m? any crystal group get this symmetry? Since it is a cubic point group, referred to Oh? is C4 also a cubic point group?\n<extra_id_1>Assistant\nThe point group of m3m is m3m, which has order 16 and is a cubic point group. The crystal structure of m3m is cubic, and the symmetry of the m3m group is referred to as Oh. The point group C4 also has order 16 and is a cubic point group, and has a cubic crystal structure with symmetry referred to as Ci.\n<extra_id_1>User\nare m3m and C4 describing the same structure?\n<extra_id_1>Assistant\nYes, m3m and C4 describe the same crystal structure. This structure has a four-fold rotation axis along the center of the cube, and eight mirror planes along the diagonal directions of the cube.\n<extra_id_1>User\nGenerate a picture for me ?',
 'chosen': "What is the point group of m3m? any crystal group get this symmetry? Since it is a cubic point group, referred to Oh? is C4 also a cubic point group?\n<extra_id_1>A

In [13]:
from transformers import AutoTokenizer


model_name = "HuggingFaceTB/SmolLM2-135M-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [14]:
def tokenize_example(example):
    def tokenize_text(text):
        return tokenizer(
            text,
            truncation=True,
            max_length=512,
            padding="max_length"
        )
    
    tokenized_chosen = tokenize_text(example["chosen"])
    tokenized_rejected = tokenize_text(example["rejected"])

    return {
        "input_ids_chosen": tokenized_chosen["input_ids"],
        "attention_mask_chosen": tokenized_chosen["attention_mask"],
        "input_ids_rejected": tokenized_rejected["input_ids"],
        "attention_mask_rejected": tokenized_rejected["attention_mask"],
        "margin": example["margin"]
    }

In [15]:
tokenized_dataset = processed_dataset.map(tokenize_example)

In [16]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['prompt', 'chosen', 'chosen_rating', 'rejected', 'rejected_rating', 'margin', 'input_ids_chosen', 'attention_mask_chosen', 'input_ids_rejected', 'attention_mask_rejected'],
        num_rows: 6942
    })
    test: Dataset({
        features: ['prompt', 'chosen', 'chosen_rating', 'rejected', 'rejected_rating', 'margin', 'input_ids_chosen', 'attention_mask_chosen', 'input_ids_rejected', 'attention_mask_rejected'],
        num_rows: 1736
    })
})

In [17]:
tokenized_dataset["train"][0]

{'prompt': 'What is the point group of m3m? any crystal group get this symmetry? Since it is a cubic point group, referred to Oh? is C4 also a cubic point group?\n<extra_id_1>Assistant\nThe point group of m3m is m3m, which has order 16 and is a cubic point group. The crystal structure of m3m is cubic, and the symmetry of the m3m group is referred to as Oh. The point group C4 also has order 16 and is a cubic point group, and has a cubic crystal structure with symmetry referred to as Ci.\n<extra_id_1>User\nare m3m and C4 describing the same structure?\n<extra_id_1>Assistant\nYes, m3m and C4 describe the same crystal structure. This structure has a four-fold rotation axis along the center of the cube, and eight mirror planes along the diagonal directions of the cube.\n<extra_id_1>User\nGenerate a picture for me ?',
 'chosen': "What is the point group of m3m? any crystal group get this symmetry? Since it is a cubic point group, referred to Oh? is C4 also a cubic point group?\n<extra_id_1>A

In [18]:
columns_to_keep = [
    "input_ids_chosen",
    "attention_mask_chosen",
    "input_ids_rejected",
    "attention_mask_rejected",
    "margin"
]

tokenized_dataset["train"] = tokenized_dataset["train"].remove_columns(
    [col for col in tokenized_dataset["train"].column_names if col not in columns_to_keep]
)

tokenized_dataset["test"] = tokenized_dataset["test"].remove_columns(
    [col for col in tokenized_dataset["test"].column_names if col not in columns_to_keep]
)

In [19]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['margin', 'input_ids_chosen', 'attention_mask_chosen', 'input_ids_rejected', 'attention_mask_rejected'],
        num_rows: 6942
    })
    test: Dataset({
        features: ['margin', 'input_ids_chosen', 'attention_mask_chosen', 'input_ids_rejected', 'attention_mask_rejected'],
        num_rows: 1736
    })
})

### Training Reward model using LoRa and RewardTrainer

In [None]:
from peft import LoraConfig, TaskType
from transformers import AutoModelForSequenceClassification, TrainingArguments
from trl import RewardTrainer, RewardConfig


model_name = "HuggingFaceTB/SmolLM2-135M-Instruct"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1).to(device)
peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
)

training_args = RewardConfig(
    output_dir="./RM_output",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,
    eval_accumulation_steps=2,
    learning_rate=5e-5,
    max_length=512,
    fp16=True,
    logging_steps=50,
    eval_steps=1000,
    center_rewards_coefficient=0.01,  # encouraging the model to produce mean-zero outputs
)

trainer = RewardTrainer(
    model=model,
    args=training_args,
    processing_class=tokenizer,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    peft_config=peft_config,
)

trainer.train()

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at HuggingFaceTB/SmolLM2-135M-Instruct and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
50,1.4315
100,1.2823
150,1.2952
200,1.2775
250,1.2453
300,1.3488
350,1.2289
400,1.2193
450,1.2764
500,1.2223


TrainOutput(global_step=868, training_loss=1.2665826801880165, metrics={'train_runtime': 665.7197, 'train_samples_per_second': 10.428, 'train_steps_per_second': 1.304, 'total_flos': 0.0, 'train_loss': 1.2665826801880165, 'epoch': 1.0})