In [1]:
import torch
import gc

gc.collect()
torch.cuda.empty_cache()

seed = 28
torch.manual_seed(seed)

device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cuda


### Load model to check

In [2]:
# from transformers import AutoTokenizer, AutoModelForCausalLM


# model_name = "HuggingFaceTB/SmolLM2-135M-Instruct"
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model = AutoModelForCausalLM.from_pretrained(model_name).to(device)

In [3]:
# messages = [{"role": "user", "content": "What is gravity?"}]
# input_text=tokenizer.apply_chat_template(messages, tokenize=False)

# inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)
# outputs = model.generate(inputs, max_new_tokens=50, temperature=0.2, top_p=0.9, do_sample=True)
# print(tokenizer.decode(outputs[0]))

### Load and prepare dataset

In [4]:
from datasets import load_dataset

dataset = load_dataset("esfrankel17/HelpSteer2_binarized", split='average_rating_split')

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
dataset

Dataset({
    features: ['prompt', 'chosen', 'chosen_rating', 'rejected', 'rejected_rating'],
    num_rows: 8678
})

In [6]:
dataset = dataset.train_test_split(test_size=0.2, shuffle=True, seed=seed)

In [7]:
dataset

DatasetDict({
    train: Dataset({
        features: ['prompt', 'chosen', 'chosen_rating', 'rejected', 'rejected_rating'],
        num_rows: 6942
    })
    test: Dataset({
        features: ['prompt', 'chosen', 'chosen_rating', 'rejected', 'rejected_rating'],
        num_rows: 1736
    })
})

In [8]:
dataset["train"][0]

{'prompt': 'What is the point group of m3m? any crystal group get this symmetry? Since it is a cubic point group, referred to Oh? is C4 also a cubic point group?\n<extra_id_1>Assistant\nThe point group of m3m is m3m, which has order 16 and is a cubic point group. The crystal structure of m3m is cubic, and the symmetry of the m3m group is referred to as Oh. The point group C4 also has order 16 and is a cubic point group, and has a cubic crystal structure with symmetry referred to as Ci.\n<extra_id_1>User\nare m3m and C4 describing the same structure?\n<extra_id_1>Assistant\nYes, m3m and C4 describe the same crystal structure. This structure has a four-fold rotation axis along the center of the cube, and eight mirror planes along the diagonal directions of the cube.\n<extra_id_1>User\nGenerate a picture for me ?',
 'chosen': [{'content': 'What is the point group of m3m? any crystal group get this symmetry? Since it is a cubic point group, referred to Oh? is C4 also a cubic point group?\n

In [9]:
# check if there is more than one request-response pair
for example in dataset["train"]:
    if len(example["chosen"]) > 2:
        print(example["chosen"])
        break
    if len(example["rejected"]) > 2:
        print(example["rejected"])
        break

In [10]:
import numpy as np

all_rewards = np.array(
    dataset["train"]["chosen_rating"] + dataset["train"]["rejected_rating"]
)

mean_reward = np.mean(all_rewards)
std_reward = np.std(all_rewards)

print(f"mean = {mean_reward}\nstd = {std_reward}")

mean = 2.6000432152117545
std = 0.6627274849063368


In [11]:
def process_example(example):
    chosen_text = " ".join([msg["content"] for msg in example["chosen"]])
    rejected_text = " ".join([msg["content"] for msg in example["rejected"]])
    margin = (example["chosen_rating"] - example["rejected_rating"])
    
    return {"chosen": chosen_text, "rejected": rejected_text, "margin": margin}

In [12]:
processed_dataset = dataset.map(process_example)

In [13]:
processed_dataset["train"][0]

{'prompt': 'What is the point group of m3m? any crystal group get this symmetry? Since it is a cubic point group, referred to Oh? is C4 also a cubic point group?\n<extra_id_1>Assistant\nThe point group of m3m is m3m, which has order 16 and is a cubic point group. The crystal structure of m3m is cubic, and the symmetry of the m3m group is referred to as Oh. The point group C4 also has order 16 and is a cubic point group, and has a cubic crystal structure with symmetry referred to as Ci.\n<extra_id_1>User\nare m3m and C4 describing the same structure?\n<extra_id_1>Assistant\nYes, m3m and C4 describe the same crystal structure. This structure has a four-fold rotation axis along the center of the cube, and eight mirror planes along the diagonal directions of the cube.\n<extra_id_1>User\nGenerate a picture for me ?',
 'chosen': "What is the point group of m3m? any crystal group get this symmetry? Since it is a cubic point group, referred to Oh? is C4 also a cubic point group?\n<extra_id_1>A

In [14]:
columns_to_keep = [
    "chosen",
    "rejected",
    "margin"
]

processed_dataset["train"] = processed_dataset["train"].remove_columns(
    [col for col in processed_dataset["train"].column_names if col not in columns_to_keep]
)

processed_dataset["test"] = processed_dataset["test"].remove_columns(
    [col for col in processed_dataset["test"].column_names if col not in columns_to_keep]
)

In [15]:
processed_dataset

DatasetDict({
    train: Dataset({
        features: ['chosen', 'rejected', 'margin'],
        num_rows: 6942
    })
    test: Dataset({
        features: ['chosen', 'rejected', 'margin'],
        num_rows: 1736
    })
})

### Training Reward model using LoRa and RewardTrainer

In [16]:
from transformers import AutoTokenizer


model_name = "HuggingFaceTB/SmolLM2-135M-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [17]:
import os
import shutil

directory_path = r'./RM_output'

if os.path.exists(directory_path) and os.path.isdir(directory_path):
    shutil.rmtree(directory_path)
    print(f"Directory '{directory_path}' has been deleted.")
else:
    print(f"Directory '{directory_path}' does not exist.")

Directory './RM_output' has been deleted.


In [18]:
from peft import LoraConfig, TaskType
from transformers import AutoModelForSequenceClassification
from trl import RewardTrainer, RewardConfig


model_name = "HuggingFaceTB/SmolLM2-135M-Instruct"
finetune_name = "HuggingFaceTB/SmolLM2-135M-Instruct-Reward"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1).to(device)
peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS,
    inference_mode=False,
    r=16,
    lora_alpha=32,
    lora_dropout=0.2,
)

training_args = RewardConfig(
    output_dir="./RM_output",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=8,
    eval_accumulation_steps=4,
    learning_rate=5e-5,
    max_length=512,
    fp16=True,
    logging_steps=25,
    eval_strategy="steps",
    eval_steps=25,
    center_rewards_coefficient=0.01,  # encouraging the model to produce mean-zero outputs
    report_to=["tensorboard"],
    logging_dir="./RM_output/logs",
    hub_model_id=finetune_name,
)

trainer = RewardTrainer(
    model=model,
    args=training_args,
    processing_class=tokenizer,
    train_dataset=processed_dataset["train"],
    eval_dataset=processed_dataset["test"],
    peft_config=peft_config,
)

trainer.train()

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at HuggingFaceTB/SmolLM2-135M-Instruct and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Accuracy
25,1.4457,1.361001,0.466817
50,1.2822,1.257607,0.502247
75,1.2775,1.221443,0.533183
100,1.2282,1.205081,0.538808
125,1.2545,1.194427,0.538202
150,1.1623,1.186594,0.546067
175,1.212,1.178316,0.551181
200,1.1289,1.173591,0.564045
225,1.1941,1.170951,0.567416
250,1.1873,1.167656,0.568539












TrainOutput(global_step=324, training_loss=1.2153587341308594, metrics={'train_runtime': 1355.3971, 'train_samples_per_second': 7.694, 'train_steps_per_second': 0.239, 'total_flos': 0.0, 'train_loss': 1.2153587341308594, 'epoch': 2.994246260069045})

### Save the model

In [25]:
from huggingface_hub import login
from dotenv import load_dotenv
import os

load_dotenv()

login(os.getenv("HF_TOKEN"))

Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.


In [26]:
merged_model = trainer.model.merge_and_unload()

merged_model.save_pretrained("./reward_model")
tokenizer.save_pretrained("./reward_model")

('./reward_model\\tokenizer_config.json',
 './reward_model\\special_tokens_map.json',
 './reward_model\\vocab.json',
 './reward_model\\merges.txt',
 './reward_model\\added_tokens.json',
 './reward_model\\tokenizer.json')

In [28]:
repo_name = "MilyaShams/SmolLM2-135M-Instruct-Reward"

merged_model.push_to_hub(repo_id=repo_name, tags=["LoRa"])
tokenizer.push_to_hub(repo_id=repo_name)

model.safetensors: 100%|██████████| 538M/538M [00:53<00:00, 9.97MB/s]   


CommitInfo(commit_url='https://huggingface.co/MilyaShams/SmolLM2-135M-Instruct-Reward/commit/a5c252c8aceb8f538fbe22a1d0ea41254d40ef0a', commit_message='Upload tokenizer', commit_description='', oid='a5c252c8aceb8f538fbe22a1d0ea41254d40ef0a', pr_url=None, repo_url=RepoUrl('https://huggingface.co/MilyaShams/SmolLM2-135M-Instruct-Reward', endpoint='https://huggingface.co', repo_type='model', repo_id='MilyaShams/SmolLM2-135M-Instruct-Reward'), pr_revision=None, pr_num=None)