### Setup

In [None]:
!pip install transformers==4.47.0
!pip install datasets==3.2.0
!pip install trl==0.14.0
!pip install peft==0.14.0
!pip install numpy==1.26.4
!pip install huggingface_hub==0.27.0
!pip install tqdm==4.67.1

In [1]:
import torch
import gc
import random
import numpy

# empty cache
gc.collect()
torch.cuda.empty_cache()

# set the seed
seed = 28
torch.manual_seed(seed)
numpy.random.seed(seed)
random.seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# device check
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


### Load the model

In [2]:
# from transformers import AutoTokenizer, AutoModelForCausalLM

# model_name = "HuggingFaceTB/SmolLM2-135M-Instruct"
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# policy_model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
# policy_model.train()

In [3]:
from peft import LoraConfig, get_peft_model
from transformers import AutoTokenizer, AutoModelForCausalLM


lora_config = LoraConfig(
    r=32,
    lora_alpha=64,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)

model_name = "HuggingFaceTB/SmolLM2-135M-Instruct"
base_model = AutoModelForCausalLM.from_pretrained(model_name)

policy_model = get_peft_model(base_model, lora_config).to(device)
policy_model.train()
policy_model.print_trainable_parameters()

config.json:   0%|          | 0.00/861 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/269M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

trainable params: 1,843,200 || all params: 136,358,208 || trainable%: 1.3517


In [4]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.padding_side = "left"
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/3.76k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/801k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.10M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/655 [00:00<?, ?B/s]

In [5]:
trainable_params = [n for n, p in policy_model.named_parameters() if p.requires_grad]
print(f"Number of trainable blocks: {len(trainable_params)}")
print("Trainable blocks:")
for param_name in trainable_params:
    print(f"- {param_name}")

total_trainable = sum(p.numel() for p in policy_model.parameters() if p.requires_grad)
print(f"\nTotal trainable parameters: {total_trainable:,}")

Number of trainable blocks: 120
Trainable blocks:
- base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight
- base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight
- base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight
- base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.weight
- base_model.model.model.layers.1.self_attn.q_proj.lora_A.default.weight
- base_model.model.model.layers.1.self_attn.q_proj.lora_B.default.weight
- base_model.model.model.layers.1.self_attn.v_proj.lora_A.default.weight
- base_model.model.model.layers.1.self_attn.v_proj.lora_B.default.weight
- base_model.model.model.layers.2.self_attn.q_proj.lora_A.default.weight
- base_model.model.model.layers.2.self_attn.q_proj.lora_B.default.weight
- base_model.model.model.layers.2.self_attn.v_proj.lora_A.default.weight
- base_model.model.model.layers.2.self_attn.v_proj.lora_B.default.weight
- base_model.model.model.layers.3.self_attn.q_proj.lora_A.default.weight
-

In [6]:
from transformers import AutoModelForSequenceClassification

rm_name = "MilyaShams/SmolLM2-135M-Instruct-Reward"
reward_model = AutoModelForSequenceClassification.from_pretrained(rm_name, num_labels=1).to(device)
reward_model.eval()

config.json:   0%|          | 0.00/1.07k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/269M [00:00<?, ?B/s]

LlamaForSequenceClassification(
  (model): LlamaModel(
    (embed_tokens): Embedding(49152, 576, padding_idx=2)
    (layers): ModuleList(
      (0-29): 30 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=576, out_features=576, bias=False)
          (k_proj): Linear(in_features=576, out_features=192, bias=False)
          (v_proj): Linear(in_features=576, out_features=192, bias=False)
          (o_proj): Linear(in_features=576, out_features=576, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=576, out_features=1536, bias=False)
          (up_proj): Linear(in_features=576, out_features=1536, bias=False)
          (down_proj): Linear(in_features=1536, out_features=576, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((576,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((576,), eps=1e-05)
      )
 

### Load and prepare dataset

In [7]:
from datasets import load_dataset

dataset = load_dataset("esfrankel17/HelpSteer2_binarized", split='average_rating_split')

README.md:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

(…)rage_rating_split-00000-of-00001.parquet:   0%|          | 0.00/22.4M [00:00<?, ?B/s]

(…)o_verbosity_split-00000-of-00001.parquet:   0%|          | 0.00/21.5M [00:00<?, ?B/s]

(…)_complexity_split-00000-of-00001.parquet:   0%|          | 0.00/20.9M [00:00<?, ?B/s]

(…)dness_score_split-00000-of-00001.parquet:   0%|          | 0.00/21.1M [00:00<?, ?B/s]

Generating average_rating_split split:   0%|          | 0/8678 [00:00<?, ? examples/s]

Generating average_rating_no_verbosity_split split:   0%|          | 0/8315 [00:00<?, ? examples/s]

Generating average_rating_no_verbosity_no_complexity_split split:   0%|          | 0/8025 [00:00<?, ? examples…

Generating goodness_score_split split:   0%|          | 0/8124 [00:00<?, ? examples/s]

In [8]:
dataset

Dataset({
    features: ['prompt', 'chosen', 'chosen_rating', 'rejected', 'rejected_rating'],
    num_rows: 8678
})

In [9]:
prompts = [len(item["prompt"]) for item in dataset]

n_long = 0
n_ok = 0
n_good = 0

for i in prompts:
    if i > 512:
        n_long += 1
    if i < 512:
        n_ok += 1
    if i < 200:
        n_good += 1

print(n_long, n_ok, n_good)

3361 5310 3996


Let's leave only medium level f length, not so long prompts, because in the original paper they are restrict the both models to max context length equals 512.

In [10]:
filtered_dataset = dataset.filter(lambda example: len(example['prompt']) < 200)

Filter:   0%|          | 0/8678 [00:00<?, ? examples/s]

In [11]:
filtered_dataset

Dataset({
    features: ['prompt', 'chosen', 'chosen_rating', 'rejected', 'rejected_rating'],
    num_rows: 3996
})

In [12]:
filtered_dataset = filtered_dataset.train_test_split(test_size=0.2, shuffle=True, seed=seed)

In [13]:
filtered_dataset

DatasetDict({
    train: Dataset({
        features: ['prompt', 'chosen', 'chosen_rating', 'rejected', 'rejected_rating'],
        num_rows: 3196
    })
    test: Dataset({
        features: ['prompt', 'chosen', 'chosen_rating', 'rejected', 'rejected_rating'],
        num_rows: 800
    })
})

In [14]:
filtered_dataset["train"][0]

{'prompt': "What is the equivalent resistance between two nodes separated by a knight's move of an infinite square grid of resistors of resistance R?",
 'chosen': [{'content': "What is the equivalent resistance between two nodes separated by a knight's move of an infinite square grid of resistors of resistance R?",
   'role': 'user'},
  {'content': "The equivalent resistance between two nodes separated by a knight's move of an infinite square grid of resistors of resistance R can be calculated using the following steps:\n\n1. Draw a circuit diagram of the square grid and the two nodes. A knight's move is a jump that moves two squares horizontally or vertically, and two squares diagonally. The two nodes are connected by a knight's move.\n\n2. Apply the series-parallel rule to the circuit. The series-parallel rule states that the equivalent resistance of a series circuit is the sum of the individual resistances, while the equivalent resistance of a parallel circuit is the reciprocal of t

In [15]:
from torch.utils.data import DataLoader


def collate_prompts(batch):
    return [item['prompt'] for item in batch]

batch_size = 16  # на GPU Р100 Kaggle
dataloader_train = DataLoader(filtered_dataset["train"], batch_size=batch_size, num_workers=4, collate_fn=collate_prompts)
dataloader_val = DataLoader(filtered_dataset["test"], batch_size=batch_size, num_workers=4, collate_fn=collate_prompts)

In [16]:
for batch in dataloader_train:
    prompts = batch
    for prompt in prompts:
        print(prompt)
    break

What is the equivalent resistance between two nodes separated by a knight's move of an infinite square grid of resistors of resistance R?
Lets play Dungeons and Dragons. I'm a halfling rogue and you're the DM.
You are a helpful teacher
Write the parable of the lost sheep in the style of Gordon Ramsay yelling at the shepherd.
elaborate CAD data structure
why are 3rd party plugins in fl studio detached by defaultShare Prompt
what's more correct: "Person List" or "People List"
make a list of cautionary phrases and clauses
SSL handshake optimisation using reverse proxy
There's more to your philosophy than just the Socratic Method, isn't there?
Explain Domain Driven design with a realtime example
I would like to name our child "Maxwell Stomacher" but my sister says this name is inappropriate. Provide three bullet points rebutting this
Best methods to slowly ease into keto or low carb diets to achieve ketosis and not get the keto flu.
Are there web crawlers in email services? 
If the snow is

### Training

In [17]:
from torch.utils.tensorboard import SummaryWriter
from torch.optim import AdamW
import torch.nn.functional as F
from tqdm import tqdm


num_epochs = 1
learning_rate = 5e-5
total_rewards = []
baseline = None

optimizer = AdamW(
    filter(lambda p: p.requires_grad, policy_model.parameters()), 
    lr=learning_rate
)
scaler = torch.amp.GradScaler('cuda')

log_dir = "runs/REINFORCE_with_baseline_logs"
writer = SummaryWriter(log_dir)

In [18]:
print(f"Number of trainable blocks in optimizer: {len(optimizer.param_groups[0]['params'])}")

Number of trainable blocks in optimizer: 120


In [22]:
def generate_batch_response_and_logprob(prompts):
    inputs = tokenizer(prompts, return_tensors="pt", max_length=512, truncation=True, padding=True).to(device)
    
    with torch.no_grad():
        output_ids = policy_model.generate(
            **inputs,
            max_length=512,
            do_sample=True
        )
    
    responses = [tokenizer.decode(ids, skip_special_tokens=True) for ids in output_ids]
    
    with torch.autocast('cuda', dtype=torch.float16):
        outputs = policy_model(output_ids[:, :-1])
        logits = outputs.logits
        log_probs = F.log_softmax(logits, dim=-1)
        target_tokens = output_ids[:, 1:]
        token_log_probs = torch.gather(log_probs, dim=-1, index=target_tokens.unsqueeze(-1)).squeeze(-1)
        total_log_probs = token_log_probs.sum(dim=-1)
    
    return responses, total_log_probs

In [23]:
def compute_reward(prompt, response):
    input_text = prompt + "\n" + response
    inputs = tokenizer(input_text, return_tensors="pt", max_length=512, truncation=True, padding=True).to(device)
    
    with torch.no_grad():
        outputs = reward_model(**inputs)
    
    logits = outputs.logits
    reward = torch.sigmoid(logits)[0, 0].item()
    return reward

In [24]:
def train_epoch(policy_model, dataloader_train, optimizer, scaler, total_rewards):
    policy_model.train()
    epoch_loss = 0.0
    epoch_advantages = []
    num_batches = 0

    for batch in tqdm(dataloader_train, desc="Training"):
        prompts = batch
        optimizer.zero_grad()
        
        responses, log_probs = generate_batch_response_and_logprob(prompts)
        
        batch_loss = 0.0
        for i, prompt in enumerate(prompts):
            response = responses[i]
            log_prob = log_probs[i]
            reward = compute_reward(prompt, response)
            total_rewards.append(reward)
            
            baseline = sum(total_rewards) / len(total_rewards)
            advantage = reward - baseline
            epoch_advantages.append(advantage)
            
            loss = -advantage * log_prob
            batch_loss += loss
        
        batch_loss = batch_loss / len(prompts)
        
        scaler.scale(batch_loss).backward()
        scaler.step(optimizer)
        scaler.update()
        
        epoch_loss += batch_loss.item()
        num_batches += 1

    avg_epoch_loss = epoch_loss / num_batches
    avg_advantage = sum(epoch_advantages) / len(epoch_advantages) if epoch_advantages else 0.0

    return avg_epoch_loss, avg_advantage, baseline

In [25]:
def validate_model(policy_model, dataloader_val):
    policy_model.eval()
    total_reward_val = 0.0
    num_val = 0
    with torch.no_grad():
        for prompts in tqdm(dataloader_val, desc="Validation"):
            responses, _ = generate_batch_response_and_logprob(prompts)
            for i, prompt in enumerate(prompts):
                reward = compute_reward(prompt, responses[i])
                total_reward_val += reward
                num_val += 1
    avg_reward = total_reward_val / num_val if num_val > 0 else 0.0
    return avg_reward

In [26]:
print("Pre-training validation:")
pretrain_val_reward = validate_model(policy_model, dataloader_val)
print(f"Pre-training average reward: {pretrain_val_reward:.4f}")

Pre-training validation:


Validation: 100%|██████████| 50/50 [14:08<00:00, 16.97s/it]

Pre-training average reward: 0.4745





In [27]:
for epoch in range(num_epochs):
    print(f"\nEpoch {epoch+1}/{num_epochs}")
    
    train_loss, avg_advantage, baseline = train_epoch(policy_model, dataloader_train, optimizer, scaler, total_rewards)
    print(f"Epoch {epoch+1}, loss: {train_loss:.4f}, Advantage mean: {avg_advantage:.4f}, Baseline: {baseline:.4f}")
    writer.add_scalar("Train/Loss", train_loss, epoch + 1)
    writer.add_scalar("Train/Advantage Mean", avg_advantage, epoch + 1)
    writer.add_scalar("Train/Baseline", baseline, epoch + 1)
    
    val_reward = validate_model(policy_model, dataloader_val)
    print(f"Validation average reward: {val_reward:.4f}")
    writer.add_scalar("Validation/Average Reward", val_reward, epoch + 1)

print("Training complete")
writer.close()


Epoch 1/1


Training: 100%|██████████| 200/200 [1:03:28<00:00, 19.04s/it]


Epoch 1, loss: -569.5966, Advantage mean: 0.0090, Baseline: 0.4932


Validation: 100%|██████████| 50/50 [14:47<00:00, 17.75s/it]

Validation average reward: 0.5027
Training complete



