### 0. Installation

In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.wh

### 1. Preprocessing

In [2]:
import numpy as np
from tqdm import tqdm
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSequenceClassification
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [3]:
dataset = load_dataset("trl-lib/ultrafeedback_binarized", split="train")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/643 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/131M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/2.14M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/62135 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:
dataset

Dataset({
    features: ['chosen', 'rejected', 'score_chosen', 'score_rejected'],
    num_rows: 62135
})

In [4]:
# Разделение на train и val (например, 90% train, 10% val)
split_dataset = dataset.train_test_split(test_size=0.1, seed=42)  # seed для воспроизводимости

# Получение train и val выборок
train = split_dataset['train']
val = split_dataset['test']

In [5]:
tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-135M-Instruct",
                                          use_fast=True,
                                          truncation=True,
                                          padding_side="left")
tokenizer.pad_token = tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/3.76k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/801k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.10M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/655 [00:00<?, ?B/s]

### 2. Reward model

Добавляем один линейный слой к sft моделе

In [6]:
class RewardModel(nn.Module):
  def __init__(self, model_name):
    super().__init__()
    self.base_model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        output_hidden_states=True
    )
    self.head = nn.Linear(self.base_model.config.hidden_size, 10)

  def target_distribution(self, scores, possible_values):
    distance = (possible_values - scores.unsqueeze(-1)) ** 2
    return torch.softmax(-distance / 0.5, dim=-1)

  def forward(self, input_ids, attention_mask):
    outputs = self.base_model(
        input_ids=input_ids,
        attention_mask=attention_mask,
        return_dict=True
      )

    last_hidden_state = outputs.hidden_states[-1]
    cls_hidden = last_hidden_state[:, 0, :]
    logits = self.head(cls_hidden)
    return torch.softmax(logits, dim=-1)

Формула функции потерь подробно разобрана в README.

In [None]:
def compute_loss(model, chosen_input_ids, chosen_attention_mask,
                 rejected_input_ids, rejected_attention_mask,
                 score_chosen, score_rejected, alpha = 0.6):

  # Предсказание
  p_chosen = model(
      input_ids=chosen_input_ids,
      attention_mask=chosen_attention_mask
  )
  p_rejected = model(
      input_ids=rejected_input_ids,
      attention_mask=rejected_attention_mask
  )
  t_chosen = score_chosen.clone().detach().requires_grad_(False)
  t_rejected = score_rejected.clone().detach().requires_grad_(False)

  # Потеря на предпочтениях
  possible_values = torch.arange(1, p_chosen.shape[1] + 1, dtype=torch.float32).to(device)
  e_chosen = torch.sum(possible_values * p_chosen)
  e_rejected = torch.sum(possible_values * p_rejected)
  loss_pref = -F.logsigmoid(
      e_chosen -
      e_rejected
  )

  # Потеря на оценках
  target_chosen = model.target_distribution(t_chosen, possible_values)
  target_rejected = model.target_distribution(t_rejected, possible_values)
  loss_score = (
      F.kl_div(p_chosen.log(), target_chosen, reduction='batchmean') +
      F.kl_div(p_rejected.log(), target_rejected, reduction='batchmean')
    )/2

  # Итоговый loss
  total_loss = alpha * loss_pref + (1 - alpha) * loss_score
  return total_loss

### 3. Preprocess for Reward Model

In [None]:
def extract_text(example):
  # Для chosen: объединяем все content в одну строку
  chosen_text = "\n".join([msg["content"] for msg in example["chosen"]])
  # Для rejected: аналогично
  rejected_text = "\n".join([msg["content"] for msg in example["rejected"]])
  return {"chosen": chosen_text, "rejected": rejected_text}

# Применяем преобразование ко всему датасету
train_dataset = train.map(extract_text, batched=False)
val_dataset = val.map(extract_text, batched=False)

Map:   0%|          | 0/55921 [00:00<?, ? examples/s]

Map:   0%|          | 0/6214 [00:00<?, ? examples/s]

In [None]:
def tokenize_function(examples):
  # Токенизация с проверкой длины
  tokenized_chosen = tokenizer(
      examples["chosen"],
      padding="max_length",
      truncation=True,
      max_length=512,
      return_tensors="pt"
  )
  tokenized_rejected = tokenizer(
      examples["rejected"],
      padding="max_length",
      truncation=True,
      max_length=512,
      return_tensors="pt"
  )

  # Проверка на пустые последовательности
  if tokenized_chosen["input_ids"].shape[1] == 0 or tokenized_rejected["input_ids"].shape[1] == 0:
      return None  # Пропустить пример

  return {
      "chosen_input_ids": tokenized_chosen["input_ids"],
      "chosen_attention_mask": tokenized_chosen["attention_mask"],
      "rejected_input_ids": tokenized_rejected["input_ids"],
      "rejected_attention_mask": tokenized_rejected["attention_mask"],
      "score_chosen": examples["score_chosen"],
      "score_rejected": examples["score_rejected"]
  }

In [None]:
train_dataset = train_dataset.map(
  tokenize_function,
  remove_columns=["chosen", "rejected"]  # Удаляем исходные текстовые колонки
  )
val_dataset = val_dataset.map(
  tokenize_function,
  remove_columns=["chosen", "rejected"]  # Удаляем исходные текстовые колонки
  )

### 4. Training Reward Model

In [None]:
reward_model = RewardModel("HuggingFaceTB/SmolLM2-135M-Instruct").to(device)
optimizer = optim.Adam(reward_model.parameters(), lr=1e-5)

config.json:   0%|          | 0.00/861 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/269M [00:00<?, ?B/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at HuggingFaceTB/SmolLM2-135M-Instruct and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
accumulation_steps = 8
for epoch in range(1):
  reward_model.train()
  for i, batch in enumerate(tqdm(train_dataset)):
    # Берем по одному примеру из train_dataset
    batch = {
        k: torch.tensor(v).to(device)
        for k, v in batch.items()
    }
    # Накапливаем градиенты
    loss = compute_loss(model=reward_model, **batch)
    loss.backward()

    # Оптимизируем каждые 8 шагов
    if i % accumulation_steps == 0:
      optimizer.step()
      optimizer.zero_grad()

    # Валидация
    if i % 1000 == 0:
        reward_model.eval()
        with torch.no_grad():
          val_batch = val_dataset[i//1000]
          val_batch = {
                    k: torch.tensor(v).to(device)
                    for k, v in val_batch.items()
                }
          save_path = f"reward_model/reward_model_checkpoint_iter_{i}.pth"
          torch.save({
            'model_state_dict': reward_model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'iteration': i,
            'loss': loss.item()
          }, save_path)
          val_loss = compute_loss(model=reward_model, **val_batch).item()
        print(f"Iter: {i}, Train Loss: {loss.item():.4f}, Val Loss: {val_loss:.4f}")
        reward_model.train()

### 5. REINFORCE class

Класс датасета, к которому удобно будет обращаться во время обучения модели

In [7]:
class REINFORCEDataset(Dataset):
  def __init__(self, prompts, tokenizer, max_length=512):

    self.prompts = prompts
    self.tokenizer = tokenizer
    self.max_length = max_length

  def __len__(self):
    return len(self.prompts)

  def __getitem__(self, idx):
    prompt = self.prompts[idx]
    encoded = self.tokenizer(
        prompt,
        max_length=self.max_length,
        truncation=True,
        padding="max_length",
        return_tensors="pt"
      )
    return {
        "input_ids": encoded["input_ids"].squeeze(),
        "attention_mask": encoded["attention_mask"].squeeze(),
        "prompt": prompt
      }

In [8]:
class REINFORCETrainer:
    def __init__(
        self,
        model,
        sft_model,
        reward_model,
        tokenizer,
        optimizer,
        batch_size=4,
        baseline_alpha=0.8,
        kl_coef=0.1,
        max_new_tokens=256,
        device="cpu",
        max_length=512
    ):
        self.tokenizer = tokenizer
        self.tokenizer.pad_token = self.tokenizer.eos_token
        self.tokenizer.padding_side = "left"

        self.device = device
        self.model = model.to(self.device).train()
        self.sft_model = sft_model.to(self.device).eval()
        self.reward_model = reward_model.to(self.device).eval()
        self.optimizer = optimizer
        self.batch_size = batch_size
        self.baseline = torch.tensor(0.0, device=self.device)
        self.baseline_alpha = baseline_alpha
        self.kl_coef = kl_coef
        self.max_new_tokens = max_new_tokens
        self.max_length = max_length

    def _prepare_text_pair(self, prompts, responses):
      # Нужный формат для input
      return [f"<|im_start|>user\n{p}<|im_end|>\n<|im_start|>assistant\n{r}<|im_end|>"
              for p, r in zip(prompts, responses)]

    def compute_rewards(self, prompts, responses):
        combined_texts = self._prepare_text_pair(prompts, responses)

        encoded = self.tokenizer(
            combined_texts,
            padding=True,
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        ).to(self.device)

        with torch.no_grad():
          # Получаем reward
          prob_dist = self.reward_model(**encoded)
          epsilon = torch.std(prob_dist, dim=1).mean().add(1)
          possible_values = torch.arange(
              1,
              prob_dist.shape[1] + 1,
              dtype=torch.float32,
              device=self.device
          )
          # Нормируем reward
          rewards = torch.sum(possible_values * prob_dist, dim=1) / 10

        return rewards, epsilon

    def compute_kl_penalty(self, prompts, responses):
        combined_texts = self._prepare_text_pair(prompts, responses)

        encoded = self.tokenizer(
            combined_texts,
            padding=True,
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        ).to(self.device)

        # Полуаем SFT model logprobs
        with torch.no_grad():
            sft_logits = self.sft_model(**encoded).logits
            sft_logprobs = torch.log_softmax(sft_logits, dim=-1)

        # Получаем текущую политику logprobs
        current_logits = self.model(**encoded).logits
        current_logprobs = torch.log_softmax(current_logits, dim=-1)

        # Считаем KL-дивергенцию
        kl_div = torch.exp(current_logprobs) * (current_logprobs - sft_logprobs)
        return self.kl_coef * kl_div.sum(dim=-1).mean(dim=1)

    def train(self, train_prompts, max_iterations=12):
        dataset = REINFORCEDataset(train_prompts, self.tokenizer, self.max_length)
        dataloader = DataLoader(dataset, batch_size=self.batch_size, shuffle=True)

        self.model.train()
        progress_bar = tqdm(total=max_iterations)

        for iteration, batch in enumerate(dataloader, 1):
            if iteration > max_iterations:
                break

            self.optimizer.zero_grad()

            # Подгототавливаем inpu
            input_ids = batch["input_ids"].to(self.device)
            attention_mask = batch["attention_mask"].to(self.device)
            prompts = batch["prompt"]

            # Генерируем ответы на промпт
            outputs = self.model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_new_tokens=self.max_new_tokens,
                do_sample=True,
                pad_token_id=self.tokenizer.eos_token_id
            )

            # Токенизируем
            responses = [
                self.tokenizer.decode(
                    o[len(input_ids[i]):],
                    skip_special_tokens=True
                ) for i, o in enumerate(outputs)
            ]

            # Считаем reward и KL-дивергенцию
            with torch.no_grad():
                rewards, epsilon = self.compute_rewards(prompts, responses)
                kl_penalty = self.compute_kl_penalty(prompts, responses)
                total_rewards = (rewards - kl_penalty)/epsilon

            # Обновляем baseline
            advantage = (total_rewards - self.baseline)
            self.baseline = self.baseline_alpha * self.baseline
            self.baseline += (1 - self.baseline_alpha) * total_rewards.mean()

            response_texts = self._prepare_text_pair(prompts, responses)
            response_encoded = self.tokenizer(
                response_texts,
                padding=True,
                truncation=True,
                max_length=self.max_length,
                return_tensors="pt"
            ).to(self.device)
            # Считаем log probabilities
            logits = self.model(**response_encoded).logits
            log_probs = torch.log_softmax(logits, dim=-1)

            shifted_ids = response_encoded.input_ids[:, 1:]
            shifted_log_probs = log_probs[:, :-1]
            log_probs_values = shifted_log_probs.gather(2, shifted_ids.unsqueeze(-1)).squeeze(-1)

            mask = response_encoded.attention_mask[:, 1:].float()
            log_probs = (log_probs_values * mask).sum(dim=1)

            # Считаем loss
            loss = -(log_probs * advantage).mean()
            loss.backward()
            self.optimizer.step()

            progress_bar.update(1)
            progress_bar.set_postfix({
                "loss": loss.item(),
                "reward": rewards.mean().item(),
                "kl_penalty": kl_penalty.mean().item(),
                "baseline": self.baseline.item(),
                "iteration": iteration
            })

        progress_bar.close()


### 6. Training REINFORCE

In [10]:
reward_model = RewardModel("HuggingFaceTB/SmolLM2-135M-Instruct").to(device)

file_path = "reward_model_checkpoint_iter_29000.pth"
checkpoint = torch.load(file_path, map_location='cpu')
reward_model.load_state_dict(checkpoint['model_state_dict'])

reward_model.to(device)

config.json:   0%|          | 0.00/861 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/269M [00:00<?, ?B/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at HuggingFaceTB/SmolLM2-135M-Instruct and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RewardModel(
  (base_model): LlamaForSequenceClassification(
    (model): LlamaModel(
      (embed_tokens): Embedding(49152, 576, padding_idx=2)
      (layers): ModuleList(
        (0-29): 30 x LlamaDecoderLayer(
          (self_attn): LlamaAttention(
            (q_proj): Linear(in_features=576, out_features=576, bias=False)
            (k_proj): Linear(in_features=576, out_features=192, bias=False)
            (v_proj): Linear(in_features=576, out_features=192, bias=False)
            (o_proj): Linear(in_features=576, out_features=576, bias=False)
          )
          (mlp): LlamaMLP(
            (gate_proj): Linear(in_features=576, out_features=1536, bias=False)
            (up_proj): Linear(in_features=576, out_features=1536, bias=False)
            (down_proj): Linear(in_features=1536, out_features=576, bias=False)
            (act_fn): SiLU()
          )
          (input_layernorm): LlamaRMSNorm((576,), eps=1e-05)
          (post_attention_layernorm): LlamaRMSNorm((576,), eps=1e

In [11]:
sft_model_name = "HuggingFaceTB/SmolLM2-135M-Instruct"
reinforce_model = AutoModelForCausalLM.from_pretrained(sft_model_name).to(device)
original_sft_model = AutoModelForCausalLM.from_pretrained(sft_model_name).to(device)

generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

Подготовка данных к обучению

In [12]:
def replace_prompt_with_chosen(example):
  example["prompt"] = example["chosen"][0]['content']
  return example

train_data = train.map(replace_prompt_with_chosen)
val_data = val.map(replace_prompt_with_chosen)

Map:   0%|          | 0/55921 [00:00<?, ? examples/s]

Map:   0%|          | 0/6214 [00:00<?, ? examples/s]

In [13]:
def preprocess_prompt(prompt):
  tokens = tokenizer.encode(prompt, truncation=True, max_length=512)
  return tokenizer.decode(tokens, skip_special_tokens=True)

train_data = train_data.map(lambda x: {"prompt": preprocess_prompt(x["prompt"])})
val_data = train_data.map(lambda x: {"prompt": preprocess_prompt(x["prompt"])})

Map:   0%|          | 0/55921 [00:00<?, ? examples/s]

Map:   0%|          | 0/55921 [00:00<?, ? examples/s]

In [17]:
optimizer = torch.optim.AdamW(reinforce_model.parameters(), lr=1e-3)

trainer = REINFORCETrainer(
    model=reinforce_model,
    sft_model=original_sft_model,
    reward_model=reward_model,
    tokenizer=tokenizer,
    optimizer=optimizer,
    device="cpu"
  )

trainer.train(train_data["prompt"])




  0%|          | 0/12 [00:00<?, ?it/s][A[A[A


  8%|▊         | 1/12 [06:52<1:15:34, 412.23s/it][A[A[A


  8%|▊         | 1/12 [06:52<1:15:34, 412.23s/it, loss=456, reward=0.662, kl_penalty=0, baseline=0.123, iteration=1][A[A[A


 17%|█▋        | 2/12 [13:07<1:05:06, 390.62s/it, loss=456, reward=0.662, kl_penalty=0, baseline=0.123, iteration=1][A[A[A


  0%|          | 0/12 [29:30<?, ?it/s]
  0%|          | 0/12 [25:20<?, ?it/s]



 25%|██▌       | 3/12 [19:38<58:35, 390.61s/it, loss=389, reward=0.666, kl_penalty=0.0357, baseline=0.215, iteration=2]  [A[A[A


 25%|██▌       | 3/12 [19:38<58:35, 390.61s/it, loss=293, reward=0.661, kl_penalty=0.0682, baseline=0.282, iteration=3][A[A[A


 33%|███▎      | 4/12 [26:26<52:59, 397.47s/it, loss=293, reward=0.661, kl_penalty=0.0682, baseline=0.282, iteration=3][A[A[A


 33%|███▎      | 4/12 [26:26<52:59, 397.47s/it, loss=219, reward=0.659, kl_penalty=0.0798, baseline=0.333, iteration=4][A[A[A


 42%|████▏     | 5/12 [

### 7. Evaluation

In [18]:
def format_input(prompt, response):
  # Нужный формат для input
  return f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n{response}<|im_end|>"

In [23]:
def evaluate(model, reward_model, tokenizer, val_prompts, batch_size=2, device="cpu"):
  model.eval()
  reward_model.eval()

  total_reward = 0.0
  num_samples = 0
  all_rewards = []

  tokenizer.pad_token = tokenizer.eos_token

  with torch.no_grad():
    for i in range(0, len(val_prompts), batch_size):
      batch_prompts = val_prompts[i:i+batch_size]

      inputs = tokenizer(
          batch_prompts,
          return_tensors="pt",
          padding=True,
          truncation=True,
          max_length=256
        ).to(device)

      # Генерируем ответ на промпт

      generated = model.generate(
          input_ids=inputs["input_ids"],
          attention_mask=inputs["attention_mask"],
          max_new_tokens=256,
          do_sample=False,
          pad_token_id=tokenizer.eos_token_id,
          eos_token_id=tokenizer.eos_token_id,
          early_stopping=True
        )

      responses = [
          tokenizer.decode(g[len(inputs["input_ids"][j]):], skip_special_tokens=True)
          for j, g in enumerate(generated)
        ]


      combined_texts = [format_input(p, r) for p, r in zip(batch_prompts, responses)]

      reward_inputs = tokenizer(
          combined_texts,
          padding=True,
          truncation=True,
          max_length=512,
          return_tensors="pt"
        ).to(device)

      # Подсчет reward

      rewards_out = reward_model(**reward_inputs)

      if isinstance(rewards_out, torch.Tensor):
          rewards = rewards_out.squeeze(-1)
      else:
          rewards = rewards_out.logits.squeeze(-1)


      if len(rewards.shape) > 1 and rewards.shape[1] > 1:
          possible_values = torch.arange(1, rewards.shape[1] + 1, dtype=torch.float32, device=device)
          rewards = torch.sum(possible_values * rewards, dim=1)
          rewards = rewards/10
      else:
          rewards = rewards.squeeze(-1)

      all_rewards.extend(rewards.tolist())
      total_reward += rewards.sum().item()
      num_samples += len(batch_prompts)

  # Анализ результатов
  print(f"\nReward statistics:")
  print(f"Min: {min(all_rewards):.2f}")
  print(f"Max: {max(all_rewards):.2f}")
  print(f"Mean: {sum(all_rewards)/len(all_rewards):.2f}")

  return total_reward / num_samples if num_samples > 0 else 0.0

In [25]:
print("Evaluating SFT model...")
sft = evaluate(original_sft_model, reward_model, tokenizer, val_data["prompt"][2:12])
print("Evaluating trained model...")
trained = evaluate(reinforce_model, reward_model, tokenizer, val_data["prompt"][2:12])

print(f"SFT Reward: {sft:.3f}")
print(f"Trained Reward: {trained:.3f}")

Evaluating SFT model...

Reward statistics:
Min: 0.66
Max: 0.67
Mean: 0.66
Evaluating trained model...

Reward statistics:
Min: 0.60
Max: 0.72
Mean: 0.66
SFT Reward: 0.659
Trained Reward: 0.662
