### 0. Installation

In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.wh

In [None]:
!pip install trl

Collecting trl
  Downloading trl-0.16.1-py3-none-any.whl.metadata (12 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=2.0.0->accelerate>=0.34.0->trl)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=2.0.0->accelerate>=0.34.0->trl)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=2.0.0->accelerate>=0.34.0->trl)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=2.0.0->accelerate>=0.34.0->trl)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=2.0.0->accelerate>=0.34.0->trl)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Colle

### 1. Preprocessing

In [2]:
import numpy as np
from tqdm import tqdm
from datasets import load_dataset
from trl import RewardConfig, RewardTrainer
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSequenceClassification
import torch
from torch.utils.data import Dataset, DataLoader

In [3]:
dataset = load_dataset("trl-lib/ultrafeedback_binarized", split="train")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/643 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/131M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/2.14M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/62135 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [4]:
# Разделение на train и val (например, 90% train, 10% val)
split_dataset = dataset.train_test_split(test_size=0.1, seed=42)

# Получение train и val выборок
train = split_dataset['train']
val = split_dataset['test']

In [None]:
tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-135M-Instruct",
                                          use_fast=True,
                                          truncation=True,
                                          padding_side='left')
model = AutoModelForSequenceClassification.from_pretrained(
    "HuggingFaceTB/SmolLM2-135M-Instruct", num_labels=1
)
model.config.pad_token_id = tokenizer.pad_token_id

tokenizer_config.json:   0%|          | 0.00/3.76k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/801k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.10M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/655 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/861 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/269M [00:00<?, ?B/s]

Some weights of LlamaForSequenceClassification were not initialized from the model checkpoint at HuggingFaceTB/SmolLM2-135M-Instruct and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


### 2. Reward model


In [None]:
training_args = RewardConfig(output_dir="HuggingFaceTB/SmolLM2-135M-Instruct-Reward",
                             learning_rate=5e-5,
                             per_device_train_batch_size=8,
                             max_length=512,
                             fp16=True,
                             remove_unused_columns=False,
                             num_train_epochs=1,)

In [None]:
reward = RewardTrainer(
  args=training_args,
  model=model,
  processing_class=tokenizer,
  train_dataset=train,
  eval_dataset=val
)
reward.train()

Map:   0%|          | 0/55921 [00:00<?, ? examples/s]

Map:   0%|          | 0/55921 [00:00<?, ? examples/s]

Filter:   0%|          | 0/55921 [00:00<?, ? examples/s]

Map:   0%|          | 0/6214 [00:00<?, ? examples/s]

Map:   0%|          | 0/6214 [00:00<?, ? examples/s]

Filter:   0%|          | 0/6214 [00:00<?, ? examples/s]

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mmakaninkirmh[0m ([33mmakaninkirmh-nsu[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss
500,0.6603
1000,0.6133
1500,0.6017
2000,0.5957
2500,0.5833
3000,0.5651
3500,0.5653


TrainOutput(global_step=3655, training_loss=0.5972115951099735, metrics={'train_runtime': 3010.7516, 'train_samples_per_second': 9.712, 'train_steps_per_second': 1.214, 'total_flos': 0.0, 'train_loss': 0.5972115951099735, 'epoch': 1.0})

In [None]:
reward.save_model("reward_model1")

### 3. Reinforce class

In [6]:
class REINFORCEDataset(Dataset):
  def __init__(self, prompts, tokenizer, max_length=512):
    self.prompts = prompts
    self.tokenizer = tokenizer
    self.max_length = max_length

  def __len__(self):
    return len(self.prompts)

  def __getitem__(self, idx):
    prompt = self.prompts[idx]
    encoded = self.tokenizer(
      prompt,
      max_length=self.max_length,
      truncation=True,
      padding="max_length",
      return_tensors="pt"
    )
    return {
      "input_ids": encoded["input_ids"].squeeze(),
      "attention_mask": encoded["attention_mask"].squeeze(),
      "prompt": prompt
    }

In [7]:
class REINFORCETrainer:
    def __init__(
        self,
        model,
        sft_model,
        reward_model,
        tokenizer,
        optimizer,
        batch_size=4,
        baseline_alpha=0.8,
        kl_coef=0.1,
        max_new_tokens=256,
        device="cpu",
        max_length=512
    ):
        self.tokenizer = tokenizer
        self.tokenizer.pad_token = self.tokenizer.eos_token
        self.tokenizer.padding_side = "left"

        self.device = device
        self.model = model.to(self.device).train()
        self.sft_model = sft_model.to(self.device).eval()
        self.reward_model = reward_model.to(self.device).eval()
        self.optimizer = optimizer
        self.batch_size = batch_size
        self.baseline = torch.tensor(0.0, device=self.device)
        self.baseline_alpha = baseline_alpha
        self.kl_coef = kl_coef
        self.max_new_tokens = max_new_tokens
        self.max_length = max_length

    def _prepare_text_pair(self, prompts, responses):
      # Нужный формат для input
      return [f"<|im_start|>user\n{p}<|im_end|>\n<|im_start|>assistant\n{r}<|im_end|>"
                for p, r in zip(prompts, responses)]

    def compute_rewards(self, prompts, responses):
        combined_texts = self._prepare_text_pair(prompts, responses)

        encoded = self.tokenizer(
            combined_texts,
            padding=True,
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        ).to(self.device)

        with torch.no_grad():
          # Получаем reward
          logits = self.reward_model(**encoded).logits
          # Нормируем reward
          rewards = torch.sigmoid(logits).squeeze(-1)

        return rewards

    def compute_kl_penalty(self, prompts, responses):
        combined_texts = self._prepare_text_pair(prompts, responses)

        encoded = self.tokenizer(
            combined_texts,
            padding=True,
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt"
        ).to(self.device)

        # Полуаем SFT model logprobs
        with torch.no_grad():
            sft_logits = self.sft_model(**encoded).logits
            sft_logprobs = torch.log_softmax(sft_logits, dim=-1)

        # Получаем текущую политику logprobs
        current_logits = self.model(**encoded).logits
        current_logprobs = torch.log_softmax(current_logits, dim=-1)

        # Считаем KL-дивергенцию
        kl_div = torch.exp(current_logprobs) * (current_logprobs - sft_logprobs)
        return self.kl_coef * kl_div.sum(dim=-1).mean(dim=1)

    def train(self, train_prompts, max_iterations=12):
        dataset = REINFORCEDataset(train_prompts, self.tokenizer, self.max_length)
        dataloader = DataLoader(dataset, batch_size=self.batch_size, shuffle=True)

        self.model.train()
        progress_bar = tqdm(total=max_iterations)

        for iteration, batch in enumerate(dataloader, 1):
            if iteration > max_iterations:
                break

            self.optimizer.zero_grad()

            # Подгототавливаем input
            input_ids = batch["input_ids"].to(self.device)
            attention_mask = batch["attention_mask"].to(self.device)
            prompts = batch["prompt"]

            # Генерируем ответы на промпт
            outputs = self.model.generate(
                input_ids=input_ids,
                attention_mask=attention_mask,
                max_new_tokens=self.max_new_tokens,
                do_sample=True,
                pad_token_id=self.tokenizer.eos_token_id
            )

            # Токенизируем
            responses = [
                self.tokenizer.decode(
                    o[len(input_ids[i]):],
                    skip_special_tokens=True
                ) for i, o in enumerate(outputs)
            ]

            # Считаем reward и KL-дивергенцию
            with torch.no_grad():
                rewards = self.compute_rewards(prompts, responses)
                kl_penalty = self.compute_kl_penalty(prompts, responses)
                total_rewards = rewards - kl_penalty

            # Обновляем baseline
            advantage = total_rewards - self.baseline
            self.baseline = self.baseline_alpha * self.baseline
            self.baseline += (1 - self.baseline_alpha) * total_rewards.mean()

            response_texts = self._prepare_text_pair(prompts, responses)
            response_encoded = self.tokenizer(
                response_texts,
                padding=True,
                truncation=True,
                max_length=self.max_length,
                return_tensors="pt"
            ).to(self.device)

            # Считаем log probabilities
            logits = self.model(**response_encoded).logits
            log_probs = torch.log_softmax(logits, dim=-1)

            shifted_ids = response_encoded.input_ids[:, 1:]
            shifted_log_probs = log_probs[:, :-1]
            log_probs_values = shifted_log_probs.gather(2, shifted_ids.unsqueeze(-1)).squeeze(-1)

            mask = response_encoded.attention_mask[:, 1:].float()
            log_probs = (log_probs_values * mask).sum(dim=1)

            # Считаем loss
            loss = -(log_probs * advantage).mean()
            loss.backward()
            self.optimizer.step()

            progress_bar.update(1)
            progress_bar.set_postfix({
                "loss": loss.item(),
                "reward": rewards.mean().item(),
                "kl_penalty": kl_penalty.mean().item(),
                "baseline": self.baseline.item(),
                "iteration": iteration
            })

        progress_bar.close()

### 4. Reinforce training

In [8]:
sft_model_name = "HuggingFaceTB/SmolLM2-135M-Instruct"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
reinforce_model = AutoModelForCausalLM.from_pretrained(sft_model_name).to(device)
original_sft_model = AutoModelForCausalLM.from_pretrained(sft_model_name).to(device)

config.json:   0%|          | 0.00/861 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/269M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

In [11]:
reward_model= AutoModelForSequenceClassification.from_pretrained("reward_model").to(device)

Подготавливаем данные к обучению

In [9]:
def replace_prompt_with_chosen(example):
  example["prompt"] = example["chosen"][0]['content']
  return example

train_data = train.map(replace_prompt_with_chosen)
val_data = val.map(replace_prompt_with_chosen)

Map:   0%|          | 0/55921 [00:00<?, ? examples/s]

Map:   0%|          | 0/6214 [00:00<?, ? examples/s]

In [10]:
def preprocess_prompt(prompt):
  tokens = tokenizer.encode(prompt, truncation=True, max_length=512)
  return tokenizer.decode(tokens, skip_special_tokens=True)

train_data = train_data.map(lambda x: {"prompt": preprocess_prompt(x["prompt"])})
val_data = train_data.map(lambda x: {"prompt": preprocess_prompt(x["prompt"])})

Map:   0%|          | 0/55921 [00:00<?, ? examples/s]

Map:   0%|          | 0/55921 [00:00<?, ? examples/s]

In [None]:
optimizer = torch.optim.AdamW(reinforce_model.parameters(), lr=1e-3)

trainer = REINFORCETrainer(
    model=reinforce_model,
    sft_model=original_sft_model,
    reward_model=reward_model,
    tokenizer=tokenizer,
    optimizer=optimizer,
    device="cpu"
  )

trainer.train(train_data["prompt"])

### 4. SFT vs Reinforce

In [13]:
def format_input(prompt, response):
  # Нужный формат для input
  return f"<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n{response}<|im_end|>"

In [14]:
def evaluate(model, reward_model, tokenizer, val_prompts, batch_size=2,
             device="cpu"):
  model.eval()
  reward_model.eval()

  total_reward = 0.0
  num_samples = 0
  all_rewards = []

  tokenizer.pad_token = tokenizer.eos_token

  with torch.no_grad():
    for i in range(0, len(val_prompts), batch_size):
      batch_prompts = val_prompts[i:i+batch_size]

      inputs = tokenizer(
          batch_prompts,
          return_tensors="pt",
          padding=True,
          truncation=True,
          max_length=256
        ).to(device)

      # Генерируем ответ на промпт

      generated = model.generate(
          input_ids=inputs["input_ids"],
          attention_mask=inputs["attention_mask"],
          max_new_tokens=256,
          do_sample=False,
          pad_token_id=tokenizer.eos_token_id,
          eos_token_id=tokenizer.eos_token_id,
          early_stopping=True
        )

      responses = [
        tokenizer.decode(g[len(inputs["input_ids"][j]):],
                         skip_special_tokens=True)
        for j, g in enumerate(generated)
        ]

      combined_texts = [format_input(p, r) for p, r in zip(batch_prompts, responses)]

      # Получаем reward

      reward_inputs = tokenizer(
          combined_texts,
          padding=True,
          truncation=True,
          max_length=512,
          return_tensors="pt"
        ).to(device)

      rewards = reward_model(**reward_inputs).logits.squeeze(-1)
      rewards = torch.sigmoid(rewards).squeeze(-1)

      all_rewards.extend(rewards.tolist())
      total_reward += rewards.sum().item()
      num_samples += len(batch_prompts)

  # Анализ результатов
  print(f"\nReward statistics:")
  print(f"Min: {min(all_rewards):.2f}")
  print(f"Max: {max(all_rewards):.2f}")
  print(f"Mean: {sum(all_rewards)/len(all_rewards):.2f}")

  return total_reward / num_samples if num_samples > 0 else 0.0

In [15]:
print("Evaluating SFT model...")
sft = evaluate(original_sft_model, reward_model,
               tokenizer, val_data["prompt"][2:12])
print("Evaluating trained model...")
trained = evaluate(reinforce_model, reward_model,
                   tokenizer, val_data["prompt"][2:12])

print(f"SFT Reward: {sft:.3f}")
print(f"Trained Reward: {trained:.3f}")

Evaluating SFT model...





Reward statistics:
Min: 0.03
Max: 0.79
Mean: 0.33
Evaluating trained model...

Reward statistics:
Min: 0.19
Max: 0.72
Mean: 0.43
SFT Reward: 0.325
Trained Reward: 0.434
