In [4]:
 !pip install trl -q
 !pip install transformers -q
 !pip install vllm -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m335.7/335.7 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m293.6/293.6 MB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m97.9/97.9 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.6/71.6 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m111.0/111.0 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m87.6/87.6 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m766.7/766.7 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m0:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m79.2 MB/s[0m eta [36m0:0

In [5]:
import os
import random
import numpy as np
import torch
import torch.nn as nn
import wandb
import torch.nn.functional as F

from typing import List
from dataclasses import dataclass

from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel
from trl import DataCollatorForCompletionOnlyLM, OnlineDPOConfig, DPOConfig, OnlineDPOTrainer, DPOTrainer, FDivergenceType, FDivergenceConstants
from vllm import LLM, SamplingParams

INFO 03-25 19:54:22 [__init__.py:239] Automatically detected platform cuda.


In [6]:
@dataclass
class Config:
  seed: int = 42
  dataset_path: str = 'HuggingFaceH4/ultrafeedback_binarized'
  model_name: str = 'HuggingFaceTB/SmolLM-135M-Instruct'

  pair_rm_model: str = "llm-blender/PairRM"
  eval_size: int = 100

  temperature: float = 0.2
  top_p: float = 0.95


  max_seq_len: int = 1024

  output_dir: str = "/kaggle/working/trained_dpo"
  eval_steps: int = 10
  gradient_accumulation_steps: int = 4
  gradient_checkpointing: bool = True
  batch_size: int = 4
  max_prompt_length: int = 128
  max_completion_length: int = 256
  max_steps: int = 200
  lr_scheduler_type: str = "cosine" 
  learning_rate: int = 5e-5
  logging_steps: int = 10
  bf16: bool = True # try on P100
  fp16: bool = False
  tf32: bool = False
  beta: float = 5
  #beta = {0.1, 0.05, 1, 5} in paper

  max_tokens_output: int = 768

  def __post_init__(self):
    self.num_proc = os.cpu_count() // torch.cuda.device_count() if torch.cuda.is_available() else 1
    self.model_rlhf = "/kaggle/input/trained-dpo-beta-01-2v/kaggle/working/trained_dpo_beta_0.1"

config = Config()
# print(config.num_proc)

In [4]:
def seed_env(seed: int = Config.seed) -> None:
  random.seed(seed)
  os.environ['PYTHONHASHSEED'] = str(seed)
  np.random.seed(seed)

def seed_torch(seed: int = Config.seed) -> None:
  torch.manual_seed(seed)
  torch.cuda.manual_seed(seed)
  torch.backends.cudnn.deterministic = True
  torch.backends.cudnn.benchmark = False

def seed_everything() -> None:
  """Set seeds"""
  seed_torch()
  seed_env()

def init_wandb() -> None:
  wandb.login(key="abc76743c2934682510c9f2cf81c1896709fe072")
  wandb.init(project='DPO, HW2', entity='lulim')

seed_everything()
init_wandb()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mlulim[0m ([33mturbo-alignment[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mlulim[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [4]:
def filter_dataset(sample: Dataset) -> bool:
    tokenizer = AutoTokenizer.from_pretrained(
        config.model_name,
        padding="right",
        truncation_side="right",
        use_fast=True,
        model_max_length=config.max_seq_len
    )

    
    def len_filter(sample: Dataset) -> bool:
        chosen_conversation = sample["chosen"]
        tokenized_chosen = tokenizer.apply_chat_template(
            conversation=chosen_conversation,
            add_generation_prompt=False,
            tokenize=True,
            truncation=False,
            return_dict=True
        )

        rejected_conversation = sample["rejected"]
        tokenized_rejected = tokenizer.apply_chat_template(
            conversation=chosen_conversation,
            add_generation_prompt=False,
            tokenize=True,
            truncation=False,
            return_dict=True
        )
        return (len(tokenized_chosen["input_ids"]) <= config.max_seq_len) and (len(tokenized_rejected["input_ids"]) <= config.max_seq_len)
    return len_filter(sample)

In [5]:
ds = load_dataset(config.dataset_path)
ds["train_prefs"] = ds["train_prefs"].select(range(1000)).filter(filter_dataset, batched=False, num_proc=config.num_proc).shuffle(seed=config.seed)
ds["test_prefs"] = ds["test_prefs"].select(range(1000)).filter(filter_dataset, batched=False, num_proc=config.num_proc).shuffle(seed=config.seed)

# ds["train_prefs"] = ds["train_prefs"].select(range(500)).filter(filter_dataset, batched=False, num_proc=config.num_proc).shuffle(seed=config.seed)
# ds["test_prefs"] = ds["test_prefs"].select(range(500)).filter(filter_dataset, batched=False, num_proc=config.num_proc).shuffle(seed=config.seed)
ds


README.md: 0.00B [00:00, ?B/s]

train_prefs-00000-of-00001.parquet:   0%|          | 0.00/226M [00:00<?, ?B/s]

test_prefs-00000-of-00001.parquet:   0%|          | 0.00/7.29M [00:00<?, ?B/s]

test_sft-00000-of-00001.parquet:   0%|          | 0.00/3.72M [00:00<?, ?B/s]

KeyboardInterrupt: 

In [4]:
model = AutoModelForCausalLM.from_pretrained(
    config.model_name,
    torch_dtype=torch.bfloat16
)
tokenizer = AutoTokenizer.from_pretrained(
        config.model_name,
        padding="right",
        truncation_side="right",
        use_fast=True,
        model_max_length=config.max_seq_len
)
tokenizer.pad_token = tokenizer.eos_token

config.json:   0%|          | 0.00/723 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/269M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/156 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/3.59k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/801k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.10M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/565 [00:00<?, ?B/s]

## Vanila DPO

In [10]:
config.output_dir = config.output_dir + f"_beta_{config.beta}"
config.output_dir

'/kaggle/working/trained_dpo_beta_5'

In [11]:
training_args = DPOConfig(
    output_dir=config.output_dir,
    eval_strategy="steps",
    eval_steps=config.eval_steps,
    # report_to="wandb",
    report_to="none",
    gradient_accumulation_steps=config.gradient_accumulation_steps,
    gradient_checkpointing=config.gradient_checkpointing,
    per_device_train_batch_size=config.batch_size,
    per_device_eval_batch_size=config.batch_size,
    max_prompt_length=config.max_prompt_length,
    max_completion_length=config.max_completion_length,
    max_steps=config.max_steps,
    lr_scheduler_type=config.lr_scheduler_type,
    learning_rate=config.learning_rate,
    logging_steps=config.logging_steps,
    beta=config.beta,
    fp16=config.fp16,
    tf32=config.tf32,
    bf16=config.bf16
)
dpo_trainer = DPOTrainer(
    model=model,
    processing_class=tokenizer,
    args=training_args,
    train_dataset=ds["train_prefs"].select_columns(["prompt", "chosen", "rejected"]),
    eval_dataset=ds["test_prefs"].select_columns(["prompt", "chosen", "rejected"])
)
dpo_trainer.train()
dpo_trainer.save_model()

Extracting prompt in train dataset:   0%|          | 0/934 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/934 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/934 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1036 > 1024). Running this sequence through the model will result in indexing errors


Extracting prompt in eval dataset:   0%|          | 0/932 [00:00<?, ? examples/s]

Applying chat template to eval dataset:   0%|          | 0/932 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/932 [00:00<?, ? examples/s]

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss,Validation Loss,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/chosen,Logps/rejected,Logits/chosen,Logits/rejected
10,1.8319,2.126009,-1.241936,-1.77351,0.542918,0.531574,-295.413422,-289.308014,-0.505283,-0.338918
20,1.5864,2.143726,-0.53751,-1.21772,0.531116,0.68021,-295.272522,-289.196808,-0.479461,-0.313732
30,2.1687,2.167031,0.41859,-0.230132,0.54721,0.648721,-295.081299,-288.999298,-0.459458,-0.29347
40,2.1797,2.164281,0.700673,0.172875,0.520386,0.527798,-295.024902,-288.918701,-0.448208,-0.282238
50,1.8272,2.067701,0.981626,0.299071,0.543991,0.682554,-294.968719,-288.893463,-0.451606,-0.285932
60,1.8732,1.960009,0.628709,-0.262658,0.548283,0.891367,-295.039337,-289.005829,-0.462894,-0.297051
70,0.3299,2.125365,0.248159,-0.38399,0.545064,0.632149,-295.115417,-289.03009,-0.467956,-0.30163
80,0.5477,2.121719,0.374749,-0.41698,0.555794,0.791729,-295.090057,-289.036682,-0.471225,-0.30513
90,0.3602,1.952107,0.306544,-0.715746,0.550429,1.022291,-295.103729,-289.096405,-0.46451,-0.298507
100,0.5088,2.127689,0.101115,-0.648776,0.541846,0.74989,-295.144806,-289.083008,-0.470504,-0.305494


In [12]:
!zip -r /kaggle/working/trained_dpo_beta_5.zip /kaggle/working/trained_dpo_beta_5

  adding: kaggle/working/trained_dpo_beta_5/ (stored 0%)
  adding: kaggle/working/trained_dpo_beta_5/model.safetensors (deflated 21%)
  adding: kaggle/working/trained_dpo_beta_5/tokenizer.json (deflated 82%)
  adding: kaggle/working/trained_dpo_beta_5/merges.txt (deflated 55%)
  adding: kaggle/working/trained_dpo_beta_5/tokenizer_config.json (deflated 85%)
  adding: kaggle/working/trained_dpo_beta_5/config.json (deflated 46%)
  adding: kaggle/working/trained_dpo_beta_5/training_args.bin (deflated 52%)
  adding: kaggle/working/trained_dpo_beta_5/vocab.json (deflated 59%)
  adding: kaggle/working/trained_dpo_beta_5/checkpoint-200/ (stored 0%)
  adding: kaggle/working/trained_dpo_beta_5/checkpoint-200/optimizer.pt (deflated 24%)
  adding: kaggle/working/trained_dpo_beta_5/checkpoint-200/model.safetensors (deflated 21%)
  adding: kaggle/working/trained_dpo_beta_5/checkpoint-200/rng_state.pth (deflated 25%)
  adding: kaggle/working/trained_dpo_beta_5/checkpoint-200/tokenizer.json (deflated 

In [13]:
import os 
from IPython.display import FileLink


os.chdir(r'/kaggle/working')
FileLink(r'trained_dpo_beta_5.zip')

# Pair eval

In [7]:
!pip install git+https://github.com/yuchenlin/LLM-Blender.git -q

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for llm_blender (setup.py) ... [?25l[?25hdone


In [8]:
import llm_blender
from llm_blender.pair_ranker.pairrm import DebertaV2PairRM

pair_rm = llm_blender.Blender()
pair_rm.loadranker(config.pair_rm_model)

ds = load_dataset(config.dataset_path)
np.random.seed(config.seed)
eval_idxs = np.random.randint(low=0, high=len(ds), size=config.eval_size)
# print(f"{eval_idxs}")
ds_eval = ds["test_sft"].select(eval_idxs).select_columns(["prompt"]).shuffle(seed=config.seed)
ds_eval = [sample["prompt"] for sample in ds_eval]
ds_eval[0]

Fetching 11 files:   0%|          | 0/11 [00:00<?, ?it/s]

config.json:   0%|          | 0.00/508 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/286 [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/130 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/13.7k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.00k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/4.79k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/8.66M [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/580 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/874M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/874M [00:00<?, ?B/s]

Successfully loaded ranker from  /root/.cache/huggingface/hub/llm-blender/PairRM


README.md:   0%|          | 0.00/6.53k [00:00<?, ?B/s]

train_prefs-00000-of-00001.parquet:   0%|          | 0.00/226M [00:00<?, ?B/s]

test_prefs-00000-of-00001.parquet:   0%|          | 0.00/7.29M [00:00<?, ?B/s]

test_sft-00000-of-00001.parquet:   0%|          | 0.00/3.72M [00:00<?, ?B/s]

train_gen-00000-of-00001.parquet:   0%|          | 0.00/184M [00:00<?, ?B/s]

test_gen-00000-of-00001.parquet:   0%|          | 0.00/3.02M [00:00<?, ?B/s]

Generating train_prefs split:   0%|          | 0/61135 [00:00<?, ? examples/s]

Generating train_sft split:   0%|          | 0/61135 [00:00<?, ? examples/s]

Generating test_prefs split:   0%|          | 0/2000 [00:00<?, ? examples/s]

Generating test_sft split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Generating train_gen split:   0%|          | 0/61135 [00:00<?, ? examples/s]

Generating test_gen split:   0%|          | 0/1000 [00:00<?, ? examples/s]

'Evaluate the extent to which web usability is affected by various design styles, including color scheme, typography, layout, and navigation. Provide specific examples and empirical evidence to support your analysis.'

In [9]:
def give_text_answers(prompts, pair_rm_model, model_sft, model_rlhf, sampling_params):
  model_sft = LLM(model=model_sft, dtype="float16", gpu_memory_utilization=0.4)
  model_rlhf = LLM(model=model_rlhf, dtype="float16", gpu_memory_utilization=0.4)

  outputs_sft = model_sft.generate(prompts, sampling_params)
  outputs_rlhf = model_rlhf.generate(prompts, sampling_params)

  only_outputs_sft = [output.outputs[0].text for output in outputs_sft]
  only_outputs_rlhf = [output.outputs[0].text for output in outputs_rlhf]

  # print(outputs_sft)
  # print(outputs_sft[0]) 
  only_outputs_sft_logprobs = [output.outputs[0].logprobs for output in outputs_sft]
  only_outputs_rlhf_logprobs = [output.outputs[0].logprobs for output in outputs_rlhf]
  # print(outputs_rlhf)
  comparison_results = pair_rm_model.compare(prompts, only_outputs_rlhf, only_outputs_sft, 
                                             batch_size=2) # A > B
  # scores_sft = pair_rm.compare(inputs, only_outputs_sft, , batch_size=2)

  return comparison_results, only_outputs_rlhf, only_outputs_sft, \
        only_outputs_rlhf_logprobs, only_outputs_sft_logprobs

In [9]:
sampling_params = SamplingParams(temperature=config.temperature, top_p=config.top_p,
                                 max_tokens=config.max_tokens_output, 
                                 logprobs=20
)
comparison_results, outputs_rlhf, outputs_sft, only_outputs_rlhf_logprobs, only_outputs_sft_logprobs = give_text_answers(ds_eval, pair_rm, config.model_name, config.model_rlhf, sampling_params) # for debug equal models

config.json:   0%|          | 0.00/723 [00:00<?, ?B/s]

INFO 03-24 21:06:00 [config.py:583] This model supports multiple tasks: {'generate', 'score', 'classify', 'reward', 'embed'}. Defaulting to 'generate'.
INFO 03-24 21:06:00 [llm_engine.py:241] Initializing a V0 LLM engine (v0.8.1) with config: model='HuggingFaceTB/SmolLM-135M-Instruct', speculative_config=None, tokenizer='HuggingFaceTB/SmolLM-135M-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=2048, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar', reasoning_backend=None), observability_config=ObservabilityConfig(show_hidden_metrics=False, otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), s

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/565 [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/156 [00:00<?, ?B/s]

INFO 03-24 21:06:03 [cuda.py:234] Cannot use FlashAttention-2 backend for Volta and Turing GPUs.
INFO 03-24 21:06:03 [cuda.py:282] Using XFormers backend.
INFO 03-24 21:06:24 [parallel_state.py:967] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0
INFO 03-24 21:06:24 [model_runner.py:1110] Starting to load model HuggingFaceTB/SmolLM-135M-Instruct...
INFO 03-24 21:06:24 [weight_utils.py:257] Using model weights format ['*.safetensors']


model.safetensors:   0%|          | 0.00/269M [00:00<?, ?B/s]

INFO 03-24 21:06:25 [weight_utils.py:273] Time spent downloading weights for HuggingFaceTB/SmolLM-135M-Instruct: 1.176216 seconds
INFO 03-24 21:06:25 [weight_utils.py:307] No model.safetensors.index.json found in remote.


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 03-24 21:06:26 [loader.py:429] Loading weights took 0.27 seconds
INFO 03-24 21:06:26 [model_runner.py:1146] Model loading took 0.2547 GB and 1.961097 seconds
INFO 03-24 21:06:27 [worker.py:267] Memory profiling takes 0.82 seconds
INFO 03-24 21:06:27 [worker.py:267] the current vLLM instance can use total_gpu_memory (14.74GiB) x gpu_memory_utilization (0.40) = 5.90GiB
INFO 03-24 21:06:27 [worker.py:267] model weights take 0.25GiB; non_torch_memory takes 0.05GiB; PyTorch activation peak memory takes 0.46GiB; the rest of the memory reserved for KV Cache is 5.14GiB.
INFO 03-24 21:06:28 [executor_base.py:111] # cuda blocks: 14969, # CPU blocks: 11650
INFO 03-24 21:06:28 [executor_base.py:116] Maximum concurrency for 2048 tokens per request: 116.95x
INFO 03-24 21:06:34 [model_runner.py:1442] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. 

Capturing CUDA graph shapes: 100%|██████████| 35/35 [00:34<00:00,  1.00it/s]

INFO 03-24 21:07:09 [model_runner.py:1570] Graph capturing finished in 35 secs, took 0.21 GiB
INFO 03-24 21:07:09 [llm_engine.py:447] init engine (profile, create kv cache, warmup model) took 42.62 seconds





INFO 03-24 21:07:09 [config.py:583] This model supports multiple tasks: {'generate', 'score', 'classify', 'reward', 'embed'}. Defaulting to 'generate'.
INFO 03-24 21:07:09 [llm_engine.py:241] Initializing a V0 LLM engine (v0.8.1) with config: model='/kaggle/input/trained-dpo-beta-01-2v/kaggle/working/trained_dpo_beta_0.1', speculative_config=None, tokenizer='/kaggle/input/trained-dpo-beta-01-2v/kaggle/working/trained_dpo_beta_0.1', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=2048, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar', reasoning_backend=None), observability_config=ObservabilityConfig(show_hidden_metrics=False, otlp_traces_endpoint=

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 03-24 21:07:13 [loader.py:429] Loading weights took 3.13 seconds
INFO 03-24 21:07:14 [model_runner.py:1146] Model loading took 0.2548 GB and 3.187235 seconds
INFO 03-24 21:07:15 [worker.py:267] Memory profiling takes 0.64 seconds
INFO 03-24 21:07:15 [worker.py:267] the current vLLM instance can use total_gpu_memory (14.74GiB) x gpu_memory_utilization (0.40) = 5.90GiB
INFO 03-24 21:07:15 [worker.py:267] model weights take 0.25GiB; non_torch_memory takes 0.00GiB; PyTorch activation peak memory takes 0.45GiB; the rest of the memory reserved for KV Cache is 5.19GiB.
INFO 03-24 21:07:15 [executor_base.py:111] # cuda blocks: 15129, # CPU blocks: 11650
INFO 03-24 21:07:15 [executor_base.py:116] Maximum concurrency for 2048 tokens per request: 118.20x
INFO 03-24 21:07:22 [model_runner.py:1442] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. 

Capturing CUDA graph shapes: 100%|██████████| 35/35 [00:43<00:00,  1.24s/it]

INFO 03-24 21:08:05 [model_runner.py:1570] Graph capturing finished in 44 secs, took 0.21 GiB
INFO 03-24 21:08:05 [llm_engine.py:447] init engine (profile, create kv cache, warmup model) took 51.63 seconds



Processed prompts: 100%|██████████| 100/100 [00:34<00:00,  2.94it/s, est. speed input: 1277.11 toks/s, output: 2022.50 toks/s]
Processed prompts: 100%|██████████| 100/100 [00:33<00:00,  2.99it/s, est. speed input: 1298.22 toks/s, output: 2092.07 toks/s]
Ranking candidates: 100%|██████████| 50/50 [01:47<00:00,  2.14s/it]


In [10]:
def get_logprob(vllm_outputs):
    logprobs = [output for output in vllm_outputs[0]]
    logprobs = [[value for key, value in item.items()] for item in logprobs]
    logprobs = [logprob[0].logprob for logprob in logprobs]
    print(f"len logprobs: {len(logprobs)}")
    return logprobs

In [None]:
get_logprob(only_outputs_rlhf_logprobs)

In [None]:
get_logprob(only_outputs_sft_logprobs)

In [20]:
# rlhf_logprob = [output for output in only_outputs_rlhf_logprobs[0]]
# rlhf_logprob
# only_outputs_rlhf_logprobs[0]

In [10]:
rlhf_logprob = [output for output in only_outputs_rlhf_logprobs[0]]
rlhf_logprob = [[value for key, value in item.items()] for item in rlhf_logprob]
rlhf_logprob = [logprobs[0].logprob for logprobs in rlhf_logprob]
rlhf_logprob = np.mean(rlhf_logprob)
rlhf_logprob

-0.10280211417199071

In [11]:
sft_logprob = [output for output in only_outputs_sft_logprobs[0]]
sft_logprob = [[value for key, value in item.items()] for item in sft_logprob]
sft_logprob = [logprobs[0].logprob for logprobs in sft_logprob]
sft_logprob = np.mean(sft_logprob)
sft_logprob

-0.1221480804435428

In [12]:
kl = rlhf_logprob - sft_logprob
kl

0.019345966271552092

In [None]:
print(f"model after dpo win rate: {comparison_results.sum() / comparison_results.shape[0]}")

In [11]:
def get_win_rate_kl(model_path, sft_model, pair_rm):
    sampling_params = SamplingParams(temperature=config.temperature, top_p=config.top_p,
                                 max_tokens=config.max_tokens_output, 
                                 logprobs=20
    )
    comparison_results, outputs_rlhf, outputs_sft, only_outputs_rlhf_logprobs, only_outputs_sft_logprobs = give_text_answers(ds_eval, pair_rm, config.model_name, model_path, sampling_params)
    model_logprobs = np.array(get_logprob(only_outputs_rlhf_logprobs))
    sft_logprobs = np.array(get_logprob(only_outputs_sft_logprobs))
    min_len = min(len(model_logprobs), len(sft_logprobs))
    model_kl = np.mean(model_logprobs[-min_len:] - sft_logprobs[-min_len:])

    win_rate = comparison_results.sum() / comparison_results.shape[0]
    return {"model_path": model_path, "win_rate": win_rate, "kl": model_kl}

# BEYOND REVERSE KL DIVERGANCE

In [None]:
class DPOForwardKL(DPOTrainer):
    def __init__(self, model, processing_class, args, train_dataset, eval_dataset):
        self.model = model
        self.processing_class = processing_class
        self.train_dataset = train_dataset
        self.eval_dataset = eval_dataset
        self.eps = 1e-5
        super().__init__(model=model, processing_class=processing_class,
                         args=args, train_dataset=train_dataset, eval_dataset=eval_dataset)

    def custom_func_derivative(self, x: float) -> torch.tensor:
        # print(f"arg: {x}")
        return -torch.pow(x + self.eps, -1)

    def dpo_loss(
        self,
        chosen_logps: torch.FloatTensor,
        rejected_logps: torch.FloatTensor,
        ref_chosen_logps: torch.FloatTensor,
        ref_rejected_logps: torch.FloatTensor,
    ) -> tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
        """
        Compute the DPO loss for a batch of policy and reference model log probabilities.

        Args:
                Log probabilities of the model for the chosen responses. Shape: `(batch_size,)`.
            rejected_logps (`torch.FloatTensor`):
                Log probabilities of the model for the rejected responses. Shape: `(batch_size,)`.
            ref_chosen_logps (`torch.FloatTensor`):
                Log probabilities of the reference model for the chosen responses. Shape: `(batch_size,)`.
            ref_rejected_logps (`torch.FloatTensor`):
                Log probabilities of the reference model for the rejected responses. Shape: `(batch_size,)`.

        Returns:
            A tuple of three tensors: `(losses, chosen_rewards, rejected_rewards)`.
            The losses tensor contains the DPO loss for each example in the batch.
            The `chosen_rewards` and `rejected_rewards` tensors contain the rewards for the chosen and rejected
            responses, respectively.
        """
        # print(f"calc dpo loss")
        # print(f"chosen_logps: {chosen_logps}")
        # print(f"rejected_logps: {rejected_logps}")
        # print(f"ref_chosen_logps: {ref_chosen_logps}")
        # print(f"ref_rejected_logps: {ref_rejected_logps}")
        device = self.accelerator.device
        chosen_logratios = chosen_logps.to(device) - (not self.reference_free) * ref_chosen_logps.to(device)
        rejected_logratios = rejected_logps.to(device) - (not self.reference_free) * ref_rejected_logps.to(device)

        # print(f"chosen_logratios: {chosen_logratios}")
        # print(f"rejected_logratios: {rejected_logratios}")
        if self.f_divergence_type == FDivergenceType.ALPHA_DIVERGENCE.value:
            # The alpha-divergence formula: (1 - u^-alpha) / alpha
            # The divergence difference between the chosen and rejected sample is:
            #     (1 - u[w]^-alpha) / alpha - (1 - u[l]^-alpha) / alpha
            #        = (u[l]^-alpha - u[w]^-alpha) / alpha
            # where u[w] and u[l] are the policy/reference probability ratios
            # for the chosen and rejected samples, respectively.
            alpha_coef = FDivergenceConstants.ALPHA_DIVERGENCE_COEF_DEFAULT
            if self.f_divergence_params and FDivergenceConstants.ALPHA_DIVERGENCE_COEF_KEY in self.f_divergence_params:
                alpha_coef = float(self.f_divergence_params[FDivergenceConstants.ALPHA_DIVERGENCE_COEF_KEY])
            logits = (cap_exp(rejected_logratios * -alpha_coef) - cap_exp(chosen_logratios * -alpha_coef)) / alpha_coef
        else:
            logratios = chosen_logps - rejected_logps
            if self.reference_free:
                ref_logratios = torch.tensor([0], dtype=logratios.dtype, device=logratios.device)
            else:
                ref_logratios = ref_chosen_logps - ref_rejected_logps

            logratios = logratios.to(self.accelerator.device)
            ref_logratios = ref_logratios.to(self.accelerator.device)
            logits = logratios - ref_logratios

            if self.f_divergence_type == FDivergenceType.JS_DIVERGENCE.value:
                # The js-divergence formula: log(2 * u / (1 + u))
                # The divergence difference between the chosen and rejected sample is:
                #     log(2 * u[w] / (1 + u[w])) - log(2 * u[l] / (1 + u[l]))
                #       = log(u[w]) - log(u[l]) - (log(1 + u[w]) - log(1 + u[l]))
                # where u[w] and u[l] are the policy/reference probability ratios
                # for the chosen and rejected samples, respectively.
                logits -= F.softplus(chosen_logratios) - F.softplus(rejected_logratios)
        losses = (- F.logsigmoid(self.beta * self.custom_func_derivative(chosen_logratios) - \
                    self.beta * self.custom_func_derivative(rejected_logratios)))
        # print(f"losses: {losses}")
        # print(f"beta: {self.beta}")
        chosen_rewards = self.beta * (chosen_logps.to(device) - ref_chosen_logps.to(device)).detach()
        rejected_rewards = self.beta * (rejected_logps.to(device) - ref_rejected_logps.to(device)).detach()
        # print(f"{losses}, {chosen_rewards}, {rejected_rewards}")
        return losses, chosen_rewards, rejected_rewards

In [11]:
class DPOReverseKL(DPOTrainer):
    def __init__(self, model, processing_class, args, train_dataset, eval_dataset):
        self.model = model
        self.processing_class = processing_class
        self.train_dataset = train_dataset
        self.eval_dataset = eval_dataset
        self.eps = 1e-5
        super().__init__(model=model, processing_class=processing_class,
                         args=args, train_dataset=train_dataset, eval_dataset=eval_dataset)

    def custom_func_derivative(self, x: torch.tensor) -> torch.tensor:
        x = F.softplus(x)
        return torch.log(x + self.eps) + 1

    def dpo_loss(
        self,
        chosen_logps: torch.FloatTensor,
        rejected_logps: torch.FloatTensor,
        ref_chosen_logps: torch.FloatTensor,
        ref_rejected_logps: torch.FloatTensor,
    ) -> tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
        """
        Compute the DPO loss for a batch of policy and reference model log probabilities.

        Args:
            chosen_logps (`torch.FloatTensor`):
                Log probabilities of the model for the chosen responses. Shape: `(batch_size,)`.
            rejected_logps (`torch.FloatTensor`):
                Log probabilities of the model for the rejected responses. Shape: `(batch_size,)`.
            ref_chosen_logps (`torch.FloatTensor`):
                Log probabilities of the reference model for the chosen responses. Shape: `(batch_size,)`.
            ref_rejected_logps (`torch.FloatTensor`):
                Log probabilities of the reference model for the rejected responses. Shape: `(batch_size,)`.

        Returns:
            A tuple of three tensors: `(losses, chosen_rewards, rejected_rewards)`.
            The losses tensor contains the DPO loss for each example in the batch.
            The `chosen_rewards` and `rejected_rewards` tensors contain the rewards for the chosen and rejected
            responses, respectively.
        """
        # print(f"calc dpo loss")
        device = self.accelerator.device
        # print(f"chosen_logps: {chosen_logps}, ref_chosen_logps: {ref_chosen_logps}")
        # print(f"rejected_logps: {rejected_logps}, ref_rejected_logps: {ref_rejected_logps}")
        chosen_logratios = chosen_logps.to(device) - (not self.reference_free) * ref_chosen_logps.to(device)
        rejected_logratios = rejected_logps.to(device) - (not self.reference_free) * ref_rejected_logps.to(device)

        # print(f"chosen_logratios: {chosen_logratios}")
        # print(f"rejected_logratios: {rejected_logratios}")
        if self.f_divergence_type == FDivergenceType.ALPHA_DIVERGENCE.value:
            # The alpha-divergence formula: (1 - u^-alpha) / alpha
            # The divergence difference between the chosen and rejected sample is:
            #     (1 - u[w]^-alpha) / alpha - (1 - u[l]^-alpha) / alpha
            #        = (u[l]^-alpha - u[w]^-alpha) / alpha
            # where u[w] and u[l] are the policy/reference probability ratios
            # for the chosen and rejected samples, respectively.
            alpha_coef = FDivergenceConstants.ALPHA_DIVERGENCE_COEF_DEFAULT
            if self.f_divergence_params and FDivergenceConstants.ALPHA_DIVERGENCE_COEF_KEY in self.f_divergence_params:
                alpha_coef = float(self.f_divergence_params[FDivergenceConstants.ALPHA_DIVERGENCE_COEF_KEY])
            logits = (cap_exp(rejected_logratios * -alpha_coef) - cap_exp(chosen_logratios * -alpha_coef)) / alpha_coef
        else:
            logratios = chosen_logps - rejected_logps
            if self.reference_free:
                ref_logratios = torch.tensor([0], dtype=logratios.dtype, device=logratios.device)
            else:
                ref_logratios = ref_chosen_logps - ref_rejected_logps

            logratios = logratios.to(self.accelerator.device)
            ref_logratios = ref_logratios.to(self.accelerator.device)
            logits = logratios - ref_logratios

            if self.f_divergence_type == FDivergenceType.JS_DIVERGENCE.value:
                # The js-divergence formula: log(2 * u / (1 + u))
                # The divergence difference between the chosen and rejected sample is:
                #     log(2 * u[w] / (1 + u[w])) - log(2 * u[l] / (1 + u[l]))
                #       = log(u[w]) - log(u[l]) - (log(1 + u[w]) - log(1 + u[l]))
                # where u[w] and u[l] are the policy/reference probability ratios
                # for the chosen and rejected samples, respectively.
                logits -= F.softplus(chosen_logratios) - F.softplus(rejected_logratios)
        losses = (- F.logsigmoid(self.beta * self.custom_func_derivative(chosen_logratios) - \
                    self.beta * self.custom_func_derivative(rejected_logratios)))
        # print(f"losses: {losses}")

        chosen_rewards = self.beta * (chosen_logps.to(device) - ref_chosen_logps.to(device)).detach()
        rejected_rewards = self.beta * (rejected_logps.to(device) - ref_rejected_logps.to(device)).detach()
        # print(f"{chosen_rewards}, {rejected_rewards}")
        return losses, chosen_rewards, rejected_rewards

In [None]:
@dataclass
class Config:
  seed: int = 42
  dataset_path: str = 'HuggingFaceH4/ultrafeedback_binarized'
  model_name: str = 'HuggingFaceTB/SmolLM-135M-Instruct'

  pair_rm_model: str = "llm-blender/PairRM"
  eval_size: int = 100

  temperature: float = 0.2
  top_p: float = 0.95


  max_seq_len: int = 1024

  output_dir: str = "/kaggle/working/trained_dpo_forward_kl"
  eval_steps: int = 10
  gradient_accumulation_steps: int = 4
  gradient_checkpointing: bool = True
  batch_size: int = 4
  max_prompt_length: int = 128
  max_completion_length: int = 256
  max_steps: int = 200
  lr_scheduler_type: str = "cosine" 
  learning_rate: int = 5e-5
  logging_steps: int = 10
  bf16: bool = True # try on P100
  fp16: bool = False
  tf32: bool = False
  beta: float = 0.1

  max_tokens_output: int = 768

  def __post_init__(self):
    self.num_proc = os.cpu_count() // torch.cuda.device_count() if torch.cuda.is_available() else 1
    self.model_rlhf = "/kaggle/input/trained-dpo-a-5e-5/kaggle/working/trained_dpo"

forward_kl_config = Config()
forward_kl_config.beta = 0.1
# print(config.num_proc)

In [None]:
training_args = DPOConfig(
    output_dir=forward_kl_config.output_dir,
    eval_strategy="steps",
    eval_steps=forward_kl_config.eval_steps,
    report_to="wandb",
    # report_to="none",
    gradient_accumulation_steps=forward_kl_config.gradient_accumulation_steps,
    gradient_checkpointing=forward_kl_config.gradient_checkpointing,
    per_device_train_batch_size=forward_kl_config.batch_size,
    per_device_eval_batch_size=forward_kl_config.batch_size,
    max_prompt_length=forward_kl_config.max_prompt_length,
    max_completion_length=forward_kl_config.max_completion_length,
    max_steps=forward_kl_config.max_steps,
    lr_scheduler_type=forward_kl_config.lr_scheduler_type,
    learning_rate=forward_kl_config.learning_rate,
    logging_steps=forward_kl_config.logging_steps,
    beta=forward_kl_config.beta,
    fp16=forward_kl_config.fp16,
    tf32=forward_kl_config.tf32,
    bf16=forward_kl_config.bf16,
    loss_type="forward_kl"
)
forward_dpo_trainer = DPOForwardKL(
    model=model,
    processing_class=tokenizer,
    args=training_args,
    train_dataset=ds["train_prefs"].select_columns(["prompt", "chosen", "rejected"]),
    eval_dataset=ds["test_prefs"].select_columns(["prompt", "chosen", "rejected"])
)
forward_dpo_trainer.train()
forward_dpo_trainer.save_model()

In [None]:
!zip -r /kaggle/working/trained_dpo_forward_kl.zip /kaggle/working/trained_dpo_forward_kl

In [None]:
import os 
from IPython.display import FileLink


os.chdir(r'/kaggle/working')
FileLink(r'trained_dpo_forward_kl.zip')

In [9]:
reverse_kl_config = Config()
reverse_kl_config.beta = 0.1
reverse_kl_config.output_dir = "/kaggle/working/trained_dpo_reverse_kl"

In [12]:
training_args = DPOConfig(
    output_dir=reverse_kl_config.output_dir,
    eval_strategy="steps",
    eval_steps=reverse_kl_config.eval_steps,
    report_to="wandb",
    # report_to="none",
    gradient_accumulation_steps=reverse_kl_config.gradient_accumulation_steps,
    gradient_checkpointing=reverse_kl_config.gradient_checkpointing,
    per_device_train_batch_size=reverse_kl_config.batch_size,
    per_device_eval_batch_size=reverse_kl_config.batch_size,
    max_prompt_length=reverse_kl_config.max_prompt_length,
    max_completion_length=reverse_kl_config.max_completion_length,
    max_steps=reverse_kl_config.max_steps,
    lr_scheduler_type=reverse_kl_config.lr_scheduler_type,
    learning_rate=reverse_kl_config.learning_rate,
    logging_steps=reverse_kl_config.logging_steps,
    beta=reverse_kl_config.beta,
    fp16=reverse_kl_config.fp16,
    tf32=reverse_kl_config.tf32,
    bf16=reverse_kl_config.bf16,
    loss_type="reverse_kl"
)
reverse_dpo_trainer = DPOReverseKL(
    model=model,
    processing_class=tokenizer,
    args=training_args,
    train_dataset=ds["train_prefs"].select_columns(["prompt", "chosen", "rejected"]),
    eval_dataset=ds["test_prefs"].select_columns(["prompt", "chosen", "rejected"])
)
reverse_dpo_trainer.train()
reverse_dpo_trainer.save_model()

Extracting prompt in train dataset:   0%|          | 0/934 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/934 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/934 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1036 > 1024). Running this sequence through the model will result in indexing errors


Extracting prompt in eval dataset:   0%|          | 0/932 [00:00<?, ? examples/s]

Applying chat template to eval dataset:   0%|          | 0/932 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/932 [00:00<?, ? examples/s]

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss,Validation Loss,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/chosen,Logps/rejected,Logits/chosen,Logits/rejected
10,0.6848,0.687834,-0.068932,-0.085605,0.542918,0.016672,-295.85434,-289.809326,-0.53663,-0.369855
20,0.6889,0.683202,-0.087601,-0.117411,0.55794,0.02981,-296.041046,-290.12738,-0.530776,-0.364754
30,0.6781,0.679506,-0.092928,-0.133583,0.567597,0.040655,-296.09433,-290.289093,-0.521573,-0.355945
40,0.6749,0.67764,-0.104654,-0.151251,0.583691,0.046597,-296.211578,-290.465759,-0.516229,-0.351464
50,0.6855,0.676777,-0.110211,-0.16027,0.584764,0.050059,-296.26712,-290.555969,-0.517217,-0.352834
60,0.6223,0.677001,-0.133105,-0.184919,0.571888,0.051815,-296.496063,-290.80246,-0.534291,-0.370242
70,0.6043,0.674172,-0.160841,-0.221464,0.58691,0.060622,-296.773438,-291.167908,-0.543769,-0.379036
80,0.5915,0.673954,-0.185953,-0.249024,0.565451,0.063071,-297.024567,-291.443512,-0.560539,-0.396896
90,0.588,0.672208,-0.204645,-0.273968,0.57618,0.069324,-297.211487,-291.692963,-0.566233,-0.402573
100,0.5876,0.671099,-0.215355,-0.287397,0.587983,0.072041,-297.318604,-291.827209,-0.572336,-0.408166


In [13]:
!zip -r /kaggle/working/trained_dpo_reverse_kl.zip /kaggle/working/trained_dpo_reverse_kl

  adding: kaggle/working/trained_dpo_reverse_kl/ (stored 0%)
  adding: kaggle/working/trained_dpo_reverse_kl/merges.txt (deflated 55%)
  adding: kaggle/working/trained_dpo_reverse_kl/tokenizer.json (deflated 82%)
  adding: kaggle/working/trained_dpo_reverse_kl/generation_config.json (deflated 30%)
  adding: kaggle/working/trained_dpo_reverse_kl/config.json (deflated 46%)
  adding: kaggle/working/trained_dpo_reverse_kl/checkpoint-200/ (stored 0%)
  adding: kaggle/working/trained_dpo_reverse_kl/checkpoint-200/rng_state.pth (deflated 25%)
  adding: kaggle/working/trained_dpo_reverse_kl/checkpoint-200/merges.txt (deflated 55%)
  adding: kaggle/working/trained_dpo_reverse_kl/checkpoint-200/trainer_state.json (deflated 78%)
  adding: kaggle/working/trained_dpo_reverse_kl/checkpoint-200/tokenizer.json (deflated 82%)
  adding: kaggle/working/trained_dpo_reverse_kl/checkpoint-200/generation_config.json (deflated 30%)
  adding: kaggle/working/trained_dpo_reverse_kl/checkpoint-200/config.json (de

In [14]:
import os 
from IPython.display import FileLink


os.chdir(r'/kaggle/working')
FileLink(r'trained_dpo_reverse_kl.zip')

In [8]:
class AlphaDivergence(DPOTrainer):
    def __init__(self, model, processing_class, args, train_dataset, eval_dataset):
        self.model = model
        self.processing_class = processing_class
        self.train_dataset = train_dataset
        self.eval_dataset = eval_dataset
        self.eps = 1e-5
        self.alpha = 0.5
        super().__init__(model=model, processing_class=processing_class,
                         args=args, train_dataset=train_dataset, eval_dataset=eval_dataset)

    def custom_func_derivative(self, x: float) -> torch.tensor:
        # pow with 0 < rate < 1 => x > 0 - domain
        x = F.softplus(x)
        return (1 - torch.pow(x + self.eps, -self.alpha)) / self.alpha

    def dpo_loss(
        self,
        chosen_logps: torch.FloatTensor,
        rejected_logps: torch.FloatTensor,
        ref_chosen_logps: torch.FloatTensor,
        ref_rejected_logps: torch.FloatTensor,
    ) -> tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
        """
        Compute the DPO loss for a batch of policy and reference model log probabilities.

        Args:
            chosen_logps (`torch.FloatTensor`):
                Log probabilities of the model for the chosen responses. Shape: `(batch_size,)`.
            rejected_logps (`torch.FloatTensor`):
                Log probabilities of the model for the rejected responses. Shape: `(batch_size,)`.
            ref_chosen_logps (`torch.FloatTensor`):
                Log probabilities of the reference model for the chosen responses. Shape: `(batch_size,)`.
            ref_rejected_logps (`torch.FloatTensor`):
                Log probabilities of the reference model for the rejected responses. Shape: `(batch_size,)`.

        Returns:
            A tuple of three tensors: `(losses, chosen_rewards, rejected_rewards)`.
            The losses tensor contains the DPO loss for each example in the batch.
            The `chosen_rewards` and `rejected_rewards` tensors contain the rewards for the chosen and rejected
            responses, respectively.
        """
        device = self.accelerator.device
        chosen_logratios = chosen_logps.to(device) - (not self.reference_free) * ref_chosen_logps.to(device)
        rejected_logratios = rejected_logps.to(device) - (not self.reference_free) * ref_rejected_logps.to(device)

        if self.f_divergence_type == FDivergenceType.ALPHA_DIVERGENCE.value:
            # The alpha-divergence formula: (1 - u^-alpha) / alpha
            # The divergence difference between the chosen and rejected sample is:
            #     (1 - u[w]^-alpha) / alpha - (1 - u[l]^-alpha) / alpha
            #        = (u[l]^-alpha - u[w]^-alpha) / alpha
            # where u[w] and u[l] are the policy/reference probability ratios
            # for the chosen and rejected samples, respectively.
            alpha_coef = FDivergenceConstants.ALPHA_DIVERGENCE_COEF_DEFAULT
            if self.f_divergence_params and FDivergenceConstants.ALPHA_DIVERGENCE_COEF_KEY in self.f_divergence_params:
                alpha_coef = float(self.f_divergence_params[FDivergenceConstants.ALPHA_DIVERGENCE_COEF_KEY])
            logits = (cap_exp(rejected_logratios * -alpha_coef) - cap_exp(chosen_logratios * -alpha_coef)) / alpha_coef
        else:
            logratios = chosen_logps - rejected_logps
            if self.reference_free:
                ref_logratios = torch.tensor([0], dtype=logratios.dtype, device=logratios.device)
            else:
                ref_logratios = ref_chosen_logps - ref_rejected_logps

            logratios = logratios.to(self.accelerator.device)
            ref_logratios = ref_logratios.to(self.accelerator.device)
            logits = logratios - ref_logratios

            if self.f_divergence_type == FDivergenceType.JS_DIVERGENCE.value:
                # The js-divergence formula: log(2 * u / (1 + u))
                # The divergence difference between the chosen and rejected sample is:
                #     log(2 * u[w] / (1 + u[w])) - log(2 * u[l] / (1 + u[l]))
                #       = log(u[w]) - log(u[l]) - (log(1 + u[w]) - log(1 + u[l]))
                # where u[w] and u[l] are the policy/reference probability ratios
                # for the chosen and rejected samples, respectively.
                logits -= F.softplus(chosen_logratios) - F.softplus(rejected_logratios)
        losses = (- F.logsigmoid(self.beta * self.custom_func_derivative(chosen_logratios) - \
                    self.beta * self.custom_func_derivative(rejected_logratios)))

        chosen_rewards = self.beta * (chosen_logps.to(device) - ref_chosen_logps.to(device)).detach()
        rejected_rewards = self.beta * (rejected_logps.to(device) - ref_rejected_logps.to(device)).detach()
        return losses, chosen_rewards, rejected_rewards

In [16]:
class JSDivergence(DPOTrainer):
    def __init__(self, model, processing_class, args, train_dataset, eval_dataset):
        self.model = model
        self.processing_class = processing_class
        self.train_dataset = train_dataset
        self.eval_dataset = eval_dataset
        self.eps = 1e-5
        super().__init__(model=model, processing_class=processing_class,
                         args=args, train_dataset=train_dataset, eval_dataset=eval_dataset)

    def custom_func_derivative(self, x: float) -> torch.tensor:
        # print(f"arg: {x}")
        # log: domain > 0
        x = F.softplus(2 * x / (1 + x))
        return torch.log(x + self.eps)

    def dpo_loss(
        self,
        chosen_logps: torch.FloatTensor,
        rejected_logps: torch.FloatTensor,
        ref_chosen_logps: torch.FloatTensor,
        ref_rejected_logps: torch.FloatTensor,
    ) -> tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
        """
        Compute the DPO loss for a batch of policy and reference model log probabilities.

        Args:
            chosen_logps (`torch.FloatTensor`):
                Log probabilities of the model for the chosen responses. Shape: `(batch_size,)`.
            rejected_logps (`torch.FloatTensor`):
                Log probabilities of the model for the rejected responses. Shape: `(batch_size,)`.
            ref_chosen_logps (`torch.FloatTensor`):
                Log probabilities of the reference model for the chosen responses. Shape: `(batch_size,)`.
            ref_rejected_logps (`torch.FloatTensor`):
                Log probabilities of the reference model for the rejected responses. Shape: `(batch_size,)`.

        Returns:
            A tuple of three tensors: `(losses, chosen_rewards, rejected_rewards)`.
            The losses tensor contains the DPO loss for each example in the batch.
            The `chosen_rewards` and `rejected_rewards` tensors contain the rewards for the chosen and rejected
            responses, respectively.
        """
        # print(f"calc dpo loss")
        device = self.accelerator.device
        # print(f"device: {device}")
        chosen_logratios = chosen_logps.to(device) - (not self.reference_free) * ref_chosen_logps.to(device)
        rejected_logratios = rejected_logps.to(device) - (not self.reference_free) * ref_rejected_logps.to(device)

        # print(f"chosen_logratios: {chosen_logratios}")
        # print(f"rejected_logratios: {rejected_logratios}")
        if self.f_divergence_type == FDivergenceType.ALPHA_DIVERGENCE.value:
            # The alpha-divergence formula: (1 - u^-alpha) / alpha
            # The divergence difference between the chosen and rejected sample is:
            #     (1 - u[w]^-alpha) / alpha - (1 - u[l]^-alpha) / alpha
            #        = (u[l]^-alpha - u[w]^-alpha) / alpha
            # where u[w] and u[l] are the policy/reference probability ratios
            # for the chosen and rejected samples, respectively.
            alpha_coef = FDivergenceConstants.ALPHA_DIVERGENCE_COEF_DEFAULT
            if self.f_divergence_params and FDivergenceConstants.ALPHA_DIVERGENCE_COEF_KEY in self.f_divergence_params:
                alpha_coef = float(self.f_divergence_params[FDivergenceConstants.ALPHA_DIVERGENCE_COEF_KEY])
            logits = (cap_exp(rejected_logratios * -alpha_coef) - cap_exp(chosen_logratios * -alpha_coef)) / alpha_coef
        else:
            logratios = chosen_logps - rejected_logps
            if self.reference_free:
                ref_logratios = torch.tensor([0], dtype=logratios.dtype, device=logratios.device)
            else:
                ref_logratios = ref_chosen_logps - ref_rejected_logps

            logratios = logratios.to(self.accelerator.device)
            ref_logratios = ref_logratios.to(self.accelerator.device)
            logits = logratios - ref_logratios

            if self.f_divergence_type == FDivergenceType.JS_DIVERGENCE.value:
                # The js-divergence formula: log(2 * u / (1 + u))
                # The divergence difference between the chosen and rejected sample is:
                #     log(2 * u[w] / (1 + u[w])) - log(2 * u[l] / (1 + u[l]))
                #       = log(u[w]) - log(u[l]) - (log(1 + u[w]) - log(1 + u[l]))
                # where u[w] and u[l] are the policy/reference probability ratios
                # for the chosen and rejected samples, respectively.
                logits -= F.softplus(chosen_logratios) - F.softplus(rejected_logratios)
        losses = (- F.logsigmoid(self.beta * self.custom_func_derivative(chosen_logratios) - \
                    self.beta * self.custom_func_derivative(rejected_logratios)))
        # print(f"losses: {losses}")

        chosen_rewards = self.beta * (chosen_logps.to(device) - ref_chosen_logps.to(device)).detach()
        rejected_rewards = self.beta * (rejected_logps.to(device) - ref_rejected_logps.to(device)).detach()
        # print(f"{chosen_rewards}, {rejected_rewards}")
        return losses, chosen_rewards, rejected_rewards

In [9]:
alpha_kl_config = Config()
alpha_kl_config.beta = 0.1
alpha_kl_config.output_dir = "/kaggle/working/trained_dpo_alpha_kl"

In [None]:
training_args = DPOConfig(
    output_dir=alpha_kl_config.output_dir,
    eval_strategy="steps",
    eval_steps=alpha_kl_config.eval_steps,
    report_to="wandb",
    # report_to="none",
    gradient_accumulation_steps=alpha_kl_config.gradient_accumulation_steps,
    gradient_checkpointing=alpha_kl_config.gradient_checkpointing,
    per_device_train_batch_size=alpha_kl_config.batch_size,
    per_device_eval_batch_size=alpha_kl_config.batch_size,
    max_prompt_length=alpha_kl_config.max_prompt_length,
    max_completion_length=alpha_kl_config.max_completion_length,
    max_steps=alpha_kl_config.max_steps,
    lr_scheduler_type=alpha_kl_config.lr_scheduler_type,
    learning_rate=alpha_kl_config.learning_rate,
    logging_steps=alpha_kl_config.logging_steps,
    beta=alpha_kl_config.beta,
    fp16=alpha_kl_config.fp16,
    tf32=alpha_kl_config.tf32,
    bf16=alpha_kl_config.bf16,
    loss_type="alpha_kl"
)
alpha_dpo_trainer = AlphaDivergence(
    model=model,
    processing_class=tokenizer,
    args=training_args,
    train_dataset=ds["train_prefs"].select_columns(["prompt", "chosen", "rejected"]),
    eval_dataset=ds["test_prefs"].select_columns(["prompt", "chosen", "rejected"])
)
alpha_dpo_trainer.train()
alpha_dpo_trainer.save_model()

Extracting prompt in train dataset:   0%|          | 0/934 [00:00<?, ? examples/s]

Applying chat template to train dataset:   0%|          | 0/934 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/934 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1036 > 1024). Running this sequence through the model will result in indexing errors


Extracting prompt in eval dataset:   0%|          | 0/932 [00:00<?, ? examples/s]

Applying chat template to eval dataset:   0%|          | 0/932 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/932 [00:00<?, ? examples/s]

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss,Validation Loss,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/chosen,Logps/rejected,Logits/chosen,Logits/rejected
10,0.6861,0.686631,-0.071496,-0.089563,0.534335,0.018067,-295.880005,-289.848907,-0.538522,-0.371807


In [None]:
!zip -r /kaggle/working/trained_dpo_alpha_kl.zip /kaggle/working/trained_dpo_alpha_kl

In [None]:
import os 
from IPython.display import FileLink


os.chdir(r'/kaggle/working')
FileLink(r'trained_dpo_alpha_kl.zip')

In [15]:
js_kl_config = Config()
js_kl_config.beta = 0.1
js_kl_config.output_dir = "/kaggle/working/trained_dpo_js_kl"

In [20]:
training_args = DPOConfig(
    output_dir=js_kl_config.output_dir,
    eval_strategy="steps",
    eval_steps=js_kl_config.eval_steps,
    report_to="wandb",
    # report_to="none",
    gradient_accumulation_steps=js_kl_config.gradient_accumulation_steps,
    gradient_checkpointing=js_kl_config.gradient_checkpointing,
    per_device_train_batch_size=js_kl_config.batch_size,
    per_device_eval_batch_size=js_kl_config.batch_size,
    max_prompt_length=js_kl_config.max_prompt_length,
    max_completion_length=js_kl_config.max_completion_length,
    max_steps=js_kl_config.max_steps,
    lr_scheduler_type=js_kl_config.lr_scheduler_type,
    learning_rate=js_kl_config.learning_rate,
    logging_steps=js_kl_config.logging_steps,
    beta=js_kl_config.beta,
    fp16=js_kl_config.fp16,
    tf32=js_kl_config.tf32,
    bf16=js_kl_config.bf16,
    loss_type="js_kl"
)
js_dpo_trainer = JSDivergence(
    model=model,
    processing_class=tokenizer,
    args=training_args,
    train_dataset=ds["train_prefs"].select_columns(["prompt", "chosen", "rejected"]),
    eval_dataset=ds["test_prefs"].select_columns(["prompt", "chosen", "rejected"])
)
js_dpo_trainer.train()
js_dpo_trainer.save_model()

Applying chat template to eval dataset:   0%|          | 0/932 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/932 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1059 > 1024). Running this sequence through the model will result in indexing errors


Step,Training Loss,Validation Loss,Rewards/chosen,Rewards/rejected,Rewards/accuracies,Rewards/margins,Logps/chosen,Logps/rejected,Logits/chosen,Logits/rejected
10,0.7141,0.735056,-0.097483,-0.115338,0.533262,0.017855,-296.139862,-290.106659,-0.559159,-0.392593
20,0.7253,0.719982,-0.111158,-0.130998,0.536481,0.01984,-296.276581,-290.263245,-0.568547,-0.402633
30,0.7344,0.72693,-0.087303,-0.10732,0.541846,0.020018,-296.038055,-290.026489,-0.557687,-0.392267
40,0.7104,0.715051,-0.074721,-0.095524,0.543991,0.020803,-295.912262,-289.908508,-0.546171,-0.380089
50,0.7022,0.724574,-0.057147,-0.086139,0.571888,0.028992,-295.736481,-289.814667,-0.530055,-0.36465
60,0.666,0.730312,-0.057353,-0.086081,0.578326,0.028728,-295.738525,-289.814087,-0.526059,-0.360249
70,0.7184,0.719466,-0.061785,-0.088576,0.552575,0.026792,-295.782898,-289.83902,-0.523502,-0.358059
80,0.7248,0.726447,-0.051547,-0.082127,0.567597,0.03058,-295.680511,-289.774536,-0.519247,-0.353434
90,0.7238,0.722084,-0.04725,-0.073177,0.560086,0.025927,-295.637543,-289.685059,-0.515074,-0.349967
100,0.7093,0.716559,-0.04504,-0.071376,0.577253,0.026337,-295.615417,-289.667053,-0.513197,-0.347867


In [21]:
!zip -r /kaggle/working/trained_dpo_js_kl.zip /kaggle/working/trained_dpo_js_kl

  adding: kaggle/working/trained_dpo_js_kl/ (stored 0%)
  adding: kaggle/working/trained_dpo_js_kl/merges.txt (deflated 55%)
  adding: kaggle/working/trained_dpo_js_kl/tokenizer.json (deflated 82%)
  adding: kaggle/working/trained_dpo_js_kl/generation_config.json (deflated 30%)
  adding: kaggle/working/trained_dpo_js_kl/config.json (deflated 46%)
  adding: kaggle/working/trained_dpo_js_kl/checkpoint-200/ (stored 0%)
  adding: kaggle/working/trained_dpo_js_kl/checkpoint-200/rng_state.pth (deflated 25%)
  adding: kaggle/working/trained_dpo_js_kl/checkpoint-200/merges.txt (deflated 55%)
  adding: kaggle/working/trained_dpo_js_kl/checkpoint-200/trainer_state.json (deflated 78%)
  adding: kaggle/working/trained_dpo_js_kl/checkpoint-200/tokenizer.json (deflated 82%)
  adding: kaggle/working/trained_dpo_js_kl/checkpoint-200/generation_config.json (deflated 30%)
  adding: kaggle/working/trained_dpo_js_kl/checkpoint-200/config.json (deflated 46%)
  adding: kaggle/working/trained_dpo_js_kl/check

In [22]:
import os 
from IPython.display import FileLink


os.chdir(r'/kaggle/working')
FileLink(r'trained_dpo_js_kl.zip')

In [37]:
result_dpo_01 = get_win_rate_kl("/kaggle/input/trained-dpo-beta-01-2v/kaggle/working/trained_dpo_beta_0.1",
                               config.model_name,
                               pair_rm)
result_dpo_01
# {'model_path': '/kaggle/input/trained-dpo-beta-01-2v/kaggle/working/trained_dpo_beta_0.1',
#  'win_rate': 0.43,
#  'kl': 0.002710602007052546}

INFO 03-24 22:09:41 [config.py:583] This model supports multiple tasks: {'generate', 'score', 'classify', 'reward', 'embed'}. Defaulting to 'generate'.
INFO 03-24 22:09:41 [llm_engine.py:241] Initializing a V0 LLM engine (v0.8.1) with config: model='HuggingFaceTB/SmolLM-135M-Instruct', speculative_config=None, tokenizer='HuggingFaceTB/SmolLM-135M-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=2048, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar', reasoning_backend=None), observability_config=ObservabilityConfig(show_hidden_metrics=False, otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), s

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 03-24 22:09:45 [loader.py:429] Loading weights took 0.26 seconds
INFO 03-24 22:09:48 [model_runner.py:1146] Model loading took 0.2551 GB and 0.488409 seconds
INFO 03-24 22:09:53 [worker.py:267] Memory profiling takes 2.65 seconds
INFO 03-24 22:09:53 [worker.py:267] the current vLLM instance can use total_gpu_memory (14.74GiB) x gpu_memory_utilization (0.40) = 5.90GiB
INFO 03-24 22:09:53 [worker.py:267] model weights take 0.26GiB; non_torch_memory takes 0.00GiB; PyTorch activation peak memory takes 0.45GiB; the rest of the memory reserved for KV Cache is 5.19GiB.
INFO 03-24 22:09:55 [executor_base.py:111] # cuda blocks: 15128, # CPU blocks: 11650
INFO 03-24 22:09:55 [executor_base.py:116] Maximum concurrency for 2048 tokens per request: 118.19x
INFO 03-24 22:09:56 [model_runner.py:1442] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. 

Capturing CUDA graph shapes: 100%|██████████| 35/35 [02:57<00:00,  5.07s/it]

INFO 03-24 22:12:53 [model_runner.py:1570] Graph capturing finished in 177 secs, took 0.04 GiB
INFO 03-24 22:12:53 [llm_engine.py:447] init engine (profile, create kv cache, warmup model) took 185.25 seconds
INFO 03-24 22:12:53 [config.py:583] This model supports multiple tasks: {'generate', 'score', 'classify', 'reward', 'embed'}. Defaulting to 'generate'.
INFO 03-24 22:12:53 [llm_engine.py:241] Initializing a V0 LLM engine (v0.8.1) with config: model='/kaggle/input/trained-dpo-beta-01-2v/kaggle/working/trained_dpo_beta_0.1', speculative_config=None, tokenizer='/kaggle/input/trained-dpo-beta-01-2v/kaggle/working/trained_dpo_beta_0.1', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=2048, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=




INFO 03-24 22:12:56 [model_runner.py:1110] Starting to load model /kaggle/input/trained-dpo-beta-01-2v/kaggle/working/trained_dpo_beta_0.1...


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 03-24 22:12:56 [loader.py:429] Loading weights took 0.25 seconds
INFO 03-24 22:12:59 [model_runner.py:1146] Model loading took 0.2541 GB and 0.302271 seconds
INFO 03-24 22:13:04 [worker.py:267] Memory profiling takes 2.74 seconds
INFO 03-24 22:13:04 [worker.py:267] the current vLLM instance can use total_gpu_memory (14.74GiB) x gpu_memory_utilization (0.40) = 5.90GiB
INFO 03-24 22:13:04 [worker.py:267] model weights take 0.25GiB; non_torch_memory takes 0.00GiB; PyTorch activation peak memory takes 0.45GiB; the rest of the memory reserved for KV Cache is 5.19GiB.
INFO 03-24 22:13:07 [executor_base.py:111] # cuda blocks: 15131, # CPU blocks: 11650
INFO 03-24 22:13:07 [executor_base.py:116] Maximum concurrency for 2048 tokens per request: 118.21x
INFO 03-24 22:13:07 [model_runner.py:1442] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. 

Capturing CUDA graph shapes: 100%|██████████| 35/35 [03:03<00:00,  5.25s/it]

INFO 03-24 22:16:11 [model_runner.py:1570] Graph capturing finished in 184 secs, took 0.04 GiB
INFO 03-24 22:16:11 [llm_engine.py:447] init engine (profile, create kv cache, warmup model) took 191.97 seconds



Processed prompts: 100%|██████████| 100/100 [00:34<00:00,  2.91it/s, est. speed input: 1265.33 toks/s, output: 2010.51 toks/s]
Processed prompts: 100%|██████████| 100/100 [00:31<00:00,  3.21it/s, est. speed input: 1396.40 toks/s, output: 2220.51 toks/s]
Ranking candidates: 100%|██████████| 50/50 [01:50<00:00,  2.21s/it]


len logprobs: 560
len logprobs: 514


{'model_path': '/kaggle/input/trained-dpo-beta-01-2v/kaggle/working/trained_dpo_beta_0.1',
 'win_rate': 0.43,
 'kl': 0.002710602007052546}

In [13]:
result_dpo_005 = get_win_rate_kl("/kaggle/input/trained-dpo-beta-005/kaggle/working/trained_dpo_beta_0.05",
                               config.model_name,
                               pair_rm)
result_dpo_005
# {'model_path': '/kaggle/input/trained-dpo-beta-005/kaggle/working/trained_dpo_beta_0.05',
#  'win_rate': 0.56,
#  'kl': 0.04896892805607254}

0.04896892805607254

In [15]:
result_dpo_forward = get_win_rate_kl("/kaggle/input/trained-dpo-forward-kl/kaggle/working/trained_dpo_forward_kl",
                               config.model_name,
                               pair_rm)
result_dpo_forward

# {'model_path': '/kaggle/input/trained-dpo-forward-kl/kaggle/working/trained_dpo_forward_kl',
#  'win_rate': 0.44,
#  'kl': 0.030309515614999075}

0.030309515614999075

In [17]:
result_dpo_alpha = get_win_rate_kl("/kaggle/input/trained-dpo-alpha-kl/kaggle/working/trained_dpo_alpha_kl",
                               config.model_name,
                               pair_rm)
result_dpo_alpha

INFO 03-25 20:11:29 [config.py:585] This model supports multiple tasks: {'generate', 'embed', 'score', 'reward', 'classify'}. Defaulting to 'generate'.
INFO 03-25 20:11:29 [llm_engine.py:241] Initializing a V0 LLM engine (v0.8.2) with config: model='HuggingFaceTB/SmolLM-135M-Instruct', speculative_config=None, tokenizer='HuggingFaceTB/SmolLM-135M-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=2048, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto,  device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar', reasoning_backend=None), observability_config=ObservabilityConfig(show_hidden_metrics=False, otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), s

Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 03-25 20:11:31 [loader.py:447] Loading weights took 0.25 seconds
INFO 03-25 20:11:31 [model_runner.py:1146] Model loading took 0.2541 GB and 0.376847 seconds
INFO 03-25 20:11:32 [worker.py:267] Memory profiling takes 0.65 seconds
INFO 03-25 20:11:32 [worker.py:267] the current vLLM instance can use total_gpu_memory (14.74GiB) x gpu_memory_utilization (0.40) = 5.90GiB
INFO 03-25 20:11:32 [worker.py:267] model weights take 0.25GiB; non_torch_memory takes 0.00GiB; PyTorch activation peak memory takes 0.45GiB; the rest of the memory reserved for KV Cache is 5.19GiB.
INFO 03-25 20:11:33 [executor_base.py:111] # cuda blocks: 15130, # CPU blocks: 11650
INFO 03-25 20:11:33 [executor_base.py:116] Maximum concurrency for 2048 tokens per request: 118.20x
INFO 03-25 20:11:33 [model_runner.py:1442] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI. 

Capturing CUDA graph shapes: 100%|██████████| 35/35 [00:37<00:00,  1.08s/it]

INFO 03-25 20:12:11 [model_runner.py:1570] Graph capturing finished in 38 secs, took 0.04 GiB
INFO 03-25 20:12:11 [llm_engine.py:447] init engine (profile, create kv cache, warmup model) took 39.83 seconds
INFO 03-25 20:12:11 [config.py:585] This model supports multiple tasks: {'generate', 'embed', 'score', 'reward', 'classify'}. Defaulting to 'generate'.
INFO 03-25 20:12:11 [llm_engine.py:241] Initializing a V0 LLM engine (v0.8.2) with config: model='/kaggle/input/trained-dpo-alpha-kl/kaggle/working/trained_dpo_alpha_kl', speculative_config=None, tokenizer='/kaggle/input/trained-dpo-alpha-kl/kaggle/working/trained_dpo_alpha_kl', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=2048, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, 




INFO 03-25 20:12:12 [model_runner.py:1110] Starting to load model /kaggle/input/trained-dpo-alpha-kl/kaggle/working/trained_dpo_alpha_kl...


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


INFO 03-25 20:12:15 [loader.py:447] Loading weights took 2.79 seconds
INFO 03-25 20:12:15 [model_runner.py:1146] Model loading took 0.2536 GB and 2.847477 seconds
INFO 03-25 20:12:17 [worker.py:267] Memory profiling takes 0.71 seconds
INFO 03-25 20:12:17 [worker.py:267] the current vLLM instance can use total_gpu_memory (14.74GiB) x gpu_memory_utilization (0.40) = 5.90GiB
INFO 03-25 20:12:17 [worker.py:267] model weights take 0.25GiB; non_torch_memory takes 0.00GiB; PyTorch activation peak memory takes 0.45GiB; the rest of the memory reserved for KV Cache is 5.20GiB.
INFO 03-25 20:12:17 [executor_base.py:111] # cuda blocks: 15132, # CPU blocks: 11650
INFO 03-25 20:12:17 [executor_base.py:116] Maximum concurrency for 2048 tokens per request: 118.22x


OutOfMemoryError: CUDA out of memory. Tried to allocate 178.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 132.12 MiB is free. Process 3363 has 14.58 GiB memory in use. Of the allocated memory 14.02 GiB is allocated by PyTorch, with 48.00 MiB allocated in private pools (e.g., CUDA Graphs), and 89.80 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)