In [None]:
%%capture
!pip install unsloth
!pip install --force-reinstall --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git

In [3]:
import torch

In [4]:
from huggingface_hub import login
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()

hf_token = user_secrets.get_secret("HUGGINGFACE_TOKEN")
login(hf_token)

In [5]:
import wandb

wb_token = user_secrets.get_secret("wandb")

wandb.login(key=wb_token)
run = wandb.init(
    project='Fine-tune-gemma-2b-it on ultrafeedback_binarized Dataset', 
    job_type="training", 
    anonymous="allow"
)

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mjanvi24[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [8]:
from unsloth import FastLanguageModel

max_seq_length = 2048 
dtype = None 
load_in_4bit = True 

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "/kaggle/input/gemma/transformers/2b-it/3",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    token = hf_token, 
)
model = FastLanguageModel.get_peft_model(model,
                                         r=8, 
                                         target_modules=[
                                            "q_proj",
                                            "k_proj",
                                            "v_proj",
                                            "o_proj",
                                            "gate_proj",
                                            "up_proj",
                                            "down_proj",
                                        ],
                                        lora_alpha=16,
                                        lora_dropout=0,  
                                        bias="none",  
                                        use_gradient_checkpointing="unsloth",  # True or "unsloth" for very long context
                                        random_state=3407,
                                        use_rslora=False,  
                                        loftq_config=None,
                                    )

Unsloth 2025.2.12 patched 18 layers with 18 QKV layers, 18 O layers and 18 MLP layers.


In [9]:
from datasets import load_dataset
train_dataset = load_dataset("trl-lib/ultrafeedback_binarized", split="train[:500]")
train_dataset[0]

README.md:   0%|          | 0.00/643 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/131M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/2.14M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/62135 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1000 [00:00<?, ? examples/s]

{'chosen': [{'content': 'Use the pygame library to write a version of the classic game Snake, with a unique twist',
   'role': 'user'},
  {'content': "Sure, I'd be happy to help you write a version of the classic game Snake using the pygame library! Here's a basic outline of how we can approach this:\n\n1. First, we'll need to set up the game display and create a game object that we can use to handle the game's state.\n2. Next, we'll create the game's grid, which will be used to represent the game board. We'll need to define the size of the grid and the spaces within it.\n3. After that, we'll create the snake object, which will be used to represent the player's movement. We'll need to define the size of the snake and the speed at which it moves.\n4. We'll also need to create a food object, which will be used to represent the food that the player must collect to score points. We'll need to define the location of the food and the speed at which it moves.\n5. Once we have these objects se

In [19]:
from trl import DPOConfig, DPOTrainer
from unsloth import is_bfloat16_supported

training_args = DPOConfig(per_device_train_batch_size=2,
                        gradient_accumulation_steps=4,
                        # Use num_train_epochs = 1, warmup_ratio for full training runs!
                        warmup_steps=5,
                        max_steps=5,
                        learning_rate=2e-4,
                        fp16=not is_bfloat16_supported(),
                        bf16=is_bfloat16_supported(),
                        logging_steps=1,
                        optim="adamw_8bit",
                        weight_decay=0.01,
                        lr_scheduler_type="linear",
                        seed=3407,
                        output_dir="outputs")

trainer = DPOTrainer(model=model, 
                     args=training_args, 
                     processing_class=tokenizer, 
                     train_dataset=train_dataset)

In [20]:
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 500 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 10
 "-____-"     Number of trainable parameters = 9,805,824


Step,Training Loss,rewards / chosen,rewards / rejected,rewards / accuracies,rewards / margins,logps / chosen,logps / rejected,logits / chosen,logits / rejected,eval_logits / chosen,eval_logits / rejected,nll_loss,aux_loss
1,0.6931,0.0,0.0,0.0,0.0,-489.975464,-529.761597,-22.995388,-23.634632,0,0,0,0
2,0.6931,0.0,0.0,0.0,0.0,-643.203064,-503.523987,-22.909473,-22.580498,No Log,No Log,No Log,No Log
3,0.6901,0.016118,0.009942,0.625,0.006176,-649.033081,-461.669556,-21.761307,-22.803642,No Log,No Log,No Log,No Log
4,0.6322,0.250476,0.113498,0.625,0.136978,-698.312866,-520.073242,-23.086992,-22.621698,No Log,No Log,No Log,No Log
5,0.6986,0.276359,0.280949,0.375,-0.00459,-457.854492,-521.314819,-25.344511,-23.820574,No Log,No Log,No Log,No Log
6,0.667,0.690035,0.523746,0.5,0.166289,-594.36499,-597.737183,-22.775448,-23.079523,No Log,No Log,No Log,No Log
7,0.7455,0.475509,0.499598,0.375,-0.024089,-552.280579,-603.661865,-21.269972,-21.110535,No Log,No Log,No Log,No Log
8,0.695,0.389698,0.255133,0.625,0.134565,-603.745972,-509.499451,-22.787825,-24.457851,No Log,No Log,No Log,No Log
9,0.6133,0.118943,-0.072188,0.75,0.191131,-308.556213,-394.061584,-20.878235,-21.048401,No Log,No Log,No Log,No Log
10,0.8379,0.293554,0.285103,0.375,0.008451,-584.921875,-457.522583,-22.806589,-23.305445,No Log,No Log,No Log,No Log


TrainOutput(global_step=10, training_loss=0.6966004014015198, metrics={'train_runtime': 143.2304, 'train_samples_per_second': 0.559, 'train_steps_per_second': 0.07, 'total_flos': 0.0, 'train_loss': 0.6966004014015198, 'epoch': 0.16})

In [24]:
if False:
    model.push_to_hub_merged("rlhf-dpo-gemma-2b-it-model", tokenizer, save_method = "merged_16bit")
    # model.push_to_hub_gguf("rlhf-dpo-gemma-2b-it-model", tokenizer, quantization_method = "q8_0")

Unsloth: ##### The current model auto adds a BOS token.
Unsloth: ##### Your chat template has a BOS token. We shall remove it temporarily.


Cloning into 'llama.cpp'...
Submodule 'kompute' (https://github.com/nomic-ai/kompute.git) registered for path 'ggml/src/ggml-kompute/kompute'
Cloning into '/kaggle/working/llama.cpp/ggml/src/ggml-kompute/kompute'...
Submodule path 'ggml/src/ggml-kompute/kompute': checked out '4565194ed7c32d1d2efa32ceab4d3c6cae006306'
make: Entering directory '/kaggle/working/llama.cpp'
-- The C compiler identification is GNU 11.4.0
-- The CXX compiler identification is GNU 11.4.0
-- Detecting C compiler ABI info
-- Detecting C compiler ABI info - done
-- Check for working C compiler: /usr/bin/cc - skipped
-- Detecting C compile features
-- Detecting C compile features - done
-- Detecting CXX compiler ABI info
-- Detecting CXX compiler ABI info - done
-- Check for working CXX compiler: /usr/bin/c++ - skipped
-- Detecting CXX compile features
-- Detecting CXX compile features - done
-- Found Git: /usr/bin/git (found version "2.34.1")
-- Performing Test CMAKE_HAVE_LIBC_PTHREAD
-- Performing Test CMAKE_HAV

100%|██████████| 18/18 [00:00<00:00, 26.20it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving rlhf-dpo-gemma-2b-it-model/pytorch_model-00001-of-00002.bin...
Unsloth: Saving rlhf-dpo-gemma-2b-it-model/pytorch_model-00002-of-00002.bin...
Done.


OSError: [Errno 30] Read-only file system: '/kaggle/input/gemma/transformers/2b-it/3/tokenizer_config.json'