In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
import torch
import pandas as pd
from datasets import Dataset
from unsloth import FastLanguageModel, PatchDPOTrainer
from trl import DPOConfig, DPOTrainer
import ast

# 1. Configuration & Model Loading
max_seq_length = 1024 
model_name = "unsloth/gpt-oss-20b"

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = model_name,
    max_seq_length = max_seq_length,
    load_in_4bit = True,        # Handles MXFP4 automatically
    offload_embedding = True,   # Saves ~1GB VRAM for your 13GB limit
)

# 2. Add LoRA Adapters (The "Unsloth" way)
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, 
    target_modules = [
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
    lora_alpha = 32,
    use_gradient_checkpointing = "unsloth", # Crucial for 13GB VRAM
    random_state = 3407,
)

# 3. Handle Special Tokens
tokenizer.add_special_tokens({'additional_special_tokens': ['<CUSTOM>']})
model.resize_token_embeddings(len(tokenizer))

# 4. Data Preparation
# Load your generated CSV
df = pd.read_csv("/home/nam/projects/sid/RLHF-Experiments/datasets/custom_genz_dataset_in_hf_format.csv")

def format_dpo_dataset(row):
    c_list = ast.literal_eval(row['chosen'])
    r_list = ast.literal_eval(row['rejected'])
    
    return {
        "prompt"  : c_list[0]['content'],
        "chosen"  : c_list[1]['content'],
        "rejected": r_list[1]['content'],
    }

# Convert to HF Dataset and reformat
dataset = Dataset.from_pandas(df)
dataset = dataset.map(format_dpo_dataset)

# 5. Training Arguments (Optimized for 13GB VRAM)
training_args = DPOConfig(
    output_dir = "outputs",
    per_device_train_batch_size = 1,
    gradient_accumulation_steps = 4,
    learning_rate = 5e-5,
    lr_scheduler_type = "linear",
    max_length = max_seq_length,
    max_prompt_length = 512,
    beta = 0.1,                 # The "strength" of the preference
    logging_steps = 1,
    optim = "adamw_8bit",       # Saves more VRAM than standard AdamW
    bf16 = True,
    report_to = "none",
)

# 6. Initialize Trainer
# PatchDPOTrainer allows DPO without a separate reference model (saves 50% VRAM)
PatchDPOTrainer() 

trainer = DPOTrainer(
    model = model,
    ref_model = None,           # Unsloth handles this internally with PEFT
    args = training_args,
    train_dataset = dataset,
    tokenizer = tokenizer,
)

# 7. Train
trainer.train()

  from .autonotebook import tqdm as notebook_tqdm


ðŸ¦¥ Unsloth: Will patch your computer to enable 2x faster free finetuning.
ðŸ¦¥ Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2026.2.1: Fast Gpt_Oss patching. Transformers: 4.57.6.
   \\   /|    NVIDIA GeForce RTX 3090. Num GPUs = 1. Max memory: 23.568 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.10.0+cu128. CUDA: 8.6. CUDA Toolkit: 12.8. Triton: 3.6.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.35. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 4/4 [00:04<00:00,  1.23s/it]


Unsloth: Offloading embeddings to RAM to save 1.08 GB.
Unsloth: Detected MoE model with num_experts = 32 and target_modules = ['q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj']. Enabling LoRA on MoE parameters: ['mlp.experts.gate_up_proj', 'mlp.experts.down_proj']
Unsloth: PEFT set target_parameters but found no matching parameters.
This is expected for MoE models - Unsloth handles MoE expert LoRA targeting separately.
Unsloth: Making `model.base_model.model.model` require gradients


Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 33/33 [00:00<00:00, 4300.91 examples/s]
Extracting prompt in train dataset (num_proc=28): 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 33/33 [00:01<00:00, 23.79 examples/s]
Applying chat template to train dataset (num_proc=28): 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 33/33 [00:10<00:00,  3.22 examples/s]
Tokenizing train dataset (num_proc=28): 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 33/33 [00:10<00:00,  3.21 examples/s]
/home/nam/miniconda3/envs/diff/compiler_compat/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/home/nam/miniconda3/envs/diff/compiler_compat/ld: /usr/local/cuda-12.2/lib64/libcufile.so: undefined reference to `dlvsym'
/home/nam/miniconda3/envs/diff/compiler_compat/ld: /usr/local/cuda-12.2/lib64/libcufile.so: undefined reference to `dlopen'
/home/nam/miniconda3/envs/diff/compiler_compat/ld: /usr/local/cuda-12.2/lib64/libcufile.so: undefined reference to `dlclose'
/home/nam/miniconda3/envs/diff/compi

Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss,rewards / chosen,rewards / rejected,rewards / accuracies,rewards / margins,logps / chosen,logps / rejected,logits / chosen,logits / rejected,eval_logits / chosen,eval_logits / rejected,nll_loss
1,0.6921,-0.002112,-0.00415,0.5,0.002039,-169.516846,-146.300171,-3.654345,-3.650956,0,0,0
2,0.6801,0.046743,0.0204,0.75,0.026343,-154.655579,-135.907684,-3.49157,-3.497302,No Log,No Log,No Log
3,0.6683,0.150496,0.099512,1.0,0.050984,-201.934021,-185.959717,-3.552143,-3.537108,No Log,No Log,No Log
4,0.6583,-0.007164,-0.079444,0.75,0.072279,-106.189804,-101.56398,-3.381341,-3.388906,No Log,No Log,No Log
5,0.6253,-0.089056,-0.235957,0.75,0.146901,-188.257584,-172.525269,-3.564541,-3.541771,No Log,No Log,No Log
6,0.5778,0.117275,-0.133143,1.0,0.250419,-73.133133,-64.255936,-2.99672,-2.992593,No Log,No Log,No Log
7,0.4682,0.005726,-0.560424,1.0,0.56615,-175.582794,-163.114334,-3.511209,-3.490071,No Log,No Log,No Log
8,0.4163,-0.062543,-0.736217,1.0,0.673673,-255.276215,-248.773071,-3.52792,-3.524178,No Log,No Log,No Log
9,0.531,-0.159795,-0.515517,1.0,0.355723,-94.171394,-86.939911,-3.266786,-3.25534,No Log,No Log,No Log
10,0.4511,-0.414763,-1.019576,1.0,0.604813,-126.772316,-118.482544,-3.242893,-3.22073,No Log,No Log,No Log




TrainOutput(global_step=27, training_loss=0.37476068031456733, metrics={'train_runtime': 412.5234, 'train_samples_per_second': 0.24, 'train_steps_per_second': 0.065, 'total_flos': 0.0, 'train_loss': 0.37476068031456733, 'epoch': 3.0})