In [None]:
!pip install trl accelerate transformers datasets peft

In [None]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments
from trl import DPOTrainer, DPOConfig
import torch

# Load full dataset
dataset = load_dataset("PKU-Alignment/PKU-SafeRLHF")

# Optional: reduce size for Kaggle memory constraints
train_subset = dataset["train"].select(range(5000))
test_subset = dataset["test"].select(range(500))

# Convert to DPO format
def convert_to_dpo(example):
    if example["better_response_id"] == 0:
        chosen = example["response_0"]
        rejected = example["response_1"]
    else:
        chosen = example["response_1"]
        rejected = example["response_0"]

    return {
        "prompt": example["prompt"],
        "chosen": chosen,
        "rejected": rejected
    }

dpo_train_dataset = train_subset.map(convert_to_dpo)
dpo_test_dataset = test_subset.map(convert_to_dpo)

print("Train sample:", dpo_train_dataset[1])

In [None]:
model_name_or_path = "/kaggle/input/Model/SFT_best_model"  
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
tokenizer.pad_token = tokenizer.eos_token  

# Load model
model = AutoModelForCausalLM.from_pretrained(model_name_or_path, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32)

In [None]:
dpo_config = DPOConfig(
    beta=0.30,
    learning_rate=3e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    gradient_accumulation_steps=1,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_steps=10,
    output_dir="/kaggle/working/SFT_dpo_model",  # Save model here
    remove_unused_columns=False,
    fp16=torch.cuda.is_available()
)

# Training args
training_args = TrainingArguments(
    output_dir=dpo_config.output_dir,
    per_device_train_batch_size=dpo_config.per_device_train_batch_size,
    per_device_eval_batch_size=dpo_config.per_device_eval_batch_size,
    gradient_accumulation_steps=dpo_config.gradient_accumulation_steps,
    learning_rate=dpo_config.learning_rate,
    num_train_epochs=dpo_config.num_train_epochs,
    logging_dir=f"{dpo_config.output_dir}/logs",
    logging_steps=dpo_config.logging_steps,
    evaluation_strategy="steps",
    eval_steps=dpo_config.eval_steps,
    save_strategy="steps",
    save_steps=dpo_config.save_steps,
    save_total_limit=dpo_config.save_total_limit,
    report_to="none",  # prevent wandb init in Kaggle
    bf16=torch.cuda.is_available(),  # use bf16 if available
)

# Initialize trainer
trainer = DPOTrainer(
    model=model,
    ref_model=None,  
    args=training_args,
    train_dataset=dpo_train_dataset,
    eval_dataset=dpo_test_dataset,
    tokenizer=tokenizer,
    dpo_config=dpo_config,
)

# Train the model
trainer.train()

In [None]:
# Save final model
trainer.model.save_pretrained(dpo_config.output_dir)
tokenizer.save_pretrained(dpo_config.output_dir)

print("DPO fine-tuning complete. Model saved to:", dpo_config.output_dir)

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
from tqdm import tqdm
import numpy as np

# Load the DPO fine-tuned model and tokenizer
model_path =  "/kaggle/input/Model/SFT_best_model"
model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32)
tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.pad_token = tokenizer.eos_token

model.eval()
if torch.cuda.is_available():
    model.cuda()


dataset = load_dataset("PKU-Alignment/PKU-SafeRLHF")["test"].select(range(500))
def convert_to_dpo(example):
    if example["better_response_id"] == 0:
        chosen = example["response_0"]
        rejected = example["response_1"]
    else:
        chosen = example["response_1"]
        rejected = example["response_0"]
    return {
        "prompt": example["prompt"],
        "chosen": chosen,
        "rejected": rejected
    }

eval_dataset = dataset.map(convert_to_dpo)


def compute_log_prob(prompt, response):
    full_text = prompt + response
    inputs = tokenizer(full_text, return_tensors="pt", truncation=True, max_length=512)
    with torch.no_grad():
        input_ids = inputs["input_ids"].to(model.device)
        attention_mask = inputs["attention_mask"].to(model.device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits[:, :-1, :]
        labels = input_ids[:, 1:]

        # Only compute log probs for the response tokens
        label_mask = (input_ids[0] != tokenizer.pad_token_id).cpu().numpy()
        response_start = len(tokenizer(prompt)["input_ids"])
        label_mask[:response_start] = 0  # mask out prompt

        # Cross-entropy loss per token (negative log-likelihood)
        log_probs = torch.nn.functional.log_softmax(logits, dim=-1)
        chosen_token_logprobs = log_probs.gather(2, labels.unsqueeze(-1)).squeeze(-1)

        masked_log_probs = chosen_token_logprobs[0][torch.tensor(label_mask[1:], dtype=torch.bool)]
        total_log_prob = masked_log_probs.sum().item()
        return total_log_prob

# Evaluate accuracy over the dataset
correct = 0
total = len(eval_dataset)

for example in tqdm(eval_dataset, desc="Evaluating"):
    prompt = example["prompt"]
    logp_chosen = compute_log_prob(prompt, example["chosen"])
    logp_rejected = compute_log_prob(prompt, example["rejected"])

    if logp_chosen > logp_rejected:
        correct += 1

accuracy = correct / total
print(f"\nDPO Evaluation Accuracy: {accuracy * 100:.2f}% ({correct}/{total})")

### Trial 1
- **LoRA Rank:** 8  
- **DPO Beta:** 0.10  
- **Learning Rate:** 5e-6  
- **Batch Size:** 2  
- **Epochs:** 1  

trainer = DPOTrainer(
    model=model,
    ref_model=ref_model,
    args=DPOConfig(
        beta=0.10,
        learning_rate=5e-6,
        per_device_train_batch_size=2,
        per_device_eval_batch_size=2,
        num_train_epochs=1,
        gradient_accumulation_steps=4,
        evaluation_strategy="epoch",
        save_strategy="no",
        logging_steps=10,
        remove_unused_columns=False,
        fp16=torch.cuda.is_available()
    ),
    tokenizer=tokenizer,
    train_dataset=dpo_train_dataset,
    eval_dataset=dpo_test_dataset
)
trainer.train()


### Trial 2
- **LoRA Rank:** 16  
- **DPO Beta:** 0.20  
- **Learning Rate:** 1e-5  
- **Batch Size:** 4  
- **Epochs:** 2  

trainer = DPOTrainer(
    model=model,
    ref_model=ref_model,
    args=DPOConfig(
        beta=0.20,
        learning_rate=1e-5,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        num_train_epochs=2,
        gradient_accumulation_steps=2,
        evaluation_strategy="epoch",
        save_strategy="no",
        logging_steps=10,
        remove_unused_columns=False,
        fp16=torch.cuda.is_available()
    ),
    tokenizer=tokenizer,
    train_dataset=dpo_train_dataset,
    eval_dataset=dpo_test_dataset
)
trainer.train()

### Trial 3
- **LoRA Rank:** 8  
- **DPO Beta:** 0.05  
- **Learning Rate:** 2e-6  
- **Batch Size:** 1  
- **Epochs:** 1  

trainer = DPOTrainer(
    model=model,
    ref_model=ref_model,
    args=DPOConfig(
        beta=0.05,
        learning_rate=2e-6,
        per_device_train_batch_size=1,
        per_device_eval_batch_size=1,
        num_train_epochs=1,
        gradient_accumulation_steps=8,
        evaluation_strategy="epoch",
        save_strategy="no",
        logging_steps=10,
        remove_unused_columns=False,
        fp16=torch.cuda.is_available()
    ),
    tokenizer=tokenizer,
    train_dataset=dpo_train_dataset,
    eval_dataset=dpo_test_dataset
)
trainer.train()

### Trial 4 - Best Accuracy
- **LoRA Rank:** 16  
- **DPO Beta:** 0.30  
- **Learning Rate:** 3e-5  
- **Batch Size:** 4  
- **Epochs:** 3  

trainer = DPOTrainer(
    model=model,
    ref_model=ref_model,
    args=DPOConfig(
        beta=0.30,
        learning_rate=3e-5,
        per_device_train_batch_size=4,
        per_device_eval_batch_size=4,
        num_train_epochs=3,
        gradient_accumulation_steps=1,
        evaluation_strategy="epoch",
        save_strategy="no",
        logging_steps=10,
        remove_unused_columns=False,
        fp16=torch.cuda.is_available()
    ),
    tokenizer=tokenizer,
    train_dataset=dpo_train_dataset,
    eval_dataset=dpo_test_dataset
)
trainer.train()

### Trial 5
- **LoRA Rank:** 8  
- **DPO Beta:** 0.15  
- **Learning Rate:** 7e-6  
- **Batch Size:** 2  
- **Epochs:** 2  

trainer = DPOTrainer(
    model=model,
    ref_model=ref_model,
    args=DPOConfig(
        beta=0.15,
        learning_rate=7e-6,
        per_device_train_batch_size=2,
        per_device_eval_batch_size=2,
        num_train_epochs=2,
        gradient_accumulation_steps=4,
        evaluation_strategy="epoch",
        save_strategy="no",
        logging_steps=10,
        remove_unused_columns=False,
        fp16=torch.cuda.is_available()
    ),
    tokenizer=tokenizer,
    train_dataset=dpo_train_dataset,
    eval_dataset=dpo_test_dataset
)
trainer.train()
