In [None]:
import torch

# 清理 CUDA 缓存
if torch.cuda.is_available():
    torch.cuda.empty_cache()

In [None]:
import json
import torch
import pandas as pd
from datasets import Dataset
from unsloth import FastLanguageModel
from torch.utils.data import DataLoader, WeightedRandomSampler
import numpy as np
import re
import math

In [None]:
import copy
# 加载JSON数据
with open("/root/autodl-tmp/dataset/new_data/train/dpo_data_with_cot.json", "r") as f:
    raw_data = json.load(f)

# 修改原始数据解析部分
dpo_samples = []
for item in raw_data:
    dpo_samples.append({
        "prompt": item["input"],  # 使用拼接后的输入
        "chosen": f'{item["chosen"]}',
        "rejected": f'{item["rejected"]}',
    })
    
dataset = Dataset.from_pandas(pd.DataFrame(dpo_samples))
dataset = dataset.shuffle(seed=42)

In [None]:
print(len(dpo_samples))
# print(dataset[0]["chosen"])
# print(dataset[0]["rejected"])

In [None]:
max_seq_length = 5120 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "/root/autodl-tmp/dataset/new_model/qwen-sft",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 64, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 64,
    lora_dropout = 0, # Currently only supports dropout = 0
    bias = "none",    # Currently only supports bias = "none"
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

In [None]:
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported
from trl import DPOTrainer
        
dpo_trainer = DPOTrainer(
    model = model,
    ref_model = None,
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 2,
        warmup_ratio = 0.1,
        num_train_epochs = 3,
        learning_rate = 1e-5,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 5,
        lr_scheduler_type = "cosine",
        optim = "adamw_8bit",
        seed = 42,
        output_dir = "/root/dpo-outputs",
    ),
    beta = 0.2,
    train_dataset = dataset,
    # eval_dataset = YOUR_DATASET_HERE,
    tokenizer = tokenizer,
    max_length = 5120,
    max_prompt_length = 5120,
)


In [None]:
dpo_trainer.train()

In [None]:
new_model_local = "/root/qwen-dpo"
model.save_pretrained(new_model_local) 
tokenizer.save_pretrained(new_model_local)
model.save_pretrained_merged("new_model/qwen-dpo1", tokenizer, save_method="merged_16bit") 

In [None]:
dpo_trainer