In [1]:
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType
import json
import torch

  from .autonotebook import tqdm as notebook_tqdm


#### 加载本地模型和分词器（Tokenizer）

In [2]:
model_path = "/data/hf-models/llama-3-8b-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_path,
    device_map="cuda:3",  # 或 "cuda:0" 取决于你的设备
    torch_dtype=torch.float16,  # 建议使用 float16 加速
    trust_remote_code=True,
    low_cpu_mem_usage=True
)


Loading checkpoint shards: 100%|██████████| 4/4 [00:05<00:00,  1.27s/it]


#### 准备LoRA配置并应用

In [None]:
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],  # 根据llama结构调整
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM
)

model = prepare_model_for_kbit_training(model)  # 如果你使用量化
model = get_peft_model(model, lora_config)


OutOfMemoryError: CUDA out of memory. Tried to allocate 224.00 MiB. GPU 3 has a total capacity of 31.61 GiB of which 186.06 MiB is free. Including non-PyTorch memory, this process has 31.41 GiB memory in use. Of the allocated memory 28.01 GiB is allocated by PyTorch, and 3.05 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

#### 加载并格式化你的 JSON 数据

In [4]:
def load_and_flatten_json(json_path):
    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    return [{"input": v["rewritten"], "output": v["original"]} for v in data.values()]

data_list = load_and_flatten_json("/home/zhenghaoran/RAG_toxic/Adversarial_RL/result/results_dis_seed2.json")
dataset = Dataset.from_list(data_list)


#### 构造训练样本格式

In [6]:
def format_example(example):
    # 添加明确的结束标记，帮助模型理解输出结束位置
    return {
        "text": f"### Instruction:\nRewrite to original style:\n{example['input']}\n\n### Response:\n{example['output']}\n"
    }

dataset = dataset.map(format_example)


Map: 100%|██████████| 5000/5000 [00:00<00:00, 36987.20 examples/s]


#### Tokenization 和 DataCollator

In [7]:
tokenizer.pad_token = tokenizer.eos_token

def tokenize(example):
    return tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=1024
    )

tokenized_dataset = dataset.map(tokenize)

# 对于因果语言模型，使用专门的数据整理器
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)


Map: 100%|██████████| 5000/5000 [00:02<00:00, 1975.39 examples/s]


#### 设置训练参数并训练

In [8]:
training_args = TrainingArguments(
    output_dir="./lora-llama3-style-transfer",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    logging_steps=10,
    save_strategy="epoch",
    learning_rate=2e-4,
    bf16=True,  # 如果你的GPU支持
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator
)

trainer.train()


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
  trainer = Trainer(


ValueError: You can't train a model that has been loaded in 8-bit or 4-bit precision on a different device than the one you're training on. Make sure you loaded the model on the correct device using for example `device_map={'':torch.cuda.current_device()}` or `device_map={'':torch.xpu.current_device()}`