In [10]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from peft import LoraConfig, get_peft_model
from datasets import load_dataset
import torch

## 数据集路径
DATA_PATH = "./data/reformat.jsonl"

## 基础模型路径
BASE_MODEL_NAME = "/mnt/proj/jupyter/qwen3_4b"

## 微调模型输出路径
OUTPUT_MODEL = "/mnt/proj/jupyter/qwen3_4b_law"
OUTPUT_FINAL_MODEL = "/mnt/proj/jupyter/qwen3_4b_merge"

# 1. 加载模型和tokenizer
model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_NAME,
    device_map="auto",
    torch_dtype=torch.float16,
    trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

  from .autonotebook import tqdm as notebook_tqdm
[4pdvGPU Msg(3056:140607572618432:libvgpu.c:869)]: Initializing.....
[4pdvGPU Warn(3056:140607572618432:hook.c:475)]: remap handles for device 0
[4pdvGPU Warn(3056:140607572618432:hook.c:475)]: remap handles for device 1
[4pdvGPU Warn(3056:140607572618432:utils.c:228)]: get default cuda 2 from (null)
[4pdvGPU Msg(3056:140607572618432:libvgpu.c:902)]: Initialized
[4pdvGPU Msg(3056:140607572618432:memory.c:566)]: orig free=24971051008 total=25217466368 limit=25769803776 usage=236978176
[4pdvGPU Msg(3056:140607572618432:memory.c:566)]: orig free=24971051008 total=25217466368 limit=25769803776 usage=236978176
Loading checkpoint shards:   0%|                                                     | 0/3 [00:00<?, ?it/s][4pdvGPU Msg(3056:140607572618432:memory.c:566)]: orig free=24971051008 total=25217466368 limit=25769803776 usage=236978176
[4pdvGPU Msg(3056:140607572618432:memory.c:566)]: orig free=24971051008 total=25217466368 limit=2576980377

In [12]:

def preprocess_function(examples):
    texts = []
    for instruction, input_text, output in zip(
        examples["instructions"],
        examples["input"],
        examples["output"]
    ):
        if input_text:
            text = f"Instruction: {instruction}\nInput: {input_text}\nResponse: {output}"
        else:
            text = f"Instruction: {instruction}\nResponse: {output}"
        texts.append(text)
    
    # 对完整文本进行tokenize
    tokenized = tokenizer(
        texts,
        truncation=True,
        max_length=256,
        padding="max_length",
        return_tensors="pt"
    )
    
    # 创建labels（与input_ids相同）
    tokenized["labels"] = tokenized["input_ids"].clone()
    return tokenized

# 加载并预处理数据
dataset = load_dataset("json", data_files=DATA_PATH, split="train")
dataset = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=["instructions", "input", "output"]
)

Map: 100%|██████████████████████████████████████████████████| 10877/10877 [00:09<00:00, 1207.44 examples/s]


In [13]:
# 3. 极简LoRA配置
lora_target_modules = [
    "q_proj", "k_proj", "v_proj", "o_proj", # Attention 线性层
    "gate_proj", "up_proj", "down_proj"     # MLP 线性层
]

peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=lora_target_modules,
    lora_dropout=0.05,
    task_type="CAUSAL_LM",
    inference_mode=False
)
model = get_peft_model(model, peft_config)

In [16]:
# 4-1 模型训练-参数准备
training_args = TrainingArguments(
    output_dir=OUTPUT_MODEL,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    learning_rate=2e-5,
    # max_steps=200,
    num_train_epochs=1,
    logging_steps=10,
    save_steps=50,
    fp16=True,
    optim="adamw_torch_fused",
    lr_scheduler_type="cosine",
    warmup_ratio=0.03, 
    report_to="none",
    save_total_limit=3,
    label_names=["labels"],
    remove_unused_columns=True
)

# 4-2 模型训练-数据收集器
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# 4-3 模型训练-创建Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    data_collator=data_collator
)

[4pdvGPU Warn(4527:140414721386304:hook.c:475)]: remap handles for device 0
[4pdvGPU Warn(4527:140414721386304:hook.c:475)]: remap handles for device 1
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [None]:
# 4-4 模型训练-开始训练
print("开始训练...")
trainer.train()

# 5. 保存适配器
model.save_pretrained(f"{OUTPUT_MODEL}/adapter_model")

开始训练...


Step,Training Loss
10,1.664
20,1.6662
30,1.4845
40,1.3845
50,1.2856
60,1.2918
70,1.2241
80,1.2122
90,1.1784
100,1.1955


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer    
from peft import PeftModel    
import torch    

BASE_MODEL_NAME = "/home/mw/input/models4804"    
OUTPUT_ADAPTER_MODEL = "/home/mw/project/output/adapter_model"    
OUTPUT_FINAL_MODEL = "/home/mw/project/output/final_model"    

# 1. 加载基础模型    
base_model = AutoModelForCausalLM.from_pretrained(    
    BASE_MODEL_NAME,    
    torch_dtype=torch.float16,    
    device_map='auto',    
    trust_remote_code=True    
)    

# 2. 加载适配器    
peft_model = PeftModel.from_pretrained(    
    base_model,    
    OUTPUT_ADAPTER_MODEL    
)    

# 3. 合并模型（关键步骤）    
merged_model = peft_model.merge_and_unload()    

# 4. 保存完整模型    
merged_model.save_pretrained(OUTPUT_FINAL_MODEL)    
AutoTokenizer.from_pretrained(BASE_MODEL_NAME).save_pretrained(OUTPUT_FINAL_MODEL)    

print('✅ 模型已合并保存到 final_pirate_model 目录')