In [1]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig
)
from peft import LoraConfig, get_peft_model
from datasets import load_dataset
import torch

# 数据集路径
DATA_PATH = "./data/reformat.jsonl"

## 基础模型路径
BASE_MODEL_NAME = "../model/Qwen2.5-3B-Instruct"

## 微调模型输出路径
OUTPUT_MODEL = "../model/Qwen2.5_3B_Instruct_law"
OUTPUT_FINAL_MODEL = "../model/Qwen2.5_3B_Instruct_merge"


# 配置 4 位量化
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True
)

# 1. 加载模型和tokenizer
model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto", 
    trust_remote_code=True
)
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL_NAME, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

  from .autonotebook import tqdm as notebook_tqdm
[4pdvGPU Msg(13248:140590362519360:libvgpu.c:869)]: Initializing.....
[4pdvGPU Warn(13248:140590362519360:hook.c:475)]: remap handles for device 0
[4pdvGPU Warn(13248:140590362519360:hook.c:475)]: remap handles for device 1
[4pdvGPU Warn(13248:140590362519360:utils.c:228)]: get default cuda 2 from (null)
[4pdvGPU Msg(13248:140590362519360:libvgpu.c:902)]: Initialized
[4pdvGPU Msg(13248:140590362519360:memory.c:566)]: orig free=24971051008 total=25217466368 limit=25769803776 usage=236978176
[4pdvGPU Msg(13248:140590362519360:memory.c:566)]: orig free=24971051008 total=25217466368 limit=25769803776 usage=236978176
Loading checkpoint shards:   0%|                                        | 0/2 [00:00<?, ?it/s][4pdvGPU Msg(13248:140590362519360:memory.c:566)]: orig free=24971051008 total=25217466368 limit=25769803776 usage=236978176
[4pdvGPU Msg(13248:140590362519360:memory.c:566)]: orig free=24971051008 total=25217466368 limit=25769803776 us

In [2]:

def preprocess_function(examples):
    texts = []
    for instruction, input_text, output in zip(
        examples["instructions"],
        examples["input"],
        examples["output"]
    ):
        if input_text:
            text = f"请根据给定问题，按照以下格式生成答案：首先，展示你在思考问题时的过程（用<think>标签包裹），然后给出最终的答案（用<answer>标签包裹）。确保思考过程清晰，逐步推理，并最终给出完整的回答。Instruction: {instruction}\nInput: {input_text}\nResponse: {output}"
        else:
            text = f"请根据给定问题，按照以下格式生成答案：首先，展示你在思考问题时的过程（用<think>标签包裹），然后给出最终的答案（用<answer>标签包裹）。确保思考过程清晰，逐步推理，并最终给出完整的回答。Instruction: {instruction}\nResponse: {output}"
        texts.append(text)
    
    # 对完整文本进行tokenize
    tokenized = tokenizer(
        texts,
        truncation=True,
        max_length=4096,
        padding="max_length",
        return_tensors="pt"
    )
    
    # 创建labels（与input_ids相同）
    tokenized["labels"] = tokenized["input_ids"].clone()
    return tokenized

# 加载并预处理数据
dataset = load_dataset("json", data_files=DATA_PATH, split="train")
dataset = dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=["instructions", "input", "output"]
)

In [3]:
# 3. 极简LoRA配置
lora_target_modules = [
    "q_proj", "k_proj", "v_proj", "o_proj", # Attention 线性层
    "gate_proj", "up_proj", "down_proj"     # MLP 线性层
]

peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=lora_target_modules,
    lora_dropout=0.05,
    task_type="CAUSAL_LM",
    inference_mode=False
)
model = get_peft_model(model, peft_config)

In [4]:
# 4-1 模型训练-参数准备
training_args = TrainingArguments(
    output_dir=OUTPUT_MODEL,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    learning_rate=2e-5,
    # max_steps=200,
    num_train_epochs=1,
    logging_steps=10,
    save_steps=50,
    fp16=True,
    optim="adamw_torch_fused",
    lr_scheduler_type="cosine",
    warmup_ratio=0.03, 
    report_to="none",
    save_total_limit=3,
    label_names=["labels"],
    remove_unused_columns=True
)

# 4-2 模型训练-数据收集器
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

# 4-3 模型训练-创建Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    data_collator=data_collator
)

[4pdvGPU Warn(13401:139880174495552:hook.c:475)]: remap handles for device 0
[4pdvGPU Warn(13401:139880174495552:hook.c:475)]: remap handles for device 1
Detected kernel version 3.10.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [5]:
# 4-4 模型训练-开始训练
print("开始训练...")
trainer.train()

# 5. 保存适配器
model.save_pretrained(f"{OUTPUT_MODEL}/adapter_model")

开始训练...


[4pdvGPU ERROR (pid:13248 thread=140590362519360 allocator.c:119)]: cuMemoryAllocate failed res=2
[4pdvGPU ERROR (pid:13248 thread=140590362519360 allocator.c:119)]: cuMemoryAllocate failed res=2
[4pdvGPU ERROR (pid:13248 thread=140590362519360 allocator.c:119)]: cuMemoryAllocate failed res=2
[4pdvGPU Msg(13248:140590362519360:memory.c:566)]: orig free=134479872 total=25217466368 limit=25769803776 usage=25025446912


RuntimeError: NVML_SUCCESS == DriverAPI::get()->nvmlDeviceGetHandleByPciBusId_v2_( pci_id, &nvml_device) INTERNAL ASSERT FAILED at "/pytorch/c10/cuda/CUDACachingAllocator.cpp":1001, please report a bug to PyTorch. 

In [None]:
from modelscope import AutoModelForCausalLM, AutoTokenizer

model_name = "Qwen/Qwen3-4B"

# load the tokenizer and the model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map="auto"
)

# prepare the model input
prompt = "Give me a short introduction to large language model."
messages = [
    {"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True,
    enable_thinking=True # Switches between thinking and non-thinking modes. Default is True.
)
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

# conduct text completion
generated_ids = model.generate(
    **model_inputs,
    max_new_tokens=32768
)
output_ids = generated_ids[0][len(model_inputs.input_ids[0]):].tolist() 

# parsing thinking content
try:
    # rindex finding 151668 (</think>)
    index = len(output_ids) - output_ids[::-1].index(151668)
except ValueError:
    index = 0

thinking_content = tokenizer.decode(output_ids[:index], skip_special_tokens=True).strip("\n")
content = tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("\n")

print("thinking content:", thinking_content)
print("content:", content)

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from peft import PeftModel
import torch

BASE_MODEL_NAME = "/mnt/proj/jupyter/qwen3_4b"
OUTPUT_ADAPTER_MODEL = "/home/mw/input/output/adapter_model"

# --- 加载用于原始模型测试的实例 ---
print("加载原始模型 (for orig_pipe)...")
original_base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_NAME,
    device_map="auto", 
    torch_dtype=torch.float16,
    trust_remote_code=True
)
original_tokenizer = AutoTokenizer.from_pretrained(
    BASE_MODEL_NAME,
    trust_remote_code=True
)
orig_pipe = pipeline("text-generation", model=original_base_model, tokenizer=original_tokenizer)

# --- 加载用于微调模型测试的实例 ---
print("加载原始模型 (for ft_pipe)...")
base_model_for_finetuning = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL_NAME,
    device_map="auto", # 同上，注意设备分配
    torch_dtype=torch.float16,
    trust_remote_code=True
)
tokenizer_for_finetuning = AutoTokenizer.from_pretrained(
    BASE_MODEL_NAME,
    trust_remote_code=True
)

# 测试案例
test_cases = [
    "今天的天气怎么样？",
    "你最喜欢的食物是什么？",
    "如何学习编程？",
    "给我讲个笑话",
    "推荐一部电影",
    "Python是最好的语言吗？"
]

print("\n[原始模型------ (original_base_model)]")
for question in test_cases:
    prompt = f"Instruction: 用冷静的哲学家风格回答\nInput: {question}\nResponse:"

    print(f"\n{'='*50}")
    print(f"问题: {question}")

    orig_output = orig_pipe(
        prompt,
        max_length=100,
        do_sample=True,
        pad_token_id=original_tokenizer.eos_token_id
    )[0]['generated_text'].split("Response:")[1].strip()
    print(orig_output)

# 清理模型，释放显存 (如果需要)
del original_base_model

In [None]:
print("加载并合并微调适配器...")
finetuned_model = PeftModel.from_pretrained(base_model_for_finetuning, OUTPUT_ADAPTER_MODEL)
finetuned_model = finetuned_model.merge_and_unload()
ft_pipe = pipeline("text-generation", model=finetuned_model, tokenizer=tokenizer_for_finetuning)

# 测试案例
test_cases = [
    "今天的天气怎么样？",
    "你最喜欢的食物是什么？",
    "如何学习编程？",
    "给我讲个笑话",
    "推荐一部电影",
    "Python是最好的语言吗？"
]

print("\n[微调模型---- (finetuned_model)]")
for question in test_cases:
    prompt = f"Instruction: 用冷静的哲学家风格回答\nInput: {question}\nResponse:"

    print(f"\n{'='*50}")
    print(f"问题: {question}")

    ft_output = ft_pipe(
        prompt,
        max_length=100,
        do_sample=True,
        pad_token_id=tokenizer_for_finetuning.eos_token_id
    )[0]['generated_text'].split("Response:")[1].strip()
    print(ft_output)

# 清理模型，释放显存 (如果需要)
del finetuned_model
import gc
torch.cuda.empty_cache()
gc.collect()

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer    
from peft import PeftModel    
import torch    

BASE_MODEL_NAME = "/home/mw/input/models4804"    
OUTPUT_ADAPTER_MODEL = "/home/mw/project/output/adapter_model"    
OUTPUT_FINAL_MODEL = "/home/mw/project/output/final_model"    

# 1. 加载基础模型    
base_model = AutoModelForCausalLM.from_pretrained(    
    BASE_MODEL_NAME,    
    torch_dtype=torch.float16,    
    device_map='auto',    
    trust_remote_code=True    
)    

# 2. 加载适配器    
peft_model = PeftModel.from_pretrained(    
    base_model,    
    OUTPUT_ADAPTER_MODEL    
)    

# 3. 合并模型（关键步骤）    
merged_model = peft_model.merge_and_unload()    

# 4. 保存完整模型    
merged_model.save_pretrained(OUTPUT_FINAL_MODEL)    
AutoTokenizer.from_pretrained(BASE_MODEL_NAME).save_pretrained(OUTPUT_FINAL_MODEL)    

print('✅ 模型已合并保存到 final_pirate_model 目录')