# 👾Qwen2大模型微调入门

作者：林泽毅

教程文章：https://zhuanlan.zhihu.com/p/702491999  

显存要求：10GB左右  

实验过程看：https://swanlab.cn/@ZeyiLin/Qwen2-fintune/runs/cfg5f8dzkp6vouxzaxlx6/chart

## 1.安装环境

本案例测试于modelscope==1.14.0、transformers==4.41.2、datasets==2.18.0、peft==0.11.1、accelerate==0.30.1、swanlab==0.3.9

In [None]:
%pip install torch swanlab modelscope transformers datasets peft pandas accelerate

如果是第一次使用SwanLab，则前往[SwanLab](https://swanlab.cn)注册账号后，在[用户设置](https://swanlab.cn/settings/overview)复制API Key，如果执行下面的代码：

In [2]:
!swanlab login

[1m[34mswanlab[0m[0m: You are already logged in. Use `[1mswanlab login --relogin[0m` to force relogin.


## 2. 数据集加载

1. 在[zh_cls_fudan-news - modelscope](https://modelscope.cn/datasets/huangjintao/zh_cls_fudan-news/files)下载train.jsonl和test.jsonl到同级目录下。

<img src="../assets/dataset.png" width=600>

2. 将train.jsonl和test.jsonl进行处理，转换成new_train.jsonl和new_test.jsonl

In [1]:
# 2.将train.jsonl和test.jsonl进行处理，转换成new_train.jsonl和new_test.jsonl
import json
import pandas as pd
import os

def dataset_jsonl_transfer(origin_path, new_path):
    """
    将原始数据集转换为大模型微调所需数据格式的新数据集
    """
    messages = []

    # 读取旧的JSONL文件
    with open(origin_path, "r") as file:
        for line in file:
            # 解析每一行的json数据
            data = json.loads(line)
            context = data["text"]
            catagory = data["category"]
            label = data["output"]
            message = {
                "instruction": "你是一个文本分类领域的专家，你会接收到一段文本和几个潜在的分类选项，请输出文本内容的正确类型",
                "input": f"文本:{context},类型选型:{catagory}",
                "output": label,
            }
            messages.append(message)

    # 保存重构后的JSONL文件
    with open(new_path, "w", encoding="utf-8") as file:
        for message in messages:
            file.write(json.dumps(message, ensure_ascii=False) + "\n")


# 加载、处理数据集和测试集
train_dataset_path = "../train.jsonl"
test_dataset_path = "../test.jsonl"

train_jsonl_new_path = "new_train.jsonl"
test_jsonl_new_path = "new_test.jsonl"

if not os.path.exists(train_jsonl_new_path):
    dataset_jsonl_transfer(train_dataset_path, train_jsonl_new_path)
if not os.path.exists(test_jsonl_new_path):
    dataset_jsonl_transfer(test_dataset_path, test_jsonl_new_path)

train_df = pd.read_json(train_jsonl_new_path, lines=True)[:1000]  # 取前1000条做训练（可选）
test_df = pd.read_json(test_jsonl_new_path, lines=True)[:10]  # 取前10条做主观评测

In [2]:
# 检查原始数据中的output
print("原始output:", train_df.iloc[0]['output'])

原始output: History


## 3. 下载/加载模型和tokenizer

In [3]:
from modelscope import snapshot_download, AutoTokenizer
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq
import torch

# 在modelscope上下载Qwen模型到本地目录下
model_dir = snapshot_download("qwen/Qwen2-1.5B-Instruct", cache_dir="./", revision="master")

# Mac 电脑
device = "mps" if torch.backends.mps.is_available() else "cpu"

print(f"使用设备: {device}")

model = AutoModelForCausalLM.from_pretrained(
    "./qwen/Qwen2-1___5B-Instruct/",
    torch_dtype=torch.float32  # MPS 仅支持 float32
).to(device)

tokenizer = AutoTokenizer.from_pretrained(
    "./qwen/Qwen2-1___5B-Instruct/",
    use_fast=False,
    trust_remote_code=True
)

model.enable_input_require_grads()  # 开启梯度检查点时，要执行该方法

  from .autonotebook import tqdm as notebook_tqdm


Downloading Model from https://www.modelscope.cn to directory: ./qwen/Qwen2-1.5B-Instruct


2025-03-25 13:54:59,842 - modelscope - INFO - Target directory already exists, skipping creation.
Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


使用设备: mps


## 4. 预处理训练数据

In [4]:
def process_func(example):
    """
    将数据集进行预处理
    """
    MAX_LENGTH = 384
    input_ids, attention_mask, labels = [], [], []
    
    # 打印 example 内容
    # print("example:", example)
    
    instruction = tokenizer(
        f"<|im_start|>system\n你是一个文本分类领域的专家，你会接收到一段文本和几个潜在的分类选项，请输出文本内容的正确类型<|im_end|>\n<|im_start|>user\n{example['input']}<|im_end|>\n<|im_start|>assistant\n",
        add_special_tokens=False,
    )
    response = tokenizer(f"{example['output']}", add_special_tokens=False)
    
    # 打印 instruction 和 response
    #print("instruction input_ids:", instruction["input_ids"])
    #print("response input_ids:", response["input_ids"])
    
    # 计算总长度
    total_length = len(instruction["input_ids"]) + len(response["input_ids"]) + 1  # +1 是为了添加 pad_token_id
    
    # 如果总长度超过 MAX_LENGTH，截断 instruction 部分
    if total_length > MAX_LENGTH:
        # 计算需要截断的长度
        truncate_length = total_length - MAX_LENGTH
        
        # 从 instruction 部分的末尾截断
        instruction["input_ids"] = instruction["input_ids"][:-truncate_length]
        instruction["attention_mask"] = instruction["attention_mask"][:-truncate_length]
    
    # 构造 input_ids, attention_mask 和 labels
    input_ids = instruction["input_ids"] + response["input_ids"] + [tokenizer.pad_token_id]
    attention_mask = instruction["attention_mask"] + response["attention_mask"] + [1]
    labels = (
        [-100] * len(instruction["input_ids"])
        + response["input_ids"]
        + [tokenizer.pad_token_id]
    )
    
    return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}

In [5]:
from datasets import Dataset

train_ds = Dataset.from_pandas(train_df)
train_dataset = train_ds.map(process_func, remove_columns=train_ds.column_names)

Map: 100%|██████████| 1000/1000 [01:06<00:00, 15.06 examples/s]


In [6]:
sample = train_dataset[0]

output_ids = [id for id, label in zip(sample['input_ids'], sample['labels']) if label != -100]
print("Decoded output:", tokenizer.decode(output_ids))

Decoded output: History<|endoftext|>


## 5. 设置LORA

In [7]:
from peft import LoraConfig, TaskType, get_peft_model

config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
    inference_mode=False,  # 训练模式
    r=8,  # Lora 秩
    lora_alpha=8,  # 原32改为8，保持α/r=1
    lora_dropout=0.1,  # Dropout 比例
)

model = get_peft_model(model, config)

## 6. 训练

In [8]:
args = TrainingArguments(
    output_dir="./output/Qwen1.5",
    per_device_train_batch_size=2,  # 小batch_size
    gradient_accumulation_steps=8,
    logging_steps=10,
    num_train_epochs=2,
    learning_rate=1e-5,  # 降低学习率
    warmup_ratio=0.05,   # 预热比例
    max_grad_norm=0.5,   # 梯度裁剪
    optim="adamw_torch_fused",
    lr_scheduler_type="cosine",
    save_steps=100,
    gradient_checkpointing=True,
    report_to="none",
    fp16=False,          # MPS必须关闭
    bf16=False,          # MPS不支持
    remove_unused_columns=False
)

In [9]:
from swanlab.integration.transformers import SwanLabCallback
import swanlab

swanlab_callback = SwanLabCallback(
    project="Qwen2-fintune",
    experiment_name="Qwen2-1.5B-Instruct",
    description="修正后的稳定训练版本",
    config={
        "model": "qwen/Qwen2-1.5B-Instruct",
        "device": device,
        "lora_config": config.to_dict(),
        "learning_rate": 1e-5,
        "batch_size": 2
    },
)

In [10]:
# 训练前诊断
print("==== 设备诊断 ====")
print(f"PyTorch版本: {torch.__version__}")
print(f"使用设备: {device}")
print(f"模型参数示例设备: {next(model.parameters()).device}")

==== 设备诊断 ====
PyTorch版本: 2.6.0
使用设备: mps
模型参数示例设备: mps:0


In [11]:
# 训练
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    data_collator=DataCollatorForSeq2Seq(
        tokenizer=tokenizer,
        padding=True,
        pad_to_multiple_of=8
    ),
    callbacks=[swanlab_callback],
)

trainer.train()

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


[1m[34mswanlab[0m[0m: Tracking run with swanlab version 0.5.3                                   
[1m[34mswanlab[0m[0m: Run data will be saved locally in [35m[1m/Users/wangtianqing/Project/LLM-Finetune/notebook/swanlog/run-20250325_135702-a3b1799d[0m[0m
[1m[34mswanlab[0m[0m: 👋 Hi [1m[39mgrissom[0m[0m, welcome to swanlab!
[1m[34mswanlab[0m[0m: Syncing run [33mQwen2-1.5B-Instruct[0m to the cloud
[1m[34mswanlab[0m[0m: 🏠 View project at [34m[4mhttps://swanlab.cn/@grissom/Qwen2-fintune[0m[0m
[1m[34mswanlab[0m[0m: 🚀 View run at [34m[4mhttps://swanlab.cn/@grissom/Qwen2-fintune/runs/9yt3l66vtjmwhu6f9b9xe[0m[0m


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss
10,11.3794
20,12.0266
30,11.2192
40,10.6475
50,10.6646
60,10.3584
70,9.7544
80,9.4273
90,8.5473
100,8.5536




TrainOutput(global_step=124, training_loss=9.9164367798836, metrics={'train_runtime': 14252.8907, 'train_samples_per_second': 0.14, 'train_steps_per_second': 0.009, 'total_flos': 6004078226767872.0, 'train_loss': 9.9164367798836, 'epoch': 1.976})

In [13]:
for name, param in model.named_parameters():
    if param.requires_grad:
        # 检查参数中是否存在 NaN 值
        if torch.isnan(param).any():
            print(f"{name}: max {param.max()} min {param.min()}")

In [16]:
# ====== 训练结束后的预测 ===== #
def predict(messages, model, tokenizer):
    device = "mps" if torch.backends.mps.is_available() else "cpu"
    
    # 1. 处理输入文本
    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    if not text.strip():
        raise ValueError("Generated text is empty, check input formatting.")

    # 2. 生成输入张量
    model_inputs = tokenizer(
        [text], 
        return_tensors="pt", 
        padding=True,  # 添加 padding
        truncation=True  # 避免输入过长
    ).to(device)

    # 3. 检查 input_ids 是否为空
    input_ids = model_inputs.input_ids
    attention_mask = model_inputs.attention_mask
    if input_ids.numel() == 0:
        raise ValueError("Input tensor is empty, check tokenizer output.")

    # 4. 生成输出
    with torch.no_grad():
        generated_ids = model.generate(
            input_ids=input_ids, 
            attention_mask=attention_mask, 
            max_new_tokens=128  # 限制 max_new_tokens，避免计算问题
        )

    # 5. 过滤前缀 token
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(input_ids, generated_ids)
    ]

    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    print(response)
    
    return response
    

test_text_list = []
for index, row in test_df.iterrows():

    # 避免内存不足
    if index > 2:
        break
    instruction = row["instruction"]
    input_value = row["input"]

    messages = [
        {"role": "system", "content": f"{instruction}"},
        {"role": "user", "content": f"{input_value}"},
    ]
    response = predict(messages, model, tokenizer)
    messages.append({"role": "assistant", "content": f"{response}"})
    result_text = f"{messages[0]}\n\n{messages[1]}\n\n{messages[2]}"
    test_text_list.append(swanlab.Text(result_text, caption=response))

swanlab.log({"Prediction": test_text_list})
swanlab.finish()

教育
Military
Literature
[1m[34mswanlab[0m[0m: 🏠 View project at [34m[4mhttps://swanlab.cn/@grissom/Qwen2-fintune[0m[0m
[1m[34mswanlab[0m[0m: 🚀 View run at [34m[4mhttps://swanlab.cn/@grissom/Qwen2-fintune/runs/9yt3l66vtjmwhu6f9b9xe[0m[0m
                                                                                                    