# 使用 Unsloth 实现 qwen2.5-7B 的指令微调

In [24]:
# 查看 Python 的路径
!which python

# 获取当前工作目录
import os
print(os.getcwd())

/root/miniconda3/envs/unsloth/bin/python
/root/Unsloth


In [None]:
# 安装依赖
!pip install unsloth

# 加载预训练模型和分词器

In [3]:
from unsloth import FastLanguageModel
import torch

# 定义模型的最大序列长度，支持内部的RoPE（旋转位置编码）扩展
# 仅影响微调过程中的输入长度，微调后的模型的最大序列长度由原始模型的架构决定。
max_seq_length = 2048

# 定义数据类型，None表示自动检测；Float16适用于Tesla T4、V100，Bfloat16适用于Ampere架构。
dtype = None

# 是否使用4位量化（4bit quantization）以减少内存使用，True表示启用。
load_in_4bit = True

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
# More models at https://huggingface.co/unsloth

# 加载预训练模型和分词器
model, tokenizer = FastLanguageModel.from_pretrained(
    # Can select any from the below:
    # "unsloth/Qwen2.5-0.5B", "unsloth/Qwen2.5-1.5B", "unsloth/Qwen2.5-3B"
    # "unsloth/Qwen2.5-14B",  "unsloth/Qwen2.5-32B",  "unsloth/Qwen2.5-72B",
    # And also all Instruct versions and Math. Coding verisons!
    model_name = "/root/Unsloth/models/Qwen2.5-7B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm


Unsloth: Failed to patch Gemma3ForConditionalGeneration.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.3.19: Fast Qwen2 patching. Transformers: 4.51.3.
   \\   /|    NVIDIA GeForce RTX 4090. Num GPUs = 1. Max memory: 23.518 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████| 4/4 [00:05<00:00,  1.32s/it]


# 使用LoRA（低秩适配）技术对模型进行微调
现在我们添加了 LoRA 适配器，因此只需更新所有参数中的 1%到 10%！

In [9]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,     # LoRA的秩，选择任意 >0 的数，建议值为8、16、32、64、128。
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],    # 指定需要应用LoRA的模块。
    lora_alpha = 16,    # LoRA 的缩放因子。
    lora_dropout = 0,   # Supports any, but = 0 is optimized
    bias = "none",      # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth: Already have LoRA adapters! We shall skip this step.


# 数据准备
我们现在使用来自 [yahma 的 Alpaca 数据集](https://huggingface.co/datasets/yahma/alpaca-cleaned)，它是原始 Alpaca 数据集 52K 的过滤版本。您可以用自己的数据预处理替换这部分代码。
[注] 若要只对完成度进行训练（忽略用户输入），请阅读此处的 TRL 文档。
[注意] 记得在标记化输出中添加 EOS_TOKEN！否则会产生无限代！
如果您想将 llama-3 模板用于 ShareGPT 数据集，请尝试我们的会话笔记本
对于像小说写作这样的文本补全，请试试这个笔记本。

In [11]:
# 定义Alpaca格式的提示模板
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

# 获取分词器的结束标记（EOS_TOKEN），用于标记生成的结束。
EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN

# 定义格式化函数，将数据集中的样本转换为Alpaca格式
def formatting_prompts_func(examples):
    instructions = examples["instruction"]
    inputs       = examples["input"]
    outputs      = examples["output"]
    texts = []
    for instruction, input, output in zip(instructions, inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }
pass

# 加载数据集
from datasets import load_dataset
data_path = "/root/Unsloth/dataset/alpaca-cleaned"
dataset = load_dataset(data_path, split = "train")
dataset = dataset.map(formatting_prompts_func, batched = True,)

# 训练模型
现在让我们使用 Huggingface TRL 的 SFTTrainer！这里有更多文档：TRL SFT 文档。我们使用 60 步来加快速度，但你也可以设置 num_train_epochs=1 来进行完整运行，并关闭 max_steps=None。我们还支持 TRL 的 DPOTrainer！

In [12]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

# 配置训练器
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.

    # 配置训练参数
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 60,
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none", # Use this for WandB etc
    ),
)

Unsloth: Tokenizing ["text"] (num_proc=2): 100%|█████████████████████████| 51760/51760 [00:06<00:00, 8114.92 examples/s]


## 开始训练

In [13]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 51,760 | Num Epochs = 1 | Total steps = 60
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 40,370,176/7,000,000,000 (0.58% trained)


Step,Training Loss
1,0.7964
2,0.7102
3,0.9016
4,0.9711
5,0.9467
6,0.8824
7,0.6653
8,0.9814
9,0.9094
10,0.848


# 推理
让我们运行模型！您可以更改指令和输入 - 输出留空！

In [14]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
        "Continue the fibonnaci sequence.", # instruction
        "1, 1, 2, 3, 5, 8", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
tokenizer.batch_decode(outputs)

['Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.\n\n### Instruction:\nContinue the fibonnaci sequence.\n\n### Input:\n1, 1, 2, 3, 5, 8\n\n### Response:\n13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1597, 2584, 4181, 6']

您还可以使用 TextStreamer 进行连续推理--这样您就可以逐个看到生成的标记，而不用一直等待！

In [15]:
# alpaca_prompt = Copied from above
FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
[
    alpaca_prompt.format(
        "Continue the fibonnaci sequence.", # instruction
        "1, 1, 2, 3, 5, 8", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)

Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
Continue the fibonnaci sequence.

### Input:
1, 1, 2, 3, 5, 8

### Response:
13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1597, 2584, 4181, 6765, 10946, 17711, 25841, 41577, 67418, 109051, 176530, 285471, 


# 保存、加载微调模型

In [16]:
model.save_pretrained("lora_model")  # Local saving
tokenizer.save_pretrained("lora_model")
# model.push_to_hub("your_name/lora_model", token = "...") # Online saving
# tokenizer.push_to_hub("your_name/lora_model", token = "...") # Online saving

('lora_model/tokenizer_config.json',
 'lora_model/special_tokens_map.json',
 'lora_model/vocab.json',
 'lora_model/merges.txt',
 'lora_model/added_tokens.json',
 'lora_model/tokenizer.json')

现在，如果您想加载我们刚刚保存的用于推理的 LoRA 适配器，请将 False 设为 True：

In [19]:
if False:
    from unsloth import FastLanguageModel
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name = "lora_model", # YOUR MODEL YOU USED FOR TRAINING
        max_seq_length = max_seq_length,
        dtype = dtype,
        load_in_4bit = load_in_4bit,
    )
    FastLanguageModel.for_inference(model) # Enable native 2x faster inference

# alpaca_prompt = You MUST copy from above!

inputs = tokenizer(
[
    alpaca_prompt.format(
        "What is a famous tall tower in Paris?", # instruction
        "", # input
        "", # output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 128)

==((====))==  Unsloth 2025.3.19: Fast Qwen2 patching. Transformers: 4.51.3.
   \\   /|    NVIDIA GeForce RTX 4090. Num GPUs = 1. Max memory: 23.518 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.9. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████| 4/4 [00:05<00:00,  1.31s/it]


Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
What is a famous tall tower in Paris?

### Input:


### Response:
One of the most famous tall towers in Paris is the Eiffel Tower, which stands at 324 meters (1,004 feet) tall and was completed in 1889. It was designed by the French engineer Gustave Eiffel and has become one of the most recognizable landmarks in the world.<|im_end|>


您也可以使用 Hugging Face 的 AutoModelForPeftCausalLM。只有在没有安装 unsloth 的情况下才能使用。由于不支持 4bit 模型下载，它的速度会慢得令人绝望，而 Unsloth 的推理速度是它的 2 倍。

In [22]:
if False:
    # I highly do NOT suggest - use Unsloth if possible
    from peft import AutoPeftModelForCausalLM
    from transformers import AutoTokenizer
    model = AutoPeftModelForCausalLM.from_pretrained(
        "lora_model", # YOUR MODEL YOU USED FOR TRAINING
        load_in_4bit = load_in_4bit,
    )
    tokenizer = AutoTokenizer.from_pretrained("lora_model")

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
Loading checkpoint shards: 100%|██████████████████████████████████████████████████████████| 4/4 [00:05<00:00,  1.32s/it]


# 为 VLLM 保存到 float16
我们还支持直接保存为 float16。为 float16 选择 merged_16bit，为 int4 选择 merged_4bit。我们还允许将 lora 适配器作为备用。使用 push_too_hub_merged 上传到你的 Hugging Face 账户！您可以登录 https://huggingface.co/settings/tokens 获取个人令牌

In [23]:
# Merge to 16bit
if False: model.save_pretrained_merged("model", tokenizer, save_method = "merged_16bit",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_16bit", token = "")

# Merge to 4bit
if False: model.save_pretrained_merged("model", tokenizer, save_method = "merged_4bit",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_4bit", token = "")

# Just LoRA adapters
if False: model.save_pretrained_merged("model", tokenizer, save_method = "lora",)
if False: model.push_to_hub_merged("hf/model", tokenizer, save_method = "lora", token = "")

# GGUF / llama.cpp 转换
为了保存为 GGUF / llama.cpp 格式，我们现在已原生支持！我们会克隆 llama.cpp，并默认保存为 q8_0 格式。同时也支持其他方法，比如 q4_k_m。
使用 save_pretrained_gguf 方法进行本地保存，使用 push_to_hub_gguf 方法上传到 Hugging Face。

以下是部分支持的量化方法（完整列表可参考我们的 Wiki 页面）：

q8_0：快速转换。资源占用较高，但通常可以接受。

q4_k_m：推荐使用。对于一半的 attention.wv 和 feed_forward.w2 张量使用 Q6_K 量化，其余使用 Q4_K。

q5_k_m：推荐使用。对于一半的 attention.wv 和 feed_forward.w2 张量使用 Q6_K 量化，其余使用 Q5_K。

In [None]:
# Save to 8bit Q8_0
if False: model.save_pretrained_gguf("model", tokenizer,)
# Remember to go to https://huggingface.co/settings/tokens for a token!
# And change hf to your username!
if False: model.push_to_hub_gguf("hf/model", tokenizer, token = "")

# Save to 16bit GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "f16")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "f16", token = "")

# Save to q4_k_m GGUF
if False: model.save_pretrained_gguf("model", tokenizer, quantization_method = "q4_k_m")
if False: model.push_to_hub_gguf("hf/model", tokenizer, quantization_method = "q4_k_m", token = "")

# Save to multiple GGUF options - much faster if you want multiple!
if False:
    model.push_to_hub_gguf(
        "hf/model", # Change hf to your username!
        tokenizer,
        quantization_method = ["q4_k_m", "q8_0", "q5_k_m",],
        token = "", # Get a token at https://huggingface.co/settings/tokens
    )

现在，在 llama.cpp 中使用 model-unsloth.gguf 文件或 model-unsloth-Q4_K_M.gguf 文件，或者使用基于 UI 的系统，例如 Jan 或 Open WebUI。您可以在这里安装 Jan，在这里安装 Open WebUI。

大功告成！如果您对 Unsloth 有任何疑问，欢迎联系我们的 Discord 频道！如果您发现任何错误，或想了解最新的 LLM 内容，或者需要帮助、加入项目等，欢迎加入我们的 Discord！

其他链接：

训练您自己的推理模型 - Llama GRPO 笔记本 免费 Colab

将微调保存到 Ollama。免费笔记本

Llama 3.2 视觉微调 - 放射学用例。免费 Colab

在我们的文档中查看 DPO、ORPO、持续预训练、对话式微调等笔记本！