In [1]:
from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
import torch
 
# 加载 Qwen2.5-VL-3B-Instruct
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2.5-VL-3B-Instruct",
    torch_dtype=torch.bfloat16,
    device_map="auto",
)
 
# 加载 tokenizer 和 processor
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct")
processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct")
 
# 允许梯度更新
model.enable_input_require_grads()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [2]:
from datasets import Dataset
import json
data_path = "dataset/data_vl.json"
with open(data_path, 'r') as f:
    data = json.load(f)
    train_data = data[:-4]  # 划分数据集，保留最后4个样本作为测试集
    test_data = data[-4:]
# 保存数据
with open("train_data.json", "w") as f:
    json.dump(train_data, f)
with open("test_data.json", "w") as f:
    json.dump(test_data, f)
# 加载数据集
train_ds = Dataset.from_json("train_data.json")

Generating train split: 0 examples [00:00, ? examples/s]

In [3]:
from qwen_vl_utils import process_vision_info
import torch
def process_func(example):
    """
    预处理输入数据
    """
    MAX_LENGTH = 8192
    conversation = example["conversations"]
    input_content = conversation[0]["value"]
    output_content = conversation[1]["value"]
    file_path = input_content.split("<|vision_start|>")[1].split("<|vision_end|>")[0]
    # 构造多模态对话
    messages = [
        {
            "role": "user",
            "content": [
                {"type": "image", "image": f"{file_path}", "resized_height": 256, "resized_width": 256},
                {"type": "text", "text": "你是一位机械制造领域的专家，擅长优化安装流程以提升效率。我将提供一张图片，图片中展示了一个待安装SMA（表面贴装组件）的盒子，盒子共有20个空位，其中2个空位已完成安装。请你根据图片识别盒子的空位分布及已完成安装的位置，并规划剩余空位的最优安装顺序，以最小化工具移动距离。输出时，请列出每个空位的安装顺序及其对应坐标，并说明优化逻辑。如果图片信息不清晰或存在歧义，请提出具体问题以便进一步确认。"},
            ],
        }
    ]
    
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    image_inputs, video_inputs = process_vision_info(messages)
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    )
    inputs = {key: value.tolist() for key, value in inputs.items()}
    
    # 构造目标输出
    response = tokenizer(f"{output_content}", add_special_tokens=False)
    input_ids = inputs["input_ids"][0] + response["input_ids"] + [tokenizer.pad_token_id]
    attention_mask = inputs["attention_mask"][0] + response["attention_mask"] + [1]
    labels = [-100] * len(inputs["input_ids"][0]) + response["input_ids"] + [tokenizer.pad_token_id]
    # 截断
    if len(input_ids) > MAX_LENGTH:
        input_ids = input_ids[:MAX_LENGTH]
        attention_mask = attention_mask[:MAX_LENGTH]
        labels = labels[:MAX_LENGTH]
    return {
        "input_ids": torch.tensor(input_ids),
        "attention_mask": torch.tensor(attention_mask),
        "labels": torch.tensor(labels),
        "pixel_values": torch.tensor(inputs["pixel_values"]),
        "image_grid_thw": torch.tensor(inputs["image_grid_thw"]).squeeze(0)
    }

In [4]:
# 处理数据
train_dataset = train_ds.map(process_func)
# 确保数据加载成功
print(f"Train dataset size: {len(train_dataset)}")

Map:   0%|          | 0/38 [00:00<?, ? examples/s]

Train dataset size: 38


In [7]:
from peft import LoraConfig, get_peft_model
 
config = LoraConfig(
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    inference_mode=False,
    r=64,
    lora_alpha=16,
    lora_dropout=0.05,
    bias="none",
)
 
# 将 LoRA 应用于模型
peft_model = get_peft_model(model, config)



In [8]:
from swanlab.integration.huggingface import SwanLabCallback
import swanlab

swanlab_callback = SwanLabCallback(
    project="Qwen2.5-fintune",
    experiment_name="Qwen2.5-3B-VL",
    description="使用通义千问Qwen2.5-3B-VL模型微调。",
    config={
        "model": "qwen/Qwen2.5-VL-3B-VL",
        "dataset": "dataset/data_vl.json",
    },
)

  swanlab_callback = SwanLabCallback(


In [9]:
from transformers import TrainingArguments, Trainer, DataCollatorForSeq2Seq
import os
 
args = TrainingArguments(
    output_dir="output/Qwen2.5-VL-LoRA",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    logging_steps=10,
    num_train_epochs=1,
    save_steps=74,
    learning_rate=1e-4,
    gradient_checkpointing=True,
)
 
trainer = Trainer(
    model=peft_model,
    args=args,
    train_dataset=train_dataset,  
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),
    callbacks=[swanlab_callback],
)
 
trainer.train()

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


[1m[34mswanlab[0m[0m: Tracking run with swanlab version 0.5.3                                   
[1m[34mswanlab[0m[0m: Run data will be saved locally in [35m[1mc:\Users\Nack\Desktop\Nack-graduation-project\Qwen2.5-VL\swanlog\run-20250406_161944-a3b1799d[0m[0m
[1m[34mswanlab[0m[0m: 👋 Hi [1m[39mguibaoshan[0m[0m, welcome to swanlab!
[1m[34mswanlab[0m[0m: Syncing run [33moutput/Qwen2.5-VL-LoRA[0m to the cloud
[1m[34mswanlab[0m[0m: 🏠 View project at [34m[4mhttps://swanlab.cn/@guibaoshan/Qwen2.5-VL[0m[0m
[1m[34mswanlab[0m[0m: 🚀 View run at [34m[4mhttps://swanlab.cn/@guibaoshan/Qwen2.5-VL/runs/4459zzomycna0h7gy6kef[0m[0m


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


OutOfMemoryError: CUDA out of memory. Tried to allocate 3.28 GiB. GPU 0 has a total capacity of 12.00 GiB of which 0 bytes is free. Of the allocated memory 12.23 GiB is allocated by PyTorch, and 3.39 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
from peft import PeftModel
 
peft_model_path = "output/Qwen2.5-VL-LoRA/checkpoint-155"
val_peft_model = PeftModel.from_pretrained(model, peft_model_path, config=config)
 
messages = [
    {
        "role": "user",
        "content": [
            {"type": "image", "image": "dataset/images/图1.jpg"},
            {"type": "text", "text": "你是一位机械制造领域的专家，擅长优化安装流程以提升效率。我将提供一张图片，图片中展示了一个待安装SMA（表面贴装组件）的盒子，盒子共有20个空位，其中2个空位已完成安装。请你根据图片识别盒子的空位分布及已完成安装的位置，并规划剩余空位的最优安装顺序，以最小化工具移动距离。输出时，请列出每个空位的安装顺序及其对应坐标，并说明优化逻辑。如果图片信息不清晰或存在歧义，请提出具体问题以便进一步确认。"},
        ],
    }
]
 
def predict(messages, model):
    """ 用于推理验证的函数 """
    text = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    image_inputs, video_inputs = process_vision_info(messages)
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to(model.device)
 
    generated_ids = model.generate(**inputs, max_new_tokens=128)
    # 取生成的后半部分
    generated_ids_trimmed = [
        out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False
    )
    return output_text[0]
 
response = predict(messages, val_peft_model)
print(response)



A woman with long hair and a white shirt.
