In [2]:
import torch
from datasets import Dataset
from modelscope import snapshot_download, AutoTokenizer
from swanlab.integration.transformers import SwanLabCallback
from qwen_vl_utils import process_vision_info
from peft import LoraConfig, TaskType, get_peft_model, PeftModel
from transformers import (
    TrainingArguments,
    Trainer,
    DataCollatorForSeq2Seq,
    Qwen2VLForConditionalGeneration,
    AutoProcessor,
)
import swanlab
import json

In [3]:
def process_func(example):
    """
    将数据集进行预处理
    """
    MAX_LENGTH = 8192
    input_ids, attention_mask, labels = [], [], []
    conversation = example["conversations"]
    input_content = conversation[0]["value"]
    output_content = conversation[1]["value"]
    file_path = input_content.split("<|vision_start|>")[1].split("<|vision_end|>")[0]  # 获取图像路径
    messages = [
        {
            "role": "user",
            "content": [
                {
                    "type": "image",
                    "image": f"{file_path}",
                    "resized_height": 512,
                    "resized_width": 512,
                },
                {"type": "text", "text": "Please analyze the content of the picture and the text in the picture. If you think the combination of the content and the text is harmful (such as violence, pornography, discrimination and other harmful behaviors), please output 1; if you think the combination is harmless, please output 0:"},
            ],
        }
    ]
    text = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )  # 获取文本
    image_inputs, video_inputs = process_vision_info(messages)  # 获取数据数据（预处理过）
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    )
    inputs = {key: value.tolist() for key, value in inputs.items()} #tensor -> list,为了方便拼接
    instruction = inputs

    response = tokenizer(f"{output_content}", add_special_tokens=False)


    input_ids = (
            instruction["input_ids"][0] + response["input_ids"] + [tokenizer.pad_token_id]
    )

    attention_mask = instruction["attention_mask"][0] + response["attention_mask"] + [1]
    labels = (
            [-100] * len(instruction["input_ids"][0])
            + response["input_ids"]
            + [tokenizer.pad_token_id]
    )
    if len(input_ids) > MAX_LENGTH:  # 做一个截断
        input_ids = input_ids[:MAX_LENGTH]
        attention_mask = attention_mask[:MAX_LENGTH]
        labels = labels[:MAX_LENGTH]

    input_ids = torch.tensor(input_ids)
    attention_mask = torch.tensor(attention_mask)
    labels = torch.tensor(labels)
    inputs['pixel_values'] = torch.tensor(inputs['pixel_values'])
    inputs['image_grid_thw'] = torch.tensor(inputs['image_grid_thw']).squeeze(0)  #由（1,h,w)变换为（h,w）
    return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels,
            "pixel_values": inputs['pixel_values'], "image_grid_thw": inputs['image_grid_thw']}


def predict(messages, model):
    # 准备推理
    text = processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    image_inputs, video_inputs = process_vision_info(messages)
    inputs = processor(
        text=[text],
        images=image_inputs,
        videos=video_inputs,
        padding=True,
        return_tensors="pt",
    )
    inputs = inputs.to("cuda")

    # 生成输出
    generated_ids = model.generate(**inputs, max_new_tokens=128)
    generated_ids_trimmed = [
        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )
    
    return output_text[0]

In [4]:
# 在modelscope上下载Qwen2-VL模型到本地目录下
# model_dir = snapshot_download("Qwen/Qwen2-VL-2B-Instruct", cache_dir="./", revision="master")

# 使用Transformers加载模型权重
tokenizer = AutoTokenizer.from_pretrained("./Qwen/Qwen2-VL-2B-Instruct/", use_fast=False, trust_remote_code=True)
processor = AutoProcessor.from_pretrained("./Qwen/Qwen2-VL-2B-Instruct")

model = Qwen2VLForConditionalGeneration.from_pretrained("./Qwen/Qwen2-VL-2B-Instruct/", device_map="auto", torch_dtype=torch.bfloat16, trust_remote_code=True,)
model.enable_input_require_grads()  # 开启梯度检查点时，要执行该方法

# 处理数据集：读取json文件
# 拆分成训练集和测试集，保存为data_vl_train.json和data_vl_test.json
train_json_path = "data_vl.json"
with open(train_json_path, 'r') as f:
    data = json.load(f)
    train_data = data[:-100]
    test_data = data[-100:]

with open("data_vl_train.json", "w") as f:
    json.dump(train_data, f)

with open("data_vl_test.json", "w") as f:
    json.dump(test_data, f)

train_ds = Dataset.from_json("data_vl_train.json")
train_dataset = train_ds.map(process_func)





The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.
`Qwen2VLRotaryEmbedding` can now be fully parameterized by passing the model config through the `config` argument. All other arguments will be removed in v4.46


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/900 [00:00<?, ? examples/s]

In [5]:
# 配置LoRA
config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    inference_mode=False,  # 训练模式
    r=64,  # Lora 秩
    lora_alpha=16,  # Lora alaph，具体作用参见 Lora 原理
    lora_dropout=0.05,  # Dropout 比例
    bias="none",
)

# 获取LoRA模型
peft_model = get_peft_model(model, config)

# 配置训练参数
args = TrainingArguments(
    output_dir="./output/Qwen2-VL-2B",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    logging_steps=10,
    logging_first_step=5,
    num_train_epochs=8,
    save_steps=100,
    learning_rate=1e-4,
    save_on_each_node=True,
    gradient_checkpointing=True,
    report_to="none",
)
        
# 设置SwanLab回调
swanlab_callback = SwanLabCallback(
    project="Qwen2-VL-finetune",
    experiment_name="qwen2-vl-hateful-memes",
    config={
        "model": "https://modelscope.cn/models/Qwen/Qwen2-VL-2B-Instruct",
        # "dataset": "https://modelscope.cn/datasets/modelscope/coco_2014_caption/quickstart",
        "github": "https://github.com/datawhalechina/self-llm",
        "prompt": "Please analyze the content of the picture and the text in the picture. If you think the combination of the content and the text is harmful (such as violence, pornography, discrimination and other harmful behaviors), please output 1; if you think the combination is harmless, please output 0: ",
        "train_data_number": len(train_data),
        "lora_rank": 64,
        "lora_alpha": 16,
        "lora_dropout": 0.1,
    },
)

# 配置Trainer
trainer = Trainer(
    model=peft_model,
    args=args,
    train_dataset=train_dataset,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),
    callbacks=[swanlab_callback],
)
# 开启模型训练
trainer.train()

[1m[34mswanlab[0m[0m: swanlab version 0.4.2 is available!  Upgrade: `pip install -U swanlab`
[1m[34mswanlab[0m[0m: Tracking run with swanlab version 0.3.25                                  
[1m[34mswanlab[0m[0m: Run data will be saved locally in [35m[1m/root/swanlog/run-20250114_135402-a3b1799d[0m[0m
[1m[34mswanlab[0m[0m: 👋 Hi [1m[39mluozhj33[0m[0m, welcome to swanlab!
[1m[34mswanlab[0m[0m: Syncing run [33mqwen2-vl-hateful-memes[0m to the cloud
[1m[34mswanlab[0m[0m: 🌟 Run `[1mswanlab watch /root/swanlog[0m` to view SwanLab Experiment Dashboard locally
[1m[34mswanlab[0m[0m: 🏠 View project at [34m[4mhttps://swanlab.cn/@luozhj33/Qwen2-VL-finetune[0m[0m
[1m[34mswanlab[0m[0m: 🚀 View run at [34m[4mhttps://swanlab.cn/@luozhj33/Qwen2-VL-finetune/runs/ses9zzqj4wfwx01ta8slx[0m[0m


  return F.conv3d(
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss
1,7.885
10,7.8897
20,7.2144
30,6.0479
40,4.6731
50,3.7567


TrainOutput(global_step=56, training_loss=5.641982274396079, metrics={'train_runtime': 830.2925, 'train_samples_per_second': 1.084, 'train_steps_per_second': 0.067, 'total_flos': 4462272166625280.0, 'train_loss': 5.641982274396079, 'epoch': 0.9955555555555555})

In [7]:
import subprocess
import os

result = subprocess.run('bash -c "source /etc/network_turbo && env | grep proxy"', shell=True, capture_output=True, text=True)
output = result.stdout
for line in output.splitlines():
    if '=' in line:
        var, value = line.split('=', 1)
        os.environ[var] = value

In [6]:
# ====================测试模式===================
# 配置测试参数
val_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    inference_mode=True,  # 训练模式
    r=64,  # Lora 秩
    lora_alpha=16,  # Lora alaph，具体作用参见 Lora 原理
    lora_dropout=0.05,  # Dropout 比例
    bias="none",
)
swanlab.init()
# 获取测试模型
# val_peft_model = PeftModel.from_pretrained(
#     Qwen2VLForConditionalGeneration.from_pretrained(
#             "Qwen/Qwen2-VL-2B-Instruct", device_map="auto", torch_dtype="auto", trust_remote_code=True)
# , model_id="Qwen/Qwen2-VL-2B-Instruct", config=val_config
# )
# 获取测试模型
val_peft_model = PeftModel.from_pretrained(model, model_id="./output/Qwen2-VL-2B/checkpoint-56", config=val_config)

# 读取测试数据
with open("data_vl_test.json", "r") as f:
    test_dataset = json.load(f)
    # print(test_dataset)
test_image_list = []
test_data = []
for item in test_dataset:
    input_image_prompt = item["conversations"][0]["value"]
    # 去掉前后的<|vision_start|>和<|vision_end|>
    origin_image_path = input_image_prompt.split("<|vision_start|>")[1].split("<|vision_end|>")[0]
    print(origin_image_path)
    messages = [{
        "role": "user", 
        "content": [
            {
            "type": "image", 
            "image": origin_image_path
            },
            {
            "type": "text",
            "text": "Please analyze the content of the picture and the text in the picture. If you think the combination of the content and the text is harmful (such as violence, pornography, discrimination and other harmful behaviors), please output 1; if you think the combination is harmless, please output 0:"
            }
        ]}]
    
    response = predict(messages, val_peft_model)
    messages.append({"role": "assistant", "content": f"{response}"})
    test_data.append(messages[-1:]['content'])
    print(messages[-1])

    test_image_list.append(swanlab.Image(origin_image_path, caption=response))

swanlab.log({"Prediction": test_image_list})

# 在Jupyter Notebook中运行时要停止SwanLab记录，需要调用swanlab.finish()
swanlab.finish()

import raw_test
raw_test.acc_rate(raw_test.real_data, test_data)


[1m[33mswanlab[0m[0m: You have already initialized a run, the init function will be ignored
datasets/train/08719.png


  return F.conv3d(


{'role': 'assistant', 'content': '1'}
datasets/train/08732.png
{'role': 'assistant', 'content': '0'}
datasets/train/08741.png
{'role': 'assistant', 'content': '1'}
datasets/train/08742.png
{'role': 'assistant', 'content': '0'}
datasets/train/08743.png
{'role': 'assistant', 'content': '0'}
datasets/train/08761.png
{'role': 'assistant', 'content': '1'}
datasets/train/08793.png
{'role': 'assistant', 'content': '1'}
datasets/train/08795.png
{'role': 'assistant', 'content': '1'}
datasets/train/08917.png
{'role': 'assistant', 'content': '1'}
datasets/train/08924.png
{'role': 'assistant', 'content': '0'}
datasets/train/08934.png
{'role': 'assistant', 'content': '1'}
datasets/train/08937.png
{'role': 'assistant', 'content': '1'}
datasets/train/08941.png
{'role': 'assistant', 'content': '1'}
datasets/train/08954.png
{'role': 'assistant', 'content': '0'}
datasets/train/08957.png
{'role': 'assistant', 'content': '1'}
datasets/train/08961.png
{'role': 'assistant', 'content': '1'}
datasets/train/08

In [None]:
torch.cuda.empty_cache()
