In [None]:
!pwd
!ls

In [2]:

from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info

# default: Load the model on the available device(s)
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    # "Qwen/Qwen2.5-VL-7B-Instruct",
    "Qwen/Qwen2.5-VL-3B-Instruct", 
    torch_dtype="auto", 
    device_map="auto"
)

# print model torch_dtype and device_map
print(f"Model torch_dtype: {model.dtype}")
print(f"Model device_map: {model.device_map}")

# We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
# model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
#     "Qwen/Qwen2.5-VL-7B-Instruct",
#     torch_dtype=torch.bfloat16,
#     attn_implementation="flash_attention_2",
#     device_map="auto",
# )

# default processor
processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")

# The default range for the number of visual tokens per image in the model is 4-16384.
# You can set min_pixels and max_pixels according to your needs, such as a token range of 256-1280, to balance performance and cost.
# min_pixels = 256*28*28
# max_pixels = 1280*28*28
# processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct", min_pixels=min_pixels, max_pixels=max_pixels)

messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
            },
            {"type": "text", "text": "Describe this image."},
        ],
    }
]

# Preparation for inference
text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)
inputs = inputs.to(model.device)

# Inference: Generation of the output
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text)

config.json:   0%|          | 0.00/1.37k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/65.4k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.53G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/3.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/216 [00:00<?, ?B/s]

Model torch_dtype: torch.bfloat16


AttributeError: 'Qwen2_5_VLForConditionalGeneration' object has no attribute 'device_map'

In [None]:
import torch
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor
from qwen_vl_utils import process_vision_info

# 加载模型：启用 4-bit 量化（NF4 类型）
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2.5-VL-7B-Instruct",
    device_map="auto",                      # 自动分配设备（支持多 GPU）
    load_in_4bit=True,                      # 启用 4-bit 量化
    bnb_4bit_quant_type="nf4",              # NF4 是一种适合预训练权重的量化类型
    bnb_4bit_use_double_quant=True,         # 双重量化进一步压缩
    bnb_4bit_compute_dtype=torch.bfloat16   # 计算时使用 bfloat16 提升性能
)

# 加载处理器（processor）
processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")

# 示例输入消息
messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg", 
            },
            {"type": "text", "text": "Describe this image."},
        ],
    }
]

# 预处理文本部分
text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)

# 处理视觉信息（提取图像/视频）
image_inputs, video_inputs = process_vision_info(messages)

# 构建最终输入张量
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)
inputs = inputs.to(model.device)  # 移动到模型所在设备（如 GPU）

# 生成输出
generated_ids = model.generate(**inputs, max_new_tokens=128)

# 去除 prompt 部分，仅保留生成内容
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]

# 解码输出
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)

# 打印结果
print(output_text)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
You have video processor config saved in `preprocessor.json` file which is deprecated. Video processor configs should be saved in their own `video_preprocessor.json` file. You can rename the file or load and save the processor back which renames it automatically. Loading from `preprocessor.json` will be removed in v5.0.


['The image depicts a serene beach scene during what appears to be either sunrise or sunset, as indicated by the warm, golden light. A woman is sitting on the sand, interacting with a light-colored dog that is wearing a harness. The woman is smiling and appears to be giving the dog a treat or a small object, which the dog is reaching for with its front paw. The ocean waves can be seen in the background, adding to the tranquil atmosphere of the setting. The overall mood of the image is peaceful and joyful, capturing a moment of connection between the woman and her pet.']


In [2]:
# 示例输入消息
messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg", 
            },
            {"type": "text", "text": "Describe this image."},
        ],
    }
]

# 预处理文本部分
text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)

# 处理视觉信息（提取图像/视频）
image_inputs, video_inputs = process_vision_info(messages)

# 构建最终输入张量
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)
inputs = inputs.to(model.device)  # 移动到模型所在设备（如 GPU）

# 生成输出
generated_ids = model.generate(**inputs, max_new_tokens=128)

# 去除 prompt 部分，仅保留生成内容
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]

# 解码输出
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)

# 打印结果
print(output_text)

['The image depicts a serene beach scene during what appears to be either sunrise or sunset, as indicated by the warm, golden light. A woman is sitting on the sand, interacting with a light-colored dog that is wearing a harness. The woman is smiling and appears to be giving the dog a treat or a small object, which the dog is reaching for with its front paw. The ocean waves can be seen in the background, adding to the tranquil atmosphere of the setting. The overall mood of the image is peaceful and joyful, capturing a moment of connection between the woman and her pet.']


In [3]:
# 示例输入消息
messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg", 
            },
            {"type": "text", "text": "Describe this image in brief."},
        ],
    }
]

# 预处理文本部分
text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)

# 处理视觉信息（提取图像/视频）
image_inputs, video_inputs = process_vision_info(messages)

# 构建最终输入张量
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)
inputs = inputs.to(model.device)  # 移动到模型所在设备（如 GPU）

# 生成输出
generated_ids = model.generate(**inputs, max_new_tokens=128)

# 去除 prompt 部分，仅保留生成内容
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]

# 解码输出
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)

# 打印结果
print(output_text)

['The image shows a person sitting on the beach, interacting with a light-colored dog. The person is wearing a plaid shirt and dark pants, and they appear to be giving a treat or a small object to the dog. The dog is wearing a harness and is sitting attentively, reaching out with its paw towards the person. The background features the ocean with gentle waves and a soft, warm light suggesting it might be late afternoon or early evening. The overall atmosphere is serene and joyful.']
