In [1]:
import torch
# ❗️ 主要修改点：导入正确的 AutoModel 类
from transformers import AutoModelForVision2Seq, AutoTokenizer 
from PIL import Image
import os


# --- 设置 (无需改动) ---
model_id = "Qwen/Qwen2.5-VL-3B-Instruct-AWQ"
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"使用的设备: {device}")

# --- 加载模型和分词器 ---
print("正在加载模型和分词器...")
try:
    # 分词器加载无需改动
    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
    
    # ❗️ 主要修改点：使用 AutoModelForVision2Seq
    model = AutoModelForVision2Seq.from_pretrained(
        model_id,
        torch_dtype=torch.float16,
        device_map="auto",
        trust_remote_code=True
    ).eval() # 设置为评估模式
    
    print("模型加载成功！")

except Exception as e:
    print(f"加载模型失败: {e}")
    exit()

# ... 后续的推理代码无需改动 ...

使用的设备: cuda
正在加载模型和分词器...


I have left this message as the final dev message to help you transition.

Important Notice:
- AutoAWQ is officially deprecated and will no longer be maintained.
- The last tested configuration used Torch 2.6.0 and Transformers 4.51.3.
- If future versions of Transformers break AutoAWQ compatibility, please report the issue to the Transformers project.

Alternative:
- AutoAWQ has been adopted by the vLLM Project: https://github.com/vllm-project/llm-compressor

For further inquiries, feel free to reach out:
- X: https://x.com/casper_hansen_
- LinkedIn: https://www.linkedin.com/in/casper-hansen-804005170/



模型加载成功！


In [2]:
# 纯文本对话
# prompt = "你好，请你用生动有趣的语言介绍一下自己。"

# # 构建消息格式
# messages = [{"role": "user", "content": prompt}]

# messages = [
#     {
#         "role": "user",
#         "content": [
#             {
#                 "type": "image",
#                 "image": "/home/chen/dev/Factory/img/green_cube_2.png", 
#             },
#             {"type": "text", "text": "Describe this image."},
#         ],
#     }
# ]

# messages = [
#     {"role": "system", "content": "you are a helpful assistant."},
#     {"role": "user", "content": [
#             {"type": "text", "text": "Describe this image."},
#             {"image":  "/home/chen/dev/Factory/img/green_cube_2.png"},
#         ]
#     },
# ]

# # 使用分词器的聊天模板处理输入
# text = tokenizer.apply_chat_template(
#     messages,
#     tokenize=False,
#     add_generation_prompt=True
# )
# image = Image.open("/home/chen/dev/Factory/img/green_cube_2.png")
# from transformers import AutoProcessor
# processor = AutoProcessor.from_pretrained(model_id)
# model_inputs = processor(text=[text], images=[image], return_tensors="pt").to(device)

# # print model_inputs content
# print("模型输入内容:")
# print(model_inputs)

from transformers import AutoTokenizer, AutoProcessor
from PIL import Image

# Step 1: 构建 messages 格式
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {
        "role": "user",
        "content": [
            {"type": "text", "text": "Describe this image."},
            {"type": "image"}
        ]
    }
]

# Step 2: 加载图像
image = Image.open("/home/chen/dev/Factory/img/green_cube_2.png")

# Step 3: 加载 tokenizer 和 processor
tokenizer = AutoTokenizer.from_pretrained(model_id)
processor = AutoProcessor.from_pretrained(model_id)

# Step 4: 先生成 prompt 字符串
prompt = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)

# Step 5: 将 prompt 字符串 + 图像传入 processor
model_inputs = processor(
    text=prompt,       # ✅ 现在传入的是字符串
    images=image,
    return_tensors="pt"
).to(device)
print("Processor inputs keys:", model_inputs.keys())
# Step 6: 打印模型输入内容（调试用）
print("模型输入内容:")
print(model_inputs)





# messages=[
#     {
#         "role": "system",
#         "content": [{"type":"text","text": "Describe this image."}]},
#     {
#         "role": "user",
#         "content": [
#             {
#                 "type": "image_url",
#                 "min_pixels": 512*28*28,
#                 "max_pixels": 2048*28*28,
#                 # Pass in BASE64 image data. Note that the image format (i.e., image/{format}) must match the Content Type in the list of supported images. "f" is the method for string formatting.
#                 # PNG image:  f"data:image/png;base64,{base64_image}"
#                 # JPEG image: f"data:image/jpeg;base64,{base64_image}"
#                 # WEBP image: f"data:image/webp;base64,{base64_image}"
#                 "image_url": {"url": f"data:image/png;base64,{"/home/chen/dev/Factory/img/green_cube_2.png"}"},
#             },
#             {"type": "text", "text": "Describe this image."},
#         ],
#     }
# ]



# 生成回复
generated_ids = model.generate(
    model_inputs.input_ids,
    max_new_tokens=512
)
generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]



response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

# print("\n--- 纯文本推理 ---")
# print(f"用户: {prompt}")
print(f"模型: {response}")

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
You have video processor config saved in `preprocessor.json` file which is deprecated. Video processor configs should be saved in their own `video_preprocessor.json` file. You can rename the file or load and save the processor back which renames it automatically. Loading from `preprocessor.json` will be removed in v5.0.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Processor inputs keys: dict_keys(['input_ids', 'attention_mask', 'pixel_values', 'image_grid_thw'])
模型输入内容:
{'input_ids': tensor([[151644,   8948,    198,   2610,    525,    264,  10950,  17847,     13,
         151645,    198, 151644,    872,    198,  74785,    419,   2168,     13,
         151652, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655,
         151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655, 151655,
         151655, 151655, 151655, 151653, 151645,    198, 151644,  77091,    198]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]],
       device='cuda:0'), 'pixel_values': tensor([[0.9230, 0.9230, 0.9230,  ..., 0.9656, 0.9656, 0.9656],
        [0.9084, 0.9376, 0.9522,  ..., 0.9230, 0.9088, 0.8945],
        [0.9230, 0.9230, 0.9230,  ..., 0.9230, 0.9230, 0.9230],
        ...,
        [0.7333, 0.7333, 0.

In [3]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
os.environ['TORCH_USE_CUDA_DSA'] = '1'

messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg", 
            },
            {"type": "text", "text": "Describe this image."},
        ],
    }
]

from transformers import  AutoProcessor
processor = AutoProcessor.from_pretrained(model_id)
from qwen_vl_utils import process_vision_info

# 准备推理输入
text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)
inputs = inputs.to(model.device)

# 推理生成输出
generated_ids = model.generate(**inputs, max_new_tokens=128)
generated_ids_trimmed = [
    out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print(output_text)

/pytorch/aten/src/ATen/native/cuda/TensorCompare.cu:112: _assert_async_cuda_kernel: block: [0,0,0], thread: [0,0,0] Assertion `probability tensor contains either `inf`, `nan` or element < 0` failed.


RuntimeError: CUDA error: device-side assert triggered
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
