In [None]:
from transformers import Qwen2_5OmniForConditionalGeneration, Qwen2_5OmniProcessor
from qwen_omni_utils import process_mm_info
import torch
import soundfile as sf
import os
from PIL import Image

model_name = "Qwen/Qwen2.5-Omni-7B"
# model_name = "Qwen/Qwen2.5-Omni-7B-GPTQ-Int4"
# 加载模型：启用 4-bit 量化（NF4 类型）
model = Qwen2_5OmniForConditionalGeneration.from_pretrained(
    model_name,
    device_map="auto",                      # 自动分配设备（支持多 GPU）
    load_in_4bit=True,                      # 启用 4-bit 量化
    bnb_4bit_quant_type="nf4",              # NF4 是一种适合预训练权重的量化类型
    bnb_4bit_use_double_quant=True,         # 双重量化进一步压缩
    bnb_4bit_compute_dtype=torch.bfloat16,  # 计算时使用 bfloat16 提升性能
    attn_implementation="flash_attention_2" # 使用 Flash Attention 2
)
model.disable_talker()  # 如果不需要语音输出，禁用 Talker

# 加载处理器
processor = Qwen2_5OmniProcessor.from_pretrained(model_name)

# 图像预处理
img_path = "../img/BlueUp1.jpg"
image = Image.open(img_path).convert("RGB")
image = image.resize((224, 224))  # 或者 (384, 384)
image.save("resized.jpg")

# 构建对话输入
conversation = [
    {
        "role": "system",
        "content": [{"type": "text", "text": "You are a helpful assistant that can understand images and answer questions."}],
    },
    {
        "role": "user",
        "content": [
            {"type": "image", "image": "resized.jpg"},
            {"type": "text", "text": "Describe objects and their relative locations in details: "}
        ],
    },
]

# 多模态信息处理
USE_AUDIO_IN_VIDEO = False
text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
audios, images, videos = process_mm_info(conversation, use_audio_in_video=USE_AUDIO_IN_VIDEO)

# 构建输入张量
inputs = processor(
    text=text,
    audio=audios,
    images=images,
    videos=videos,
    return_tensors="pt",
    padding=True,
    use_audio_in_video=USE_AUDIO_IN_VIDEO,
)
inputs = inputs.to(model.device).to(model.dtype)  # 移动到模型所在设备

# 推理生成文本
text_ids = model.generate(
    **inputs,
    use_audio_in_video=False,
    return_audio=False,
    max_new_tokens=20,
    do_sample=True,
    temperature=0.7,
    top_p=0.9,
    repetition_penalty=1.1
)

# 解码并输出结果
output_text = processor.batch_decode(text_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
print(output_text)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
Unrecognized keys in `rope_scaling` for 'rope_type'='default': {'mrope_section'}


AttributeError: 'BitsAndBytesConfig' object has no attribute 'get_loading_attributes'

In [None]:
from PIL import Image
import matplotlib.pyplot as plt


def inference_img(img_path, 
                system_prompt="You are a helpful assistant that can understand images and answer questions.", 
                user_prompt="Describe objects and their relative locations in details: ",
                resize_location="resized.jpg",
                resieze_size=(224, 224)):
    img_path = img_path
    image = Image.open(img_path).convert("RGB")
    image = image.resize(resieze_size) 
    image.save(resize_location)
    
    # plot resized image
    plt.imshow(image)
    plt.axis('off')  # 去掉坐标轴
    plt.title("Resized Image")
    plt.show()
    
    conversation = [
        {
            "role": "system",
            "content": [{"type": "text", "text": system_prompt}],
        },
        {
            "role": "user",
            "content": [
                {"type": "image", "image": resize_location},
                {"type": "text", "text": user_prompt}
            ],
        },
    ]

    if_return_audio = False  # 如果不涉及音频，设为 False
    # 准备推理输入
    text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
    audios, images, videos = process_mm_info(conversation, use_audio_in_video=if_return_audio)
    print(f"audios: {audios}, images: {images}, videos: {videos}")
    inputs = processor(
        text=text,
        audio=audios,
        images=images,
        videos=videos,
        return_tensors="pt",
        padding=True,
        use_audio_in_video=if_return_audio,
    )
    inputs = inputs.to(model.device).to(model.dtype)

    text_ids = model.generate(
        **inputs,
        use_audio_in_video=if_return_audio,           # 不启用音频
        return_audio=False,                 # 不返回音频
        # max_new_tokens=50,                # 限制输出长度
        # max_length=10, 
        do_sample=True,                     # 采样生成（非贪心）
        temperature=0.5,                    # 控制多样性
        top_p=0.9,                          # nucleus sampling
        repetition_penalty=1.1              # 减少重复输出
    )


    output_text = processor.batch_decode(text_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
    print(output_text)
    
inference_img(img_path="../img/img1.jpg", user_prompt="Describe objects and their relative locations in details:")

In [None]:
inference_img(img_path="../img/img2.jpg")

In [None]:
inference_img(img_path="../img/BlueUp1.jpg")
inference_img(img_path="../img/GreenUp1.jpg")
inference_img(img_path="../img/red cube.png")