In [1]:
from qwen3vl.modeling_qwen3_vl import Qwen3VLForConditionalGeneration
from qwen3vl.processing_qwen3_vl import Qwen3VLProcessor
from transformers import AutoImageProcessor, AutoTokenizer, AutoProcessor

import os

model = Qwen3VLForConditionalGeneration.from_pretrained(
    "/scratch/yh5961/RLSD/SpecForge/qwen3-vl", torch_dtype="auto", device_map="auto"
)

# Manual instantiation without video processor
tokenizer = AutoTokenizer.from_pretrained("/scratch/yh5961/RLSD/SpecForge/qwen3-vl")
image_processor = AutoImageProcessor.from_pretrained("/scratch/yh5961/RLSD/SpecForge/qwen3-vl")

processor = Qwen3VLProcessor(
    image_processor=image_processor,
    tokenizer=tokenizer,
    video_processor=None,
    chat_template=tokenizer.chat_template,
)

  import pynvml  # type: ignore[import]
  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.30s/it]


In [2]:
from transformers import AutoImageProcessor, AutoTokenizer, AutoProcessor

aprocessor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-3B-Instruct")

processor = Qwen3VLProcessor(
    image_processor=image_processor,
    tokenizer=tokenizer,
    video_processor=None,
    chat_template=aprocessor.chat_template,
)

messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image",
                "image": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg",
            },
            {"type": "text", "text": "Tell me about this image."},
        ],
    }
]

# Preparation for inference
text = processor.apply_chat_template(
    messages, tokenize=False, add_generation_prompt=True
)
from qwen_vl_utils import process_vision_info
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
    text=[text],
    images=image_inputs,
    videos=video_inputs,
    padding=True,
    return_tensors="pt",
)
inputs = inputs.to("cuda")

generated_ids = model.generate(**inputs, max_new_tokens=1024, do_sample=False)
generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.tokenizer.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)

print("\033[1;32m" + str(output_text) + "\033[0m")

The image processor of type `Qwen2VLImageProcessor` is now loaded as a fast processor by default, even if the model checkpoint was saved with a slow processor. This is a breaking change and may produce slightly different outputs. To continue using the slow processor, instantiate this class with `use_fast=False`. Note that this behavior will be extended to all models in a future release.
You have video processor config saved in `preprocessor.json` file which is deprecated. Video processor configs should be saved in their own `video_preprocessor.json` file. You can rename the file or load and save the processor back which renames it automatically. Loading from `preprocessor.json` will be removed in v5.0.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


[1;32m['Of course, here is a detailed description of the image.\n\nThis is a heartwarming and serene photograph capturing a moment of connection between a woman and her dog on a beach at sunset or sunrise.\n\n---\n\n### Scene Description\n\n- **Main Subjects:** The central focus is on a young woman with long dark hair and a light-colored Labrador Retriever.\n- **Setting:** They are sitting together on a wide expanse of sand in front of the ocean. The background features gentle waves rolling onto the shore under a bright, hazy sky.\n- **Lighting:** The scene is bathed in warm, golden sunlight, characteristic of the "golden hour" just after sunrise or before sunset. This creates a soft, dreamy atmosphere and casts a beautiful lens flare from the sun on the right side of the frame.\n\n### Interaction\n\nThe most striking element of the photo is the interaction between the two subjects:\n- The woman is sitting cross-legged in the sand, smiling warmly as she looks at the dog.\n- The dog is

In [5]:
processor = Qwen3VLProcessor(
    image_processor=image_processor,
    tokenizer=tokenizer,
    video_processor=None,
)

text = "<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>Tell me about this image.<|im_end|>\n<|im_start|>assistant\n"

# Process the image
from PIL import Image
import requests

image_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
image = Image.open(requests.get(image_url, stream=True).raw)
# image = Image.open(os.path.join(curr_path, "modelfiles", "receipt.png"))

inputs = processor(
    text=[text],
    images=[image],
    videos=None,
    padding=True,
    return_tensors="pt",
)
inputs = inputs.to("cuda")

# Inference: Generation of the output
generated_ids = model.generate(**inputs, max_new_tokens=1024, do_sample=False)
generated_ids_trimmed = [
    out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
]
output_text = processor.tokenizer.batch_decode(
    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
)
print("\033[1;32m" + str(output_text) + "\033[0m")

The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


[1;32m['Of course, here is a detailed description of the image.\n\nThis is a heartwarming and serene photograph capturing a tender moment between a woman and her dog on a beach at sunset or sunrise.\n\n### Scene Description\n\n- **Setting:** The scene is set on a wide, sandy beach with the ocean in the background. Gentle waves are visible breaking near the shore.\n- **Lighting:** The lighting is soft and warm, characteristic of the "golden hour" just after sunrise or before sunset. This creates a beautiful lens flare effect on the right side of the frame, bathing the subjects in a warm glow.\n- **Atmosphere:** The overall mood is peaceful, joyful, and intimate. The vastness of the sky and sea contrasts with the close bond shared by the two main subjects.\n\n---\n\n### Subjects\n\nThe central focus of the image is a woman and a large dog sitting together on the sand.\n\n- **The Woman:**\n    - She is sitting cross-legged on the sand, turned towards the dog.\n    - She has long, dark ha