In [1]:
from transformers import Qwen2VLForConditionalGeneration, AutoProcessor
from PIL import Image
import torch

model_name = "Qwen/Qwen2-VL-2B-Instruct"

model = Qwen2VLForConditionalGeneration.from_pretrained(
    model_name,
    torch_dtype=torch.float16,  # Efficient memory usage on 40GB GPU
    device_map="auto"
)
processor = AutoProcessor.from_pretrained(model_name)


  from .autonotebook import tqdm as notebook_tqdm
Fetching 2 files: 100%|██████████| 2/2 [01:11<00:00, 35.68s/it]
Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.33it/s]
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [6]:
image_path = "images.jpeg"  # Replace with your actual image path in Kubeflow volume

# Load your image
image = Image.open(image_path).convert("RGB")  # Ensure image is in RGB

# Prepare your question and input
messages = [
    {
        "role": "user",
        "content": [
            {
                "type": "image_url",
                "image_url": {"url": "image"}
            },
            {
                "type": "text",
                "text": "What is wrong with the tactics in the image? Your analysis should help the coach."
            }
        ]
    }
]

In [7]:
# Apply chat template
text_prompt = processor.apply_chat_template(
    messages,
    add_generation_prompt=True
)

# Process inputs
inputs = processor(
    text=[text_prompt],
    images=[image],
    padding=True,
    return_tensors="pt"
)

device = 'cuda'

# Move inputs to GPU explicitly for Kubeflow environment
inputs = {k: v.to(device) for k, v in inputs.items()}
model = model.to(device)

# Generate the response
with torch.no_grad():
    output_ids = model.generate(**inputs, max_new_tokens=512)

# Decode generated outputs
output_text = processor.batch_decode(
    output_ids,
    skip_special_tokens=True,
    clean_up_tokenization_spaces=True
)

# ========================
# Print the output with user input path
# ========================

print(f"User input image path: {image_path}")
print("Model analysis output:")
print(output_text[0])

User input image path: images.jpeg
Model analysis output:
system
You are a helpful assistant.
user
What is wrong with the tactics in the image? Your analysis should help the coach.
assistant
The image depicts a woman hugging a dog in a field. The woman appears to be smiling and the dog is looking at the camera. There are no apparent issues with the image itself. However, if the coach is looking for a specific analysis or critique, it would depend on the context or the purpose of the image. If the image is meant to convey a positive message about the bond between humans and animals, it might be considered a positive representation. If the image is meant to convey a message about the relationship between humans and their pets, it might be seen as a positive portrayal of pet ownership.
