In [None]:
import json
import random
from PIL import Image, ImageDraw, ImageFont
import os
import torch
from transformers import Qwen2_5_VLForConditionalGeneration, AutoProcessor

# @title inference function
def inference(model,
              processor,
              image_path,
              prompt,
              sys_prompt="You are a helpful assistant.",
              max_new_tokens=2048,
              return_input=False):
    image = Image.open(image_path)
    image_local_path = "file://" + image_path
    messages = [
        {"role": "system", "content": sys_prompt},
        {"role": "user", "content": [
                {"type": "text", "text": prompt},
                {"image": image_local_path},
            ]
        },
    ]
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    print("text:", text)
    # image_inputs, video_inputs = process_vision_info([messages])
    inputs = processor(text=[text], images=[image], padding=True, return_tensors="pt")
    inputs = inputs.to('cuda')

    output_ids = model.generate(**inputs, max_new_tokens=max_new_tokens)
    generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
    output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
    # overall_text = processor.batch_decode(output_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
    # print("overall_text:", overall_text)
    if return_input:
        return output_text[0], inputs
    else:
        return output_text[0]

In [None]:
model_path = "Qwen/Qwen2.5-VL-7B-Instruct"
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(model_path, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2",device_map="auto")
processor = AutoProcessor.from_pretrained(model_path)

In [None]:
image_path = """
/local_data/projects/vlm/QA/structured-data/1d914f73a4a243c3acac50d24f083aac/1533202427548877/CAM_FRONT_raw.jpg
""".strip()

prompt = """
What is the distance between the trash compactor truck and the ego camera at frame 0 ? choose the closest one among the following options. \nA. 62.6\nB. 71.99\nC. 81.38\nD. 90.77\nE. 100.16\n (just reply the correct option's letter in json {'ans': ans}, unit in meters, frame idx starts from 0)
""".strip()

image = Image.open(image_path)
image.thumbnail([640,640], Image.Resampling.LANCZOS)
display(image)

## Use a local HuggingFace model to inference.
response = inference(model, processor, image_path, prompt, max_new_tokens=128)
print(response)