In [1]:
from transformers import Qwen3VLForConditionalGeneration, AutoProcessor

In [2]:
# default: Load the model on the available device(s)
model = Qwen3VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen3-VL-2B-Instruct", dtype="auto", device_map="auto"
)
processor = AutoProcessor.from_pretrained("Qwen/Qwen3-VL-2B-Instruct")

model.safetensors:   0%|          | 0.00/4.26G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/269 [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/390 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

video_preprocessor_config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

chat_template.json: 0.00B [00:00, ?B/s]

In [7]:
def choose_trajectory(image_path):
    messages = [
        {"role": "system", "content": [
            {"type": "text", "text": "You are a helpful and precise assistant for car trajectory evaluation."}
            ]},
        {
            "role": "user",
            "content": [
                {
                    "type": "image",
                    "image": image_path,
                },
                {"type": "text", "text": "Reason and pick the safest trajectory (color) for a car in the above image."},
            ],
        }
    ]

    # Preparation for inference
    inputs = processor.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_dict=True,
        return_tensors="pt"
    )
    inputs = inputs.to(model.device)

    # Inference: Generation of the output
    generated_ids = model.generate(**inputs, max_new_tokens=1024)
    generated_ids_trimmed = [
        out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
    ]
    output_text = processor.batch_decode(
        generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
    )
    print(output_text[0])

In [8]:
choose_trajectory("./candidates/pedestrians.jpg")

Based on the provided image, here is an analysis of the trajectories and the safest path for a car:

-   **Green Arrow (Left):** This trajectory shows a car moving from the left side of the image towards the center, following the path of the pedestrian. This path is relatively clear and appears to be a direct, predictable route. However, it's not the most optimal path as it may be obstructed by the car in the foreground on the left.

-   **Red Arrow (Center):** This trajectory indicates a car moving from the left towards the center, but it is positioned in a way that it could be in the path of the pedestrian. This path is less safe because it could potentially cause a collision or require the car to make a sharp turn, which is not ideal.

-   **Blue Arrow (Right):** This trajectory shows a car moving from the right side of the image towards the center. This path is the most direct and safest, as it avoids the pedestrian and is not obstructed by any other vehicles. It is also the most p