In [None]:
import numpy as np
import torch

from ultralytics import YOLOE
from ultralytics.models.yolo.yoloe import YOLOEVPSegPredictor, YOLOEVPDetectPredictor

# Initialize a YOLOE model
model = YOLOE("yoloe-11s-seg.pt")

# Define visual prompts using bounding boxes and their corresponding class IDs.
# Each box highlights an example of the object you want the model to detect.
visual_prompts = dict(
    bboxes=np.array(
        [
            [221.52, 405.8, 344.98, 857.54],  # Box enclosing person
            [120, 425, 160, 445],  # Box enclosing glasses
        ],
    ),
    cls=np.array(
        [
            0,  # ID to be assigned for person
            1,  # ID to be assigned for glassses
        ]
    ),
)

# Run inference on an image, using the provided visual prompts as guidance
results = model.predict(
    "bus.jpg",
    visual_prompts=visual_prompts,
    predictor=YOLOEVPDetectPredictor,
)


vpe = []
model.predictor.prompts = visual_prompts.copy()
vpe.append(model.predictor.get_vpe("bus.jpg"))

# add more reference images for same objects 
model.predictor.prompts = visual_prompts.copy()
vpe.append(model.predictor.get_vpe("bus.jpg"))

# Normalize the visual prompt embeddings and set them for the model
vpe = torch.nn.functional.normalize(torch.cat(vpe).mean(0, keepdims=True), dim=-1, p=2)

model.set_classes(["object0"], torch.nn.functional.normalize(model.predictor.vpe.mean(dim=0, keepdim=True), dim=-1, p=2))

# You can then run predictions without passing visual prompts:
results = model("zidane.jpg")

Ultralytics 8.3.175  Python-3.10.18 torch-2.8.0+cu126 CUDA:0 (NVIDIA GeForce RTX 3090 Ti, 24564MiB)
YOLOe-11s-seg summary (fused): 137 layers, 13,693,398 parameters, 1,857,958 gradients

image 1/1 c:\Users\jordan\Documents\GitHub\CoralNet-Toolbox\notebooks\bus.jpg: 640x480 3 object0s, 2 object1s, 28.2ms
Speed: 1.9ms preprocess, 28.2ms inference, 1.9ms postprocess per image at shape (1, 3, 640, 480)


AttributeError: 'YOLOEVPDetectPredictor' object has no attribute 'vpe'

In [21]:
model.predictor.get_vpe()

TypeError: YOLOEVPDetectPredictor.get_vpe() missing 1 required positional argument: 'source'

In [15]:
# You can then run predictions without passing visual prompts:
results = model("zidane.jpg")




RuntimeError: shape '[1, 66, -1]' is invalid for input of size 921600

In [None]:
import numpy as np
import torch

from ultralytics import YOLOE
from ultralytics.models.yolo.yoloe import YOLOEVPSegPredictor, YOLOEVPDetectPredictor

# Initialize a YOLOE segmentation model
model = YOLOE("yoloe-11s-seg.pt")

# --- Step 1: Define prompts for reference images ---
# Prompts for the first reference image
visual_prompts_1 = dict(
    bboxes=np.array([
        [221.52, 405.8, 344.98, 857.54],  # A person
    ]),
    cls=np.array([0])  # Class ID for person
)

# Prompts for a second reference image (using bus.jpg again for simplicity)
visual_prompts_2 = dict(
    bboxes=np.array([
        [120, 425, 160, 445],  # A different object, e.g., glasses
    ]),
    cls=np.array([1]) # Class ID for glasses
)

# --- Step 2: Manually create a predictor instance ---
results = model.predict(
    "bus.jpg",
    visual_prompts=visual_prompts,
    predictor=YOLOEVPSegPredictor,
)

# --- Step 3: Generate and collect VPEs from reference images ---
vpe_list = []
print("Generating VPEs from reference images...")

# Generate VPE for the first image
model.predictor.set_prompts(visual_prompts_1)
vpe1 = model.predictor.get_vpe("bus.jpg")
vpe_list.append(vpe1)
print("Generated VPE from the first prompt.")

# Generate VPE for the second image
model.predictor.set_prompts(visual_prompts_2)
vpe2 = model.predictor.get_vpe("bus.jpg")
vpe_list.append(vpe2)
print("Generated VPE from the second prompt.")

# --- Step 4: Aggregate the VPEs ---
if vpe_list:
    # Combine the list of VPE tensors and average them
    final_vpe = torch.cat(vpe_list).mean(dim=0, keepdim=True)
    
    # Normalize the final embedding
    final_vpe = torch.nn.functional.normalize(final_vpe, p=2, dim=-1)
    print("\nSuccessfully combined VPEs into a single, normalized embedding.")

    # --- Step 5: Run inference using the aggregated VPE ---
    
    # Directly set the final tensor as the prompt for the predictor.
    # The `inference` method is designed to use this tensor.
    model.predictor.set_prompts(final_vpe)
    # Or?
    model.set_classes(["person", "bus"], vpe)

    print("Running inference on a new target image ('zidane.jpg')...")
    # Call predict. The predictor will now use the `final_vpe` we just set.
    results = model.predict("zidane.jpg")
    
    print("Inference complete. Results are generated.")
    # You can now process or display the 'results'.
    results[0].show()

else:
    print("No VPEs were generated.")

Ultralytics 8.3.175  Python-3.10.18 torch-2.8.0+cu126 CUDA:0 (NVIDIA GeForce RTX 3090 Ti, 24564MiB)
YOLOe-11s-seg summary (fused): 137 layers, 13,693,398 parameters, 1,857,958 gradients

image 1/1 c:\Users\jordan\Documents\GitHub\CoralNet-Toolbox\notebooks\bus.jpg: 640x480 3 object0s, 2 object1s, 9.7ms
Speed: 2.0ms preprocess, 9.7ms inference, 7.8ms postprocess per image at shape (1, 3, 640, 480)
Generating VPEs from reference images...
Generated VPE from the first prompt.
Generated VPE from the second prompt.

Successfully combined VPEs into a single, normalized embedding.
Running inference on a new target image ('zidane.jpg')...



RuntimeError: shape '[1, 66, -1]' is invalid for input of size 921600

In [42]:
model.set_classes(["person", "bus"], vpe)

TypeError: 'bool' object is not callable

In [40]:
model.is_fused = False

In [41]:
model.is_fused()

TypeError: 'bool' object is not callable