In [None]:
import requests

import torch
from PIL import Image
from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection

model_id = "IDEA-Research/grounding-dino-tiny"
device = "cuda"

processor = AutoProcessor.from_pretrained(model_id)
model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id).to(device)

image_url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(image_url, stream=True).raw)
# Check for cats and remote controls
text_labels = [["a cat", "a remote control"]]

inputs = processor(images=image, text=text_labels, return_tensors="pt").to(device)
with torch.no_grad():
    outputs = model(**inputs)

results = processor.post_process_grounded_object_detection(
    outputs,
    inputs.input_ids,
    box_threshold=0.4,
    text_threshold=0.3,
    target_sizes=[image.size[::-1]]
)

result = results[0]
for box, score, labels in zip(result["boxes"], result["scores"], result["labels"]):
    box = [round(x, 2) for x in box.tolist()]
    print(f"Detected {labels} with confidence {round(score.item(), 3)} at location {box}")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/457 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.24k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/82.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.64k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/689M [00:00<?, ?B/s]

Detected a cat with confidence 0.479 at location [344.7, 23.11, 637.18, 374.27]
Detected a cat with confidence 0.438 at location [12.27, 51.91, 316.86, 472.43]
Detected a remote control with confidence 0.476 at location [38.59, 70.01, 176.78, 118.17]




In [None]:
import matplotlib.pyplot as plt
import matplotlib.patches as patches
# Получаем результаты детекции
result = results[0]

# Создаем копию изображения для рисования
draw = image.copy()
draw = draw.convert("RGB")

fig, ax = plt.subplots(1, 1, figsize=(8, 6))
ax.imshow(draw)

# Пропускаем остальную часть кода и добавляем отрисовку
for box, score, label in zip(result["boxes"], result["scores"], result["labels"]):
    box = [round(x) for x in box.tolist()]
    x1, y1, x2, y2 = box
    width = x2 - x1
    height = y2 - y1
    rect = patches.Rectangle((x1, y1), width, height, linewidth=2, edgecolor="red", facecolor="none")
    ax.add_patch(rect)
    plt.text(x1, y1 - 5, f"{label}: {score:.2f}", color="red", fontsize=10, weight="bold")

plt.axis("off")
plt.show()