# Grounding DINO Test Notebook
Test Grounding DINO on image frames to verify bounding box output for text descriptions.

## 1. Installation (run once)
```bash
git clone https://github.com/IDEA-Research/GroundingDINO.git
cd GroundingDINO
pip install -e .
pip install torch torchvision opencv-python

# Download weights
mkdir weights
wget -O weights/groundingdino_swint_ogc.pth https://github.com/IDEA-Research/GroundingDINO/releases/download/v0.1.0-alpha/groundingdino_swint_ogc.pth
```

In [None]:
# Imports
from groundingdino.util.inference import load_model, load_image, predict, annotate
import cv2
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt

In [None]:
# Configuration - UPDATE THESE PATHS
CONFIG_PATH = "GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py"
WEIGHTS_PATH = "weights/groundingdino_swint_ogc.pth"

# Load the model (do this once)
model = load_model(CONFIG_PATH, WEIGHTS_PATH)
print("Model loaded successfully!")

In [None]:
# Test image path - UPDATE THIS
IMAGE_PATH = "../../img/debug_screenshot.png"

# Text prompt - describe what you want to detect
TEXT_PROMPT = "the player character . health bar . inventory . button ."

# Detection thresholds
BOX_THRESHOLD = 0.35
TEXT_THRESHOLD = 0.25

In [None]:
# Load image using GroundingDINO's helper
image_source, image = load_image(IMAGE_PATH)
print(f"Image shape: {image_source.shape}")

# Display original image
plt.figure(figsize=(12, 8))
plt.imshow(image_source)
plt.title("Original Image")
plt.axis('off')
plt.show()

In [None]:
# Run inference
boxes, logits, phrases = predict(
    model=model,
    image=image,
    caption=TEXT_PROMPT,
    box_threshold=BOX_THRESHOLD,
    text_threshold=TEXT_THRESHOLD
)

print(f"Detected {len(boxes)} objects")
print(f"\nBoxes (normalized cx, cy, w, h):")
print(boxes)
print(f"\nConfidence scores:")
print(logits)
print(f"\nMatched phrases:")
print(phrases)

In [None]:
# Convert normalized boxes to pixel coordinates
h, w, _ = image_source.shape

pixel_boxes = []
for i, (cx, cy, bw, bh) in enumerate(boxes):
    x1 = int((cx - bw / 2) * w)
    y1 = int((cy - bh / 2) * h)
    x2 = int((cx + bw / 2) * w)
    y2 = int((cy + bh / 2) * h)
    pixel_boxes.append((x1, y1, x2, y2))
    print(f"{phrases[i]}: ({x1}, {y1}) -> ({x2}, {y2})")

print(f"\nPixel coordinates (x1, y1, x2, y2):")
for box in pixel_boxes:
    print(box)

In [None]:
# Annotate and display the image with bounding boxes
annotated_frame = annotate(
    image_source=image_source,
    boxes=boxes,
    logits=logits,
    phrases=phrases
)

# Convert BGR to RGB for matplotlib
annotated_frame_rgb = annotated_frame[..., ::-1]

plt.figure(figsize=(14, 10))
plt.imshow(annotated_frame_rgb)
plt.title(f"Detected: {phrases}")
plt.axis('off')
plt.show()

In [None]:
# Save annotated image
output_path = "annotated_output.jpg"
cv2.imwrite(output_path, annotated_frame)
print(f"Saved annotated image to: {output_path}")

## Helper Function for Easy Reuse

In [None]:
def detect_with_text(image_path, text_prompt, box_threshold=0.35, text_threshold=0.25, show=True):
    """
    Detect objects in an image using a text prompt.
    
    Returns:
        pixel_boxes: List of (x1, y1, x2, y2) tuples in pixel coordinates
        phrases: List of matched text phrases
        logits: Confidence scores for each detection
    """
    # Load image
    image_source, image = load_image(image_path)
    h, w, _ = image_source.shape
    
    # Run inference
    boxes, logits, phrases = predict(
        model=model,
        image=image,
        caption=text_prompt,
        box_threshold=box_threshold,
        text_threshold=text_threshold
    )
    
    # Convert to pixel coordinates
    pixel_boxes = []
    for cx, cy, bw, bh in boxes:
        x1 = int((cx - bw / 2) * w)
        y1 = int((cy - bh / 2) * h)
        x2 = int((cx + bw / 2) * w)
        y2 = int((cy + bh / 2) * h)
        pixel_boxes.append((x1, y1, x2, y2))
    
    # Optionally display
    if show and len(boxes) > 0:
        annotated = annotate(image_source, boxes, logits, phrases)
        plt.figure(figsize=(12, 8))
        plt.imshow(annotated[..., ::-1])
        plt.title(f"Detected: {phrases}")
        plt.axis('off')
        plt.show()
    
    return pixel_boxes, phrases, logits.tolist()

In [None]:
# Test the helper function
boxes, phrases, scores = detect_with_text(
    "../../img/debug_screenshot.png",
    "static UI element . button . text label ."
)

for box, phrase, score in zip(boxes, phrases, scores):
    print(f"{phrase} (conf: {score:.2f}): {box}")