# Grounding DINO - Batched Half Precision Inference

# Prepare Environments

In [None]:
import torch
from PIL import Image
import io
import os
import supervision as sv
import numpy as np
import requests
import cv2

# Grounding DINO
from groundingdino.util.inference import BatchedModel
import torchvision.transforms.functional as F
from huggingface_hub import hf_hub_download

# If you have multiple GPUs, you can set the GPU to use here.
# The default is to use the first GPU, which is usually GPU 0.
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# Load Grounding DINO model

# Load demo image

In [None]:
def download_image(url, image_file_path):
    r = requests.get(url, timeout=4.0)
    if r.status_code != requests.codes.ok:
        assert False, 'Status code error: {}.'.format(r.status_code)

    with Image.open(io.BytesIO(r.content)) as im:
        im.save(image_file_path)

    print('Image downloaded from url: {} and saved to: {}.'.format(url, image_file_path))

def load_image(image_path):
    image_source = Image.open(image_path).convert("RGB")
    image = np.asarray(image_source)
    image_tensor = F.to_tensor(image)
    return image, image_tensor

local_image_path = "assets/demo4.jpg"
#download_image(image_url, local_image_path)
image_source, image_tensor = load_image(local_image_path)
Image.fromarray(image_source)

# Run Grounding DINO for detection

In [None]:
# Use this command for evaluate the Grounding DINO model
# Or you can download the model by yourself
ckpt_repo_id = "ShilongLiu/GroundingDINO"
ckpt_filename = "groundingdino_swint_ogc.pth"
ckpt_config_filename = "GroundingDINO_SwinT_OGC.cfg.py"
device = "cuda"

cache_config_file = hf_hub_download(repo_id=ckpt_repo_id, filename=ckpt_config_filename)
cache_file = hf_hub_download(repo_id=ckpt_repo_id, filename=ckpt_filename)


## Single Precision

In [None]:
batch = 2
box_threshold = 0.3
text_threshold = 0.25
iou_threshold = 0.5

# Batch of prompts
text_prompt = [
    ["Black dog", "Beige dog"],
    ["Dog", "Stick"]
]

dtype = "float32"

# Repeat image BATCH number of times
image_tensor = image_tensor.to(device=device).to(dtype=getattr(torch, dtype))
image_tensor = image_tensor[None, ...].expand(batch, -1, -1, -1)

# Building GroundingDINO inference model
grounding_dino_model = BatchedModel(
    model_config_path=cache_config_file, 
    model_checkpoint_path=cache_file,
    device=device,
    dtype=dtype,
)


In [None]:
%%timeit -n 10
with torch.no_grad():
    bbox_batch, conf_batch, class_id_batch  = grounding_dino_model(
        image_batch=image_tensor,
        text_prompts=text_prompt,
        box_threshold=box_threshold,
        text_threshold=text_threshold,
        nms_threshold=iou_threshold
    )
    bbox_batch = [bbox.cpu().numpy() for bbox in bbox_batch]
    conf_batch = [conf.cpu().numpy() for conf in conf_batch]

## Half Precision

In [None]:
dtype = "float16"

image_tensor = image_tensor.to(device=device).to(dtype=getattr(torch, dtype))

# Building GroundingDINO inference model
grounding_dino_model = BatchedModel(
    model_config_path=cache_config_file, 
    model_checkpoint_path=cache_file,
    device=device,
    dtype=dtype
)

In [None]:
%%timeit -n 10
with torch.no_grad():
    bbox_batch, conf_batch, class_id_batch  = grounding_dino_model(
        image_batch=image_tensor,
        text_prompts=text_prompt,
        box_threshold=box_threshold,
        text_threshold=text_threshold,
        nms_threshold=iou_threshold
    )
    bbox_batch = [bbox.cpu().numpy() for bbox in bbox_batch]
    conf_batch = [conf.cpu().numpy() for conf in conf_batch]

# Display result

In [None]:
with torch.no_grad():
    bbox_batch, conf_batch, class_id_batch  = grounding_dino_model(
        image_batch=image_tensor,
        text_prompts=text_prompt,
        box_threshold=box_threshold,
        text_threshold=text_threshold,
        nms_threshold=iou_threshold
    )
    bbox_batch = [bbox.cpu().numpy() for bbox in bbox_batch]
    conf_batch = [conf.cpu().numpy() for conf in conf_batch]

In [None]:
from IPython.display import display
def annotate(image_source, boxes, logits, phrases) -> np.ndarray:
    detections = sv.Detections(xyxy=boxes)
    labels = [
        f"{phrase} {logit:.2f}"
        for phrase, logit
        in zip(phrases, logits)
    ]
    box_annotator = sv.BoxAnnotator()
    annotated_frame = cv2.cvtColor(image_source, cv2.COLOR_RGB2BGR)
    annotated_frame = box_annotator.annotate(scene=annotated_frame, detections=detections, labels=labels)
    return annotated_frame[...,::-1]


for i, (bbox, conf, class_id, class_label)  in enumerate(zip(bbox_batch, conf_batch, class_id_batch, text_prompt)):
    annotated_frame = annotate(
        image_source=image_source, 
        boxes=bbox,
        logits=conf,
        phrases=np.array(class_label)[class_id]
    )

    display(Image.fromarray(annotated_frame))