# Annotation from prompt with Grounding Dino

#### Course:Deep Neural Engineering (IM1102)
#### Group: Ellen Cordemans, Ilse Harmers & Sem Pepels

The code in this notebook is partially based on the documentation on Hugging Face 
https://github.com/huggingface/transformers/blob/main/docs/source/en/model_doc/grounding-dino.md

A font file is downloaded from https://fonts.gstatic.com/s/roboto/v30/KFOlCnqEu92Fr1MmEU9vAw.ttf

---

In [12]:
import torch
from PIL import Image
from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
import matplotlib.pyplot as plt
from torchvision import io, utils
from torchvision import tv_tensors
from torchvision.transforms import v2
from torchvision.transforms.v2 import functional as F
from pathlib import Path
from roboflow import Roboflow

directory structure

```
+ Y:/ai/projects/IM1102
  + run1
  + ds/grounding_dino_vanilla
        data.yaml
        *.jpg
        *.txt
      + output
```

In [13]:
project_root = Path("Y:/ai/projects/IM1102")
yrun_dir =     Path("Y:/ai/projects/IM1102/run1")
dataset_dir =  Path("Y:/ai/projects/IM1102/ds/grounding_dino_vanilla")

In [14]:
#model_id = "IDEA-Research/grounding-dino-tiny"
model_id = "IDEA-Research/grounding-dino-base"
device = "cpu"
processor = AutoProcessor.from_pretrained(model_id)
model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id).to(device)

In [15]:
rf = Roboflow(api_key="your api key here")
project = rf.workspace().project("facadeparsingtest")

loading Roboflow workspace...
loading Roboflow project...


In [16]:
def upload_annotated_image(imgfn, lblfn):
    project.single_upload(
        image_path          = imgfn,                       # opened as file
        annotation_path     = lblfn,
        annotation_labelmap = str(dataset_dir/"data.yaml") # opened as file only if this is a string
    )

In [19]:
def run_model(imgfn, lblfn, outimgfn):
    image = Image.open(imgfn)
    w, h = image.size
    textlabels = [["0", "door", "front yard", "window"]]
    class2name = {'0':'0', '1':'door', '2':'front yard', '3':'window'}
    name2class = {v: k for k, v in class2name.items()}
    inputs = processor(images=image, text=textlabels, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model(**inputs)
    results = processor.post_process_grounded_object_detection(
        outputs, inputs.input_ids, threshold=0.4, text_threshold=0.3, target_sizes=[image.size[::-1]])
    result = results[0]
    result["colors"] = []
    result["lines"]  = []
    for box, labels in zip(result["boxes"], result["text_labels"]):
        if   labels == 'door':        color='green'
        elif labels == 'window':      color='red'
        elif labels == 'front yard':  color='blue'
        else:                         color='black'
        result["colors"].append(color)
        [x1, y1, x2, y2] = box
        ixywh = [ 
            name2class[labels], 
            "{:.6f}".format( (x1 + ((x2-x1)/2))/w ), 
            "{:.6f}".format( (y1 + ((y2-y1)/2))/h ), 
            "{:.6f}".format( (x2-x1)/w ), 
            "{:.6f}".format( (y2-y1)/h ) 
        ]
        result["lines"].append(' '.join(ixywh))
    lblfn.write_text("\n".join(sorted(result["lines"])))
    viz = utils.draw_bounding_boxes(F.pil_to_tensor(image), boxes=result['boxes'], labels=result['text_labels'], 
                                    font='KFOlCnqEu92Fr1MmEU9vAw.ttf', font_size=25, colors=result['colors'], width=4)
    imgo = F.to_pil_image(viz)
    imgo.save(outimgfn)
    #print(result)
    return imgfn, lblfn

In [21]:
flist = sorted(Path(dataset_dir).glob('*.jpg'))
for imgfn in flist:
    print(imgfn)
    lblfn    = dataset_dir/ (imgfn.stem+'.txt')
    outimgfn = dataset_dir/ ('output/'+imgfn.stem+'.png')
    imgfn, lblfn = run_model(imgfn, lblfn, outimgfn)
    upload_annotated_image(imgfn, lblfn)

Y:\ai\projects\IM1102\ds\grounding_dino_vanilla\40294018_337_1440x960.jpg
Y:\ai\projects\IM1102\ds\grounding_dino_vanilla\42087266_279_1440x960.jpg
Y:\ai\projects\IM1102\ds\grounding_dino_vanilla\42132455_415_1440x960.jpg
Y:\ai\projects\IM1102\ds\grounding_dino_vanilla\42340258_879_1440x960.jpg
Y:\ai\projects\IM1102\ds\grounding_dino_vanilla\42340258_880_1440x960.jpg
Y:\ai\projects\IM1102\ds\grounding_dino_vanilla\42785832_417_1440x960.jpg
Y:\ai\projects\IM1102\ds\grounding_dino_vanilla\43407668_062_1440x960.jpg
Y:\ai\projects\IM1102\ds\grounding_dino_vanilla\43429591_815_1440x960.jpg
Y:\ai\projects\IM1102\ds\grounding_dino_vanilla\43491751_938_1440x960.jpg
Y:\ai\projects\IM1102\ds\grounding_dino_vanilla\43545600_929_1440x960.jpg
Y:\ai\projects\IM1102\ds\grounding_dino_vanilla\43610079_438_1440x960.jpg
Y:\ai\projects\IM1102\ds\grounding_dino_vanilla\43625880_858_1440x960.jpg
Y:\ai\projects\IM1102\ds\grounding_dino_vanilla\43626026_723_1440x960.jpg
Y:\ai\projects\IM1102\ds\grounding_din