# **Grounding DINO - Abel**

### Imports and config

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import locale
locale.getpreferredencoding = lambda: "UTF-8"

!pip install supervision fastapi kaleido python-multipart uvicorn

import numpy as np

import os, cv2, bisect
import supervision as sv
import torch
import torch.nn as nn

from torch.utils.data import DataLoader
from torchsummary import summary
from torchvision import transforms
import torch.optim as optim

from PIL import Image

random_seed = 42
torch.manual_seed(random_seed)

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

HOME = os.getcwd()
print(HOME)

Mounted at /content/drive
Collecting supervision
  Downloading supervision-0.17.1-py3-none-any.whl (77 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m77.5/77.5 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting fastapi
  Downloading fastapi-0.109.0-py3-none-any.whl (92 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.0/92.0 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting kaleido
  Downloading kaleido-0.2.1-py2.py3-none-manylinux1_x86_64.whl (79.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.9/79.9 MB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting python-multipart
  Downloading python_multipart-0.0.6-py3-none-any.whl (45 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.7/45.7 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting uvicorn
  Downloading uvicorn-0.26.0-py3-none-any.whl (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m 

## **Grounding DINO Setup**

**Download**

In [None]:
%cd {HOME}
!git clone https://github.com/IDEA-Research/GroundingDINO.git
%cd {HOME}/GroundingDINO
!pip install -q -e .
!pip install -q roboflow

CONFIG_PATH = os.path.join(HOME, "GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py")
print(CONFIG_PATH, "; exist:", os.path.isfile(CONFIG_PATH))

%cd {HOME}
!mkdir {HOME}/weights
%cd {HOME}/weights

!wget -q https://github.com/IDEA-Research/GroundingDINO/releases/download/v0.1.0-alpha/groundingdino_swint_ogc.pth

WEIGHTS_NAME = "groundingdino_swint_ogc.pth"
WEIGHTS_PATH = os.path.join(HOME, "weights", WEIGHTS_NAME)
print(WEIGHTS_PATH, "; exist:", os.path.isfile(WEIGHTS_PATH))

%cd {HOME}/GroundingDINO
from groundingdino.util.inference import load_model, load_image, predict, annotate, Model
from groundingdino.util.utils import get_phrases_from_posmap

/content
Cloning into 'GroundingDINO'...
remote: Enumerating objects: 421, done.[K
remote: Counting objects: 100% (182/182), done.[K
remote: Compressing objects: 100% (58/58), done.[K
remote: Total 421 (delta 136), reused 131 (delta 124), pack-reused 239[K
Receiving objects: 100% (421/421), 12.85 MiB | 24.06 MiB/s, done.
Resolving deltas: 100% (214/214), done.
/content/GroundingDINO
  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m254.7/254.7 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m19.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.9/69.9 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m158.3/158.3 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m178.7/178.

**Load**

In [None]:
%cd {HOME}/GroundingDINO
from groundingdino.util.inference import load_model, load_image, predict, annotate, Model
from groundingdino.util.utils import get_phrases_from_posmap

/content/GroundingDINO


## **Data**

In [None]:
classes = ['acorn', 'axe', 'backpack', 'badger', 'bag', 'barrel', 'basket', 'bear', 'bed', 'bee', 'bell', 'bench', 'bird', 'birdcage', 'boar', 'boat', 'book', 'bottle', 'bow', 'bowl', 'box', 'bridge', 'broom', 'brush', 'bucket', 'building', 'butterfly', 'camel', 'campfire', 'candle', 'cane', 'cannon', 'car', 'cat', 'cello', 'chair', 'clock', 'couch', 'cow', 'cradle', 'crown', 'cup', 'curtain', 'deer', 'diningTable', 'dog', 'doghouse', 'donkey', 'door', 'dragon', 'drum', 'egg', 'elephant', 'ermine', 'feather', 'female', 'fence', 'fireplace', 'fish', 'fishingRod', 'flag', 'flower', 'flute', 'fox', 'frog', 'glasses', 'globe', 'goat', 'gun', 'hammer', 'hat', 'hedgehog', 'helmet', 'horse', 'hotAirBalloon', 'inkpot', 'insect', 'jackal', 'jar', 'jug', 'kettle', 'kite', 'knife', 'ladder', 'lamp', 'lifebuoy', 'lion', 'lizard', 'lobster', 'male', 'map', 'marmot', 'melon', 'monkey', 'moon', 'musicSheet', 'nest', 'net', 'painting', 'paintingStand', 'pan', 'pear', 'pen', 'penguin', 'piano', 'pickaxe', 'pig', 'pineapple', 'pipe', 'plant', 'plate', 'pot', 'pottedPlant', 'rabbit', 'rake', 'rat', 'rhino', 'sausage', 'saw', 'scale', 'scissors', 'scorpion', 'seal', 'shark', 'sheep', 'shield', 'shovel', 'sieve', 'skate', 'snail', 'snake', 'spear', 'spoon', 'sportsBall', 'squirrel', 'star', 'stool', 'stroller', 'suitcase', 'sun', 'sunflower', 'sword', 'teachingBoard', 'teapot', 'tent', 'tie', 'tiger', 'train', 'tree', 'trumpet', 'tub', 'turtle', 'umbrella', 'vase', 'violin', 'wagon', 'walnut', 'weight', 'whip', 'windmill', 'window', 'wineGlass', 'wolf', 'zebra']
# datasets = ['train', 'valid', 'test']
datasets = ['train']
data = {}

transform_img_to_tensor = transforms.Compose([
    transforms.ToTensor(),
])

for dataset in datasets:
    directory_images = '/content/drive/MyDrive/Delft/capstone data/1.0_Children_Books/'+dataset+'/images'
    directory_labels = '/content/drive/MyDrive/Delft/capstone data/1.0_Children_Books/'+dataset+'/labels'

    files_images = os.listdir(directory_images)
    files_labels = os.listdir(directory_labels)

    data[dataset] = []

    # Load image name, label and bbox coordinates in format (name, [[label, bbox coordinates], ...])
    for idx, label in enumerate(files_labels[:100]):
        img_label = label[:-4]
        img = directory_images+'/'+img_label+'.jpg'
        # img = Image.open(img)
        # numpy_image = transform_img_to_tensor(img).numpy()
        _, img = load_image(img)

        f = open(directory_labels+'/'+label)
        f = [i.split(' ') for i in  f.read().split('\n')]
        f = [[float(j) for j in i] for i in f]

        y_labels_int = torch.LongTensor([i[0] for i in f])
        # y_labels_str = [classes[int(i[0])] for i in f]
        # y_labels_str = [classes[i] for i in y_labels_int]
        y_bboxes = torch.tensor([i[1:] for i in f])

        # print('f: {}'.format(f))
        # print(y_labels_int, y_labels_str, y_bboxes)

        data[dataset].append((img, y_labels_int, y_bboxes))

**Create data loaders**

In [None]:
def custom_collate(batch):
    img = [item[0] for item in batch]
    target_int = [item[1] for item in batch]
    target_bbox = [item[2] for item in batch]

    return [img, target_int, target_bbox]

    # data = [item[0] for item in batch]
    # target = [item[1] for item in batch]
    # target = torch.LongTensor(target)
    # return [data, target]

batch_size = 1

train_loader = DataLoader(data['train'], batch_size, shuffle = False, pin_memory = True, collate_fn=custom_collate)
# val_loader = DataLoader(data['valid'], batch_size*2, pin_memory = True)
# test_loader = DataLoader(data['test'], batch_size*2, pin_memory = True)

## **Train, validation, test helper funcions**


### Train function

**GD uses L1 and GIO loss**

In [None]:
crossentropy_loss = nn.CrossEntropyLoss()

def train(train_loader, net, optimizer, IoU_threshold=0.6, beta=1e3):
    torch.autograd.set_detect_anomaly(True)
    loss_list = []

    for i, d in enumerate(train_loader):
        optimizer.zero_grad()
        print('ITEM {} IN LOADER     '.format(i)+'-'*70)

        # -------------------------
        # |   DATA FROM LOADER    |
        # -------------------------

        image_tensor, y_labels_int, y_bboxes = d
        image_tensor, y_labels_int, y_bboxes = image_tensor[0], y_labels_int[0], y_bboxes[0]
        y_labels_str = [classes[i] for i in y_labels_int]

        print('y_labels_str = {}'.format(y_labels_str))

        image_tensor = image_tensor.to(device)

        # -------------------------
        # |   RUN FORWARD PASS    |
        # -------------------------

        caption = ". ".join(list(set(y_labels_str)))+'.'
        # print('PROVIDED CAPTION: {}'.format(caption))

        y_hat_bboxes, y_hat_logits, y_hat_labels_str, y_hat_logits_raw = net(image_tensor, caption)

        print('y_hat_labels_str = {}'.format(y_hat_labels_str))

        if y_hat_labels_str != []:
            # print('y_hat_bboxes = {}'.format(y_hat_bboxes))
            # print('y_hat_logits_raw = {}'.format(y_hat_logits_raw))

            # -------------------------
            # |   PROCESS OUTPUTS     |
            # -------------------------

            # Remove classes from y_labels_str that GD did not find
            for i in y_labels_str:
                if not i in y_hat_labels_str:
                    index = y_labels_str.index(i)
                    y_labels_str = y_labels_str[0:index] + y_labels_str[index+1:]
                    y_labels_int = torch.cat([y_labels_int.clone()[0:index], y_labels_int.clone()[index+1:]])
                    y_bboxes = torch.cat([y_bboxes.clone()[0:index], y_bboxes.clone()[index+1:]])


            # Check GD prediction belongs to which true label by checking the highest IoU with an IoU threshold
            final_y_order_list = []
            final_y_hat_order_list = []

            for idx, true in enumerate(y_labels_str):
                IoU_list = []
                new_y_order_list = []
                new_y_hat_order_list = []

                for idx2, pred in enumerate(y_hat_labels_str):
                    IoU = calc_IoU(y_bboxes[idx], y_hat_bboxes[idx2])
                    # print('{} - {} - {}'.format(true, pred, IoU))
                    if IoU > IoU_threshold:
                        IoU_list.append(IoU)
                        new_y_order_list.append(idx)
                        new_y_hat_order_list.append(idx2)


                if IoU_list != []:
                    IoU_list = np.array([i.item() for i in IoU_list])

                    argmax = np.argmax(IoU_list)

                    new_y_order = new_y_order_list[argmax]
                    final_y_order_list.append(new_y_order)

                    new_y_hat_order = new_y_hat_order_list[argmax]
                    final_y_hat_order_list.append(new_y_hat_order)

            # print('final_y_order_list = {}'.format(final_y_order_list))
            # print('final_y_hat_order_list = {}'.format(final_y_hat_order_list))

            if final_y_order_list != []:
                y_labels_int_v2 = torch.tensor([y_labels_int[i] for i in final_y_order_list]).to(device)
                y_labels_str_v2 = [y_labels_str[i] for i in final_y_order_list]
                y_bboxes_v2 = y_bboxes.clone().index_select(0, torch.tensor(final_y_order_list)).to(device)

            if final_y_hat_order_list != []:
                y_hat_labels_str_v2 = [y_hat_labels_str[i] for i in final_y_hat_order_list]
                y_hat_bboxes_v2 = y_hat_bboxes.clone().index_select(0, torch.tensor(final_y_hat_order_list).to(device)).to(device)
                y_hat_logits_raw_v2 = y_hat_logits_raw.clone().index_select(0, torch.tensor(final_y_hat_order_list).to(device)).to(device)
            print('[Values V2]')
            print('y_labels_str_v2 = {}'.format(y_labels_str_v2))
            print('y_labels_int_v2 = {}'.format(y_labels_int_v2))
            print('y_bboxes_v2 = {}'.format(y_bboxes_v2))

            print('y_hat_labels_str_v2 = {}'.format(y_hat_labels_str_v2))
            print('y_hat_bboxes_v2 = {}'.format(y_hat_bboxes_v2))
            print('y_hat_logits_raw_v2 = {}'.format(y_hat_logits_raw_v2))

            # -------------------------
            # |    CALCULATE LOSS     |
            # -------------------------

            loss_box = torch.mean((y_bboxes_v2-y_hat_bboxes_v2)**2)*beta
            loss_cls = crossentropy_loss(y_hat_logits_raw_v2, y_labels_int_v2)
            loss = loss_box + loss_cls

            print('loss = loss_box*{} + loss_cls = {} = {} + {}'.format(beta, loss, loss_box, loss_cls))

            if not torch.isnan(loss):
                loss.backward()

                print('MODEL PARAMETER GRADIENT CHECK: {}'.format(list(model.parameters())[-1].grad))

                optimizer.step()

                loss_list.append(loss.item())

    if len(loss_list) != 0:
        return sum(loss_list)/len(loss_list)
    else:
        return None

##### Train function 22:32 17/01/2024

In [None]:
# crossentropy_loss = nn.CrossEntropyLoss()

# def train(train_loader, net, optimizer, IoU_threshold=0.6):
#     loss_list = []

#     for i, d in enumerate(train_loader):
#         print('\nITERATION IN LOADER: {}'.format(i)+'-'*40)

#         # -------------------------
#         # |   DATA FROM LOADER    |
#         # -------------------------

#         image_tensor, y_labels_int, y_bboxes = d
#         image_tensor, y_labels_int, y_bboxes = image_tensor[0], y_labels_int[0], y_bboxes[0]
#         y_labels_str = [classes[i] for i in y_labels_int]

#         image_tensor = image_tensor.to(device)

#         # print('\n[Y VALUES]')
#         # print('y_labels_str = {}'.format(y_labels_str))
#         # print('y_labels_int = {}'.format(y_labels_int))
#         # print('y_bboxes = {}'.format(y_bboxes))

#         # image_tensor, y_labels_int, y_bboxes = image_tensor.to(device), y_labels_int.to(device), y_bboxes.to(device)

#         # -------------------------
#         # |   RUN FORWARD PASS    |
#         # -------------------------

#         optimizer.zero_grad()

#         caption = ". ".join(list(set(y_labels_str)))+'.'
#         print('PROVIDED CAPTION: {}'.format(caption))

#         y_hat_bboxes, y_hat_logits, y_hat_labels_str, y_hat_logits_raw = net(image_tensor, caption)

#         # -------------------------
#         # |   PROCESS OUTPUTS     |
#         # -------------------------

#         # Check if GD found all classes, remove not found classes by GD from true labels list y_labels_str
#         for i in y_labels_str:
#             # print('{} in {} is {}'.format(i, y_hat_labels_str, i in y_hat_labels_str))
#             if not i in y_hat_labels_str:
#                 index = y_labels_str.index(i)
#                 # y_labels_str.remove(i)
#                 y_labels_str = y_labels_str[0:index] + y_labels_str[index+1:]
#                 y_labels_int = torch.cat([y_labels_int[0:index], y_labels_int[index+1:]])
#                 y_bboxes = torch.cat([y_bboxes[0:index], y_bboxes[index+1:]])


#         # print('\n[Y VALUES AFTER DELETION]\ny_labels_str = {}\ny_labels_int = {}\ny_bboxes = {}'.format(y_labels_str, y_labels_int, y_bboxes))
#         # print('\n[BEFORE IOU FILTER]')
#         # print('y_hat_labels_str: {}'.format(y_hat_labels_str))
#         # print('y_hat_bboxes: {}'.format(y_hat_bboxes))

#         # Convert list[tensor, ...] to tensor[list, ...]
#         y_hat_bboxes = torch.tensor([i.tolist() for i in y_hat_bboxes])
#         y_hat_logits_raw = torch.tensor([i.tolist() for i in y_hat_logits_raw])

#         # Check GD prediction belongs to which true label
#         # Loop over true labels and link the prediction with the highest IoU

#         y_hat_labels_str_v2 = []
#         y_hat_IoU_v2 = []
#         y_hat_bboxes_v2 = []
#         y_hat_logits_v2 = []
#         y_hat_logits_raw_v2 = []

#         y_labels_str_v2 = []
#         y_labels_int_v2 = []
#         y_bboxes_v2 = []

#         for idx, true in enumerate(y_labels_str):
#             # print('\nTrue label: {} - {}'.format(true, idx))

#             best_IoU = 0
#             y_hat_best_label_str = ''
#             y_hat_best_corresponding_bbox = []
#             y_hat_best_corresponding_logits = []
#             y_hat_best_corresponding_logits_raw = []

#             y_best_corresponding_bbox = []

#             for idx2, pred in enumerate(y_hat_labels_str):
#                 IoU = calc_IoU(y_hat_bboxes[idx2], y_bboxes[idx])
#                 # print('Checking with prediction: {} - {}, IoU: {}'.format(pred, idx2, IoU))
#                 if IoU > best_IoU and IoU > IoU_threshold:
#                     best_IoU = IoU
#                     y_hat_best_label_str = pred
#                     y_hat_best_corresponding_bbox = y_hat_bboxes[idx2].tolist()
#                     y_hat_best_corresponding_logits = y_hat_logits[idx2].tolist()
#                     y_hat_best_corresponding_logits_raw = y_hat_logits_raw[idx2].tolist()

#                     y_best_corresponding_bbox = y_bboxes[idx].tolist()
#                     # print('[NEW BEST ]')

#             if best_IoU != 0:
#                 y_hat_IoU_v2.append(best_IoU)
#                 y_hat_labels_str_v2.append(y_hat_best_label_str)
#                 y_hat_bboxes_v2.append(y_hat_best_corresponding_bbox)
#                 y_hat_logits_v2.append(y_hat_best_corresponding_logits)
#                 y_hat_logits_raw_v2.append(y_hat_best_corresponding_logits_raw)

#                 y_labels_str_v2.append(true)
#                 y_labels_int_v2.append(y_labels_int[idx])
#                 y_bboxes_v2.append(y_best_corresponding_bbox)
#             else:
#                 pass
#                 # print('NO IOU ABOVE THRESHOLD {}'.format(IoU_threshold))

#         y_hat_bboxes_v2 = torch.tensor(y_hat_bboxes_v2).to(device)
#         y_hat_logits_v2 = torch.tensor(y_hat_logits_v2).to(device)
#         y_hat_logits_raw_v2 = torch.tensor(y_hat_logits_raw_v2).to(device)

#         y_labels_int_v2 = torch.tensor([i.item() for i in y_labels_int_v2]).to(device)
#         y_bboxes_v2 = torch.tensor(y_bboxes_v2).to(device)

#         IoUs = [i.item() for i in y_hat_IoU_v2]

#         # print('\n[AFTER IOU FILTER]')
#         # print('\n[Y VALUES V2]')
#         # print('y_labels_str_v2 = {}'.format(y_labels_str_v2))
#         # print('y_labels_int_v2 = {}'.format(y_labels_int_v2))
#         # print('IoUs = {}'.format(IoUs))
#         # print('y_bboxes_v2 = {}'.format(y_bboxes_v2))

#         # print('\n[Y-HAT VALUES V2]')
#         # print('y_hat_labels_str_v2 = {}'.format(y_hat_labels_str_v2))
#         # print('y_hat_bboxes_v2 = {}'.format(y_hat_bboxes_v2))
#         # print('y_hat_logits_raw_v2 shape = {}'.format(y_hat_logits_raw_v2.shape))
#         # # print('\nlen(y_hat_logits_raw_v2): {}'.format(len(y_hat_logits_raw_v2)))
#         # print('len(y_labels_int) = {}'.format(len(y_labels_int)))


#         # -------------------------
#         # |    CALCULATE LOSS     |
#         # -------------------------

#         loss_box = torch.mean((y_bboxes_v2-y_hat_bboxes_v2)**2)
#         loss_cls = crossentropy_loss(y_hat_logits_raw_v2, y_labels_int_v2)
#         loss = loss_box + loss_cls

#         print('loss = loss_box + loss_cs = {}'.format(loss))

#         if not torch.isnan(loss):
#             loss.requires_grad = True
#             loss.backward()
#             optimizer.step()

#             loss_list.append(loss.item())

#         # _, predicted = torch.max(y_cls.data, 1)
#         # total += labels.size(0)
#         # correct += (predicted == labels).sum().item()

#         # error_boxes  += torch.mean(torch.abs(y_bboxes-bboxes))

#     return sum(loss_list)/len(loss_list)#, correct / total, error_boxes / len(train_loader)

### Validation function

In [None]:
# def val(val_loader, net, beta):
#   avg_loss = 0
#   correct = 0
#   total = 0
#   error_boxes = 0

#   with torch.no_grad():
#     for data in val_loader:
#         inputs, labels, bboxes = data

#         inputs, labels, bboxes = inputs.to(device), labels.to(device), bboxes.to(device)

#         boxes, logits, phrases = net(inputs)

#         loss_box = torch.mean((y_box-bboxes)**2)*beta
#         loss_cls = crossentropy_loss(y_cls, labels)

#         loss = loss_box + loss_cls

#         avg_loss += loss

#         _, predicted = torch.max(y_cls.data, 1)
#         total += labels.size(0)
#         correct += (predicted == labels).sum().item()

#         error_boxes  += torch.mean(torch.abs(y_box-bboxes))


#   return avg_loss/len(val_loader), correct / total, error_boxes / len(val_loader)

## **Grounding DINO Implementation**

### Custom model - Grounding DINO V2

In [None]:
class GroundingDINOV2(nn.Module):
    def __init__(self, box_threshold=0.35, text_threshold=0.25):
        super(GroundingDINOV2, self).__init__()

        self.box_threshold = box_threshold
        self.text_threshold = text_threshold

        self.basemodel = load_model(CONFIG_PATH, WEIGHTS_PATH)

    def basemodel_forward(self, image, caption, remove_combined=False):
        outputs = self.basemodel(image[None], captions=[caption])

        prediction_logits = outputs["pred_logits"].sigmoid()[0]  # prediction_logits.shape = (nq, 256)
        prediction_logits_no_sigmoid = outputs["pred_logits"][0]  # prediction_logits.shape = (nq, 256)
        prediction_boxes = outputs["pred_boxes"][0]  # prediction_boxes.shape = (nq, 4)

        mask = prediction_logits.max(dim=1)[0] > self.box_threshold
        logits = prediction_logits[mask]  # logits.shape = (n, 256)
        boxes = prediction_boxes[mask]  # boxes.shape = (n, 4)

        logits_raw = prediction_logits[mask]

        tokenizer = self.basemodel.tokenizer
        tokenized = tokenizer(caption)

        if remove_combined:
            sep_idx = [i for i in range(len(tokenized['input_ids'])) if tokenized['input_ids'][i] in [101, 102, 1012]]

            phrases = []
            for logit in logits:
                max_idx = logit.argmax()
                insert_idx = bisect.bisect_left(sep_idx, max_idx)
                right_idx = sep_idx[insert_idx]
                left_idx = sep_idx[insert_idx - 1]
                phrases.append(get_phrases_from_posmap(logit > self.text_threshold, tokenized, tokenizer, left_idx, right_idx).replace('.', ''))
        else:
            phrases = [
                get_phrases_from_posmap(logit > self.text_threshold, tokenized, tokenizer).replace('.', '')
                for logit
                in logits
            ]

        return boxes, logits.max(dim=1)[0], phrases, logits_raw

    def forward(self, image, caption='chair.'):
        return self.basemodel_forward(image, caption)

### Run Grounding DINO V2

#### Run function

In [None]:
def run(model, epochs, lr):
    print('Starting run function')
    model = model.to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr)

    train_losses = []

    epoch_count = 0

    for epoch in range(epochs+1):
        model = model.train()
        train_loss = train(train_loader, model, optimizer)
        # train_loss = train(data['train'][1:], model, optimizer)
        train_losses.append(train_loss)

        epoch_count += 1
        print("Epoch: {} | Train loss: {}".format(epoch, train_loss))

    return train_losses

#### IoU Function

In [None]:
def calc_IoU(boxA, boxB):
	xA = max(boxA[0], boxB[0])
	yA = max(boxA[1], boxB[1])
	xB = min(boxA[2], boxB[2])
	yB = min(boxA[3], boxB[3])

	interArea = max(0, xB - xA + 1) * max(0, yB - yA + 1)

	boxAArea = (boxA[2] - boxA[0] + 1) * (boxA[3] - boxA[1] + 1)
	boxBArea = (boxB[2] - boxB[0] + 1) * (boxB[3] - boxB[1] + 1)

	iou = interArea / float(boxAArea + boxBArea - interArea)

	return iou

In [None]:
IoU_threshold = 0.5

y_labels_str = ['bottle', 'painting', 'dog', 'window']

y_bboxes = torch.tensor([[0.4808, 0.4615, 0.0745, 0.1466],
    [0.5072, 0.2332, 0.1731, 0.1899],
    [0.3245, 0.7188, 0.4736, 0.3149],
    [0.8582, 0.2812, 0.2572, 0.2933]])


y_hat_labels_str = ['window', 'bottle', 'dog', 'dog', 'painting', 'painting']
y_hat_bboxes= torch.tensor([[0.8576, 0.2888, 0.2683, 0.3153],
    [0.4809, 0.4581, 0.0633, 0.1276],
    [0.3189, 0.7199, 0.4825, 0.3303],
    [0.2569, 0.5575, 0.3608, 0.2712],
    [0.5081, 0.2378, 0.1699, 0.1899],
    [0.5000, 0.5017, 0.9996, 0.8149]])


for idx, true in enumerate(y_labels_str):
    print('\nTrue label: {} - {}'.format(true, idx))

    best_IoU = 0
    best_cor_bbox = 0
    best_cor_logits = 0

    for idx2, pred in enumerate(y_hat_labels_str):
        IoU = calc_IoU(y_hat_bboxes[idx2], y_bboxes[idx])
        if IoU > best_IoU:
            best_IoU = IoU
            best_cor_bbox = y_hat_bboxes[idx2]
            print('Checking with prediction: {} - {}, IoU: {}'.format(pred, idx2, IoU))


True label: bottle - 0
Checking with prediction: window - 0, IoU: 0.21861384809017181
Checking with prediction: bottle - 1, IoU: 0.9491409659385681

True label: painting - 1
Checking with prediction: window - 0, IoU: 0.3671414256095886
Checking with prediction: bottle - 1, IoU: 0.5686933994293213
Checking with prediction: painting - 4, IoU: 0.9890643954277039

True label: dog - 2
Checking with prediction: window - 0, IoU: 0.2840963304042816
Checking with prediction: bottle - 1, IoU: 0.2845150828361511
Checking with prediction: dog - 2, IoU: 0.9609138369560242

True label: window - 3
Checking with prediction: window - 0, IoU: 0.9439011812210083


#### Run model

In [None]:
model = GroundingDINOV2()
model = model.to(device)
initial_parameters = model.state_dict()

all_parameters = model.parameters()


# for idx, param in enumerate(list(all_parameters)):
#     if idx < len(list(all_parameters)) - 1:
#         param.requires_grad = False

for param in model.parameters():
    param.requires_grad = False

for idx, (name, param) in enumerate(model.named_parameters()):
    if idx > 827:
        param.requires_grad = True

    print('{} - {}: {}'.format(idx, name, param.requires_grad))

final text_encoder_type: bert-base-uncased


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

0 - basemodel.transformer.level_embed: False
1 - basemodel.transformer.encoder.layers.0.self_attn.sampling_offsets.weight: False
2 - basemodel.transformer.encoder.layers.0.self_attn.sampling_offsets.bias: False
3 - basemodel.transformer.encoder.layers.0.self_attn.attention_weights.weight: False
4 - basemodel.transformer.encoder.layers.0.self_attn.attention_weights.bias: False
5 - basemodel.transformer.encoder.layers.0.self_attn.value_proj.weight: False
6 - basemodel.transformer.encoder.layers.0.self_attn.value_proj.bias: False
7 - basemodel.transformer.encoder.layers.0.self_attn.output_proj.weight: False
8 - basemodel.transformer.encoder.layers.0.self_attn.output_proj.bias: False
9 - basemodel.transformer.encoder.layers.0.norm1.weight: False
10 - basemodel.transformer.encoder.layers.0.norm1.bias: False
11 - basemodel.transformer.encoder.layers.0.linear1.weight: False
12 - basemodel.transformer.encoder.layers.0.linear1.bias: False
13 - basemodel.transformer.encoder.layers.0.linear2.weig

In [None]:
epochs = 10
lr = 0.0001

train_losses = run(model, epochs, lr)

Starting run function
ITEM 0 IN LOADER     ----------------------------------------------------------------------
y_labels_str = ['male']
y_hat_labels_str = ['male']
[Values V2]
y_labels_str_v2 = ['male']
y_labels_int_v2 = tensor([89], device='cuda:0')
y_bboxes_v2 = tensor([[0.4916, 0.5000, 0.8798, 0.9447]], device='cuda:0')
y_hat_labels_str_v2 = ['male']
y_hat_bboxes_v2 = tensor([[0.5028, 0.4981, 0.9207, 0.9749]], device='cuda:0',
       grad_fn=<IndexSelectBackward0>)
y_hat_logits_raw_v2 = tensor([[0.0078, 0.7724, 0.0198, 0.0078, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000,
         0.0000, 

RuntimeError: Trying to backward through the graph a second time (or directly access saved tensors after they have already been freed). Saved intermediate values of the graph are freed when you call .backward() or autograd.grad(). Specify retain_graph=True if you need to backward through the graph a second time or if you need to access saved tensors after calling backward.

In [None]:
current_parameters = model.state_dict()

parameters_updated = False
for key in initial_parameters.keys():
    if not torch.equal(initial_parameters[key], current_parameters[key]):
        parameters_updated = True
        break

print('Parameters updated: {}'.format(parameters_updated))

## **Extras**

#### Manual runs

In [None]:
# model = GroundingDINOV2()
# model = model.to(device)

# image = data['train'][2][1]
# image = image.to(device)

# true_classes = [classes[int(i[0])] for i in data['train'][2][2]]
# caption = ". ".join(true_classes)

# with torch.no_grad():
#     print(model(image, caption))

In [None]:
# n = 1

# image = data['train'][n][1]
# image = image.to(device)

# true_classes = [classes[int(i[0])] for i in data['train'][n][2]]
# caption = ". ".join(true_classes)

# with torch.no_grad():
#     print(model(image, caption))

In [None]:
# import torch
# import torch.nn.functional as F

# # Suppose we have ground truth labels and predicted logits for three samples and three classes
# ground_truth_labels = torch.tensor([2, 0, 1])  # Actual class labels (0: Class A, 1: Class B, 2: Class C)
# predicted_logits = torch.tensor([[1.0, 2.0, -1.0], [-1.0, 0.5, 2.0], [0.0, 1.0, -2.0]])

# # Calculate cross-entropy loss using PyTorch's F.cross_entropy
# loss = F.cross_entropy(predicted_logits, ground_truth_labels)

# print("Cross-Entropy Loss:", loss.item())

#### Old Version

In [None]:
# class GroundingDINOV2(nn.Module):
#     def __init__(self, box_threshold=0.35, text_threshold=0.25):
#         super(GroundingDINOV2, self).__init__()

#         self.box_threshold = box_threshold
#         self.text_threshold = text_threshold

#         self.DINO = Model(model_config_path=CONFIG_PATH, model_checkpoint_path=WEIGHTS_PATH)

#     def forward(self, image, image_classes):
#         image = image.reshape(416,416,3)*255
#         image = image.astype(np.uint8)
#         x = self.DINO.predict_with_classes(
#             image=image,
#             classes=image_classes,
#             box_threshold=self.box_threshold,
#             text_threshold=self.text_threshold
#         )

#         return x

In [None]:
# # directory_images = '/content/drive/MyDrive/Delft/capstone data/1.0_Children_Books/'+'train'+'/images'
# # IMAGE_PATH = directory_images+'/'+data['train'][2][0]+'.jpg'

# model = GroundingDINOV2()
# model(data['train'][2][1], [classes[int(i[0])] for i in data['train'][2][2]])

In [None]:
# directory_images = '/content/drive/MyDrive/Delft/capstone data/1.0_Children_Books/'+'train'+'/images'

# IMAGE_PATH = directory_images+'/'+data['train'][0][0]+'.jpg'
# # IMAGE_PATH = os.path.join(HOME, "data", IMAGE_NAME)

# TEXT_PROMPT = "chair"
# BOX_TRESHOLD = 0.35
# TEXT_TRESHOLD = 0.25

# image_source, image = load_image(IMAGE_PATH)
# image

# im = Image.fromarray((x * 255).astype(np.uint8))


### Grounding DINO Manual Pass

In [None]:
# manual_pass_model = load_model(CONFIG_PATH, WEIGHTS_PATH)

# directory_images = '/content/drive/MyDrive/Delft/capstone data/1.0_Children_Books/'+'train'+'/images'

# IMAGE_PATH = directory_images+'/'+data['train'][2][0]+'.jpg'
# # IMAGE_PATH = os.path.join(HOME, "data", IMAGE_NAME)

# TEXT_PROMPT = "painting. male. female."
# BOX_TRESHOLD = 0.35
# TEXT_TRESHOLD = 0.25

# image_source, image = load_image(IMAGE_PATH)

# print('Size loaded image by Grounding DINO: {}'.format(image.shape))

# boxes, logits, phrases = predict(
#     model=manual_pass_model,
#     image=image,
#     caption=TEXT_PROMPT,
#     box_threshold=BOX_TRESHOLD,
#     text_threshold=TEXT_TRESHOLD
# )

# annotated_frame = annotate(image_source=image_source, boxes=boxes, logits=logits, phrases=phrases)

# %matplotlib inline
# sv.plot_image(annotated_frame, (16, 16))