In [None]:
!pip install ftfy regex tqdm
!pip install git+https://github.com/openai/CLIP.git
!gdown 1xijq32XfEm6FPhUb7RsZYWHc2UuwVkiq
!tar -xf /content/refcocog.tar.gz
!pip install -qr https://raw.githubusercontent.com/ultralytics/yolov5/master/requirements.txt

In [None]:
import json
import clip
import torch
import pandas
import numpy as np

from torch.utils.data import Dataset, DataLoader
from typing import Sequence, Union

from PIL import Image, ImageDraw


class RefcocogDataset(Dataset):
    def __init__(self, base_path, split=None, transform=None, tokenization=None):
        annotation_path = base_path + "/annotations/"

        self.IMAGES_PATH = base_path + "/images/"
        self.transform = transform
        self.tokenization = tokenization

        tmp_annotations = pandas.read_pickle(annotation_path + "refs(umd).p")
        tmp_instances = json.load(open(annotation_path + "instances.json", "r"))

        annotations_dt = pandas.DataFrame.from_records(tmp_annotations) \
            .filter(items=["image_id", "split", "sentences", "ann_id"])

        instances_dt = pandas.DataFrame.from_records(tmp_instances['annotations'])

        self.annotations = annotations_dt \
            .merge(instances_dt[["id", "bbox", "area"]], left_on="ann_id", right_on="id") \
            .drop(columns="id")

        if split is not None:
            self.annotations = self.__get_annotations_by_split(split.lower())

    def getImage(self, sample):
        id = sample['idx'][0].item()
        item = self.annotations.iloc[id]
        image = self.__getimage(item.image_id)

        return image

    def getSentences(self, sample):
        id = sample['idx'][0].item()
        item = self.annotations.iloc[id]

        return self.__extract_sentences(item.sentences)

    def showImage(self, train_features, train_bbox):
        img = self.getImage(train_features)
        img1 = ImageDraw.Draw(img)
        img1.rectangle([(train_bbox[0].item(), train_bbox[1].item()), (train_bbox[0].item()+train_bbox[2].item(), train_bbox[1].item()+train_bbox[3].item())], outline ="red")
        img.show()

    def __get_annotations_by_split(self, split):
        return self.annotations[self.annotations.split == split].reset_index()

    def __getimage(self, id):
        return Image.open(self.IMAGES_PATH + "COCO_train2014_" + str(id).zfill(12) + ".jpg")

    def __extract_sentences(self, sentences):
        return [f"a photo of {s['sent']}" for s in sentences]

    def __tokenize_sents(self, sentences):
        return [self.tokenization(s) for s in sentences]

    def __len__(self):
        return self.annotations.shape[0]

    def __getitem__(self, idx):
        item = self.annotations.iloc[idx]
        image = self.__getimage(item.image_id)
        sentences = self.__extract_sentences(item.sentences)

        if self.transform:
            image = self.transform(image)

        if self.tokenization:
            sentences = self.__tokenize_sents(sentences)

        sample = {'idx': idx, 'image': image, 'sentences': sentences}

        return sample, item.bbox

In [None]:
_, preprocess = clip.load("ViT-B/16")
test_dataset = RefcocogDataset("refcocog", split="test", transform=preprocess, tokenization=clip.tokenize)

test_loader = DataLoader(test_dataset, shuffle=False)

100%|████████████████████████████████████████| 335M/335M [00:02<00:00, 128MiB/s]


In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
modelYOLO = torch.hub.load('ultralytics/yolov5', 'yolov5x')
modelCLIP, preprocessCLIP = clip.load("ViT-B/16", device=device)

Downloading: "https://github.com/ultralytics/yolov5/zipball/master" to /root/.cache/torch/hub/master.zip
YOLOv5 🚀 2023-9-1 Python-3.10.12 torch-2.0.1+cu118 CUDA:0 (Tesla T4, 15102MiB)

Downloading https://github.com/ultralytics/yolov5/releases/download/v7.0/yolov5x.pt to yolov5x.pt...
100%|██████████| 166M/166M [00:01<00:00, 117MB/s]

Fusing layers... 
YOLOv5x summary: 444 layers, 86705005 parameters, 0 gradients
Adding AutoShape... 


In [None]:
# extract objects from image using YOLO
def YoloBBoxes(img, modelYOLO):
    result = modelYOLO(img)
    bbox = result.pandas().xyxy[0]
    bbox = bbox.reset_index()
    bbox["tconfidence"] = np.nan
    bbox["crop"] = np.nan
    return bbox


# retrieve image crops with coordinates from YOLO
def CropImage(image, boxs):
    crops = []

    for index, row in boxs.iterrows():
        box = (
            row['xmin'],
            row['ymin'],
            row['xmax'],
            row['ymax'],
        )
        crop = image.crop(box)
        crops.append(crop)

        boxs.at[index, 'crop'] = crop

    return crops


# compute similarity between clip embeddings
def computeSimilarity(image, sentences, modelCLIP, preprocessCLIP):
    similarities = []

    for sent in sentences:
        with torch.no_grad():
            image_features = modelCLIP.encode_image(torch.unsqueeze(preprocessCLIP(image).to(device), dim=0)).float().to(device)
            text_features = modelCLIP.encode_text(sent[0].to(device)).float().to(device)

        image_features /= image_features.norm(dim=-1, keepdim=True)
        text_features /= text_features.norm(dim=-1, keepdim=True)

        similarities.append(text_features.cpu().numpy() @ image_features.cpu().numpy().T)

    return sum(similarities)/len(similarities)


# needed for IoU computation
def computeIntersection(fx1, fy1, fx2, fy2, sx1, sy1, sx2, sy2):
    dx = min(fx2, sx2) - max(fx1, sx1)
    dy = min(fy2, sy2) - max(fy1, sy1)
    if (dx>=0) and (dy>=0):
        area = dx*dy
    else:
        area = 0
    return area


# accuracy is IoU between two bboxes
def computeAccuracy(bboxes, index, label):
    x_min, y_min, x_max, y_max = bboxes['xmin'][index], bboxes['ymin'][index], bboxes['xmax'][index], bboxes['ymax'][index]
    x, y, w, h = label[0].item(), label[1].item(), label[2].item(), label[3].item()

    intersection = computeIntersection(x_min, y_min, x_max, y_max, x, y, x+w, y+h)

    area1 = (x_max - x_min) * (y_max - y_min)
    area2 = w * h

    return intersection / (area1 + area2 - intersection)

In [None]:
def baseline(loader, dataset, modelYOLO, modelCLIP, preprocessCLIP):
    n_samples = 0
    tot_accuracy = 0

    for data_features, data_bbox in loader:

        bboxes = YoloBBoxes(dataset.getImage(data_features), modelYOLO) # extract bboxes from image using YOLO
        crops = CropImage(dataset.getImage(data_features), bboxes) # retrieve crops for each object

        # determine which crop/object has highest similarity
        # crop with highest similarity is the predicted object
        # compute IoU between predicted object and ground truth
        if len(crops) > 0:
            highest_similarity = 0
            index_pred = 0

            for i, crop in enumerate(crops):
                similarity = computeSimilarity(crop, data_features['sentences'], modelCLIP, preprocessCLIP)

                if similarity > highest_similarity:
                    highest_similarity = similarity
                    index_pred = i

            accuracy = computeAccuracy(bboxes, index_pred, data_bbox)
        else:
            accuracy = 0

        tot_accuracy += accuracy
        n_samples += 1
        print(f'Image {n_samples:^6}/{len(dataset):^6}\t{accuracy:.4f}')

    return tot_accuracy/n_samples

In [None]:
print(f'Mean accuracy IoU: {baseline(test_loader, test_dataset, modelYOLO, modelCLIP, preprocessCLIP):.8f}')

[1;30;43mOutput streaming troncato alle ultime 5000 righe.[0m
Image   25  / 5023 	0.1521
Image   26  / 5023 	0.1293
Image   27  / 5023 	0.0445
Image   28  / 5023 	0.9481
Image   29  / 5023 	0.9482
Image   30  / 5023 	0.9250
Image   31  / 5023 	0.0000
Image   32  / 5023 	0.9158
Image   33  / 5023 	0.9558
Image   34  / 5023 	0.0993
Image   35  / 5023 	0.0000
Image   36  / 5023 	0.9663
Image   37  / 5023 	0.9669
Image   38  / 5023 	0.8715
Image   39  / 5023 	0.2205
Image   40  / 5023 	0.8821
Image   41  / 5023 	0.0220
Image   42  / 5023 	0.0000
Image   43  / 5023 	0.9309
Image   44  / 5023 	0.9230
Image   45  / 5023 	0.9748
Image   46  / 5023 	0.8104
Image   47  / 5023 	0.8614
Image   48  / 5023 	0.8428
Image   49  / 5023 	0.0000
Image   50  / 5023 	0.9546
Image   51  / 5023 	0.9390
Image   52  / 5023 	0.9667
Image   53  / 5023 	0.2037
Image   54  / 5023 	0.1862
Image   55  / 5023 	0.0243
Image   56  / 5023 	0.9638
Image   57  / 5023 	0.9649
Image   58  / 5023 	0.0000
Image   59  / 5023