In [2]:
import os
import cv2
import torch
import numpy as np
from detectron2.config import get_cfg
from detectron2.engine import DefaultPredictor
from detectron2.modeling import build_model
from detectron2.checkpoint import DetectionCheckpointer
from detectron2.data import MetadataCatalog
from torchvision.transforms.functional import to_tensor
import torch.nn.functional as F
from datasets import load_dataset
import pickle

In [3]:
def setup_detectron2_model(config_file, model_weights, device):
    cfg = get_cfg()
    cfg.merge_from_file(config_file)
    cfg.MODEL.RETINANET.SCORE_THRESH_TEST = 0.5
    cfg.MODEL.WEIGHTS = model_weights
    cfg.MODEL.DEVICE = device

    model = build_model(cfg)
    model.eval()
    DetectionCheckpointer(model).load(cfg.MODEL.WEIGHTS)
    return model

In [8]:
def extract_roi_features(image, model):
    image = to_tensor(image)  # Convert image to a PyTorch tensor
    image = image.to(model.device)  # Move image to the same device as the model
    image = (image * 255).to(torch.uint8)  # Convert back to the original range (0-255)

    with torch.no_grad():
        input_image = model.preprocess_image([{"image": image, "height": image.shape[-2], "width": image.shape[-1]}])
        features = model.backbone(input_image.tensor)
        proposals, _ = model.proposal_generator(input_image, features)
        instances, _ = model.roi_heads(input_image, features, proposals)
        roi_features = model.roi_heads.box_pooler(
            [features[f] for f in ["p2", "p3", "p4", "p5"]],
            [x.pred_boxes for x in instances]
        )
        
        # Apply some kind of averaging to get fixed-size output
        mean_roi_features = torch.mean(roi_features, dim=0)
        
    return mean_roi_features.cpu()

In [9]:
cifar100_data = load_dataset("cifar100")

Found cached dataset cifar100 (/root/.cache/huggingface/datasets/cifar100/cifar100/1.0.0/f365c8b725c23e8f0f8d725c3641234d9331cd2f62919d1381d1baa5b3ba3142)


  0%|          | 0/2 [00:00<?, ?it/s]

In [10]:
train_data = cifar100_data["train"]
test_data = cifar100_data["test"]

In [11]:
device = "cuda" if torch.cuda.is_available() else "cpu"
config_file = "faster_rcnn_R_50_FPN_3x.yaml"
model_weights = "detectron2://COCO-Detection/faster_rcnn_R_50_FPN_3x/137849458/model_final_280758.pkl"
model = setup_detectron2_model(config_file, model_weights, device)

In [12]:
print("Generating " + str(len(train_data)) + " embeddings")

visual_embeddings = []
fine_labels = []
coarse_labels = []

for idx, img_data in enumerate(train_data):
    if idx % 1000 == 0:
        print("On idx " + str(idx))
    image = np.array(img_data["img"])
    embedding = extract_roi_features(image, model)
    visual_embeddings.append(embedding)
    fine_labels.append(img_data["fine_label"])
    coarse_labels.append(img_data["coarse_label"])

cifar100_train_embeddings = {
    "embeddings": visual_embeddings,
    "fine_labels": fine_labels,
    "coarse_labels": coarse_labels
}

with open("cifar100-train-embeddings.pkl", "wb") as f:
    pickle.dump(cifar100_train_embeddings, f)

Generating 50000 embeddings
On idx 0


  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


On idx 1000
On idx 2000
On idx 3000
On idx 4000
On idx 5000
On idx 6000
On idx 7000
On idx 8000
On idx 9000
On idx 10000
On idx 11000
On idx 12000
On idx 13000
On idx 14000
On idx 15000
On idx 16000
On idx 17000
On idx 18000
On idx 19000
On idx 20000
On idx 21000
On idx 22000
On idx 23000
On idx 24000
On idx 25000
On idx 26000
On idx 27000
On idx 28000
On idx 29000
On idx 30000
On idx 31000
On idx 32000
On idx 33000
On idx 34000
On idx 35000
On idx 36000
On idx 37000
On idx 38000
On idx 39000
On idx 40000
On idx 41000
On idx 42000
On idx 43000
On idx 44000
On idx 45000
On idx 46000
On idx 47000
On idx 48000
On idx 49000


In [13]:
print("Generating " + str(len(test_data)) + " embeddings")

visual_embeddings = []
fine_labels = []
coarse_labels = []

for idx, img_data in enumerate(test_data):
    if idx % 1000 == 0:
        print("On idx " + str(idx))
    image = np.array(img_data["img"])
    embedding = extract_roi_features(image, model)
    visual_embeddings.append(embedding)
    fine_labels.append(img_data["fine_label"])
    coarse_labels.append(img_data["coarse_label"])

cifar100_test_embeddings = {
    "embeddings": visual_embeddings,
    "fine_labels": fine_labels,
    "coarse_labels": coarse_labels
}

with open("cifar100-test-embeddings.pkl", "wb") as f:
    pickle.dump(cifar100_test_embeddings, f)

Generating 10000 embeddings
On idx 0
On idx 1000
On idx 2000
On idx 3000
On idx 4000
On idx 5000
On idx 6000
On idx 7000
On idx 8000
On idx 9000


In [18]:
cifar100_test_embeddings['coarse_labels']

[10,
 10,
 0,
 4,
 10,
 2,
 11,
 7,
 10,
 4,
 10,
 12,
 19,
 19,
 5,
 8,
 2,
 8,
 2,
 4,
 2,
 10,
 12,
 15,
 11,
 3,
 5,
 18,
 18,
 2,
 6,
 3,
 19,
 15,
 17,
 1,
 18,
 14,
 6,
 9,
 12,
 7,
 10,
 1,
 17,
 5,
 19,
 18,
 12,
 9,
 0,
 7,
 3,
 4,
 14,
 1,
 10,
 17,
 8,
 19,
 13,
 15,
 11,
 16,
 7,
 14,
 8,
 2,
 12,
 2,
 12,
 18,
 1,
 1,
 1,
 1,
 2,
 3,
 13,
 5,
 10,
 15,
 2,
 7,
 15,
 19,
 15,
 10,
 14,
 1,
 5,
 7,
 15,
 11,
 18,
 11,
 16,
 12,
 9,
 14,
 13,
 4,
 17,
 0,
 8,
 19,
 9,
 9,
 4,
 17,
 6,
 6,
 15,
 4,
 3,
 10,
 18,
 8,
 4,
 17,
 19,
 3,
 11,
 2,
 10,
 6,
 11,
 16,
 2,
 14,
 14,
 3,
 1,
 16,
 10,
 1,
 10,
 12,
 17,
 10,
 0,
 14,
 17,
 9,
 7,
 0,
 6,
 5,
 4,
 18,
 18,
 13,
 7,
 17,
 13,
 12,
 18,
 6,
 7,
 0,
 16,
 18,
 5,
 3,
 0,
 17,
 2,
 13,
 4,
 4,
 3,
 9,
 0,
 4,
 0,
 4,
 1,
 1,
 7,
 5,
 5,
 12,
 5,
 16,
 3,
 3,
 5,
 0,
 14,
 4,
 9,
 7,
 13,
 18,
 6,
 7,
 18,
 12,
 7,
 5,
 7,
 15,
 9,
 10,
 7,
 10,
 15,
 11,
 16,
 15,
 1,
 14,
 10,
 18,
 8,
 17,
 14,
 1,
 3,
 19,
 6,
 15,
 9,
