In [1]:
from IPython.display import Image, display
import PIL.Image
import io
import torch
import numpy as np
from processing_image import Preprocess
from visualizing_image import SingleImageViz
from modeling_frcnn import GeneralizedRCNN
from utils import Config
import utils
from transformers import VisualBertForQuestionAnswering, BertTokenizerFast

In [2]:
from datasets import load_dataset, Image
import pickle
import os
import cv2
import torch
import numpy as np
from torchvision import transforms

In [3]:
%load_ext autoreload
%autoreload 2

In [4]:
device = "cuda" if torch.cuda.is_available() else "cpu"


In [5]:
# load models and model components
frcnn_cfg = Config.from_pretrained("unc-nlp/frcnn-vg-finetuned")
frcnn_cfg.model.DEVICE = device

frcnn = GeneralizedRCNN.from_pretrained("unc-nlp/frcnn-vg-finetuned", config=frcnn_cfg).to(device)

image_preprocess = Preprocess(frcnn_cfg)

# bert_tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
# visualbert_vqa = VisualBertForQuestionAnswering.from_pretrained("uclanlp/visualbert-vqa")

loading configuration file cache
loading weights file https://cdn.huggingface.co/unc-nlp/frcnn-vg-finetuned/pytorch_model.bin from cache at /root/.cache/torch/transformers/57f6df6abe353be2773f2700159c65615babf39ab5b48114d2b49267672ae10f.77b59256a4cf8343ae0f923246a81489fc8d82f98d082edc2d2037c977c0d9d0
All model checkpoint weights were used when initializing GeneralizedRCNN.

All the weights of GeneralizedRCNN were initialized from the model checkpoint at unc-nlp/frcnn-vg-finetuned.
If your task is similar to the task the model of the checkpoint was trained on, you can already use GeneralizedRCNN for predictions without further training.


In [7]:
cifar10_data = load_dataset("cifar10")

Found cached dataset cifar10 (/root/.cache/huggingface/datasets/cifar10/plain_text/1.0.0/447d6ec4733dddd1ce3bb577c7166b986eaa4c538dcd9e805ba61f35674a9de4)


  0%|          | 0/2 [00:00<?, ?it/s]

In [8]:
train_data = cifar10_data["train"]
test_data = cifar10_data["test"]
to_tensor = transforms.ToTensor()

In [None]:
print("Generating " + str(len(train_data)) + " embeddings")

visual_embeddings = []
labels = []

for idx, img_data in enumerate(train_data):
    if idx % 1000 == 0:
        print("On idx " + str(idx))
    image = np.array(img_data["img"])
    # run frcnn
    images, sizes, scales_yx = image_preprocess(image)
    output_dict = frcnn(
        images,
        sizes,
        scales_yx=scales_yx,
        padding="max_detections",
        max_detections=frcnn_cfg.max_detections,
        return_tensors="pt",
    )
    features = output_dict.get("roi_features")
    
    visual_embeddings.append(features)
    labels.append(img_data["label"])


In [None]:
cifar10_train_embeddings = {
    "embeddings": visual_embeddings,
    "labels": labels
}

with open("cifar10-train-embeddings.pkl", "wb") as f:
    pickle.dump(cifar10_train_embeddings, f)

In [9]:
print("Generating " + str(len(test_data)) + " embeddings")

visual_embeddings = []
labels = []


for idx, img_data in enumerate(test_data):
    if idx % 1000 == 0:
        print("On idx " + str(idx))
    image = np.array(img_data["img"])
    # run frcnn
    images, sizes, scales_yx = image_preprocess(image)
    output_dict = frcnn(
        images,
        sizes,
        scales_yx=scales_yx,
        padding="max_detections",
        max_detections=frcnn_cfg.max_detections,
        return_tensors="pt",
    )
    features = output_dict.get("roi_features")
    
    visual_embeddings.append(features)
    labels.append(img_data["label"])


Generating 10000 embeddings
On idx 0


  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]


On idx 1000
On idx 2000
On idx 3000
On idx 4000
On idx 5000
On idx 6000
On idx 7000
On idx 8000
On idx 9000


In [10]:
cifar10_test_embeddings = {
    "embeddings": visual_embeddings,
    "labels": labels
}

with open("cifar10-test-embeddings.pkl", "wb") as f:
    pickle.dump(cifar10_test_embeddings, f)