In [30]:
import torch.nn as nn


In [1]:
import requests
import PIL.Image
import torch
from transformers import (
    OwlViTProcessor,
    OwlViTForObjectDetection,
    OwlViTVisionModel
)
from typing import Sequence, List, Tuple
import torch.nn as nn


class OwlVit(object):
    def __init__(self, threshold=0.1):
        self.processor = OwlViTProcessor.from_pretrained("google/owlvit-base-patch32")
        self.model = OwlViTForObjectDetection.from_pretrained("google/owlvit-base-patch32")
        self.threshold = threshold

    def predict(self, image: PIL.Image.Image, texts: Sequence[str]):
        inputs = self.processor(text=texts, images=image, return_tensors="pt")
        outputs = self.model(**inputs)
        target_sizes = torch.Tensor([image.size[::-1]])
        results = self.processor.post_process_object_detection(outputs=outputs, target_sizes=target_sizes, threshold=self.threshold)
        i = 0
        boxes, scores, labels = results[i]["boxes"], results[i]["scores"], results[i]["labels"]
        detections = []
        for box, score, label in zip(boxes, scores, labels):
            detection = {"bbox": box.tolist(), "score": float(score), "label": int(label), "text": texts[label]}
            detections.append(detection)
        return detections

In [60]:
owlvit = OwlVit()

In [1]:
import torch

In [None]:
class ModuleProfiler(object):
    pass


In [61]:
class ModuleRecorder(object):

    def __init__(self, module: nn.Module):
        self._module = module
        self._input = None
        self._output = None
        self._hook = None

    def _on_forward(self, module, input, output):
        self._input = input
        self._output = output

    def attach(self):
        if self._hook is not None:
            raise RuntimeError("Hook already attached.")
        self._hook = self._module.register_forward_hook(self._on_forward)

    def detach(self):
        if self._hook is not None:
            self._hook.remove()
            self._hook = None

    def __enter__(self, *args, **kwargs):
        self.attach()

    def __exit__(self, *args, **kwargs):
        self.detach()

    def get_input(self):
        return self._input
    
    def get_output(self):
        return self._output


In [72]:
image = PIL.Image.open("../assets/dogs.jpg")

In [73]:
vision_recorder = ModuleRecorder(owlvit.model.owlvit.vision_model.encoder)

In [None]:
owlvit.model.

In [74]:
vision_recorder.attach()

In [76]:
out = owlvit.predict(image, texts=["A dog", "A cat"])

In [81]:
import torch
import torchvision.models as models
from torch.profiler import profile, record_function, ProfilerActivity

In [82]:

import time

In [86]:
count = 10
t0 = time.perf_counter_ns()

for i in range(count):
    # out = owlvit.predict(image, texts=["A dog", "A cat"])
    owlvit.model.owlvit.vision_model(torch.randn(1, 3, 768, 768))
torch.cuda.current_stream().synchronize()
t1 = time.perf_counter_ns()
dt = (t1 - t0) / 1e9
print(count / dt)

2.708129982561073


In [89]:
a = 24*24*3*768*32*32

In [90]:
b = ((768//2)**2)*3*64*3*3

In [91]:
a/b

5.333333333333333

In [77]:
vision_recorder.get_output().last_hidden_state.shape

torch.Size([1, 577, 768])

In [78]:
(768//32)**2 + 1

577

In [58]:
help(owlvit.model.owlvit.vision_model.register_forward_hook)

Help on method register_forward_hook in module torch.nn.modules.module:

register_forward_hook(hook: Callable[..., NoneType]) -> torch.utils.hooks.RemovableHandle method of transformers.models.owlvit.modeling_owlvit.OwlViTVisionTransformer instance
    Registers a forward hook on the module.
    
    The hook will be called every time after :func:`forward` has computed an output.
    It should have the following signature::
    
        hook(module, input, output) -> None or modified output
    
    The input contains only the positional arguments given to the module.
    Keyword arguments won't be passed to the hooks and only to the ``forward``.
    The hook can modify the output. It can modify the input inplace but
    it will not have effect on forward since this is called after
    :func:`forward` is called.
    
    Returns:
        :class:`torch.utils.hooks.RemovableHandle`:
            a handle that can be used to remove the added hook by calling
            ``handle.remove()``


In [56]:
vision_recorder.get_output()

BaseModelOutputWithPooling(last_hidden_state=tensor([[[ 20.6480, -13.1593,   9.6784,  ..., -20.6196,  15.2642,  18.7416],
         [ -1.3909,   1.0352,   0.3451,  ...,  -0.1778,   0.2200,   0.7068],
         [ -0.3303,   1.6497,  -0.3364,  ...,  -0.5335,   0.3259,   1.2121],
         ...,
         [  1.0217,   2.0309,   0.1072,  ...,  -0.7548,   0.8171,   2.2222],
         [  1.1874,   1.4332,  -0.7798,  ...,   0.0555,  -0.2738,   1.0362],
         [  1.0650,  -1.1717,  -2.4873,  ...,  -0.5335,   0.9282,  -0.1461]]],
       grad_fn=<AddBackward0>), pooler_output=tensor([[ 1.3095e+00, -1.0579e+00,  6.8421e-01, -5.0445e-01, -5.7594e-01,
          1.0334e+00,  1.2500e+00,  1.0735e+00, -1.3173e+00,  1.9208e+00,
         -1.4657e+00, -9.8172e-01,  1.2634e+00,  1.2387e+00,  1.0493e+00,
          3.2842e-01, -1.4635e+00,  8.4521e-01,  1.3846e+00,  9.9562e-01,
          6.1305e-01,  3.5310e-01,  1.1976e+00,  1.1578e+00,  2.5607e-01,
          1.0382e+00,  1.1112e+00,  9.9394e-01, -9.6237e-01, 

In [27]:
owlvit.processor

OwlViTProcessor:
- image_processor: OwlViTImageProcessor {
  "crop_size": {
    "height": 768,
    "width": 768
  },
  "do_center_crop": false,
  "do_convert_rgb": true,
  "do_normalize": true,
  "do_rescale": true,
  "do_resize": true,
  "feature_extractor_type": "OwlViTFeatureExtractor",
  "image_mean": [
    0.48145466,
    0.4578275,
    0.40821073
  ],
  "image_processor_type": "OwlViTImageProcessor",
  "image_std": [
    0.26862954,
    0.26130258,
    0.27577711
  ],
  "processor_class": "OwlViTProcessor",
  "resample": 3,
  "rescale_factor": 0.00392156862745098,
  "size": {
    "height": 768,
    "width": 768
  }
}

- tokenizer: CLIPTokenizerFast(name_or_path='google/owlvit-base-patch32', vocab_size=49408, model_max_length=16, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken("<|startoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, s

In [28]:
owlvit.model.owlvit.vision_model

OwlViTVisionTransformer(
  (embeddings): OwlViTVisionEmbeddings(
    (patch_embedding): Conv2d(3, 768, kernel_size=(32, 32), stride=(32, 32), bias=False)
    (position_embedding): Embedding(577, 768)
  )
  (pre_layernorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  (encoder): OwlViTEncoder(
    (layers): ModuleList(
      (0): OwlViTEncoderLayer(
        (self_attn): OwlViTAttention(
          (k_proj): Linear(in_features=768, out_features=768, bias=True)
          (v_proj): Linear(in_features=768, out_features=768, bias=True)
          (q_proj): Linear(in_features=768, out_features=768, bias=True)
          (out_proj): Linear(in_features=768, out_features=768, bias=True)
        )
        (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): OwlViTMLP(
          (activation_fn): QuickGELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
  

In [22]:
owlvit.model.owlvit.vision_model.encoder.layers

ModuleList(
  (0): OwlViTEncoderLayer(
    (self_attn): OwlViTAttention(
      (k_proj): Linear(in_features=768, out_features=768, bias=True)
      (v_proj): Linear(in_features=768, out_features=768, bias=True)
      (q_proj): Linear(in_features=768, out_features=768, bias=True)
      (out_proj): Linear(in_features=768, out_features=768, bias=True)
    )
    (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (mlp): OwlViTMLP(
      (activation_fn): QuickGELUActivation()
      (fc1): Linear(in_features=768, out_features=3072, bias=True)
      (fc2): Linear(in_features=3072, out_features=768, bias=True)
    )
    (layer_norm2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (1): OwlViTEncoderLayer(
    (self_attn): OwlViTAttention(
      (k_proj): Linear(in_features=768, out_features=768, bias=True)
      (v_proj): Linear(in_features=768, out_features=768, bias=True)
      (q_proj): Linear(in_features=768, out_features=768, bias=True)
      (out_proj): 