## CLIP

Inference using OpeniAI's [CLIP](https://github.com/openai/CLIP) repository

In [None]:
import clip
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

In [None]:
text = clip.tokenize(["a dog is sitting on a bench in the park"]).to(device)
text_features = model.encode_text(text)

  attn_output = scaled_dot_product_attention(q, k, v, attn_mask, dropout_p, is_causal)


In [None]:
text_features.shape

torch.Size([1, 512])

## DETR

DETR Inference through the HuggingFace `transformers` library

In [7]:
from transformers import DetrImageProcessor, DetrForObjectDetection
from PIL import Image, ImageDraw
import requests
import torch

In [2]:
image_processor = DetrImageProcessor()
detr_model = DetrForObjectDetection.from_pretrained("facebook/detr-resnet-50")

Some weights of the model checkpoint at facebook/detr-resnet-50 were not used when initializing DetrForObjectDetection: ['model.backbone.conv_encoder.model.layer1.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing DetrForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DetrForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [10]:
images = [
    Image.open(requests.get(
        "https://farm5.staticflickr.com/4029/4669549715_7db3735de0_z.jpg", stream=True).raw),
    Image.open(requests.get(
        "https://farm3.staticflickr.com/2050/2407157255_5ac59d6ebc_z.jpg", stream=True).raw)
]

processed_image = image_processor.preprocess(
    images, return_tensors="pt")
processed_image

{'pixel_values': tensor([[[[-1.5014, -1.5185, -1.5699,  ...,  0.0000,  0.0000,  0.0000],
          [-1.5014, -1.5185, -1.5699,  ...,  0.0000,  0.0000,  0.0000],
          [-1.5528, -1.5528, -1.5699,  ...,  0.0000,  0.0000,  0.0000],
          ...,
          [-1.5185, -1.5185, -1.5014,  ...,  0.0000,  0.0000,  0.0000],
          [-1.4500, -1.4672, -1.4672,  ...,  0.0000,  0.0000,  0.0000],
          [-1.3987, -1.4158, -1.4329,  ...,  0.0000,  0.0000,  0.0000]],

         [[-1.4055, -1.4230, -1.4755,  ...,  0.0000,  0.0000,  0.0000],
          [-1.4055, -1.4230, -1.4755,  ...,  0.0000,  0.0000,  0.0000],
          [-1.4580, -1.4580, -1.4755,  ...,  0.0000,  0.0000,  0.0000],
          ...,
          [-1.4230, -1.4230, -1.4055,  ...,  0.0000,  0.0000,  0.0000],
          [-1.3529, -1.3704, -1.3704,  ...,  0.0000,  0.0000,  0.0000],
          [-1.3004, -1.3179, -1.3354,  ...,  0.0000,  0.0000,  0.0000]],

         [[-1.1770, -1.1944, -1.2467,  ...,  0.0000,  0.0000,  0.0000],
          [-1

In [8]:
detr_model.eval()
with torch.no_grad():
    output = detr_model(**processed_image)

In [11]:
post_processed = image_processor.post_process_object_detection(output)
post_processed

[{'scores': tensor([0.8315, 0.9965, 0.9988, 0.9770, 0.9976]),
  'labels': tensor([32,  2, 18, 15,  1]),
  'boxes': tensor([[ 4.4946e-01,  4.1292e-01,  4.6387e-01,  4.4605e-01],
          [ 3.8136e-01,  4.5823e-01,  4.9195e-01,  7.0207e-01],
          [ 8.9242e-01,  6.1414e-01,  9.8079e-01,  6.6543e-01],
          [-1.6114e-04,  7.6959e-01,  6.3832e-01,  9.4864e-01],
          [ 3.8834e-01,  3.6991e-01,  4.9615e-01,  6.6282e-01]])},
 {'scores': tensor([0.5345, 0.8457, 0.5441, 0.9088, 0.6941, 0.7361, 0.9940, 0.9704, 0.8768,
          0.9809, 0.9995, 0.8063, 0.6922, 0.8574, 0.9768, 0.5396, 0.9695]),
  'labels': tensor([27, 10, 10, 77,  1,  1, 77,  1, 10,  1,  1, 27,  1,  1,  1,  1,  1]),
  'boxes': tensor([[-2.1952e-04,  8.1461e-01,  1.6846e-01,  9.9997e-01],
          [ 2.6563e-01,  1.0263e-01,  3.0258e-01,  2.1878e-01],
          [ 1.0063e-01,  3.3650e-04,  2.9782e-01,  7.4236e-02],
          [ 6.2298e-01,  7.2955e-01,  6.6983e-01,  8.2029e-01],
          [ 8.2300e-01,  3.2762e-01,  9.9

### Inference Visualization

In [17]:
pred_label = post_processed[0]["labels"][1].item()
print("Predicted class is", detr_model.config.id2label[pred_label])

Predicted class is bicycle


In [18]:
image_draw = ImageDraw.Draw(images[0])
rect = post_processed[0]["boxes"].tolist()[1]
rect = [e * (images[0].width if idx % 2 == 0 else images[0].height) for idx, e in enumerate(rect)]

In [19]:
image_draw.rectangle(rect, outline="blue", width=2)

In [20]:
images[0].show()

## Huggingface CLIP

Inference using the CLIP model provided by the Huggingface `transformers` library (should yiled simillar results as directlly using OpenAI's repo but output is richer)

In [None]:
from transformers import CLIPModel, CLIPProcessor
import requests

In [None]:
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")



In [None]:
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)
texts = ["image of a bird", "image of a cat", "image of a dog", "image containing one or more cats"]

In [None]:
inputs = processor.image_processor(image, return_tensors="pt")
image_features = model.get_image_features(**inputs)
image_features.shape

torch.Size([1, 512])

In [None]:
inputs = processor(text=texts, images=image, padding=True, return_tensors="pt")

In [None]:
response = model(**inputs)

In [None]:
response.logits_per_text

tensor([[18.0458],
        [24.8410],
        [20.0701],
        [28.1160]], grad_fn=<MulBackward0>)

In [None]:
response.text_model_output.last_hidden_state.shape

torch.Size([4, 8, 512])

In [None]:
text_features = model.get_text_features(**{k:v for k, v in inputs.items() if k != "pixel_values"})