In [1]:
import torch
import numpy as np
from transformers import CLIPTokenizer, CLIPTextModel, CLIPProcessor, CLIPVisionModel, AutoModel, AutoProcessor

NOTE: Redirects are currently not supported in Windows or MacOs.


In [2]:
model_name = "openai/clip-vit-base-patch32"
model = AutoModel.from_pretrained(model_name)
processor = AutoProcessor.from_pretrained(model_name)

In [3]:
from PIL import Image
images = [Image.open("/Users/vladislavlialin/Downloads/top-text-bottpm.jpeg")]*2
captions = ["cap1", "not a a cat"]

inputs = processor(images=images, text=captions, return_tensors="pt", padding=True)

In [4]:
np.array(images[0]).shape

(600, 600, 3)

In [5]:
out = model(**inputs)
out.keys()

odict_keys(['logits_per_image', 'logits_per_text', 'text_embeds', 'image_embeds', 'text_model_output', 'vision_model_output'])

In [6]:
out.text_embeds.shape, out.image_embeds.shape

(torch.Size([2, 512]), torch.Size([2, 512]))

In [7]:
out.logits_per_image.to(device="cpu", dtype=torch.float32)

tensor([[23.2729, 19.9196],
        [23.2729, 19.9196]], grad_fn=<PermuteBackward0>)

In [8]:
import torch
from transformers import AutoModel, AutoProcessor


class ClipScore:
    def __init__(self, device="cpu", dtype=torch.float32):
        model_name = "openai/clip-vit-base-patch32"
        # "openai/clip-vit-large-patch14" vs "openai/clip-vit-base-patch32" ?
        self.dtype = dtype
        self.device = device
        self.model = AutoModel.from_pretrained(model_name)
        self.processor = AutoProcessor.from_pretrained(model_name)
        self.model.eval()

    @torch.no_grad()
    def compute(self, *, captions, images, batch_size=1):
        assert len(captions) == len(images)
        self.model.to(self.device, dtype=self.dtype)

        score_sum = torch.tensor(0, device=self.device, dtype=torch.float32)  # we want to keep the score in float32 to increase precision

        for i in range(0, len(captions), batch_size):
            batch_captions = captions[i : i + batch_size]
            batch_images = images[i : i + batch_size]

            inputs = self.processor(images=batch_images, text=batch_captions, return_tensors="pt", padding=True).to(self.device)
            inputs["pixel_values"] = inputs["pixel_values"].to(self.dtype)

            model_outputs = self.model(**inputs)

            text_features = model_outputs.text_embeds
            image_features = model_outputs.image_embeds

            # compute cosine similarity between image and text features
            text_features = text_features / text_features.norm(dim=-1, keepdim=True)
            image_features = image_features / image_features.norm(dim=-1, keepdim=True)

            score_sum += (text_features * image_features).sum().float()

        self.model.to(torch.device("cpu"), dtype=torch.float32)

        return score_sum / len(captions)


In [9]:
score = ClipScore()

score.compute(captions=captions, images=images, batch_size=2)

tensor(0.2160)

In [10]:
score = ClipScore()

score.compute(captions=captions, images=[np.array(i) for i in images], batch_size=2)

tensor(0.2160)

In [14]:
np.array([np.array(i) for i in images]).shape

(2, 600, 600, 3)

In [15]:
score = ClipScore()

score.compute(captions=captions, images=[torch.tensor(np.array(i)) for i in images], batch_size=2)

tensor(0.2160)

In [23]:
# score = ClipScore()

image_tensor = torch.tensor(np.stack([np.array(i) for i in images], axis=0))  # [2, 600, 600, 3]
image_tensor = image_tensor.permute(0, 3, 1, 2)  # [2, 3, 600, 600]
score.compute(captions=captions, images=image_tensor, batch_size=2)

TypeError: Cannot handle this data type: (1, 1, 600, 600), |u1

In [21]:
image_tensor.shape

torch.Size([2, 600, 600, 3])

In [24]:
model_name = "openai/clip-vit-base-patch32"
processor = AutoProcessor.from_pretrained(model_name)

In [26]:
?processor

[0;31mSignature:[0m      [0mprocessor[0m[0;34m([0m[0mtext[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0mimages[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0mreturn_tensors[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mType:[0m           CLIPProcessor
[0;31mString form:[0m   
CLIPProcessor:
           - feature_extractor: CLIPFeatureExtractor {
           "crop_size": 224,
           "do_center_crop" <...> , rstrip=False, lstrip=False, single_word=False, normalized=True), 'pad_token': '<|endoftext|>'})
[0;31mFile:[0m           ~/miniconda3/lib/python3.8/site-packages/transformers/models/clip/processing_clip.py
[0;31mDocstring:[0m     
Constructs a CLIP processor which wraps a CLIP feature extractor and a CLIP tokenizer into a single processor.

[`CLIPProcessor`] offers all the functionalities of [`CLIPFeatureExtractor`] and [`CLIPTokenizerFast`]. See the
[`~CLIPProcessor.__call__`] and [`~CLIPProcessor

In [25]:
processor(images=image_tensor, text=captions, return_tensors="pt", padding=True)

TypeError: Cannot handle this data type: (1, 1, 600, 600), |u1