In [2]:
import torch
import clip
from PIL import Image

device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

image = preprocess(Image.open("./data/CLIP.png")).unsqueeze(0).to(device)
text = clip.tokenize(["a diagram", "a dog", "a cat"]).to(device)

with torch.no_grad():
    image_features = model.encode_image(image)
    text_features = model.encode_text(text)
    
    logits_per_image, logits_per_text = model(image, text)
    probs = logits_per_image.softmax(dim=-1).cpu().numpy()

print("Label probs:", probs)  # prints: [[0.9927937  0.00421068 0.00299572]]

Label probs: [[0.9927   0.004185 0.002968]]


In [26]:
logits_per_image

tensor([[25.5625, 20.0938, 19.7500]], device='cuda:0', dtype=torch.float16)

In [27]:
logits_per_text

tensor([[25.5625],
        [20.0938],
        [19.7500]], device='cuda:0', dtype=torch.float16)

In [23]:
probs.sum()

1.0

In [9]:
image_features.size()

torch.Size([1, 512])

In [12]:
text_features.T.size()

torch.Size([512, 3])

In [39]:
import numpy as np

In [41]:
out = torch.matmul(image_features/image_features.norm(dim=1, keepdim=True),((text_features/text_features.norm(dim=1, keepdim=True)).T))*(np.log(1/0.07))

In [42]:
out

tensor([[0.6797, 0.5342, 0.5254]], device='cuda:0', dtype=torch.float16)

In [32]:
out/(1**(0.5))

tensor([[31.6250, 23.8125, 22.6719]], device='cuda:0', dtype=torch.float16)

In [25]:
torch.softmax(out,dim=-1)

tensor([[9.9951e-01, 4.0436e-04, 1.2922e-04]], device='cuda:0',
       dtype=torch.float16)