In [4]:
import torch
import clip
from transformers import CLIPProcessor, CLIPModel
import time
import numpy as np
from PIL import Image
import os.path as path

device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [5]:
model_id = "openai/clip-vit-base-patch32"
processor = CLIPProcessor.from_pretrained(model_id)
model = CLIPModel.from_pretrained(model_id)

In [7]:
#Choosing image
image_name_test = "abstract_v002_val2015_000000020000.png"
image_id_test = int(image_name_test[:-4].split("_")[-1])

dataset_path=path.join("Images")
image = Image.open(path.join(dataset_path,image_name_test))
image.show()

In [8]:
#Reading json file
import json
 
f = open('Annotations/MultipleChoice_abstract_v002_val2015_questions.json')
data = json.load(f)

for q in data['questions']:
    image_id = q["image_id"]
    if image_id == image_id_test:
        multiple_choices = q["multiple_choices"]
        question = q["question"]
        print("Question: ", question)
        print("Multiple choices:", multiple_choices)
        break

# Closing file
f.close()

Question:  What color is the fire?
Multiple choices: ['white', 'red', 'yellow', 'red and orange', 'red orange', 'by bookshelf', '1', 'brown', 'blue', '4', 'yes', '3', 'orange', '2', 'no', 'foot rest', 'no cat', 'butt']


**Question embedding**

In [9]:
question_tokens = processor(text=question, images=None, return_tensors="pt", padding=True)
print(question_tokens)
question_emb = model.get_text_features(**question_tokens).detach().cpu().numpy()[0]
print(len(question_emb))

{'input_ids': tensor([[49406,   768,  3140,   533,   518,  1769,   286, 49407]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1]])}


tensor([[49406,   768,  3140,   533,   518,  1769,   286, 49407,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0]], dtype=torch.int32)

**Image embedding**

In [178]:
img = processor(text=None, images=image, return_tensors="pt")["pixel_values"]
image_emb = model.get_image_features(img).detach().cpu().numpy()[0]
print(len(image_emb))

512


**Posible answers embedding**

In [179]:
multiple_choices_tokens = processor(text=multiple_choices, images=None, return_tensors="pt", padding=True)
print(multiple_choices_tokens["input_ids"].size())
multiple_choices_emb = model.get_text_features(**multiple_choices_tokens).detach().cpu().numpy()
print(multiple_choices_emb)

torch.Size([18, 4])
[[ 0.11625315 -0.02649451 -0.03693745 ... -0.06846023 -0.2037456
   0.6314057 ]
 [ 0.13425508  0.0152376  -0.18170106 ... -0.06541064  0.1776331
   0.26445475]
 [-0.19172315 -0.12251318  0.12002026 ... -0.15095477 -0.43429744
  -0.38816124]
 ...
 [ 0.11757565  0.13806969  0.2414751  ... -0.48215938 -0.02207426
   0.45251638]
 [ 0.14271183 -0.19830242  0.14422156 ... -0.10659659 -0.3477292
   0.15352668]
 [ 0.04081094  0.27597252  0.35765576 ... -0.7412758   0.13738537
   0.42976072]]


**Comparing**

In [180]:
from sklearn.metrics.pairwise import cosine_similarity

ComparableVector = image_emb * question_emb
ComparableVector = ComparableVector / np.linalg.norm(ComparableVector)

similarities=[]
for choice_emb in multiple_choices_emb:
    choice_emb = choice_emb / np.linalg.norm(choice_emb)
    similarities.append(np.dot(ComparableVector, choice_emb) / (np.linalg.norm(ComparableVector)*np.linalg.norm(choice_emb)))
print(similarities)
argmax = np.argmax(similarities)
print(multiple_choices[argmax])

[0.29649082, 0.28999376, 0.3011317, 0.3811985, 0.3887719, 0.30011657, 0.3709505, 0.35827, 0.35228503, 0.3408517, 0.34658712, 0.3653583, 0.26324734, 0.2699174, 0.3405001, 0.29251245, 0.31902406, 0.36002368]
yes
