In [1]:
!pip install ftfy regex tqdm
!pip install git+https://github.com/openai/CLIP.git

Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-feps4jcf
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-feps4jcf
  Resolved https://github.com/openai/CLIP.git to commit a9b1bf5920416aaeaec965c25dd9e8f98c864f16
  Preparing metadata (setup.py) ... [?25l[?25hdone


In [2]:
import numpy as np
import torch
from pkg_resources import packaging
import clip
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Torch version:", torch.__version__)
model, preprocess = clip.load("RN50")
model.cuda().eval()
input_resolution = model.visual.input_resolution
context_length = model.context_length
vocab_size = model.vocab_size

print("Model parameters:", f"{np.sum([int(np.prod(p.shape)) for p in model.parameters()]):,}")
print("Input resolution:", input_resolution)
print("Context length:", context_length)
print("Vocab size:", vocab_size)

preprocess

Torch version: 2.0.1+cu118
Model parameters: 102,007,137
Input resolution: 224
Context length: 77
Vocab size: 49408


Compose(
    Resize(size=224, interpolation=bicubic, max_size=None, antialias=warn)
    CenterCrop(size=(224, 224))
    <function _convert_image_to_rgb at 0x7f35eab50ee0>
    ToTensor()
    Normalize(mean=(0.48145466, 0.4578275, 0.40821073), std=(0.26862954, 0.26130258, 0.27577711))
)

In [7]:
import pandas as pd
import os
import gdown
import tarfile
from PIL import Image
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader

class RefCOCOg(Dataset):

  FILE_ID = '1wyyksgdLwnRMC9pQ-vjJnNUn47nWhyMD'
  ARCHIVE_NAME = 'refcocog.tar.gz'
  NAME = 'refcocog'
  ANNOTATIONS = 'annotations/refs(umd).p'
  IMAGES = 'images'
  IMAGE_NAME = 'COCO_train2014_{}.jpg'


  def __init__(self, data_dir, split, transform=None):
      self.data_dir = data_dir
      self._check_dataset()
      self.split = split
      self._filter_annotation(os.path.join(self.data_dir, self.NAME, self.ANNOTATIONS))
      self.transform = transform

  def _check_dataset(self):
      if not os.path.exists(os.path.join(self.data_dir, self.ARCHIVE_NAME)):
          if not os.path.exists(self.data_dir):
              os.mkdir(self.data_dir)
          print('Downloading dataset...')
          gdown.download(id=self.FILE_ID)
      if not os.path.exists(os.path.join(self.data_dir, self.NAME)):
          print('Extracting dataset...')
          with tarfile.open(os.path.join(self.data_dir, self.ARCHIVE_NAME), 'r:gz') as tar:
              tar.extractall(path=self.data_dir)
      else:
          print('Dataset already extracted')

  def __len__(self):
      return len(self.annotation)

  def __getitem__(self, idx):
      # get line by index
      raw = self.annotation.iloc[idx]
      # get image
      image = self._get_image(raw)
      # get sentences
      sentences = self._get_sentences(raw)

      return self._get_vectors(image, sentences)

  def _get_image(self, raw):
      # get image_id
      image_id = raw['image_id']
      # pad image_id to 12 digits
      image_id = str(image_id).zfill(12)
      # convert image to tensor
      image = Image.open(os.path.join(self.data_dir, self.NAME, self.IMAGES, self.IMAGE_NAME.format(image_id)))
      return image

  def _get_sentences(self, raw):
      # get sentences
      sentences = raw['sentences']
      # get raw sentences
      sentences = [sentence['raw'] for sentence in sentences]
      return sentences

  def _filter_annotation(self, path):
      # self.annotation = pd.read_pickle(path)
      # #self.annotation = pd.DataFrame([x for x in (self.annotation) if x['split'] == 'val'])
      # self.annotation = [x for x in (self.annotation) if x['split'] == 'val']
      # self.ANNOTATION_SIZE = len(self.annotation)
      self.annotation = pd.read_pickle(path)
      self.annotation = pd.DataFrame(self.annotation)
      self.annotation = self.annotation[self.annotation['split'] == self.split]

  def _get_vectors(self, image, sentences):
  # TODO: check multiple sentences
    image = preprocess(image).unsqueeze(0).to(device)
    text = clip.tokenize(sentences).to(device)
    with torch.no_grad():
      image_features = model.encode_image(image)
      text_features = model.encode_text(text)
    print(f"Image shape: {image_features.shape}, Text shape:{text_features.shape}")
    product = np.multiply(image_features.cpu(), text_features.cpu())
    out = product / np.linalg.norm(product)
    print(f"Output shape: {out.shape}")
    return out

In [8]:
dataset = RefCOCOg('.', 'val')
for i in range(5):
  x = dataset[i]
  print(x)

Dataset already extracted
Image shape: torch.Size([1, 1024]), Text shape:torch.Size([2, 1024])
Output shape: torch.Size([2, 1024])
tensor([[ 2.9540e-04,  8.5831e-04,  2.9221e-03,  ..., -2.7275e-04,
         -5.6171e-04, -1.9217e-03],
        [-4.7755e-04,  1.4048e-03,  2.5578e-03,  ..., -8.1658e-05,
         -1.2970e-04, -3.8681e-03]], dtype=torch.float16)
Image shape: torch.Size([1, 1024]), Text shape:torch.Size([2, 1024])
Output shape: torch.Size([2, 1024])
tensor([[-0.0016, -0.0002, -0.0037,  ..., -0.0004, -0.0003, -0.0104],
        [-0.0014, -0.0003, -0.0053,  ...,  0.0002, -0.0002,  0.0007]],
       dtype=torch.float16)
Image shape: torch.Size([1, 1024]), Text shape:torch.Size([2, 1024])
Output shape: torch.Size([2, 1024])
tensor([[-6.6996e-04,  1.7273e-04,  2.8372e-04,  ..., -2.2161e-04,
          8.2541e-04, -2.2030e-03],
        [-8.0252e-04, -3.8028e-04, -1.6422e-03,  ...,  1.4365e-05,
          2.4033e-03,  4.4322e-04]], dtype=torch.float16)
Image shape: torch.Size([1, 1024])