## Precompute CLIP embeddings on the THINGS dataset

In [1]:
import torch
import open_clip
from PIL import Image
from torchvision import transforms
import os
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


### CLIP

In [2]:
MODEL_NAME = "ViT-L-14"
PRETRAINED_DATASET = "openai"  # Use "laion2b_s32b_b82k" for OpenCLIP versions

device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess, tokenizer = open_clip.create_model_and_transforms(MODEL_NAME, pretrained=PRETRAINED_DATASET)
model.to(device)
model.eval()



CLIP(
  (visual): VisionTransformer(
    (conv1): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14), bias=False)
    (patch_dropout): Identity()
    (ln_pre): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
    (transformer): Transformer(
      (resblocks): ModuleList(
        (0-23): 24 x ResidualAttentionBlock(
          (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1024, out_features=1024, bias=True)
          )
          (ls_1): Identity()
          (ln_2): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1024, out_features=4096, bias=True)
            (gelu): GELU(approximate='none')
            (c_proj): Linear(in_features=4096, out_features=1024, bias=True)
          )
          (ls_2): Identity()
        )
      )
    )
    (ln_post): LayerNorm((1024,), eps=1e-05, elementwi

### DATA
The _image_database_things dataset can be downloaded on the OSF page of the THINGS dataset: https://osf.io/jum2f/
It is around 4.7GB compressed and 5.1GB uncompressed.

In [19]:
IMAGE_DIR = "./_image_database_things/object_images"
OUTPUT_FILE = "reduced_things_clip_embeddings.pt"

### PROCESS ALL IMAGES

In [20]:
batch_size = 32
embeddings = {}

for root, _, files in tqdm(os.walk(IMAGE_DIR), desc="Processing categories", total=len(os.listdir(IMAGE_DIR))):
    image_paths = [os.path.join(root, file) for file in files if file.endswith((".jpg", ".jpeg", ".png"))][:1]

    # process batches
    for i in range(0, len(image_paths), batch_size):
        batch_paths = image_paths[i:i + batch_size]

        images = []
        for img_path in batch_paths:
            try:
                image = Image.open(img_path).convert("RGB")
                image = preprocess(image)
                images.append(image)
            except Exception as e:
                print(f"Error processing {img_path}: {e}")

        images = torch.stack(images).to(device)
        with torch.no_grad():
            batch_embeddings = model.encode_image(images).cpu()
        
        for j, img_path in enumerate(batch_paths):
            category = os.path.basename(root)
            embeddings["/".join(img_path.split("/")[-2:])] = batch_embeddings[j].unsqueeze(0)

print(f"There are {len(embeddings)} embeddings")
torch.save(embeddings, OUTPUT_FILE)
print(f"Saved {len(embeddings)} embeddings to {OUTPUT_FILE}")
print(f"The file is {os.path.getsize(OUTPUT_FILE) / 1024 / 1024:.2f} MB")

Processing categories: 1855it [00:31, 58.22it/s]                          

There are 1854 embeddings
Saved 1854 embeddings to reduced_things_clip_embeddings.pt
The file is 6.01 MB





### VERIFYING EMBEDDINGS

In [17]:
embeddings = torch.load(OUTPUT_FILE)
print(list(embeddings.keys())[:10])
print("Embeddings have a shape of", embeddings['photo_booth/photo_booth_09s.jpg'].shape)

['photo_booth/photo_booth_09s.jpg', 'photo_booth/photo_booth_02s.jpg', 'photo_booth/photo_booth_12s.jpg', 'photo_booth/photo_booth_11s.jpg', 'photo_booth/photo_booth_03s.jpg', 'photo_booth/photo_booth_13s.jpg', 'photo_booth/photo_booth_05s.jpg', 'photo_booth/photo_booth_06s.jpg', 'photo_booth/photo_booth_10s.jpg', 'photo_booth/photo_booth_15s.jpg']
Embeddings have a shape of torch.Size([1, 768])
