# Base Neural Network
How does CLIP compare to other Neural Network Models for Image Classification?

In [1]:
import os

In [2]:
image_dir = "data/"
all_images = [x for x in os.listdir(image_dir) if x.endswith('.jpg')]
categories = set([x[:-8] for x in all_images])
category2images = {category : [] for category in categories}
for img in all_images:
    category2images[img[:-8]].append(img)

In [3]:
all_images

['bagel000.jpg',
 'bagel001.jpg',
 'bagel002.jpg',
 'bagel003.jpg',
 'bagel004.jpg',
 'bagel005.jpg',
 'bagel006.jpg',
 'bagel007.jpg',
 'bagel008.jpg',
 'bagel009.jpg',
 'bagel010.jpg',
 'bagel011.jpg',
 'bagel012.jpg',
 'bagel013.jpg',
 'bagel014.jpg',
 'bagel015.jpg',
 'bagel016.jpg',
 'bagel017.jpg',
 'bagel018.jpg',
 'bagel019.jpg',
 'bagel020.jpg',
 'bagel021.jpg',
 'bagel022.jpg',
 'bagel023.jpg',
 'bagel024.jpg',
 'bagel025.jpg',
 'bagel026.jpg',
 'bagel027.jpg',
 'bagel028.jpg',
 'bagel029.jpg',
 'bagel030.jpg',
 'bagel031.jpg',
 'bagel032.jpg',
 'bagel033.jpg',
 'bagel034.jpg',
 'bagel035.jpg',
 'bagel036.jpg',
 'bagel037.jpg',
 'bagel038.jpg',
 'bagel039.jpg',
 'bagel040.jpg',
 'bagel041.jpg',
 'bagel042.jpg',
 'bagel043.jpg',
 'bagel044.jpg',
 'bagel045.jpg',
 'bagel046.jpg',
 'bagel047.jpg',
 'bagel048.jpg',
 'bagel049.jpg',
 'bagel050.jpg',
 'bagel051.jpg',
 'bagel052.jpg',
 'bagel053.jpg',
 'bagel054.jpg',
 'bagel055.jpg',
 'bagel056.jpg',
 'bagel057.jpg',
 'bagel058.jpg

In [4]:
import requests
import torch
from PIL import Image
import matplotlib.pyplot as plt
from io import BytesIO
import numpy as np
from tqdm import tqdm
import pandas as pd
import os

In [5]:
# !pip install --upgrade diffusers[torch]
# !pip install transformers

Collecting diffusers[torch]
  Using cached diffusers-0.36.0-py3-none-any.whl.metadata (20 kB)
Collecting importlib_metadata (from diffusers[torch])
  Downloading importlib_metadata-8.7.0-py3-none-any.whl.metadata (4.8 kB)
Collecting huggingface-hub<2.0,>=0.34.0 (from diffusers[torch])
  Using cached huggingface_hub-1.2.1-py3-none-any.whl.metadata (13 kB)
Collecting regex!=2019.12.17 (from diffusers[torch])
  Downloading regex-2025.11.3-cp312-cp312-win_amd64.whl.metadata (41 kB)
Collecting safetensors>=0.3.1 (from diffusers[torch])
  Using cached safetensors-0.7.0-cp38-abi3-win_amd64.whl.metadata (4.2 kB)
Collecting accelerate>=0.31.0 (from diffusers[torch])
  Using cached accelerate-1.12.0-py3-none-any.whl.metadata (19 kB)
Collecting hf-xet<2.0.0,>=1.2.0 (from huggingface-hub<2.0,>=0.34.0->diffusers[torch])
  Using cached hf_xet-1.2.0-cp37-abi3-win_amd64.whl.metadata (5.0 kB)
Collecting shellingham (from huggingface-hub<2.0,>=0.34.0->diffusers[torch])
  Downloading shellingham-1.5.4-py

In [6]:
from diffusers import StableUnCLIPImg2ImgPipeline
from transformers import CLIPTextModelWithProjection, CLIPTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [9]:
# ──────────────────────────────────────────────────────────────
# 1.  Load unCLIP – vision side only (projection_dim = 1024)   ─
# ──────────────────────────────────────────────────────────────
pipe = StableUnCLIPImg2ImgPipeline.from_pretrained(
    "sd2-community/stable-diffusion-2-1-unclip",
    torch_dtype=torch.float32,
).to(device)

vision_encoder = pipe.image_encoder # keep as-is (1024-d)

Loading pipeline components...: 100%|████████████████████████████████████████████████████| 9/9 [00:01<00:00,  7.81it/s]


In [10]:
# ──────────────────────────────────────────────────────────────
# 2.  Swap in an OpenCLIP ViT-H/14 text branch (also 1024-d)  ─
# ──────────────────────────────────────────────────────────────
openclip_repo = "laion/CLIP-ViT-H-14-laion2B-s32B-b79K"     # projection_dim = 1024 :contentReference[oaicite:0]{index=0}
tokenizer = CLIPTokenizer.from_pretrained(openclip_repo)
text_encoder = CLIPTextModelWithProjection.from_pretrained(
    openclip_repo,
    torch_dtype=torch.float16
).to(device)

# optional: stuff them into the pipe so `pipe.tokenizer` etc. work
pipe.tokenizer, pipe.text_encoder = tokenizer, text_encoder

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [11]:
# ──────────────────────────────────────────────────────────────
# 3. Helpers                                              
# ──────────────────────────────────────────────────────────────
def embed_images(paths, batch_size=8):
    """Return (N,1024) image embeddings"""
    out, fe, enc = [], pipe.feature_extractor, pipe.image_encoder
    for i in range(0, len(paths), batch_size):
        imgs = [Image.open(p).convert("RGB") for p in paths[i:i + batch_size]]
        px   = fe(imgs, return_tensors="pt").pixel_values.to(enc.device, enc.dtype)
        with torch.no_grad():
            v = enc(px)[0]                              # (B,1024)
        out.append(v)
    return torch.cat(out)  # (N,1024)

def embed_texts(prompts, batch_size=64):
    """Return (N,1024) text embeddings"""
    vecs = []
    for i in range(0, len(prompts), batch_size):
        toks = tokenizer(prompts[i:i + batch_size],
                         padding=True, truncation=True, max_length=77,
                         return_tensors="pt").to(text_encoder.device)
        with torch.no_grad():
            t = text_encoder(**toks).text_embeds        # (B,1024)
        vecs.append(t)
    return torch.cat(vecs)  # (N,1024)

In [12]:
# ──────────────────────────────────────────────────────────────
# 4.  Sanity check on image-text similarity                                 
# ──────────────────────────────────────────────────────────────
img_vec = embed_images(["./data/mango_03s.jpg"])
txt_vec = embed_texts(["mango"])
print("cosine(mango image, \"mango\") →",
      (torch.nn.functional.normalize(img_vec, dim=-1) @ torch.nn.functional.normalize(txt_vec, dim=-1).T).item())        
# expect ≳ 0.3

FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\charl\\COGS118B\\COGS118B-final-project\\THINGS_images\\mango_03s.jpg'

In [None]:
# ──────────────────────────────────────────────────────────────
# 5.  Sanity check on image-image similarity                                 
# ──────────────────────────────────────────────────────────────
img_vec = embed_images(["./THINGS_images/mango_03s.jpg",
                       "./THINGS_images/cat_01b.jpg",
                       "./THINGS_images/dog_01b.jpg"])
img_vec2 = embed_images(["./THINGS_images/mango_01b.jpg",
                       "./THINGS_images/cat_04s.jpg",
                       "./THINGS_images/dog_06s.jpg"])

sims = torch.nn.functional.normalize(img_vec, dim=-1) @ torch.nn.functional.normalize(img_vec2, dim=-1).T 
sims = sims.detach().cpu().numpy()

In [None]:
fig, ax = plt.subplots()

# Display the data as an image (heatmap)
im = ax.imshow(sims, cmap='viridis')

# Loop over the data and place text annotations
for i in range(sims.shape[0]):
    for j in range(sims.shape[1]):
        ax.text(j, i, sims[i, j], ha='center', va='center', color='black')

# Add a colorbar for reference
plt.colorbar(im)
plt.yticks([0, 1, 2], ['mango1', 'cat1', 'dog1'])
plt.xticks([0, 1, 2], ['mango2', 'cat2', 'dog2'])

# Set title and display the plot
ax.set_title('Pairwise similarities between 6 different images')
plt.show()