<a href="https://colab.research.google.com/github/Himkeshtak/VLM-OpenCV-Course/blob/main/Embeddings_VLM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from transformers import CLIPTokenizer  #
from transformers import CLIPProcessor
from transformers import CLIPModel

import torch
import torch.nn.functional as F
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from PIL import Image
import requests
from io import BytesIO




Load model

In [None]:
#
model_name = "openai/clip-vit-base-patch32"

#loading the pretrained model from the hugging Face
model = CLIPModel.from_pretrained(model_name)

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

Tokenize Text Strings

In [None]:
# loading the tokenzier associted with the specified CLIP model
tokenizer = CLIPTokenizer.from_pretrained(model_name)

#Define a list of text descriptions to embed
text = ["a donut","a cookie", "an airplane", "a cat"]

#Tokenize and preprocess the text inputs with padding to ensure equal sequence lengths
inputs = tokenizer(text, padding=True, return_tensors="pt")

#unpack the inputs
input_ids = inputs.input_ids #Tokenized and encoded text inputs IDs

#Print information about the unpacked inputs
print("Input IDs (shape):" , input_ids.shape)
print(input_ids)

Input IDs (shape): torch.Size([4, 4])
tensor([[49406,   320, 18471, 49407],
        [49406,   320,  9367, 49407],
        [49406,   550, 16451, 49407],
        [49406,   320,  2368, 49407]])


Find text Embeddings
The CLIP models takes in the tokenized strings and return an 512 length embedding for every string.

In [None]:
#compute the text embeddinfgs without tracking gradients (inference  mode)
with torch.no_grad:
  text_embeddings = model.get_text_features(**inputs)
  # Obtaining the text embeddings(feature vectors)from the CLIP model

#Print the shape of the resulting text embeddings tensor
#the shape is [number_of_texts, embedding_dimension]
print(text_embeddings.shape)


6.󠀠󠀮󠁽󠁝󠁝󠁝󠁝 Calculate Cosine Similarity
Why did the two vectors start dating after math class?

Because when they calculated their cosine similarity, they realized they were practically aligned—it was love at first dot-product!

Cosine similarity measures how similar (aligned) two vectors are by looking only at the angle between them, not their length.

Imagine each vector as an arrow from the origin.
The cosine of the angle  
θ
  between the arrows is
c
o
s
i
n
e
_
s
i
m
(
a
,
b
)
=
cos
(
θ
)
=
a
⋅
b
∥
a
∥
∥
b
∥


A value of +1 means the arrows point in exactly the same direction (perfect similarity), 0 means they’re orthogonal (no similarity), and –1 means they point in opposite directions (complete dissimilarity).
Because it ignores magnitude, cosine similarity is ideal for comparing text or image embeddings where direction captures meaning and length may just scale with word count or pixel intensity.

The calculated cosine similarity between strings are displayed using a color coded matrix.


In [1]:
# Assuming text_embeddings is a tensor of shape [n, d], where:
# - n = number of text prompts
# - d = embedding dimension

# Compute the n x n cosine similarity matrix between all pairs of embeddings
# text_embeddings[:, None, :] reshapes embeddings to [n, 1, d]
# text_embeddings[None, :, :] reshapes embeddings to [1, n, d]
# cosine_similarity calculates similarity along the last dimension (d)
cosine_similarity = F.cosine_similarity(
    text_embeddings[:, None, :],    # Shape: [n, 1, d]
    text_embeddings[None, :, :],    # Shape: [1, n, d]
    dim=2                           # Calculate similarity along embedding dimension d
).cpu().numpy()                     # Move to CPU and convert tensor to NumPy array for plotting

# Initialize a matplotlib figure with specified size (width=6, height=4)
plt.figure(figsize=(6, 4))

# Create a heatmap visualization using seaborn to display the cosine similarity matrix
sns.heatmap(
    cosine_similarity,              # Matrix to visualize (n x n similarity scores)
    annot=True,                     # Annotate each cell with numeric similarity value
    fmt=".2f",                      # Format annotations to two decimal places
    cmap="coolwarm",                # Colormap for heatmap indicating negative/positive similarities
    xticklabels=text,               # Label x-axis with the original text prompts
    yticklabels=text                # Label y-axis with the original text prompts
)

# Set the plot title with font size 14
plt.title("Cosine Similarity Matrix", fontsize=14)

# Label x-axis as "Text Embeddings"
plt.xlabel("Text Embeddings")

# Label y-axis as "Text Embeddings"
plt.ylabel("Text Embeddings")

# Display the plot
plt.show()

NameError: name 'F' is not defined

Plot Images

In [None]:
#Utility for displaying images with labels
def plot_images(images, labels):
  n= len(images)                #No. od images loaded succesfully
  fig, axes = plt.subplots(1,n) #Creates subplots with one row and n columns

  #Loop through each subplot axis, image and its label to display them
  for ax, img, lbl in zip(axes, images, labels):
    ax.imshow(img)
    ax.set_title(lbl)
    ax.axis("off")

  plt.tight_layout()  #Adjsut layout to prevent overlap
  plt.show()          #Show the images plot

Load and Display IMages

In [2]:
# Load a pre-trained CLIP processor for handling images and text preprocessing
processor = CLIPProcessor.from_pretrained(model_name)

# Dictionary containing labels and their corresponding image URLs
image_urls = {
    "a donut": "https://learnopencv.com/wp-content/uploads/2025/03/donut.jpeg",
    "a cookie": "https://learnopencv.com/wp-content/uploads/2025/03/cookie.jpeg",
    "an airplane": "https://learnopencv.com/wp-content/uploads/2025/03/airplane.jpeg",
    "a cat": "https://learnopencv.com/wp-content/uploads/2025/03/cat.jpeg"
}

# Extract the list of labels from the dictionary keys
labels = list(image_urls.keys())

# Define a robust function to load images from URLs
def load_image(url):
    headers = {'User-Agent': 'Mozilla/5.0'}        # Set headers to avoid blocking by web servers
    response = requests.get(url, headers=headers)  # Request the image from the URL
    response.raise_for_status()                    # Raise an error if the download fails
    # Open the downloaded image, convert it to RGB format, and return the PIL Image
    return Image.open(BytesIO(response.content)).convert("RGB")

# Initialize empty lists for successfully loaded images and their labels
images = []

# Loop through each label to load the associated image
for label in labels:
    try:
        img = load_image(image_urls[label])  # Load image from URL
        images.append(img)                   # Append loaded image to images list
    except requests.exceptions.RequestException as e:
        # If an image fails to load, print an error message
        print(f"Failed to load {label}: {e}")

# Display the loaded images in a single row using matplotlib
plot_images(images, labels)


NameError: name 'CLIPProcessor' is not defined

9.󠀠󠀮󠁽󠁝󠁝󠁝󠁝 Calculate Image Emdeddings & Display Similarity

In [None]:
# Preprocess images using CLIP processor to prepare for embedding generation
image_inputs = processor(images=images, return_tensors="pt")

with torch.no_grad():
  # Generate image embeddings using the CLIP model
  image_embeddings = model.get_image_features(**image_inputs)

# Print the shape of the resulting image embeddings tensor
# The shape is [number_of_images, embedding_dimension]
print(image_embeddings.shape)

# Compute similarity matrix
img_similarity = F.cosine_similarity(image_embeddings[:, None, :], image_embeddings[None, :, :], dim=2).cpu().numpy()

# Plot heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(img_similarity, annot=True, xticklabels=labels, yticklabels=labels, cmap="coolwarm")
plt.xlabel("Image Embeddings")
plt.ylabel("Image Embeddings")
plt.title("CLIP Image-Image Similarity Heatmap")
plt.show()

10.󠀠󠀮󠁽󠁝󠁝󠁝󠁝 Calculate Image-Text Similarity

In [None]:
# Compute similarity matrix
txt_image_similarity = F.cosine_similarity(text_embeddings[:, None, :], image_embeddings[None, :, :], dim=2).cpu().numpy()

# Plot heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(txt_image_similarity, annot=True, xticklabels=labels, yticklabels=labels, cmap="coolwarm")
plt.xlabel("Text Embeddings")
plt.ylabel("Image Embeddings")
plt.title("CLIP Image-Text Similarity Heatmap")
plt.show()

11.󠀠󠀮󠁽󠁝󠁝󠁝󠁝 Conclusion
Before you leave this notebook make sure you understand the following concepts.

Tokenization
Text embedding
Image embedding
Cosine similarity
That's all!