# Image Embedding Example

- [Image Similarity with Hugging Face Datasets and Transformers](https://huggingface.co/blog/image-similarity)

In [1]:
!pip install transformers requests
!pip install datasets torch
!pip install "pybind11>=2.12"
# !pip uninstall numpy -y
!pip install "numpy<2"
!pip install --upgrade jupyter ipywidgets

[0m

In [2]:
from transformers import AutoImageProcessor, AutoModel

model_ckpt = 'google/vit-base-patch16-224'
processor = AutoImageProcessor.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt)
# model.eval()  # Set the model to evaluation mode

2024-07-09 15:49:37.218024: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-09 15:49:37.474944: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['vit.pooler.dense.bias', 'vit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
from transformers import AutoImageProcessor, AutoModel
import torch
from PIL import Image
from torch.nn.functional import cosine_similarity
def preprocess_image(image_path):
    image = Image.open(image_path).convert("RGB")
    inputs = processor(images=image, return_tensors="pt")
    return inputs

# Example usage
image_path1 = "buddy_face.jpg" #"buddy.jpg"
image_path2 = "IMG_6529_face.jpg"
inputs1 = preprocess_image(image_path1)
inputs2 = preprocess_image(image_path2)

with torch.no_grad():
    outputs1 = model(**inputs1)
    outputs2 = model(**inputs2)

# The embeddings are typically taken from the last hidden state of the model
embeddings1 = outputs1.last_hidden_state[:, 0, :]  # Extract [CLS] token embedding
embeddings2 = outputs2.last_hidden_state[:, 0, :]  # Extract [CLS] token embedding

# Get the embedding length (dimensionality)
embedding_length = embeddings1.shape[1]
print(f"Embedding length: {embedding_length}")

# Output the embeddings
print(f"Embeddings for image 1: {embeddings1}")
print(f"Embeddings for image 2: {embeddings2}")


from torch.nn.functional import cosine_similarity

similarity = cosine_similarity(embeddings1, embeddings2)
print(f"Similarity: {similarity.item()}")


Embedding length: 768
Embeddings for image 1: tensor([[ 3.4695e-01,  6.1617e-01, -1.6142e-01,  9.2766e-01, -1.3618e-01,
         -1.1370e+00, -2.1302e-01,  5.6286e-01,  6.3115e-02, -7.4667e-01,
         -3.6927e-01,  1.0740e+00,  4.3376e-01, -2.1565e+00,  1.0418e+00,
          1.1640e+00,  1.0452e-04,  6.7651e-02,  5.9292e-01, -1.1233e+00,
          3.5163e-01,  2.3281e+00, -3.9556e-01,  8.8211e-01, -1.3872e+00,
          2.7011e-01,  8.6785e-01,  6.2526e-01, -2.7973e-01, -4.4975e-01,
         -1.1738e+00, -2.1596e-01,  3.3536e-01, -5.9063e-01,  1.3016e+00,
         -3.0838e-01, -2.5397e+00, -6.6081e-01, -1.4386e-01, -8.0380e-01,
          1.1686e+00,  2.8393e-01, -2.2242e-01, -1.9076e+00,  6.7082e-01,
          1.5326e+00, -3.8423e-01, -2.1905e+00,  6.7262e-01, -2.9838e-01,
         -4.6584e-01,  2.1223e-01, -2.0196e-01, -1.9567e-01, -4.5268e-01,
          6.4930e-01,  4.3316e-01,  3.9301e-01, -1.3632e-01,  6.9296e-02,
         -8.7580e-02,  1.6303e+00,  6.2314e-01,  1.3681e+00, -6.71