In [1]:
!pip uninstall -y torch torchaudio fastai
!pip install torch==2.6.0 torchvision==0.21.0
!pip install transformers

from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import torch

Found existing installation: torch 2.5.1+cu124
Uninstalling torch-2.5.1+cu124:
  Successfully uninstalled torch-2.5.1+cu124
Found existing installation: torchaudio 2.5.1+cu124
Uninstalling torchaudio-2.5.1+cu124:
  Successfully uninstalled torchaudio-2.5.1+cu124
Found existing installation: fastai 2.7.18
Uninstalling fastai-2.7.18:
  Successfully uninstalled fastai-2.7.18
Collecting torch==2.6.0
  Downloading torch-2.6.0-cp311-cp311-manylinux1_x86_64.whl.metadata (28 kB)
Collecting torchvision==0.21.0
  Downloading torchvision-0.21.0-cp311-cp311-manylinux1_x86_64.whl.metadata (6.1 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch==2.6.0)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch==2.6.0)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch==2.6.0)
  Downloading nv

In [8]:
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

In [14]:
image_path = "/content/image.jpg"  # Path to your local image
image = Image.open(image_path)

In [12]:
text = [
    "A breathtaking view of a modern suspension bridge over deep blue waters",
    "A stunning red suspension bridge connects two lands under a clear blue sky",
    "A panoramic shot of a grand bridge stretching across the sea",
    "This magnificent bridge stands tall over the tranquil waters below",
    "The striking architecture of this suspension bridge is a true masterpiece",
    "The long and elegant bridge offering smooth travel across the water",
    " The towering red pillars of this suspension bridge create an iconic landmark",
]

In [15]:
# Preprocess the image and text to match CLIP input requirements
inputs = processor(text=text, images=image, return_tensors="pt", padding=True)

# Check if CUDA is available, otherwise fall back to CPU
device = "cuda" if torch.cuda.is_available() else "cpu"

# Move tensors to the correct device
inputs = {key: value.to(device) for key, value in inputs.items()}
model.to(device)

# Get the image and text features using CLIP
with torch.no_grad():
    outputs = model(**inputs)
    logits_per_image = outputs.logits_per_image  # Image-text similarity scores
    logits_per_text = outputs.logits_per_text  # Text-image similarity scores

# Softmax to normalize similarity scores
image_features = logits_per_image.softmax(dim=-1)  # For image-to-text similarity
text_features = logits_per_text.softmax(dim=-1)  # For text-to-image similarity

# Print similarity scores
print("Image to Text Similarity Scores:")
for idx, caption in enumerate(text):
    print(f"{caption}: {image_features[0][idx].item():.4f}")

# Optionally, return the best caption based on similarity score
best_caption_idx = torch.argmax(image_features)
print("\nBest caption for the image:", text[best_caption_idx])

Image to Text Similarity Scores:
A breathtaking view of a modern suspension bridge over deep blue waters: 0.1376
A stunning red suspension bridge connects two lands under a clear blue sky: 0.5021
A panoramic shot of a grand bridge stretching across the sea: 0.1303
This magnificent bridge stands tall over the tranquil waters below: 0.0077
The striking architecture of this suspension bridge is a true masterpiece: 0.0204
The long and elegant bridge offering smooth travel across the water: 0.1850
 The towering red pillars of this suspension bridge create an iconic landmark: 0.0170

Best caption for the image: A stunning red suspension bridge connects two lands under a clear blue sky
