In [None]:
!pip install torch clip Pillow



In [None]:
!pip install torch torchvision ftfy regex tqdm
!pip install open-clip-torch

Collecting ftfy
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Downloading ftfy-6.3.1-py3-none-any.whl (44 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: ftfy
Successfully installed ftfy-6.3.1
Collecting open-clip-torch
  Downloading open_clip_torch-2.30.0-py3-none-any.whl.metadata (31 kB)
Downloading open_clip_torch-2.30.0-py3-none-any.whl (1.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m56.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: open-clip-torch
Successfully installed open-clip-torch-2.30.0


In [None]:
# First, install required packages:
# pip install torch torchvision ftfy regex tqdm
# pip install git+https://github.com/openai/CLIP.git

import torch
import open_clip
from PIL import Image
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize
import numpy as np

class ImageCaptioningSystem:
    def __init__(self, model_name="ViT-B-32", pretrained="laion2b_s34b_b79k"):
        # Initialize CLIP model and preprocessing
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model, _, self.preprocess = open_clip.create_model_and_transforms(
            model_name,
            pretrained=pretrained,
            device=self.device
        )
        self.tokenizer = open_clip.get_tokenizer(model_name)

        # Set of candidate captions (can be expanded)
        self.candidate_captions = [
            "a photo of a person",
            "a photo of an animal",
            "a landscape photo",
            "a photo of food",
            "a photo of a building",
            "an artistic image",
            "a portrait photo",
            "a nature scene",
            "an urban scene",
            "an indoor scene"
        ]

    def encode_image(self, image_path):
        """Encode image using CLIP's image encoder"""
        image = Image.open(image_path).convert("RGB")
        image_input = self.preprocess(image).unsqueeze(0).to(self.device)
        with torch.no_grad():
            image_features = self.model.encode_image(image_input)
            image_features /= image_features.norm(dim=-1, keepdim=True)
        return image_features

    def encode_text(self, texts):
        """Encode text descriptions using CLIP's text encoder"""
        text_tokens = self.tokenizer(texts).to(self.device)
        with torch.no_grad():
            text_features = self.model.encode_text(text_tokens)
            text_features /= text_features.norm(dim=-1, keepdim=True)
        return text_features

    def get_image_caption(self, image_path, top_k=3):
        """Get the most likely captions for an image"""
        # Encode image
        image_features = self.encode_image(image_path)

        # Encode all candidate captions
        text_features = self.encode_text(self.candidate_captions)

        # Calculate similarity scores
        similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)

        # Get top k matches
        values, indices = similarity[0].topk(top_k)

        results = []
        for value, idx in zip(values, indices):
            results.append({
                'caption': self.candidate_captions[idx],
                'confidence': float(value)
            })

        return results

    def add_custom_captions(self, new_captions):
        """Add custom captions to the candidate pool"""
        self.candidate_captions.extend(new_captions)

    def batch_process_images(self, image_paths):
        """Process multiple images and return their captions"""
        results = {}
        for path in image_paths:
            try:
                captions = self.get_image_caption(path)
                results[path] = captions
            except Exception as e:
                results[path] = f"Error processing image: {str(e)}"
        return results

# Example usage
def main():
    # Initialize the system
    captioning_system = ImageCaptioningSystem()

    # Add some custom captions if desired
    custom_captions = [
        "a sunset over the ocean",
        "a bustling city street",
        "a peaceful garden scene"
    ]
    captioning_system.add_custom_captions(custom_captions)

    # Example of processing a single image
    image_path = "/content/wp3.jpg"
    try:
        captions = captioning_system.get_image_caption(image_path)
        print(f"\nCaptions for {image_path}:")
        for caption in captions:
            print(f"Caption: {caption['caption']}")
            print(f"Confidence: {caption['confidence']:.3f}")
    except Exception as e:
        print(f"Error processing image: {str(e)}")

    # Example of batch processing
    image_paths = ["image1.jpg", "image2.jpg", "image3.jpg"]
    results = captioning_system.batch_process_images(image_paths)

    print("\nBatch processing results:")
    for path, result in results.items():
        print(f"\n{path}:")
        print(result)

if __name__ == "__main__":
    main()


Captions for /content/wp3.jpg:
Caption: an urban scene
Confidence: 0.893
Caption: a bustling city street
Confidence: 0.063
Caption: an indoor scene
Confidence: 0.029

Batch processing results:

image1.jpg:
Error processing image: [Errno 2] No such file or directory: 'image1.jpg'

image2.jpg:
Error processing image: [Errno 2] No such file or directory: 'image2.jpg'

image3.jpg:
Error processing image: [Errno 2] No such file or directory: 'image3.jpg'
