In [9]:
!pip install transformers torch torchvision PIL requests gradio accelerate -q
!pip install git+https://github.com/openai/CLIP.git -q


[31mERROR: Could not find a version that satisfies the requirement PIL (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for PIL[0m[31m
[0m  Preparing metadata (setup.py) ... [?25l[?25hdone


In [12]:
import torch
import torch
import clip
import requests
from PIL import Image
from transformers import BlipProcessor, BlipForConditionalGeneration
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
import gradio as gr
import io
import base64
from typing import List, Tuple
import numpy as np

# Check if GPU is available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"{device}")

cuda


In [14]:
# ====================================
# Define Main Caption Generator Class
# ====================================
class ImageCaptionGenerator:
    def __init__(self):
        self.models = {}
        self.processors = {}
        self.load_models()

    def load_models(self):
        """Load multiple caption generation models"""
        print("Loading models... This may take a few minutes on first run.")

        # 1. BLIP Model (Salesforce) - State of the art
        print("Loading BLIP model...")
        self.processors["blip"] = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
        self.models["blip"] = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large").to(device)

        # 2. ViT-GPT2 Model - Good for detailed descriptions
        print("Loading ViT-GPT2 model...")
        self.models["vit_gpt2"] = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning").to(device)
        self.processors["vit_gpt2_feature"] = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
        self.processors["vit_gpt2_tokenizer"] = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

        # 3. CLIP Model - For similarity and context
        print("Loading CLIP model...")
        self.models["clip"], self.processors["clip"] = clip.load("ViT-B/32", device=device)

        print("All models loaded successfully!")


# ====================================
#  BLIP Caption Generation Methods
# ====================================
    def generate_blip_caption(self, image: Image.Image, mode: str = "normal") -> str:
        """Generate caption using BLIP model"""
        try:
            if mode == "conditional":
                # Conditional generation with prompt
                text = "a photography of"
                inputs = self.processors["blip"](image, text, return_tensors="pt").to(device)
            else:
                # Unconditional generation
                inputs = self.processors["blip"](image, return_tensors="pt").to(device)

            with torch.no_grad():
                out = self.models["blip"].generate(**inputs, max_length=50, num_beams=5)

            caption = self.processors["blip"].decode(out[0], skip_special_tokens=True)

            # Clean up conditional prompt if used
            if mode == "conditional" and caption.startswith("a photography of"):
                caption = caption.replace("a photography of", "").strip()

            return caption
        except Exception as e:
            return f"Error generating BLIP caption: {str(e)}"


# ====================================
# ViT-GPT2 Caption Generation Method
# ====================================
    def generate_vit_gpt2_caption(self, image: Image.Image) -> str:
        """Generate caption using ViT-GPT2 model"""
        try:
            # Preprocess image
            pixel_values = self.processors["vit_gpt2_feature"](
                images=image, return_tensors="pt"
            ).pixel_values.to(device)

            # Generate caption
            with torch.no_grad():
                output_ids = self.models["vit_gpt2"].generate(
                    pixel_values,
                    max_length=50,
                    num_beams=4,
                    early_stopping=True
                )

            # Decode caption
            caption = self.processors["vit_gpt2_tokenizer"].decode(
                output_ids[0], skip_special_tokens=True
            )

            return caption.strip()
        except Exception as e:
            return f"Error generating ViT-GPT2 caption: {str(e)}"


# ====================================
# CLIP Analysis Method
# ====================================
    def analyze_with_clip(self, image: Image.Image, text_options: List[str]) -> Tuple[str, List[float]]:
        """Use CLIP to find best matching description"""
        try:
            # Preprocess
            image_input = self.processors["clip"](image).unsqueeze(0).to(device)
            text_inputs = clip.tokenize(text_options).to(device)

            # Get features
            with torch.no_grad():
                image_features = self.models["clip"].encode_image(image_input)
                text_features = self.models["clip"].encode_text(text_inputs)

                # Calculate similarity
                similarities = (100.0 * image_features @ text_features.T).softmax(dim=-1)
                similarities = similarities.cpu().numpy()[0]

            # Find best match
            best_idx = np.argmax(similarities)
            best_description = text_options[best_idx]

            return best_description, similarities.tolist()
        except Exception as e:
            return f"Error with CLIP analysis: {str(e)}", []


# ====================================
#  Comprehensive Caption Generation
# ====================================
    def generate_comprehensive_caption(self, image: Image.Image) -> dict:
        """Generate multiple captions using different approaches"""
        results = {}

        # BLIP captions
        print("Generating BLIP captions...")
        results["blip_basic"] = self.generate_blip_caption(image, "normal")
        results["blip_conditional"] = self.generate_blip_caption(image, "conditional")

        # ViT-GPT2 caption
        print("Generating ViT-GPT2 caption...")
        results["vit_gpt2"] = self.generate_vit_gpt2_caption(image)

        # CLIP-based scene analysis
        print("Analyzing scene with CLIP...")
        scene_options = [
            "a photo taken indoors",
            "a photo taken outdoors",
            "a portrait of a person",
            "a landscape scene",
            "an urban environment",
            "a natural environment",
            "an animal",
            "food or cooking",
            "a vehicle",
            "architecture or buildings",
            "art or creative work",
            "sports or activity"
        ]

        clip_scene, scene_scores = self.analyze_with_clip(image, scene_options)
        results["clip_scene"] = clip_scene
        results["scene_confidence"] = max(scene_scores) if scene_scores else 0

        # Generate enhanced caption by combining insights
        print("Creating enhanced caption...")
        enhanced_caption = self.create_enhanced_caption(results)
        results["enhanced"] = enhanced_caption

        return results

    def create_enhanced_caption(self, results: dict) -> str:
        """Create an enhanced caption by combining different model outputs"""
        try:
            # Get the best basic caption
            blip_basic = results.get("blip_basic", "")
            vit_caption = results.get("vit_gpt2", "")
            scene_info = results.get("clip_scene", "")
            confidence = results.get("scene_confidence", 0)

            # Choose primary caption based on length and quality indicators
            if len(blip_basic) > len(vit_caption) and "error" not in blip_basic.lower():
                primary_caption = blip_basic
            elif "error" not in vit_caption.lower():
                primary_caption = vit_caption
            else:
                primary_caption = "An image with various elements"

            # Add scene context if confident
            if confidence > 0.3 and "error" not in scene_info.lower():
                if "indoors" in scene_info:
                    primary_caption += " (indoor setting)"
                elif "outdoors" in scene_info:
                    primary_caption += " (outdoor setting)"
                elif "portrait" in scene_info:
                    primary_caption += " (portrait style)"
                elif "landscape" in scene_info:
                    primary_caption += " (landscape view)"

            return primary_caption

        except Exception as e:
            return f"Enhanced caption generation error: {str(e)}"

In [15]:
print("Initializing Caption Generator...")
caption_generator = ImageCaptionGenerator()
print("Caption Generator ready!")

Initializing Caption Generator...
Loading models... This may take a few minutes on first run.
Loading BLIP model...


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Loading ViT-GPT2 model...


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Loading CLIP model...
All models loaded successfully!
Caption Generator ready!


In [16]:
# ====================================
# Helper Functions
# ====================================
def process_image(image):
    """Process uploaded image and generate captions"""
    if image is None:
        return "Please upload an image first.", "", "", "", "", ""

    try:
        print("Processing image...")

        # Convert to PIL Image if needed
        if not isinstance(image, Image.Image):
            image = Image.fromarray(image)

        # Ensure RGB mode
        if image.mode != 'RGB':
            image = image.convert('RGB')

        # Generate comprehensive captions
        results = caption_generator.generate_comprehensive_caption(image)

        print("Caption generation complete!")

        return (
            results.get("enhanced", "Error generating enhanced caption"),
            results.get("blip_basic", "Error with BLIP basic"),
            results.get("blip_conditional", "Error with BLIP conditional"),
            results.get("vit_gpt2", "Error with ViT-GPT2"),
            results.get("clip_scene", "Error with CLIP scene"),
            f"Scene Confidence: {results.get('scene_confidence', 0):.2%}"
        )

    except Exception as e:
        error_msg = f"Error processing image: {str(e)}"
        return error_msg, error_msg, error_msg, error_msg, error_msg, error_msg

def download_sample_image(url: str) -> Image.Image:
    """Download a sample image from URL"""
    try:
        print(f"Downloading sample image...")
        response = requests.get(url)
        return Image.open(io.BytesIO(response.content))
    except Exception as e:
        print(f"Error downloading image: {e}")
        return None

In [26]:
# Define the Gradio interface
def create_interface():
    with gr.Blocks(title="AI Image Caption Generator", theme=gr.themes.Soft()) as demo:
        gr.HTML("""
        <div style="text-align: center; padding: 20px;">
            <h1 style="color: #2196F3;">AI Image Caption Generator</h1>
            <p style="font-size: 18px; color: #666;">
                Upload any image and get intelligent captions from multiple state-of-the-art AI models!
            </p>
        </div>
        """)

        with gr.Row():
            with gr.Column(scale=1):
                # Image input
                image_input = gr.Image(
                    label="Upload Image",
                    type="pil",
                    height=400
                )

                # Sample images
                gr.HTML("<h3>Or try these samples:</h3>")

                sample_urls = [
                    "https://dccwebsiteimages.s3.ap-south-1.amazonaws.com/Untitled_design_13_a25f88e21c.png",  # Dog
                    "https://images.unsplash.com/photo-1506905925346-21bda4d32df4?w=400",  # Landscape
                    "https://cookingwithayeh.com/wp-content/uploads/2021/08/Italian-Pizza-1.jpg",  # Food
                ]

                with gr.Row():
                    sample_btn1 = gr.Button("Dog", size="sm")
                    sample_btn2 = gr.Button("Landscape", size="sm")
                    sample_btn3 = gr.Button("Food", size="sm")

            with gr.Column(scale=1):
                # Main results
                enhanced_output = gr.Textbox(
                    label="Enhanced Caption (Best Result)",
                    lines=3,
                    placeholder="Enhanced caption will appear here..."
                )

                # Individual model outputs
                with gr.Accordion("Individual Model Results", open=False):
                    blip_basic_output = gr.Textbox(label="BLIP Basic", lines=2)
                    blip_conditional_output = gr.Textbox(label="BLIP Conditional", lines=2)
                    vit_gpt2_output = gr.Textbox(label="ViT-GPT2", lines=2)
                    clip_scene_output = gr.Textbox(label="CLIP Scene Analysis", lines=2)
                    confidence_output = gr.Textbox(label="Confidence Score", lines=1)

        # Event handlers
        image_input.change(
            process_image,
            inputs=[image_input],
            outputs=[enhanced_output, blip_basic_output, blip_conditional_output,
                    vit_gpt2_output, clip_scene_output, confidence_output]
        )

        # Sample button handlers
        sample_btn1.click(
            lambda: download_sample_image(sample_urls[0]),
            outputs=image_input
        )
        sample_btn2.click(
            lambda: download_sample_image(sample_urls[1]),
            outputs=image_input
        )
        sample_btn3.click(
            lambda: download_sample_image(sample_urls[2]),
            outputs=image_input
        )

        gr.HTML("""
        <div style="margin-top: 30px; padding: 20px; background-color: #f5f5f5; border-radius: 10px;">
            <h3>Features:</h3>
            <ul>
                <li><strong>BLIP</strong>: Salesforce's state-of-the-art vision-language model</li>
                <li><strong>ViT-GPT2</strong>: Vision Transformer + GPT-2 for detailed descriptions</li>
                <li><strong>CLIP</strong>: OpenAI's model for scene understanding and context</li>
                <li><strong>Enhanced Mode</strong>: Combines all models for best results</li>
            </ul>
            <p><em>Tip: Try different types of images to see how each model performs!</em></p>
        </div>
        """)

    return demo

# Create the interface
demo = create_interface()

In [25]:
# Launch the interface
demo.launch(
    share=True,  # Creates public link
    debug=True,
    server_port=7860
)

print("Upload an image to start generating captions!")

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://2f9ae98f3325924bca.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Downloading sample image...
Processing image...
Generating BLIP captions...
Generating ViT-GPT2 caption...
Analyzing scene with CLIP...
Creating enhanced caption...
Caption generation complete!
Downloading sample image...
Processing image...
Generating BLIP captions...
Generating ViT-GPT2 caption...
Analyzing scene with CLIP...
Creating enhanced caption...
Caption generation complete!
Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://2f9ae98f3325924bca.gradio.live
Upload an image to start generating captions!
