In [None]:
!pip install gTTS playsound
!pip install gradio



# Import Libraries

In [None]:

import os
import io
import uuid
import requests
import numpy as np
import cv2
import inflect
import gradio as gr
from PIL import Image
from transformers import pipeline
from gtts import gTTS



# Helper Function: Load Image from URL



In [None]:
def load_image_from_url(url: str) -> Image.Image | None:
    """
    Load an image from a URL.

    Args:
        url (str): The URL of the image.

    Returns:
        PIL.Image.Image or None: The loaded image, or None if an error occurs.
    """
    try:
        response = requests.get(url, stream=True)
        response.raise_for_status()
        return Image.open(io.BytesIO(response.content))
    except requests.exceptions.RequestException as e:
        print(f"Error loading image: {e}")
        return None


#Helper Function: Render Detection Results


In [None]:
def render_results_in_image(in_pil_img: Image.Image, in_results: list) -> Image.Image:
    """
    Draw bounding boxes and labels on an image based on detection results.

    Args:
        in_pil_img (PIL.Image.Image): The original image.
        in_results (list): List of detection dictionaries with keys 'box', 'label', and 'score'.

    Returns:
        PIL.Image.Image: The annotated image.
    """
    # Convert the PIL image (RGB) to an OpenCV image (BGR)
    img = cv2.cvtColor(np.array(in_pil_img), cv2.COLOR_RGB2BGR)

    # Loop through each prediction and draw bounding boxes and labels
    for prediction in in_results:
        x = prediction['box']['xmin']
        y = prediction['box']['ymin']
        w = prediction['box']['xmax'] - x
        h = prediction['box']['ymax'] - y
        label_text = f"{prediction['label']}: {round(prediction['score'] * 100, 1)}%"

        cv2.rectangle(img, (x, y), (x + w, y + h), (0, 255, 0), 2)
        cv2.putText(img, label_text, (x, y - 10),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.5, (0, 0, 255), 2)

    # Convert back to a PIL image (RGB)
    img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    return Image.fromarray(img_rgb)


#Helper Function: Summarize Predictions


In [None]:
def summarize_predictions_natural_language(predictions: list) -> str:
    """
    Create a natural language summary from detection results.

    Args:
        predictions (list): List of detection dictionaries.

    Returns:
        str: A summary of the detected objects.
    """
    if not predictions:
        return "No objects were detected in the image."

    counts = {}
    p = inflect.engine()

    # Count each detected object type
    for prediction in predictions:
        label = prediction['label']
        counts[label] = counts.get(label, 0) + 1

    # Create a descriptive sentence for the summary
    descriptions = [
        f"{p.number_to_words(count)} {label}{'s' if count > 1 else ''}"
        for label, count in counts.items()
    ]
    if len(descriptions) == 1:
        description_text = descriptions[0]
    else:
        description_text = ", ".join(descriptions[:-1]) + f", and {descriptions[-1]}"

    return f"In this image, there are {description_text}."



#Initialize the Object Detection Pipeline


In [None]:

od_pipe = pipeline("object-detection", model="facebook/detr-resnet-50")


Some weights of the model checkpoint at facebook/detr-resnet-50 were not used when initializing DetrForObjectDetection: ['model.backbone.conv_encoder.model.layer1.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer2.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer3.0.downsample.1.num_batches_tracked', 'model.backbone.conv_encoder.model.layer4.0.downsample.1.num_batches_tracked']
- This IS expected if you are initializing DetrForObjectDetection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DetrForObjectDetection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use cpu


#Helper Function: Generate Audio Description using gTTS


In [None]:
def generate_audio_description(predictions: list) -> str:
    """
    Generate an audio narration from detection results.

    Args:
        predictions (list): List of detection dictionaries.

    Returns:
        str: The file path of the generated audio.
    """
    description = summarize_predictions_natural_language(predictions)
    tts = gTTS(text=description, lang='en')
    # Create a unique filename for each audio output to avoid overwrites
    audio_path = f"output_{uuid.uuid4().hex}.mp3"
    tts.save(audio_path)
    return audio_path



# Main Function: Process Image and Generate Outputs


In [None]:
def get_pipeline_prediction_with_audio(pil_image: Image.Image) -> tuple:
    """
    Process an input image to detect objects, annotate the image, and generate an audio summary.

    Args:
        pil_image (PIL.Image.Image): The input image.

    Returns:
        tuple: (Annotated image, Audio file path)
    """
    # Run object detection
    pipeline_output = od_pipe(pil_image)
    # Render the detections on the image
    processed_image = render_results_in_image(pil_image, pipeline_output)
    # Generate audio description
    audio_file = generate_audio_description(pipeline_output)
    return processed_image, audio_file


# Build the Gradio Interface


In [None]:
demo = gr.Interface(
    fn=get_pipeline_prediction_with_audio,
    inputs=gr.Image(label="Input Image", type="pil"),
    outputs=[
        gr.Image(label="Annotated Image", type="pil"),
        gr.Audio(label="Audio Description")
    ],
    title="Object Detection Assistant",
    description="Upload an image to see detected objects and hear an audio description."
)


# Launch the Gradio App


In [None]:
if __name__ == "__main__":
    try:
        demo.launch(server_name="0.0.0.0", server_port=7860, share=True)
    except OSError:
        print("Port 7860 is busy, launching on a different port.")
        demo.launch(server_name="0.0.0.0", share=True)


Port 7860 is busy, launching on a different port.
Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://6b607ef49e57724a68.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)
