# Object Detection

In [None]:
# Libraries required

# !pip install transformers
# !pip install gradio
# !pip install timm
# !pip install inflect
# !pip install phonemizer

## Build the object-detection pipeline using 🤗 Transformers Library

In [None]:
from helper import load_image_from_url, render_results_in_image

In [None]:
from transformers import pipeline

In [None]:
from transformers.utils import logging
logging.set_verbosity_error()

from helper import ignore_warnings
ignore_warnings()

In [None]:
od_pipe = pipeline("object-detection", "./models/facebook/detr-resnet-50")

## Using the Pipeline

In [None]:
from PIL import Image

In [None]:
raw_image = Image.open('huggingface_friends.jpg')
raw_image.resize((569, 491))

In [None]:
pipeline_output = od_pipe(raw_image)
# this returns the coordinates of the image
# along with th eobject name which is detected

Return the results from the pipeline using the helper function render_results_in_image.

In [None]:
processed_image = render_results_in_image(
    raw_image, 
    pipeline_output)

In [None]:
processed_image

## Building Gradio Interface

In [None]:
import os
import gradio as gr

In [None]:
def get_pipeline_prediction(pil_image):
    pipeline_output = od_pipe(pil_image)
    processed_image = render_results_in_image(pil_image,pipeline_output)
    return processed_image

In [None]:
demo = gr.Interface(
  fn=get_pipeline_prediction,
  inputs=gr.Image(label="Input image", 
                  type="pil"),
  outputs=gr.Image(label="Output image with predicted instances",
                   type="pil")
)

In [None]:
demo.launch(share=True, server_port=int(os.environ['PORT1']))

In [None]:
demo.close()

## Adding text to speech for hearing results

In [None]:
pipeline_output

In [None]:
od_pipe

In [None]:
raw_image = Image.open('huggingface_friends.jpg')
raw_image.resize((284, 245))

In [None]:
from helper import summarize_predictions_natural_language

In [None]:
text = summarize_predictions_natural_language(pipeline_output)

In [None]:
text

In [None]:
tts_pipe = pipeline("text-to-speech",
                    model="./models/kakao-enterprise/vits-ljs")

In [None]:
narrated_text = tts_pipe(text)

In [None]:
from IPython.display import Audio as IPythonAudio

In [None]:
IPythonAudio(narrated_text["audio"][0],
             rate=narrated_text["sampling_rate"])

## Combining in a function

In [None]:
from IPython.lib.display import Audio
from scipy.io.wavfile import write
import tempfile
import tempfile
import librosa

In [None]:
def image_to_audio_pipeline(pil_image):
    pipeline_output = od_pipe(pil_image)
    text = summarize_predictions_natural_language(pipeline_output)
    narrated_text = tts_pipe(text)
    audio_data = narrated_text["audio"][0]
    sampling_rate = narrated_text["sampling_rate"]
    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_audio:
        write(temp_audio.name, sampling_rate, audio_data)
        temp_audio_path = temp_audio.name
    return temp_audio_path

In [None]:
demo = gr.Interface(
    fn=image_to_audio_pipeline,
    inputs=gr.Image(label="Input image", type="pil"),
    outputs=gr.Audio(label="Generated Audio", type="filepath")
)

In [None]:
demo.launch(share=True)

In [None]:
demo.close()