### **SET C**

------

#### **Basic Object Detection with an Interface using Gradio**

In [None]:
!pip install transformers
!pip install gradio
!pip install timm
!pip install inflect
!pip install phonemizer

Note: py-espeak-ng is only available Linux operating systems.

To run locally in a Linux machine, follow these commands:

    sudo apt-get update
    sudo apt-get install espeak-ng
    pip install py-espeak-ng

In [None]:
from helper import load_image_from_url, render_results_in_image

In [None]:
from transformers import pipeline

In [None]:
from transformers.utils import logging
logging.set_verbosity_error()

from helper import ignore_warnings
ignore_warnings()

In [None]:
od_pipe = pipeline("object-detection", "./models/facebook/detr-resnet-50")

Sample output for testing (Update image path before running)

In [None]:
from PIL import Image

In [None]:
raw_image = Image.open('huggingface_friends.jpg')
raw_image.resize((569, 491))

In [None]:
pipeline_output = od_pipe(raw_image)

In [None]:
processed_image = render_results_in_image(
    raw_image, 
    pipeline_output)

In [None]:
processed_image

Making a simple Interface, using Gradio

In [None]:
import os
import gradio as gr

In [None]:
def get_pipeline_prediction(pil_image):
    
    pipeline_output = od_pipe(pil_image)
    processed_image = render_results_in_image(pil_image, pipeline_output)
    return processed_image

In [None]:
demo = gr.Interface(
  fn=get_pipeline_prediction,
  inputs=gr.Image(label="Input image", type="pil"),
  outputs=gr.Image(label="Output image with predicted instances", type="pil")
)

In [None]:
demo.launch(share=True, server_port=int(os.environ['PORT1']))

In [None]:
demo.close()

---------

(contd.)

#### **AI Powered Audio Assistant**

In [None]:
pipeline_output

In [None]:
od_pipe

In [None]:
raw_image = Image.open('huggingface_friends.jpg')
raw_image.resize((284, 245))

In [None]:
from helper import summarize_predictions_natural_language

In [None]:
text = summarize_predictions_natural_language(pipeline_output)

In [None]:
text

Generating Audio Narration of an Image

In [None]:
tts_pipe = pipeline("text-to-speech", model="./models/kakao-enterprise/vits-ljs")

In [None]:
narrated_text = tts_pipe(text)

Playing the Generated Audio

In [None]:
from IPython.display import Audio as IPythonAudio

In [None]:
IPythonAudio(narrated_text["audio"][0], rate=narrated_text["sampling_rate"])

--------