# 0. Setup

In [4]:
!pip install transformers gradio kokoro soundfile




# 1. Pipeline

Crea una función que encadene las tres pipelines para obtener voz como input y voz generada como output.

In [5]:
from transformers import pipeline
from kokoro import KPipeline
from IPython.display import Audio, display
import soundfile as sf
from google.colab import userdata
from google import genai
from google.genai import types
import io

# Cargamos modelo de transcripción
transcription_pipeline = pipeline("automatic-speech-recognition", "openai/whisper-base")

# Cargamos modelo de lenguaje
llm_pipeline = genai.Client(
    api_key=userdata.get("GOOGLE_API_KEY"),
)

# Cargamos modelo de generación de voz
speech_generation_pipeline = KPipeline(lang_code='a')

def generate_transcription(audio):
  transcription = transcription_pipeline(audio)
  return transcription['text']

def generate_response(text):
    response = llm_pipeline.models.generate_content(
      model="gemini-2.0-flash",
      contents=types.Content(
          role="user",
          parts=[
              types.Part.from_text(text=str(text)),
          ],
        ),
      config=types.GenerateContentConfig(
          system_instruction="You are a super helpful assistant that can answer questions and help with tasks. Be concise and to the point."
        )
    )
    return response.text

def generate_speech(text, voice='af_heart'):
    generator = speech_generation_pipeline(text, voice)
    audio_data = []
    for i, (gs, ps, audio) in enumerate(generator):
        audio_data.extend(audio)

    temp_file_path = "temp_audio.wav"
    sf.write(temp_file_path, audio_data, 24000)

    return temp_file_path

Device set to use cpu




# 3. Demo

1. Construye una demo para el asistente de voz.

- Como input usa un micrófono

  ```
  gr.Audio(sources=["microphone"], type="filepath")
  ```

- Como output usa un reproductor de audio

  ```
  gr.Audio(label="Generated Speech", autoplay=True)
  ```

- Añade el parámetro `live=True` para hacer que el input se envie automáticamente


2. Prueba a generar respuestas a preguntas en inglés. Intenta que sean respuestas cortas.

3. Para tener mejor visiblidad, añade como outputs los pasos intermedios de transcripción y respuesta.


In [9]:
def procesar_input(audio):
  transcription = generate_transcription(audio)
  response = generate_response(transcription)
  voice_response = generate_speech(response)
  return transcription, response, voice_response

import gradio as gr
demo = gr.Interface(
    procesar_input,
    title = 'Voice Assitant',
    inputs = [gr.Audio(sources=["microphone"], type="filepath")],
    outputs = [gr.Textbox(label="Transcription"),
               gr.Textbox(label="Response"),
               gr.Audio(label="Generated Speech", autoplay=True)],
    live=True
)

demo.launch()

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://9ea16c99c696a60c47.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [10]:
import gradio as gr

def procesar_input(audio_input):
  transcription = generate_transcription(audio_input)
  response = generate_response(transcription)
  audio_path = generate_speech(response)
  return audio_path

# Define the Gradio interface with inputs and outputs
# Removed 'source' from gr.Audio and added it to gr.Interface 'inputs', then removed from gr.Interface too.
demo = gr.Interface(
    fn=procesar_input,
    inputs=gr.Audio(sources=["microphone"], type="filepath"),
    outputs=gr.Audio(label="Generated Speech"),
    # source="microphone", # Removed source from here as it's not a valid argument for gr.Interface
)

# Launch the interface
demo.launch()

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://dd1438c10419f8ee49.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


