# Project - Speech to Text Converter

Convert speech to text, and back to speech in language of choice.

In [None]:
#!pip install pydub simpleaudio speechrecognition pipwin pyaudio

In [1]:
import os
from openai import OpenAI
import gradio as gr
from dotenv import load_dotenv
import speech_recognition as sr
from pydub import AudioSegment
import simpleaudio as sa
from pydub import AudioSegment
from pydub.playback import play
from io import BytesIO

In [2]:
load_dotenv(override=True)

openai_api_key = os.getenv('OPENAI_API_KEY')
if openai_api_key:
    print(f"OpenAI API Key exists and begins {openai_api_key[:8]}")
else:
    print("OpenAI API Key not set")
    
MODEL = "gpt-4o-mini"
openai = OpenAI()

OpenAI API Key exists and begins sk-proj-


In [3]:
def talker(message):
    response = openai.audio.speech.create(
        model="tts-1",
        voice='onyx',
        input=message
    )
    audio_stream = BytesIO(response.content)
    audio = AudioSegment.from_file(audio_stream, format="mp3")
    play(audio)

In [4]:
def update_text(text, language):
    system_prompt = f"You are a language translator. You have to convert the text into {language}. Do not add extra information. Translate accurately."
    response = openai.chat.completions.create(
        model = MODEL,
        messages = [
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": text}
                ]
    )
    return response.choices[0].message.content

In [5]:
def listener(language):
    recognizer = sr.Recognizer()

    with sr.Microphone() as source:
        print('Listening.. Speak now!')
        recognizer.adjust_for_ambient_noise(source)
        audio = recognizer.listen(source)

    try:
        print('Processing Speech...')
        text = recognizer.recognize_google(audio)
        translated_text = update_text(text, language)
        talker(translated_text)
        #print(f"You said: {text}")
        return text, translated_text
    except sr.UnknownValueError:
        #print('Sorry I did not understand that.')
        return 'Sorry I did not understand that.', ''
    except sr.RequestError:
            #print("Could not request results, please check your internet connection.")
            return 'Could not request results, please check your internet connection.', ''

In [9]:
text = listener()
if text:
    print(f"You said: {text}")

Listening.. Speak now!
Processing Speech...
You said: hi how are you


In [9]:
with gr.Blocks() as ui:
    with gr.Tab("Speech to Text converter") as s2t:
        with gr.Row():
            gr.Markdown("### Let's write down your thoughts...")
            gr.Markdown("### Translated text goes here...")
        with gr.Row():
            text_output = gr.Markdown("", height=150)
            translated_output = gr.Markdown("", height=150)
        with gr.Row():
            speech_button = gr.Button("Convert Voice to Text")
        with gr.Row():
            language = gr.Dropdown(["English", "Spanish", "German", "Italian"], label="Select output language", value="English")
        with gr.Row():
            clear = gr.Button("Clear")

    speech_button.click(listener, inputs=[language], outputs=[text_output, translated_output])

    clear.click(lambda: ("", ""), outputs=[text_output, translated_output])

ui.launch(inbrowser=True, share=True)


* Running on local URL:  http://127.0.0.1:7861
* Running on public URL: https://a764ab219db07952d9.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




Listening.. Speak now!
Processing Speech...
Listening.. Speak now!
Processing Speech...


In [8]:
!ffmpeg -version
!ffprobe -version
!ffplay -version

ffmpeg version 7.1.1 Copyright (c) 2000-2025 the FFmpeg developers
built with Apple clang version 16.0.0 (clang-1600.0.26.6)
configuration: --prefix=/opt/homebrew/Cellar/ffmpeg/7.1.1_2 --enable-shared --enable-pthreads --enable-version3 --cc=clang --host-cflags= --host-ldflags='-Wl,-ld_classic' --enable-ffplay --enable-gnutls --enable-gpl --enable-libaom --enable-libaribb24 --enable-libbluray --enable-libdav1d --enable-libharfbuzz --enable-libjxl --enable-libmp3lame --enable-libopus --enable-librav1e --enable-librist --enable-librubberband --enable-libsnappy --enable-libsrt --enable-libssh --enable-libsvtav1 --enable-libtesseract --enable-libtheora --enable-libvidstab --enable-libvmaf --enable-libvorbis --enable-libvpx --enable-libwebp --enable-libx264 --enable-libx265 --enable-libxml2 --enable-libxvid --enable-lzma --enable-libfontconfig --enable-libfreetype --enable-frei0r --enable-libass --enable-libopencore-amrnb --enable-libopencore-amrwb --enable-libopenjpeg --enable-libspeex --e

In [7]:
import os
os.environ["PATH"] += os.pathsep + "/opt/homebrew/bin"