# Voice-To-Text


**Requirements:** 
* Gradio: 4.36.1
* Transformers: 4.41.2
* NumPy: 1.26.4
* PyWavelets: 1.6.0
* Pydub: 0.25.1
* openai: 1.34.0
* openai/whisper-small: 20231117
* ffmpeg: 7.0.1

In [None]:
import gradio as gr
from transformers import pipeline
import numpy as np
import pywt

transcriber = pipeline("automatic-speech-recognition", model="openai/whisper-small")

# Initialize the stream variable
stream = None

def denoise_audio_wavelet(audio_data):
    # Perform wavelet transform
    wavelet = 'sym3'
    max_level = pywt.dwt_max_level(len(audio_data), wavelet)
    coeffs = pywt.wavedec(audio_data, wavelet, level=max_level)
    
    # Thresholding the coefficients
    threshold = np.median(np.abs(coeffs[-1])) / 0.6745*np.sqrt(2*np.log(len(audio_data)))
    denoised_coeffs = [pywt.threshold(c, threshold, mode='soft') for c in coeffs]
    
    # Reconstruct the signal
    denoised_audio = pywt.waverec(denoised_coeffs, wavelet)
    return denoised_audio


def to_mono(audio_data):
    if len(audio_data.shape) > 1 and audio_data.shape[1] > 1:
        audio_data = np.mean(audio_data, axis=1)
    return audio_data

# real-time
def transcribe(new_chunk):
    global stream
    sr, y = new_chunk
    y = y.astype(np.float32)
    y /= np.max(np.abs(y))

    y = to_mono(y)
    y = denoise_audio_wavelet(y)

    if stream is not None:
        stream = np.concatenate([stream, y])
    else:
        stream = y
    return transcriber({"sampling_rate": sr, "raw": stream})["text"]

# from audio file
def transcribe_file(audio_file):
    sr, audio = audio_file
    audio = audio.astype(np.float32)
    audio /= np.max(np.abs(audio))

    audio = to_mono(audio)
    audio = denoise_audio_wavelet(audio)

    return transcriber({"sampling_rate": sr, "raw": audio})["text"]


def clear_stream():
    global stream
    stream = None

with gr.Blocks() as demo:
    with gr.Tab("Microphone"):
        mic_input = gr.Audio(sources=["microphone"], streaming=True)
        mic_output = gr.Textbox()
        mic_clear = gr.Button("Clear")

        mic_input.stream(transcribe, inputs=mic_input, outputs=mic_output)
        mic_clear.click(clear_stream, None, mic_output)
        
    with gr.Tab("Upload File"):
        file_input = gr.Audio(sources=["upload"])
        file_output = gr.Textbox()
        file_clear = gr.Button("Clear")
        
        file_input.change(transcribe_file, inputs=file_input, outputs=file_output)
        file_clear.click(clear_stream, None, file_output)

demo.launch()