<h1 align=center>A first end-to-end prototype using the computer's sounddevice</h1>

<h2> Description </h2>
This notebook is self-contained, you do not need the model server to run. 

The notebook contains a first end-to-end prototype of the speech-to-speech translation agent. It uses OpenAI's Whisper model for transcription and translation to Egnlish, OpenAI's GPT-3 for translation to the target language, and different TTS engines for the final speech output.

The first version uses the following TTS engines:
 - Tacotron2 for spectrogram generation
 - Vocoder/HifiGAN for audio generation

In [12]:
import asyncio
import sys

import numpy as np
import sounddevice as sd


import whisper
from speechbrain.pretrained import Tacotron2
from speechbrain.pretrained import HIFIGAN

In [13]:
transcription_model = whisper.load_model("small", download_root="../models/whisper_small")

tacotron2 = Tacotron2.from_hparams(source="speechbrain/tts-tacotron2-ljspeech", savedir="../models/tts")
hifi_gan = HIFIGAN.from_hparams(source="speechbrain/tts-hifigan-ljspeech", savedir="../models/vocoder")

In [14]:

async def record_buffer(buffer, **kwargs):
    loop = asyncio.get_event_loop()
    event = asyncio.Event()
    idx = 0

    def callback(indata, frame_count, time_info, status):
        nonlocal idx
        if status:
            print(status)
        remainder = len(buffer) - idx
        if remainder == 0:
            loop.call_soon_threadsafe(event.set)
            raise sd.CallbackStop
        indata = indata[:remainder]
        buffer[idx:idx + len(indata)] = indata
        idx += len(indata)

    stream = sd.InputStream(callback=callback, dtype=buffer.dtype,
                            channels=buffer.shape[1], **kwargs)
    with stream:
        await event.wait()


def transcribe_and_speak(buffer, **kwargs):
    """Take the input buffer, transcribe and translate using Whisper and convert back to audio via TTS models

    Args:
        buffer (_type_): _description_
    """
    
    result = transcription_model.transcribe(buffer.flatten(),
                    fp16=False,
                    task="translate" )
    
    print(result["text"])
    
    
    # Running the TTS model (text-to-spectrogram)
    mel_output, mel_length, alignment = tacotron2.encode_text(
        result.get("text", "Hello World")
    )
    
    # Running Vocoder (spectrogram-to-waveform)
    waveform = hifi_gan.decode_batch(mel_output)
    
    
    return waveform.squeeze(0).permute((1,0)).numpy()
    
async def play_buffer(buffer, **kwargs):
    loop = asyncio.get_event_loop()
    event = asyncio.Event()
    idx = 0

    def callback(outdata, frame_count, time_info, status):
        nonlocal idx
        if status:
            print(status)
        remainder = len(buffer) - idx
        if remainder == 0:
            loop.call_soon_threadsafe(event.set)
            raise sd.CallbackStop
        valid_frames = frame_count if remainder >= frame_count else remainder
        outdata[:valid_frames] = buffer[idx:idx + valid_frames]
        outdata[valid_frames:] = 0
        idx += valid_frames

    stream = sd.OutputStream(callback=callback, dtype=buffer.dtype,
                             channels=buffer.shape[1], **kwargs)
    with stream:
        await event.wait()



In [15]:
frames = 22050 * 5
channels = 1
dtype = 'float32'

In [16]:
buffer = np.zeros((frames, channels), dtype=dtype)
buffer.shape

(110250, 1)

In [17]:
print("Recording...")
await record_buffer(buffer, samplerate=22050, blocksize=1024)

print("Translating...")
waveform = transcribe_and_speak(buffer)

print("Re-playing...")
await play_buffer(waveform, samplerate=22050)

print('Done')

Recording...
Translating...
 And what is this? A very good thing?
Re-playing...
Done
