# Speech-to-text

Define `AZURE_SPEECH_KEY`, `AZURE_SPEECH_REGION`, `LANGUAGE`, `SHORT_LANGUAGE`:

In [1]:
exec(open('../instance/config.py').read())

## Azure Speech

[Speech-to-text quickstart](https://learn.microsoft.com/en-us/azure/cognitive-services/speech-service/get-started-speech-to-text?pivots=programming-language-python)

In [2]:
# !pip install azure-cognitiveservices-speech

In [3]:
import azure.cognitiveservices.speech as speechsdk

In [4]:
def speech_to_text_azure():
    speech_config = speechsdk.SpeechConfig(subscription=AZURE_SPEECH_KEY, region=AZURE_SPEECH_REGION)
    speech_config.speech_recognition_language = LANGUAGE

    audio_config = speechsdk.audio.AudioConfig(use_default_microphone=True)
    speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, audio_config=audio_config)

    print('Speak into your microphone.')
    speech_recognition_result = speech_recognizer.recognize_once_async().get()

    if speech_recognition_result.reason == speechsdk.ResultReason.RecognizedSpeech:
        return speech_recognition_result.text
    elif speech_recognition_result.reason == speechsdk.ResultReason.NoMatch:
        print('No speech could be recognized: {}'.format(speech_recognition_result.no_match_details))
    elif speech_recognition_result.reason == speechsdk.ResultReason.Canceled:
        cancellation_details = speech_recognition_result.cancellation_details
        print('Speech Recognition canceled: {}'.format(cancellation_details.reason))
        if cancellation_details.reason == speechsdk.CancellationReason.Error:
            print('Error details: {}'.format(cancellation_details.error_details))
            print('Did you set the speech resource key and region values?')

In [5]:
text = speech_to_text_azure()
text

Speak into your microphone.


'4+6-2 égale 8.'

## Speech Recognition

In [6]:
# !pip install SpeechRecognition
# !pip install PyAudio

In [7]:
import speech_recognition as sr

In [8]:
def audio_from_microphone():
    r = sr.Recognizer()
    
    with sr.Microphone() as source:
        print('Speak into your microphone.')
        audio = r.listen(source)
        return audio

In [9]:
def speech_to_text_google():
    audio = audio_from_microphone()

    r = sr.Recognizer()
    return r.recognize_google(audio, language=LANGUAGE)

In [10]:
text = speech_to_text_google()
text

Speak into your microphone.


'4 + 6 - 2 = 8'

## Whisper

[Setup](https://github.com/openai/whisper#setup)

In [11]:
# !pip install git+https://github.com/openai/whisper.git 

In [12]:
import tempfile

import numpy as np
import whisper

In [13]:
model = whisper.load_model('large') # tiny, base, small, medium, large

In [14]:
def speech_to_text_whisper(model, translate=False):
    audio = audio_from_microphone()

    with tempfile.NamedTemporaryFile(suffix='.wav') as f:
        f.write(audio.get_wav_data())
        f.flush()
        result = model.transcribe(f.name, 
                                  language=SHORT_LANGUAGE, 
                                  task='Translate' if translate else None, 
                                  fp16=False)

    return result['text']

In [15]:
text = speech_to_text_whisper(model)
text

Speak into your microphone.


' 4 plus 6 moins 2 égale 8.'

In [16]:
translated_text = speech_to_text_whisper(model, translate=True)
translated_text

Speak into your microphone.


' 4 plus 6 minus 2 equals 8.'