#### 語音直接翻譯，AZURE直接整合 speech -> text -> translation

In [1]:
from dotenv import load_dotenv
import os
load_dotenv()

True

In [1]:
import os
import azure.cognitiveservices.speech as speechsdk
import torchaudio

def convert_16k(wav_file):
    data, sr = torchaudio.load(wav_file)
    if sr != 16000:
        data = torchaudio.functional.resample(data, sr, 16000)
        new_wav_file = wav_file.replace(".wav", "_16k.wav")
        torchaudio.save(new_wav_file, data, 16000)
        return new_wav_file
    return wav_file

wav_file = "/mnt/disk1/chris/uaicraft_workspace/translate-everywhere/jupyter_test/test.wav"
wav_file = convert_16k(wav_file)

# This example requires environment variables named "SPEECH_KEY" and "SPEECH_REGION"
speech_translation_config = speechsdk.translation.SpeechTranslationConfig(subscription=os.getenv('AZURE_SPEECH_KEY'), region=os.getenv('AZURE_SERVICE_REGION'))
speech_translation_config.speech_recognition_language="zh-TW"

target_language="ja"
speech_translation_config.add_target_language(target_language)

audio_config = speechsdk.audio.AudioConfig(filename=wav_file)
translation_recognizer = speechsdk.translation.TranslationRecognizer(translation_config=speech_translation_config, audio_config=audio_config)

translation_recognition_result = translation_recognizer.recognize_once()

if translation_recognition_result.reason == speechsdk.ResultReason.TranslatedSpeech:
    print("Recognized: {}".format(translation_recognition_result.text))
    print("""Translated into '{}': {}""".format(
        target_language, 
        translation_recognition_result.translations[target_language]))
elif translation_recognition_result.reason == speechsdk.ResultReason.NoMatch:
    print("No speech could be recognized: {}".format(translation_recognition_result.no_match_details))
elif translation_recognition_result.reason == speechsdk.ResultReason.Canceled:
    cancellation_details = translation_recognition_result.cancellation_details
    print("Speech Recognition canceled: {}".format(cancellation_details.reason))
    if cancellation_details.reason == speechsdk.CancellationReason.Error:
        print("Error details: {}".format(cancellation_details.error_details))
        print("Did you set the speech resource key and region values?")

Recognized: 今天午餐要吃什麼？
Translated into 'ja': 今日のお昼ごはんは何を食べますか?


#### 語音 to 語音 ， azure直接整合 speech -> text -> translation -> synthesis speech

In [9]:
import os
import azure.cognitiveservices.speech as speechsdk

speech_key, service_region = os.getenv('AZURE_SPEECH_KEY'), os.getenv('AZURE_SERVICE_REGION')
from_language, to_language = 'zh-TW', 'ja'

wav_file = "/mnt/disk1/chris/uaicraft_workspace/translate-everywhere/jupyter_test/test.wav"
wav_file = convert_16k(wav_file)

def translate_speech_to_text():
    translation_config = speechsdk.translation.SpeechTranslationConfig(
            subscription=speech_key, region=service_region)

    translation_config.speech_recognition_language = from_language
    translation_config.add_target_language(to_language)

    audio_config = speechsdk.audio.AudioConfig(filename=wav_file)

    # See: https://aka.ms/speech/sdkregion#standard-and-neural-voices
    translation_config.voice_name = "ja-JP-AoiNeural"

    translation_recognizer = speechsdk.translation.TranslationRecognizer(translation_config=translation_config, audio_config=audio_config)

    def synthesis_callback(evt):
        size = len(evt.result.audio)
        print(f'Audio synthesized: {size} byte(s) {"(COMPLETED)" if size == 0 else ""}')

        if size > 0:
            file = open('translation.wav', 'wb+')
            file.write(evt.result.audio)
            file.close()

    translation_recognizer.synthesizing.connect(synthesis_callback)

    print(f'Say something in "{from_language}" and we\'ll translate into "{to_language}".')

    result = translation_recognizer.recognize_once()
    print(get_result_text(reason=result.reason, result=result))

def get_result_text(reason, result):
    reason_format = {
        speechsdk.ResultReason.TranslatedSpeech:
            f'Recognized "{from_language}": {result.text}\n' +
            f'Translated into "{to_language}"": {result.translations[to_language]}',
        speechsdk.ResultReason.RecognizedSpeech: f'Recognized: "{result.text}"',
        speechsdk.ResultReason.NoMatch: f'No speech could be recognized: {result.no_match_details}',
        speechsdk.ResultReason.Canceled: f'Speech Recognition canceled: {result.cancellation_details}'
    }
    return reason_format.get(reason, 'Unable to recognize speech')

translate_speech_to_text()

Say something in "zh-TW" and we'll translate into "ja".
Audio synthesized: 120444 byte(s) 
Audio synthesized: 0 byte(s) (COMPLETED)
Recognized "zh-TW": 今天午餐要吃什麼？
Translated into "ja"": 今日のお昼ごはんは何を食べますか?


#### 語音 to 語音 ， 也可以先產生翻譯，再TTS。 speech -> text -> translation； translation -> synthesis speech

In [10]:
import os
import azure.cognitiveservices.speech as speechsdk

speech_key, service_region = os.getenv('AZURE_SPEECH_KEY'), os.getenv('AZURE_SERVICE_REGION')
from_language, to_language = 'zh-TW', 'ja'

wav_file = "/mnt/disk1/chris/uaicraft_workspace/translate-everywhere/jupyter_test/test.wav"
wav_file = convert_16k(wav_file)

def translate_speech_to_text():
    translation_config = speechsdk.translation.SpeechTranslationConfig(
            subscription=speech_key, region=service_region)

    translation_config.speech_recognition_language = from_language
    translation_config.add_target_language(to_language)

    audio_config = speechsdk.audio.AudioConfig(filename=wav_file)
    translation_recognizer = speechsdk.translation.TranslationRecognizer(translation_config=translation_config, audio_config=audio_config)
    
    result = translation_recognizer.recognize_once()
    synthesize_translations(result=result)

def synthesize_translations(result):
    language_to_voice_map = {
        "de": "de-DE-KatjaNeural",
        "en": "en-US-AriaNeural",
        "it": "it-IT-ElsaNeural",
        "pt": "pt-BR-FranciscaNeural",
        "zh-Hans": "zh-CN-XiaoxiaoNeural", 
        'ja': "ja-JP-AoiNeural"
    }
    print(f'Recognized: "{result.text}"')

    for language in result.translations:
        translation = result.translations[language]
        print(f'Translated into "{language}": {translation}')

        speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
        speech_config.speech_synthesis_voice_name = language_to_voice_map.get(language)
        
        audio_config = speechsdk.audio.AudioOutputConfig(filename=f'{language}-translation.wav')
        speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=audio_config)
        speech_synthesizer.speak_text_async(translation).get()

translate_speech_to_text()

Recognized: "今天午餐要吃什麼？"
Translated into "ja": 今日のお昼ごはんは何を食べますか?
