#### 語音直接翻譯，AZURE直接整合 speech -> text -> translation

In [1]:
from dotenv import load_dotenv
import os
load_dotenv()

True

In [3]:
import os
import azure.cognitiveservices.speech as speechsdk
import torchaudio

def convert_16k(wav_file):
    data, sr = torchaudio.load(wav_file)
    if sr != 16000:
        data = torchaudio.functional.resample(data, sr, 16000)
        new_wav_file = wav_file.replace(".wav", "_16k.wav")
        torchaudio.save(new_wav_file, data, 16000)
        return new_wav_file
    return wav_file

wav_file = "/mnt/disk1/chris/uaicraft_workspace/translate-everywhere/jupyter_test/test.wav"
wav_file = convert_16k(wav_file)

# This example requires environment variables named "SPEECH_KEY" and "SPEECH_REGION"
speech_translation_config = speechsdk.translation.SpeechTranslationConfig(subscription=os.getenv('AZURE_SPEECH_KEY'), region=os.getenv('AZURE_SERVICE_REGION'))
speech_translation_config.speech_recognition_language="zh-TW"

target_language="ja"
speech_translation_config.add_target_language(target_language)

audio_config = speechsdk.audio.AudioConfig(filename=wav_file)
translation_recognizer = speechsdk.translation.TranslationRecognizer(translation_config=speech_translation_config, audio_config=audio_config)

translation_recognition_result = translation_recognizer.recognize_once()

if translation_recognition_result.reason == speechsdk.ResultReason.TranslatedSpeech:
    print("Recognized: {}".format(translation_recognition_result.text))
    print("""Translated into '{}': {}""".format(
        target_language, 
        translation_recognition_result.translations[target_language]))
elif translation_recognition_result.reason == speechsdk.ResultReason.NoMatch:
    print("No speech could be recognized: {}".format(translation_recognition_result.no_match_details))
elif translation_recognition_result.reason == speechsdk.ResultReason.Canceled:
    cancellation_details = translation_recognition_result.cancellation_details
    print("Speech Recognition canceled: {}".format(cancellation_details.reason))
    if cancellation_details.reason == speechsdk.CancellationReason.Error:
        print("Error details: {}".format(cancellation_details.error_details))
        print("Did you set the speech resource key and region values?")

Recognized: 晚上繼續寫成是。
Translated into 'ja': イブニングは「はい」と書き続けます。


#### 語音 to 語音 ， azure直接整合 speech -> text -> translation -> synthesis speech

In [3]:
import os
import azure.cognitiveservices.speech as speechsdk

speech_key, service_region = os.getenv('AZURE_SPEECH_KEY'), os.getenv('AZURE_SERVICE_REGION')
from_language, to_language = 'zh-TW', 'ja'

wav_file = "/mnt/disk1/chris/uaicraft_workspace/translate-everywhere/jupyter_test/test.wav"
wav_file = convert_16k(wav_file)

def translate_speech_to_text():
    translation_config = speechsdk.translation.SpeechTranslationConfig(
            subscription=speech_key, region=service_region)

    translation_config.speech_recognition_language = from_language
    translation_config.add_target_language(to_language)

    audio_config = speechsdk.audio.AudioConfig(filename=wav_file)

    # See: https://aka.ms/speech/sdkregion#standard-and-neural-voices
    translation_config.voice_name = "ja-JP-AoiNeural"

    translation_recognizer = speechsdk.translation.TranslationRecognizer(translation_config=translation_config, audio_config=audio_config)

    def synthesis_callback(evt):
        size = len(evt.result.audio)
        print(f'Audio synthesized: {size} byte(s) {"(COMPLETED)" if size == 0 else ""}')

        if size > 0:
            file = open('translation.wav', 'wb+')
            file.write(evt.result.audio)
            file.close()

    translation_recognizer.synthesizing.connect(synthesis_callback)

    print(f'Say something in "{from_language}" and we\'ll translate into "{to_language}".')

    result = translation_recognizer.recognize_once()
    print(get_result_text(reason=result.reason, result=result))

def get_result_text(reason, result):
    reason_format = {
        speechsdk.ResultReason.TranslatedSpeech:
            f'Recognized "{from_language}": {result.text}\n' +
            f'Translated into "{to_language}"": {result.translations[to_language]}',
        speechsdk.ResultReason.RecognizedSpeech: f'Recognized: "{result.text}"',
        speechsdk.ResultReason.NoMatch: f'No speech could be recognized: {result.no_match_details}',
        speechsdk.ResultReason.Canceled: f'Speech Recognition canceled: {result.cancellation_details}'
    }
    return reason_format.get(reason, 'Unable to recognize speech')

translate_speech_to_text()

Say something in "zh-TW" and we'll translate into "ja".
Audio synthesized: 118444 byte(s) 
Audio synthesized: 0 byte(s) (COMPLETED)
Recognized "zh-TW": 晚上繼續寫成是。
Translated into "ja"": イブニングは「はい」と書き続けます。


#### 語音 to 語音 ， 也可以先產生翻譯，再TTS。 speech -> text -> translation； translation -> synthesis speech

In [4]:
import os
import azure.cognitiveservices.speech as speechsdk

speech_key, service_region = os.getenv('AZURE_SPEECH_KEY'), os.getenv('AZURE_SERVICE_REGION')
from_language, to_language = 'zh-TW', 'ja'

wav_file = "/mnt/disk1/chris/uaicraft_workspace/translate-everywhere/jupyter_test/test.wav"
wav_file = convert_16k(wav_file)

def translate_speech_to_text():
    translation_config = speechsdk.translation.SpeechTranslationConfig(
            subscription=speech_key, region=service_region)

    translation_config.speech_recognition_language = from_language
    translation_config.add_target_language(to_language)

    audio_config = speechsdk.audio.AudioConfig(filename=wav_file)
    translation_recognizer = speechsdk.translation.TranslationRecognizer(translation_config=translation_config, audio_config=audio_config)
    
    result = translation_recognizer.recognize_once()
    synthesize_translations(result=result)

def synthesize_translations(result):
    language_to_voice_map = {
        "de": "de-DE-KatjaNeural",
        "en": "en-US-AriaNeural",
        "it": "it-IT-ElsaNeural",
        "pt": "pt-BR-FranciscaNeural",
        "zh-Hans": "zh-CN-XiaoxiaoNeural", 
        'ja': "ja-JP-AoiNeural"
    }
    print(f'Recognized: "{result.text}"')

    for language in result.translations:
        translation = result.translations[language]
        print(f'Translated into "{language}": {translation}')

        speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
        speech_config.speech_synthesis_voice_name = language_to_voice_map.get(language)
        
        audio_config = speechsdk.audio.AudioOutputConfig(filename=f'{language}-translation.wav')
        speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=audio_config)
        speech_synthesizer.speak_text_async(translation).get()

translate_speech_to_text()

Recognized: "晚上繼續寫成是。"
Translated into "ja": イブニングは「はい」と書き続けます。


#### continuos translation

In [10]:
import time
speech_key, service_region, endpoint = os.getenv('AZURE_CUSTOM_SPEECH_KEY'), os.getenv('AZURE_CUSTOM_SERVICE_REGION'), os.getenv('AZURE_CUSTOM_ENDPOINT')
def translation_continuous():
    """performs continuous speech translation from an audio file"""
    # <TranslationContinuous>
    # set up translation parameters: source language and target languages
    translation_config = speechsdk.translation.SpeechTranslationConfig(
        subscription=speech_key, region=service_region)
    translation_config.endpoint_id=endpoint
    translation_config.speech_recognition_language = 'zh-TW'
    translation_config.add_target_language('en')
    translation_config.voice_name = "en-US-AvaNeural"
    wav_file = "/mnt/disk1/chris/uaicraft_workspace/translate-everywhere/uploaded_audio/20240530/20240530173836.wav"
    audio_config = speechsdk.audio.AudioConfig(filename=wav_file)

    # Creates a translation recognizer using and audio file as input.
    recognizer = speechsdk.translation.TranslationRecognizer(
        translation_config=translation_config, audio_config=audio_config)

    def result_callback(event_type: str, evt: speechsdk.translation.TranslationRecognitionEventArgs):
        """callback to display a translation result"""
        print("{}:\n {}\n\tTranslations: {}\n\tResult Json: {}\n".format(
            event_type, evt, evt.result.translations.items(), evt.result.offset))

    done = False

    def stop_cb(evt: speechsdk.SessionEventArgs):
        """callback that signals to stop continuous recognition upon receiving an event `evt`"""
        print('CLOSING on {}'.format(evt))
        nonlocal done
        done = True

    def canceled_cb(evt: speechsdk.translation.TranslationRecognitionCanceledEventArgs):
        print('CANCELED:\n\tReason:{}\n'.format(evt.result.reason))
        print('\tDetails: {} ({})'.format(evt, evt.result.cancellation_details.error_details))

    # connect callback functions to the events fired by the recognizer
    recognizer.session_started.connect(lambda evt: print('SESSION STARTED: {}'.format(evt)))
    recognizer.session_stopped.connect(lambda evt: print('SESSION STOPPED {}'.format(evt)))
    # event for intermediate results
    recognizer.recognizing.connect(lambda evt: result_callback('RECOGNIZING', evt))
    # event for final result
    recognizer.recognized.connect(lambda evt: result_callback('RECOGNIZED', evt))
    # cancellation event
    recognizer.canceled.connect(canceled_cb)

    # stop continuous recognition on either session stopped or canceled events
    recognizer.session_stopped.connect(stop_cb)
    recognizer.canceled.connect(stop_cb)

    def synthesis_callback(evt: speechsdk.translation.TranslationRecognitionEventArgs):
        """
        callback for the synthesis event
        """
        print('SYNTHESIZING {}\n\treceived {} bytes of audio. Reason: {}'.format(
            evt, len(evt.result.audio), evt.result.reason))

    # connect callback to the synthesis event
    recognizer.synthesizing.connect(synthesis_callback)

    # start translation
    recognizer.start_continuous_recognition()

    while not done:
        time.sleep(.5)

    recognizer.stop_continuous_recognition()
    # </TranslationContinuous>

translation_continuous()

SESSION STARTED: SessionEventArgs(session_id=8152e3a5a0a142349f5c06aca1fe547c)
RECOGNIZING:
 TranslationRecognitionEventArgs(session_id=8152e3a5a0a142349f5c06aca1fe547c, result=TranslationRecognitionResult(result_id=084a54e322b944ea9457f476869596b7, translations={'en': 'Today colleagues'}, reason=ResultReason.TranslatingSpeech))
	Translations: dict_items([('en', 'Today colleagues')])
	Result Json: 7500000

RECOGNIZING:
 TranslationRecognitionEventArgs(session_id=8152e3a5a0a142349f5c06aca1fe547c, result=TranslationRecognitionResult(result_id=dec40b893e5147f1a9aac45f1d86b11a, translations={'en': 'Today a colleague went to see us in the hospital'}, reason=ResultReason.TranslatingSpeech))
	Translations: dict_items([('en', 'Today a colleague went to see us in the hospital')])
	Result Json: 7500000

RECOGNIZING:
 TranslationRecognitionEventArgs(session_id=8152e3a5a0a142349f5c06aca1fe547c, result=TranslationRecognitionResult(result_id=e9fd629331c2444890cb254f5b4a61bf, translations={'en': 'Tod

In [25]:
import torchaudio
wav_file = '/mnt/disk1/chris/uaicraft_workspace/translate-everywhere/uploaded_audio/20240523/20240523135717.wav'
Duration = 20200000
Offset = 4700000

## use Duration(ticks) and Offset(ticks) to cut the wav file
data, sr = torchaudio.load(wav_file)

In [26]:
Duration / 10000000, Offset / 10000000

(2.02, 0.47)

In [31]:
start = int(Offset / 10000000*16000)
end = start + int(Duration / 10000000*16000)
torchaudio.save('/mnt/disk1/chris/uaicraft_workspace/translate-everywhere/uploaded_audio/test.wav', data[:, start:end], 16000)