#### 語音直接翻譯，AZURE直接整合 speech -> text -> translation

In [2]:
from dotenv import load_dotenv
import os
load_dotenv()

True

In [3]:
import os
import azure.cognitiveservices.speech as speechsdk
import torchaudio

def convert_16k(wav_file):
    data, sr = torchaudio.load(wav_file)
    if sr != 16000:
        data = torchaudio.functional.resample(data, sr, 16000)
        new_wav_file = wav_file.replace(".wav", "_16k.wav")
        torchaudio.save(new_wav_file, data, 16000)
        return new_wav_file
    return wav_file

wav_file = "/mnt/disk1/chris/uaicraft_workspace/translate-everywhere/jupyter_test/test.wav"
wav_file = convert_16k(wav_file)

# This example requires environment variables named "SPEECH_KEY" and "SPEECH_REGION"
speech_translation_config = speechsdk.translation.SpeechTranslationConfig(subscription=os.getenv('AZURE_SPEECH_KEY'), region=os.getenv('AZURE_SERVICE_REGION'))
speech_translation_config.speech_recognition_language="zh-TW"

target_language="ja"
speech_translation_config.add_target_language(target_language)

audio_config = speechsdk.audio.AudioConfig(filename=wav_file)
translation_recognizer = speechsdk.translation.TranslationRecognizer(translation_config=speech_translation_config, audio_config=audio_config)

translation_recognition_result = translation_recognizer.recognize_once()

if translation_recognition_result.reason == speechsdk.ResultReason.TranslatedSpeech:
    print("Recognized: {}".format(translation_recognition_result.text))
    print("""Translated into '{}': {}""".format(
        target_language, 
        translation_recognition_result.translations[target_language]))
elif translation_recognition_result.reason == speechsdk.ResultReason.NoMatch:
    print("No speech could be recognized: {}".format(translation_recognition_result.no_match_details))
elif translation_recognition_result.reason == speechsdk.ResultReason.Canceled:
    cancellation_details = translation_recognition_result.cancellation_details
    print("Speech Recognition canceled: {}".format(cancellation_details.reason))
    if cancellation_details.reason == speechsdk.CancellationReason.Error:
        print("Error details: {}".format(cancellation_details.error_details))
        print("Did you set the speech resource key and region values?")

Recognized: 晚上繼續寫成是。
Translated into 'ja': イブニングは「はい」と書き続けます。


#### 語音 to 語音 ， azure直接整合 speech -> text -> translation -> synthesis speech

In [4]:
import os
import azure.cognitiveservices.speech as speechsdk

speech_key, service_region = os.getenv('AZURE_SPEECH_KEY'), os.getenv('AZURE_SERVICE_REGION')
from_language, to_language = 'zh-TW', 'ja'

wav_file = "/mnt/disk1/chris/uaicraft_workspace/translate-everywhere/jupyter_test/test.wav"
wav_file = convert_16k(wav_file)

def translate_speech_to_text():
    translation_config = speechsdk.translation.SpeechTranslationConfig(
            subscription=speech_key, region=service_region)

    translation_config.speech_recognition_language = from_language
    translation_config.add_target_language(to_language)

    audio_config = speechsdk.audio.AudioConfig(filename=wav_file)

    # See: https://aka.ms/speech/sdkregion#standard-and-neural-voices
    translation_config.voice_name = "ja-JP-AoiNeural"

    translation_recognizer = speechsdk.translation.TranslationRecognizer(translation_config=translation_config, audio_config=audio_config)

    def synthesis_callback(evt):
        size = len(evt.result.audio)
        print(f'Audio synthesized: {size} byte(s) {"(COMPLETED)" if size == 0 else ""}')

        if size > 0:
            file = open('translation.wav', 'wb+')
            file.write(evt.result.audio)
            file.close()

    translation_recognizer.synthesizing.connect(synthesis_callback)

    print(f'Say something in "{from_language}" and we\'ll translate into "{to_language}".')

    result = translation_recognizer.recognize_once()
    print(get_result_text(reason=result.reason, result=result))

def get_result_text(reason, result):
    reason_format = {
        speechsdk.ResultReason.TranslatedSpeech:
            f'Recognized "{from_language}": {result.text}\n' +
            f'Translated into "{to_language}"": {result.translations[to_language]}',
        speechsdk.ResultReason.RecognizedSpeech: f'Recognized: "{result.text}"',
        speechsdk.ResultReason.NoMatch: f'No speech could be recognized: {result.no_match_details}',
        speechsdk.ResultReason.Canceled: f'Speech Recognition canceled: {result.cancellation_details}'
    }
    return reason_format.get(reason, 'Unable to recognize speech')

translate_speech_to_text()

Say something in "zh-TW" and we'll translate into "ja".
Audio synthesized: 118444 byte(s) 
Audio synthesized: 0 byte(s) (COMPLETED)
Recognized "zh-TW": 晚上繼續寫成是。
Translated into "ja"": イブニングは「はい」と書き続けます。


#### 語音 to 語音 ， 也可以先產生翻譯，再TTS。 speech -> text -> translation； translation -> synthesis speech

In [9]:
import os
import azure.cognitiveservices.speech as speechsdk

speech_key, service_region = os.getenv('AZURE_SPEECH_KEY'), os.getenv('AZURE_SERVICE_REGION')
from_language, to_language = 'zh-TW', 'ja'

wav_file = "/mnt/disk1/chris/uaicraft_workspace/translate-everywhere/jupyter_test/test.wav"
wav_file = convert_16k(wav_file)

def translate_speech_to_text():
    translation_config = speechsdk.translation.SpeechTranslationConfig(
            subscription=speech_key, region=service_region)

    translation_config.speech_recognition_language = from_language
    translation_config.add_target_language(to_language)

    audio_config = speechsdk.audio.AudioConfig(filename=wav_file)
    translation_recognizer = speechsdk.translation.TranslationRecognizer(translation_config=translation_config, audio_config=audio_config)
    
    result = translation_recognizer.recognize_once()
    synthesize_translations(result=result)

def synthesize_translations(result):
    language_to_voice_map = {
        "de": "de-DE-KatjaNeural",
        "en": "en-US-AriaNeural",
        "it": "it-IT-ElsaNeural",
        "pt": "pt-BR-FranciscaNeural",
        "zh-Hans": "zh-CN-XiaoxiaoNeural", 
        'ja': "ja-JP-AoiNeural"
    }
    print(f'Recognized: "{result.text}"')

    for language in result.translations:
        translation = result.translations[language]
        print(f'Translated into "{language}": {translation}')

        speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)
        speech_config.speech_synthesis_voice_name = language_to_voice_map.get(language)
        
        audio_config = speechsdk.audio.AudioOutputConfig(filename=f'{language}-translation.wav')
        speech_synthesizer = speechsdk.SpeechSynthesizer(speech_config=speech_config, audio_config=audio_config)
        speech_synthesizer.speak_text_async(translation).get()

translate_speech_to_text()

Recognized: "晚上繼續寫成是。"
Translated into "ja": イブニングは「はい」と書き続けます。


#### continuos translation

In [7]:
import time

def translation_continuous():
    """performs continuous speech translation from an audio file"""
    # <TranslationContinuous>
    # set up translation parameters: source language and target languages
    translation_config = speechsdk.translation.SpeechTranslationConfig(
        subscription=speech_key, region=service_region)
    translation_config.speech_recognition_language = 'zh-TW'
    translation_config.add_target_language('en')
    translation_config.voice_name = "en-US-AvaNeural"
    wav_file = "/mnt/disk1/chris/uaicraft_workspace/translate-everywhere/uploaded_audio/20240523/20240523135717.wav"
    audio_config = speechsdk.audio.AudioConfig(filename=wav_file)

    # Creates a translation recognizer using and audio file as input.
    recognizer = speechsdk.translation.TranslationRecognizer(
        translation_config=translation_config, audio_config=audio_config)

    def result_callback(event_type: str, evt: speechsdk.translation.TranslationRecognitionEventArgs):
        """callback to display a translation result"""
        print("{}:\n {}\n\tTranslations: {}\n\tResult Json: {}\n".format(
            event_type, evt, evt.result.translations.items(), evt.result.offset))

    done = False

    def stop_cb(evt: speechsdk.SessionEventArgs):
        """callback that signals to stop continuous recognition upon receiving an event `evt`"""
        print('CLOSING on {}'.format(evt))
        nonlocal done
        done = True

    def canceled_cb(evt: speechsdk.translation.TranslationRecognitionCanceledEventArgs):
        print('CANCELED:\n\tReason:{}\n'.format(evt.result.reason))
        print('\tDetails: {} ({})'.format(evt, evt.result.cancellation_details.error_details))

    # connect callback functions to the events fired by the recognizer
    recognizer.session_started.connect(lambda evt: print('SESSION STARTED: {}'.format(evt)))
    recognizer.session_stopped.connect(lambda evt: print('SESSION STOPPED {}'.format(evt)))
    # event for intermediate results
    recognizer.recognizing.connect(lambda evt: result_callback('RECOGNIZING', evt))
    # event for final result
    recognizer.recognized.connect(lambda evt: result_callback('RECOGNIZED', evt))
    # cancellation event
    recognizer.canceled.connect(canceled_cb)

    # stop continuous recognition on either session stopped or canceled events
    recognizer.session_stopped.connect(stop_cb)
    recognizer.canceled.connect(stop_cb)

    def synthesis_callback(evt: speechsdk.translation.TranslationRecognitionEventArgs):
        """
        callback for the synthesis event
        """
        print('SYNTHESIZING {}\n\treceived {} bytes of audio. Reason: {}'.format(
            evt, len(evt.result.audio), evt.result.reason))

    # connect callback to the synthesis event
    recognizer.synthesizing.connect(synthesis_callback)

    # start translation
    recognizer.start_continuous_recognition()

    while not done:
        time.sleep(.5)

    recognizer.stop_continuous_recognition()
    # </TranslationContinuous>

translation_continuous()

SESSION STARTED: SessionEventArgs(session_id=164716a5fbd14f99b6dfa45f60646fd8)
RECOGNIZING:
 TranslationRecognitionEventArgs(session_id=164716a5fbd14f99b6dfa45f60646fd8, result=TranslationRecognitionResult(result_id=07e4e14b4e084fcda9deb894a0b7e2ce, translations={'en': 'Chris, alone'}, reason=ResultReason.TranslatingSpeech))
	Translations: dict_items([('en', 'Chris, alone')])
	Result Json: 4700000

RECOGNIZED:
 TranslationRecognitionEventArgs(session_id=164716a5fbd14f99b6dfa45f60646fd8, result=TranslationRecognitionResult(result_id=4668942ad4f240a2847a81ff2c87318a, translations={'en': 'Chris, alone.'}, reason=ResultReason.TranslatedSpeech))
	Translations: dict_items([('en', 'Chris, alone.')])
	Result Json: 4700000

SYNTHESIZING TranslationSynthesisEventArgs(session_id=164716a5fbd14f99b6dfa45f60646fd8, result=TranslationSynthesisResult(audio=<53244 bytes of audio>, reason=ResultReason.SynthesizingAudio))
	received 53244 bytes of audio. Reason: ResultReason.SynthesizingAudio
SYNTHESIZING

#### continue speech translate with language detect


In [32]:
import time
import json

def translation_continuous_with_lid_from_multilingual_file():
    """performs continuous speech translation from an audio file"""
    # <TranslationContinuous>
    # set up translation parameters: source language and target languages
    endpoint_string = "wss://{}.stt.speech.microsoft.com/speech/universal/v2".format(service_region)
    translation_config = speechsdk.translation.SpeechTranslationConfig(
        subscription=speech_key, endpoint=endpoint_string)
    # translation_config.speech_recognition_language = 'de-DE'
    translation_config.add_target_language('en')
    translation_config.voice_name = "en-US-AvaNeural"
    translation_config.set_property(
        property_id=speechsdk.PropertyId.SpeechServiceConnection_LanguageIdMode, value='Continuous')
    wav_file = "/mnt/disk1/chris/uaicraft_workspace/translate-everywhere/uploaded_audio/20240530/20240530173445.wav"
    audio_config = speechsdk.audio.AudioConfig(filename=wav_file)
    auto_detect_source_language_config = speechsdk.languageconfig.AutoDetectSourceLanguageConfig(
        languages=["en-US", "de-DE", "zh-TW", "ja-JP"])

    # Creates a translation recognizer using and audio file as input.
    recognizer = speechsdk.translation.TranslationRecognizer(
        translation_config=translation_config, 
        audio_config=audio_config, 
        auto_detect_source_language_config=auto_detect_source_language_config)

    def result_callback(event_type: str, evt: speechsdk.translation.TranslationRecognitionEventArgs):
        """callback to display a translation result"""
        print(event_type)
        if evt.result.reason == speechsdk.ResultReason.TranslatedSpeech:
            src_lang = evt.result.properties[speechsdk.PropertyId.SpeechServiceConnection_AutoDetectSourceLanguageResult]
            print("""Recognized:
            Detected language: {}
            Recognition result: {}
            German translation: {}
            French translation: {}""".format(
                src_lang,
                evt.result.text,
                evt.result.translations['de'],
                evt.result.translations['fr']))
        elif evt.result.reason == speechsdk.ResultReason.RecognizedSpeech:
            print("Recognized:\n {}".format(evt.result.text))
        elif evt.result.reason == speechsdk.ResultReason.NoMatch:
            print("No speech could be recognized: {}".format(evt.result.no_match_details))
        elif evt.result.reason == speechsdk.ResultReason.Canceled:
            print("Translation canceled: {}".format(evt.result.cancellation_details.reason))
            if evt.result.cancellation_details.reason == speechsdk.CancellationReason.Error:
                print("Error details: {}".format(evt.result.cancellation_details.error_details))

    done = False

    def stop_cb(evt: speechsdk.SessionEventArgs):
        """callback that signals to stop continuous recognition upon receiving an event `evt`"""
        print('CLOSING on {}'.format(evt))
        nonlocal done
        done = True

    def canceled_cb(evt: speechsdk.translation.TranslationRecognitionCanceledEventArgs):
        print('CANCELED:\n\tReason:{}\n'.format(evt.result.reason))
        print('\tDetails: {} ({})'.format(evt, evt.result.cancellation_details.error_details))

    # connect callback functions to the events fired by the recognizer
    recognizer.session_started.connect(lambda evt: print('SESSION STARTED: {}'.format(evt)))
    recognizer.session_stopped.connect(lambda evt: print('SESSION STOPPED {}'.format(evt)))
    # event for intermediate results
    recognizer.recognizing.connect(lambda evt: result_callback('RECOGNIZING', evt))
    # event for final result
    recognizer.recognized.connect(lambda evt: result_callback('RECOGNIZED', evt))
    # cancellation event
    recognizer.canceled.connect(canceled_cb)

    # stop continuous recognition on either session stopped or canceled events
    recognizer.session_stopped.connect(stop_cb)
    recognizer.canceled.connect(stop_cb)

    def synthesis_callback(evt: speechsdk.translation.TranslationRecognitionEventArgs):
        """
        callback for the synthesis event
        """
        print('SYNTHESIZING {}\n\treceived {} bytes of audio. Reason: {}'.format(
            evt, len(evt.result.audio), evt.result.reason))

    # connect callback to the synthesis event
    recognizer.synthesizing.connect(synthesis_callback)

    # start translation
    recognizer.start_continuous_recognition()

    while not done:
        time.sleep(.5)

    recognizer.stop_continuous_recognition()
    # </TranslationContinuous>

translation_continuous_with_lid_from_multilingual_file()

SESSION STARTED: SessionEventArgs(session_id=71508546ef5b44d6bb49046c9afe7971)
RECOGNIZING
RECOGNIZED
SYNTHESIZING TranslationSynthesisEventArgs(session_id=71508546ef5b44d6bb49046c9afe7971, result=TranslationSynthesisResult(audio=<44 bytes of audio>, reason=ResultReason.SynthesizingAudio))
	received 44 bytes of audio. Reason: ResultReason.SynthesizingAudio
SYNTHESIZING TranslationSynthesisEventArgs(session_id=71508546ef5b44d6bb49046c9afe7971, result=TranslationSynthesisResult(audio=<0 bytes of audio>, reason=ResultReason.SynthesizingAudioCompleted))
	received 0 bytes of audio. Reason: ResultReason.SynthesizingAudioCompleted
CANCELED:
	Reason:ResultReason.Canceled

	Details: TranslationRecognitionCanceledEventArgs(session_id=71508546ef5b44d6bb49046c9afe7971, result=TranslationRecognitionResult(result_id=f41c99fe516b478c8348c1baec00914a, translations={}, reason=ResultReason.Canceled)) ()
CLOSING on TranslationRecognitionCanceledEventArgs(session_id=71508546ef5b44d6bb49046c9afe7971, resul

In [54]:
temp_synthesis_file = '/mnt/disk1/chris/uaicraft_workspace/translate-everywhere/jupyter_test/temp_synthesis_file.wav'
if os.path.exists(temp_synthesis_file):
    os.remove(temp_synthesis_file)

def translation_continuous_with_lid_from_multilingual_file():
    """performs continuous speech translation from a multi-lingual audio file, with continuous language identification"""
    # <TranslationContinuousWithLID>

    # When you use Language ID with speech translation, you must set a v2 endpoint.
    # This will be fixed in a future version of Speech SDK.

    # Set up translation parameters, including the list of target (translated) languages.
    endpoint_string = "wss://{}.stt.speech.microsoft.com/speech/universal/v2".format(service_region)
    translation_config = speechsdk.translation.SpeechTranslationConfig(
        subscription=speech_key,
        endpoint=endpoint_string,)
    translation_config.add_target_language('en')
    translation_config.voice_name = "en-US-AvaNeural"
    wav_file = "/mnt/disk1/chris/uaicraft_workspace/translate-everywhere/uploaded_audio/20240530/20240530173445.wav"
    audio_config = speechsdk.audio.AudioConfig(filename=wav_file)

    # Since the spoken language in the input audio changes, you need to set the language identification to "Continuous" mode.
    # (override the default value of "AtStart").
    translation_config.set_property(
        property_id=speechsdk.PropertyId.SpeechServiceConnection_LanguageIdMode, value='Continuous')

    # Specify the AutoDetectSourceLanguageConfig, which defines the number of possible languages
    auto_detect_source_language_config = speechsdk.languageconfig.AutoDetectSourceLanguageConfig(
        languages=["en-US", "de-DE", "zh-CN", "ja-JP"])

    # Creates a translation recognizer using and audio file as input.
    recognizer = speechsdk.translation.TranslationRecognizer(
        translation_config=translation_config,
        audio_config=audio_config,
        auto_detect_source_language_config=auto_detect_source_language_config)

    def result_callback(evt):
        """callback to display a translation result"""
        if evt.result.reason == speechsdk.ResultReason.TranslatedSpeech:
            src_lang = evt.result.properties[speechsdk.PropertyId.SpeechServiceConnection_AutoDetectSourceLanguageResult]
            print("""Recognized:
            Detected language: {}
            Recognition result: {}
            translation: {}""".format(
                src_lang,
                evt.result.text,
                evt.result.translations))
        elif evt.result.reason == speechsdk.ResultReason.RecognizedSpeech:
            print("Recognized:\n {}".format(evt.result.text))
        elif evt.result.reason == speechsdk.ResultReason.NoMatch:
            print("No speech could be recognized: {}".format(evt.result.no_match_details))
        elif evt.result.reason == speechsdk.ResultReason.Canceled:
            print("Translation canceled: {}".format(evt.result.cancellation_details.reason))
            if evt.result.cancellation_details.reason == speechsdk.CancellationReason.Error:
                print("Error details: {}".format(evt.result.cancellation_details.error_details))

    done = False

    def stop_cb(evt):
        """callback that signals to stop continuous recognition upon receiving an event `evt`"""
        print('CLOSING on {}'.format(evt))
        nonlocal done
        done = True

    # connect callback functions to the events fired by the recognizer
    recognizer.session_started.connect(lambda evt: print('SESSION STARTED: {}'.format(evt)))
    recognizer.session_stopped.connect(lambda evt: print('SESSION STOPPED {}'.format(evt)))

    # event for final result
    recognizer.recognized.connect(lambda evt: result_callback(evt))

    # cancellation event
    recognizer.canceled.connect(lambda evt: print('CANCELED: {} ({})'.format(evt, evt.reason)))

    # stop continuous recognition on either session stopped or canceled events
    recognizer.session_stopped.connect(stop_cb)
    recognizer.canceled.connect(stop_cb)

    def synthesis_callback(evt: speechsdk.translation.TranslationRecognitionEventArgs):
        """
        callback for the synthesis event
        """
        print('SYNTHESIZING {}\n\treceived {} bytes of audio. Reason: {}'.format(
            evt, len(evt.result.audio), evt.result.reason))
        
        synthesis_bytes = evt.result.audio
        if len(synthesis_bytes) > 0:
            with open(temp_synthesis_file, 'wb+') as f:
                f.write(synthesis_bytes)

    # connect callback to the synthesis event
    recognizer.synthesizing.connect(synthesis_callback)

    # start translation
    recognizer.start_continuous_recognition()

    while not done:
        time.sleep(.5)

    recognizer.stop_continuous_recognition()
    # </TranslationContinuousWithLID>

translation_continuous_with_lid_from_multilingual_file()

SESSION STARTED: SessionEventArgs(session_id=ac7b1897dd4e49d7883d13a89b4996cc)
Recognized:
            Detected language: ja-JP
            Recognition result: おはようございます。
            translation: {'en': 'Good morning.'}
SYNTHESIZING TranslationSynthesisEventArgs(session_id=ac7b1897dd4e49d7883d13a89b4996cc, result=TranslationSynthesisResult(audio=<36844 bytes of audio>, reason=ResultReason.SynthesizingAudio))
	received 36844 bytes of audio. Reason: ResultReason.SynthesizingAudio
SYNTHESIZING TranslationSynthesisEventArgs(session_id=ac7b1897dd4e49d7883d13a89b4996cc, result=TranslationSynthesisResult(audio=<0 bytes of audio>, reason=ResultReason.SynthesizingAudioCompleted))
	received 0 bytes of audio. Reason: ResultReason.SynthesizingAudioCompleted
CANCELED: TranslationRecognitionCanceledEventArgs(session_id=ac7b1897dd4e49d7883d13a89b4996cc, result=TranslationRecognitionResult(result_id=c464299543314f1db198062523973033, translations={}, reason=ResultReason.Canceled)) (CancellationReason.E

In [25]:
import torchaudio
wav_file = '/mnt/disk1/chris/uaicraft_workspace/translate-everywhere/uploaded_audio/20240523/20240523135717.wav'
Duration = 20200000
Offset = 4700000

## use Duration(ticks) and Offset(ticks) to cut the wav file
data, sr = torchaudio.load(wav_file)

In [26]:
Duration / 10000000, Offset / 10000000

(2.02, 0.47)

In [31]:
start = int(Offset / 10000000*16000)
end = start + int(Duration / 10000000*16000)
torchaudio.save('/mnt/disk1/chris/uaicraft_workspace/translate-everywhere/uploaded_audio/test.wav', data[:, start:end], 16000)