#### source language detection


In [1]:
from dotenv import load_dotenv
import os
load_dotenv()

speech_key, service_region = os.getenv('AZURE_SPEECH_KEY'), os.getenv('AZURE_SERVICE_REGION')

In [14]:
import os
import azure.cognitiveservices.speech as speechsdk
import torchaudio

def convert_16k(wav_file):
    data, sr = torchaudio.load(wav_file)
    if sr != 16000:
        data = torchaudio.functional.resample(data, sr, 16000)
        new_wav_file = wav_file.replace(".wav", "_16k.wav")
        torchaudio.save(new_wav_file, data, 16000)
        return new_wav_file
    return wav_file

wav_file = "/mnt/disk1/chris/uaicraft_workspace/translate-everywhere/uploaded_audio/20240530/20240530173836.wav"
wav_file = convert_16k(wav_file)

In [19]:
import time
import json

def speech_language_detection_once_from_continuous():
    """performs continuous speech language detection with input from an audio file"""
    # <SpeechContinuousLanguageDetectionWithFile>
    # Creates an AutoDetectSourceLanguageConfig, which defines a number of possible spoken languages
    auto_detect_source_language_config = \
        speechsdk.languageconfig.AutoDetectSourceLanguageConfig(languages=["zh-TW", "en-US"])

    # Creates a SpeechConfig from your speech key and region
    speech_config = speechsdk.SpeechConfig(subscription=speech_key, region=service_region)

    # Set continuous language detection (override the default of "AtStart")
    speech_config.set_property(
        property_id=speechsdk.PropertyId.SpeechServiceConnection_LanguageIdMode, value='Continuous')

    audio_config = speechsdk.audio.AudioConfig(filename=wav_file)

    source_language_recognizer = speechsdk.SourceLanguageRecognizer(
        speech_config=speech_config,
        auto_detect_source_language_config=auto_detect_source_language_config,
        audio_config=audio_config)

    done = False

    def stop_cb(evt: speechsdk.SessionEventArgs):
        """callback that signals to stop continuous recognition upon receiving an event `evt`"""
        print('CLOSING on {}'.format(evt))
        nonlocal done
        done = True

    def audio_recognized(evt: speechsdk.SpeechRecognitionEventArgs):
        """
        callback that catches the recognized result of audio from an event 'evt'.
        :param evt: event listened to catch recognition result.
        :return:
        """
        if evt.result.reason == speechsdk.ResultReason.RecognizedSpeech:
            if evt.result.properties.get(
                    speechsdk.PropertyId.SpeechServiceConnection_AutoDetectSourceLanguageResult) is None:
                print("Unable to detect any language")
            else:
                detected_src_lang = evt.result.properties[
                    speechsdk.PropertyId.SpeechServiceConnection_AutoDetectSourceLanguageResult]
                json_result = evt.result.properties[speechsdk.PropertyId.SpeechServiceResponse_JsonResult]
                detail_result = json.loads(json_result)
                start_offset = detail_result['Offset']
                duration = detail_result['Duration']
                if duration >= 0:
                    end_offset = duration + start_offset
                else:
                    end_offset = 0
                print("Detected language = " + detected_src_lang)
                print(f"Start offset = {start_offset}, End offset = {end_offset}, "
                      f"Duration = {duration} (in units of hundreds of nanoseconds (HNS))")
                global language_detected
                language_detected = True

    # Connect callbacks to the events fired by the speech recognizer
    source_language_recognizer.recognized.connect(audio_recognized)
    source_language_recognizer.session_started.connect(lambda evt: print('SESSION STARTED: {}'.format(evt)))
    source_language_recognizer.session_stopped.connect(lambda evt: print('SESSION STOPPED {}'.format(evt)))
    source_language_recognizer.canceled.connect(lambda evt: print('CANCELED {}'.format(evt)))
    # stop continuous recognition on either session stopped or canceled events
    source_language_recognizer.session_stopped.connect(stop_cb)
    source_language_recognizer.canceled.connect(stop_cb)

    # Start continuous speech recognition
    source_language_recognizer.start_continuous_recognition()
    while not done:
        time.sleep(.5)

    source_language_recognizer.stop_continuous_recognition()
    # </SpeechContinuousLanguageDetectionWithFile>

speech_language_detection_once_from_continuous()

SESSION STARTED: SessionEventArgs(session_id=c19f523f7fa74fa4ab77fb086bd4502f)
Detected language = zh-tw
Start offset = 0, End offset = 15000000, Duration = 15000000 (in units of hundreds of nanoseconds (HNS))
Detected language = zh-tw
Start offset = 14990000, End offset = 24990000, Duration = 10000000 (in units of hundreds of nanoseconds (HNS))
Detected language = zh-tw
Start offset = 24990000, End offset = 34990000, Duration = 10000000 (in units of hundreds of nanoseconds (HNS))
Detected language = zh-tw
Start offset = 34990000, End offset = 44990000, Duration = 10000000 (in units of hundreds of nanoseconds (HNS))
Detected language = zh-tw
Start offset = 44990000, End offset = 54990000, Duration = 10000000 (in units of hundreds of nanoseconds (HNS))
Detected language = zh-tw
Start offset = 54990000, End offset = 64990000, Duration = 10000000 (in units of hundreds of nanoseconds (HNS))
Detected language = zh-tw
Start offset = 64990000, End offset = 80210000, Duration = 15220000 (in un