From 2c7380861efc307ba61413fa370cc16d7aec5fe9 Mon Sep 17 00:00:00 2001 From: Bu Sun Kim <8822365+busunkim96@users.noreply.github.com> Date: Thu, 24 Sep 2020 09:23:51 -0600 Subject: [PATCH] feat!: migrate to microgenerator (#61) --- .../transcribe_streaming_infinite.py | 81 ++++--- speech/microphone/transcribe_streaming_mic.py | 46 ++-- .../transcribe_streaming_mic_test.py | 19 +- speech/snippets/beta_snippets.py | 215 ++++++++++-------- speech/snippets/beta_snippets_test.py | 17 +- speech/snippets/quickstart.py | 25 +- speech/snippets/quickstart_test.py | 2 +- speech/snippets/speech_adaptation_beta.py | 8 +- speech/snippets/speech_quickstart_beta.py | 7 +- speech/snippets/transcribe.py | 49 ++-- speech/snippets/transcribe_async.py | 61 ++--- speech/snippets/transcribe_async_test.py | 12 +- .../snippets/transcribe_auto_punctuation.py | 30 +-- .../transcribe_auto_punctuation_test.py | 7 +- speech/snippets/transcribe_context_classes.py | 22 +- .../transcribe_context_classes_test.py | 5 +- speech/snippets/transcribe_enhanced_model.py | 29 +-- .../transcribe_enhanced_model_test.py | 7 +- speech/snippets/transcribe_model_selection.py | 69 +++--- .../transcribe_model_selection_test.py | 12 +- speech/snippets/transcribe_multichannel.py | 57 ++--- .../snippets/transcribe_multichannel_test.py | 15 +- .../transcribe_onprem/transcribe_onprem.py | 19 +- speech/snippets/transcribe_streaming.py | 39 ++-- speech/snippets/transcribe_streaming_test.py | 7 +- speech/snippets/transcribe_test.py | 11 +- .../snippets/transcribe_word_time_offsets.py | 71 +++--- .../transcribe_word_time_offsets_test.py | 12 +- 28 files changed, 510 insertions(+), 444 deletions(-) diff --git a/speech/microphone/transcribe_streaming_infinite.py b/speech/microphone/transcribe_streaming_infinite.py index 759842b476d0..d6aafde2783d 100644 --- a/speech/microphone/transcribe_streaming_infinite.py +++ b/speech/microphone/transcribe_streaming_infinite.py @@ -41,9 +41,9 @@ SAMPLE_RATE = 16000 CHUNK_SIZE = int(SAMPLE_RATE / 10) # 100ms -RED = '\033[0;31m' -GREEN = '\033[0;32m' -YELLOW = '\033[0;33m' +RED = "\033[0;31m" +GREEN = "\033[0;32m" +YELLOW = "\033[0;33m" def get_current_time(): @@ -123,12 +123,14 @@ def generator(self): if self.bridging_offset > self.final_request_end_time: self.bridging_offset = self.final_request_end_time - chunks_from_ms = round((self.final_request_end_time - - self.bridging_offset) / chunk_time) + chunks_from_ms = round( + (self.final_request_end_time - self.bridging_offset) + / chunk_time + ) - self.bridging_offset = (round(( - len(self.last_audio_input) - chunks_from_ms) - * chunk_time)) + self.bridging_offset = round( + (len(self.last_audio_input) - chunks_from_ms) * chunk_time + ) for i in range(chunks_from_ms, len(self.last_audio_input)): data.append(self.last_audio_input[i]) @@ -157,7 +159,7 @@ def generator(self): except queue.Empty: break - yield b''.join(data) + yield b"".join(data) def listen_print_loop(responses, stream): @@ -201,35 +203,37 @@ def listen_print_loop(responses, stream): if result.result_end_time.nanos: result_nanos = result.result_end_time.nanos - stream.result_end_time = int((result_seconds * 1000) - + (result_nanos / 1000000)) + stream.result_end_time = int((result_seconds * 1000) + (result_nanos / 1000000)) - corrected_time = (stream.result_end_time - stream.bridging_offset - + (STREAMING_LIMIT * stream.restart_counter)) + corrected_time = ( + stream.result_end_time + - stream.bridging_offset + + (STREAMING_LIMIT * stream.restart_counter) + ) # Display interim results, but with a carriage return at the end of the # line, so subsequent lines will overwrite them. if result.is_final: sys.stdout.write(GREEN) - sys.stdout.write('\033[K') - sys.stdout.write(str(corrected_time) + ': ' + transcript + '\n') + sys.stdout.write("\033[K") + sys.stdout.write(str(corrected_time) + ": " + transcript + "\n") stream.is_final_end_time = stream.result_end_time stream.last_transcript_was_final = True # Exit recognition if any of the transcribed phrases could be # one of our keywords. - if re.search(r'\b(exit|quit)\b', transcript, re.I): + if re.search(r"\b(exit|quit)\b", transcript, re.I): sys.stdout.write(YELLOW) - sys.stdout.write('Exiting...\n') + sys.stdout.write("Exiting...\n") stream.closed = True break else: sys.stdout.write(RED) - sys.stdout.write('\033[K') - sys.stdout.write(str(corrected_time) + ': ' + transcript + '\r') + sys.stdout.write("\033[K") + sys.stdout.write(str(corrected_time) + ": " + transcript + "\r") stream.last_transcript_was_final = False @@ -238,37 +242,42 @@ def main(): """start bidirectional streaming from microphone input to speech API""" client = speech.SpeechClient() - config = speech.types.RecognitionConfig( - encoding=speech.enums.RecognitionConfig.AudioEncoding.LINEAR16, + config = speech.RecognitionConfig( + encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=SAMPLE_RATE, - language_code='en-US', - max_alternatives=1) - streaming_config = speech.types.StreamingRecognitionConfig( - config=config, - interim_results=True) + language_code="en-US", + max_alternatives=1, + ) + streaming_config = speech.StreamingRecognitionConfig( + config=config, interim_results=True + ) mic_manager = ResumableMicrophoneStream(SAMPLE_RATE, CHUNK_SIZE) print(mic_manager.chunk_size) sys.stdout.write(YELLOW) sys.stdout.write('\nListening, say "Quit" or "Exit" to stop.\n\n') - sys.stdout.write('End (ms) Transcript Results/Status\n') - sys.stdout.write('=====================================================\n') + sys.stdout.write("End (ms) Transcript Results/Status\n") + sys.stdout.write("=====================================================\n") with mic_manager as stream: while not stream.closed: sys.stdout.write(YELLOW) - sys.stdout.write('\n' + str( - STREAMING_LIMIT * stream.restart_counter) + ': NEW REQUEST\n') + sys.stdout.write( + "\n" + str(STREAMING_LIMIT * stream.restart_counter) + ": NEW REQUEST\n" + ) stream.audio_input = [] audio_generator = stream.generator() - requests = (speech.types.StreamingRecognizeRequest( - audio_content=content)for content in audio_generator) + requests = ( + speech.StreamingRecognizeRequest(audio_content=content) + for content in audio_generator + ) - responses = client.streaming_recognize(streaming_config, - requests) + responses = client.streaming_recognize( + requests=requests, config=streaming_config + ) # Now, put the transcription responses to use. listen_print_loop(responses, stream) @@ -282,11 +291,11 @@ def main(): stream.restart_counter = stream.restart_counter + 1 if not stream.last_transcript_was_final: - sys.stdout.write('\n') + sys.stdout.write("\n") stream.new_stream = True -if __name__ == '__main__': +if __name__ == "__main__": main() diff --git a/speech/microphone/transcribe_streaming_mic.py b/speech/microphone/transcribe_streaming_mic.py index 3ca7b7094124..b484a10e23e3 100644 --- a/speech/microphone/transcribe_streaming_mic.py +++ b/speech/microphone/transcribe_streaming_mic.py @@ -32,8 +32,6 @@ import sys from google.cloud import speech -from google.cloud.speech import enums -from google.cloud.speech import types import pyaudio from six.moves import queue @@ -44,6 +42,7 @@ class MicrophoneStream(object): """Opens a recording stream as a generator yielding the audio chunks.""" + def __init__(self, rate, chunk): self._rate = rate self._chunk = chunk @@ -58,8 +57,10 @@ def __enter__(self): format=pyaudio.paInt16, # The API currently only supports 1-channel (mono) audio # https://goo.gl/z757pE - channels=1, rate=self._rate, - input=True, frames_per_buffer=self._chunk, + channels=1, + rate=self._rate, + input=True, + frames_per_buffer=self._chunk, # Run the audio stream asynchronously to fill the buffer object. # This is necessary so that the input device's buffer doesn't # overflow while the calling thread makes network requests, etc. @@ -104,7 +105,7 @@ def generator(self): except queue.Empty: break - yield b''.join(data) + yield b"".join(data) def listen_print_loop(responses): @@ -142,10 +143,10 @@ def listen_print_loop(responses): # # If the previous result was longer than this one, we need to print # some extra spaces to overwrite the previous result - overwrite_chars = ' ' * (num_chars_printed - len(transcript)) + overwrite_chars = " " * (num_chars_printed - len(transcript)) if not result.is_final: - sys.stdout.write(transcript + overwrite_chars + '\r') + sys.stdout.write(transcript + overwrite_chars + "\r") sys.stdout.flush() num_chars_printed = len(transcript) @@ -155,8 +156,8 @@ def listen_print_loop(responses): # Exit recognition if any of the transcribed phrases could be # one of our keywords. - if re.search(r'\b(exit|quit)\b', transcript, re.I): - print('Exiting..') + if re.search(r"\b(exit|quit)\b", transcript, re.I): + print("Exiting..") break num_chars_printed = 0 @@ -165,28 +166,33 @@ def listen_print_loop(responses): def main(): # See http://g.co/cloud/speech/docs/languages # for a list of supported languages. - language_code = 'en-US' # a BCP-47 language tag + language_code = "en-US" # a BCP-47 language tag client = speech.SpeechClient() - config = types.RecognitionConfig( - encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, + config = speech.RecognitionConfig( + encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=RATE, - language_code=language_code) - streaming_config = types.StreamingRecognitionConfig( - config=config, - interim_results=True) + language_code=language_code, + ) + streaming_config = speech.StreamingRecognitionConfig( + config=config, interim_results=True + ) with MicrophoneStream(RATE, CHUNK) as stream: audio_generator = stream.generator() - requests = (types.StreamingRecognizeRequest(audio_content=content) - for content in audio_generator) + requests = ( + speech.StreamingRecognizeRequest(audio_content=content) + for content in audio_generator + ) - responses = client.streaming_recognize(streaming_config, requests) + responses = client.streaming_recognize( + requests=requests, config=streaming_config + ) # Now, put the transcription responses to use. listen_print_loop(responses) -if __name__ == '__main__': +if __name__ == "__main__": main() # [END speech_transcribe_streaming_mic] diff --git a/speech/microphone/transcribe_streaming_mic_test.py b/speech/microphone/transcribe_streaming_mic_test.py index dd5e7ea6f5e6..f5e08f5d30b2 100644 --- a/speech/microphone/transcribe_streaming_mic_test.py +++ b/speech/microphone/transcribe_streaming_mic_test.py @@ -18,7 +18,7 @@ import mock -RESOURCES = os.path.join(os.path.dirname(__file__), 'resources') +RESOURCES = os.path.join(os.path.dirname(__file__), "resources") class MockPyAudio(object): @@ -32,8 +32,9 @@ def open(self, stream_callback, rate, *args, **kwargs): self.rate = rate self.closed = threading.Event() self.stream_thread = threading.Thread( - target=self.stream_audio, args=( - self.audio_filename, stream_callback, self.closed)) + target=self.stream_audio, + args=(self.audio_filename, stream_callback, self.closed), + ) self.stream_thread.start() return self @@ -47,23 +48,25 @@ def terminate(self): pass def stream_audio(self, audio_filename, callback, closed, num_frames=512): - with open(audio_filename, 'rb') as audio_file: + with open(audio_filename, "rb") as audio_file: while not closed.is_set(): # Approximate realtime by sleeping for the appropriate time for # the requested number of frames time.sleep(num_frames / float(self.rate)) # audio is 16-bit samples, whereas python byte is 8-bit num_bytes = 2 * num_frames - chunk = audio_file.read(num_bytes) or b'\0' * num_bytes + chunk = audio_file.read(num_bytes) or b"\0" * num_bytes callback(chunk, None, None, None) -@mock.patch.dict('sys.modules', pyaudio=mock.MagicMock( - PyAudio=MockPyAudio(os.path.join(RESOURCES, 'quit.raw')))) +@mock.patch.dict( + "sys.modules", + pyaudio=mock.MagicMock(PyAudio=MockPyAudio(os.path.join(RESOURCES, "quit.raw"))), +) def test_main(capsys): import transcribe_streaming_mic transcribe_streaming_mic.main() out, err = capsys.readouterr() - assert re.search(r'quit', out, re.DOTALL | re.I) + assert re.search(r"quit", out, re.DOTALL | re.I) diff --git a/speech/snippets/beta_snippets.py b/speech/snippets/beta_snippets.py index 79d9c3d587eb..eaafe3ca978e 100644 --- a/speech/snippets/beta_snippets.py +++ b/speech/snippets/beta_snippets.py @@ -35,29 +35,31 @@ def transcribe_file_with_enhanced_model(): """Transcribe the given audio file using an enhanced model.""" # [START speech_transcribe_enhanced_model_beta] from google.cloud import speech_v1p1beta1 as speech + client = speech.SpeechClient() - speech_file = 'resources/commercial_mono.wav' + speech_file = "resources/commercial_mono.wav" - with io.open(speech_file, 'rb') as audio_file: + with io.open(speech_file, "rb") as audio_file: content = audio_file.read() - audio = speech.types.RecognitionAudio(content=content) - config = speech.types.RecognitionConfig( - encoding=speech.enums.RecognitionConfig.AudioEncoding.LINEAR16, + audio = speech.RecognitionAudio(content=content) + config = speech.RecognitionConfig( + encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=8000, - language_code='en-US', + language_code="en-US", use_enhanced=True, # A model must be specified to use enhanced model. - model='phone_call') + model="phone_call", + ) - response = client.recognize(config, audio) + response = client.recognize(request={"config": config, "audio": audio}) for i, result in enumerate(response.results): alternative = result.alternatives[0] - print('-' * 20) - print(u'First alternative of result {}'.format(i)) - print(u'Transcript: {}'.format(alternative.transcript)) + print("-" * 20) + print(f"First alternative of result {i}") + print(f"Transcript: {alternative.transcript}") # [END speech_transcribe_enhanced_model_beta] @@ -65,44 +67,47 @@ def transcribe_file_with_metadata(): """Send a request that includes recognition metadata.""" # [START speech_transcribe_recognition_metadata_beta] from google.cloud import speech_v1p1beta1 as speech + client = speech.SpeechClient() - speech_file = 'resources/commercial_mono.wav' + speech_file = "resources/commercial_mono.wav" - with io.open(speech_file, 'rb') as audio_file: + with io.open(speech_file, "rb") as audio_file: content = audio_file.read() # Here we construct a recognition metadata object. # Most metadata fields are specified as enums that can be found - # in speech.enums.RecognitionMetadata - metadata = speech.types.RecognitionMetadata() - metadata.interaction_type = ( - speech.enums.RecognitionMetadata.InteractionType.DISCUSSION) + # in speech.RecognitionMetadata + metadata = speech.RecognitionMetadata() + metadata.interaction_type = speech.RecognitionMetadata.InteractionType.DISCUSSION metadata.microphone_distance = ( - speech.enums.RecognitionMetadata.MicrophoneDistance.NEARFIELD) + speech.RecognitionMetadata.MicrophoneDistance.NEARFIELD + ) metadata.recording_device_type = ( - speech.enums.RecognitionMetadata.RecordingDeviceType.SMARTPHONE) + speech.RecognitionMetadata.RecordingDeviceType.SMARTPHONE + ) # Some metadata fields are free form strings metadata.recording_device_name = "Pixel 2 XL" # And some are integers, for instance the 6 digit NAICS code # https://www.naics.com/search/ metadata.industry_naics_code_of_audio = 519190 - audio = speech.types.RecognitionAudio(content=content) - config = speech.types.RecognitionConfig( - encoding=speech.enums.RecognitionConfig.AudioEncoding.LINEAR16, + audio = speech.RecognitionAudio(content=content) + config = speech.RecognitionConfig( + encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=8000, - language_code='en-US', + language_code="en-US", # Add this in the request to send metadata. - metadata=metadata) + metadata=metadata, + ) - response = client.recognize(config, audio) + response = client.recognize(request={"config": config, "audio": audio}) for i, result in enumerate(response.results): alternative = result.alternatives[0] - print('-' * 20) - print(u'First alternative of result {}'.format(i)) - print(u'Transcript: {}'.format(alternative.transcript)) + print("-" * 20) + print("First alternative of result {}".format(i)) + print("Transcript: {}".format(alternative.transcript)) # [END speech_transcribe_recognition_metadata_beta] @@ -110,28 +115,30 @@ def transcribe_file_with_auto_punctuation(): """Transcribe the given audio file with auto punctuation enabled.""" # [START speech_transcribe_auto_punctuation_beta] from google.cloud import speech_v1p1beta1 as speech + client = speech.SpeechClient() - speech_file = 'resources/commercial_mono.wav' + speech_file = "resources/commercial_mono.wav" - with io.open(speech_file, 'rb') as audio_file: + with io.open(speech_file, "rb") as audio_file: content = audio_file.read() - audio = speech.types.RecognitionAudio(content=content) - config = speech.types.RecognitionConfig( - encoding=speech.enums.RecognitionConfig.AudioEncoding.LINEAR16, + audio = speech.RecognitionAudio(content=content) + config = speech.RecognitionConfig( + encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=8000, - language_code='en-US', + language_code="en-US", # Enable automatic punctuation - enable_automatic_punctuation=True) + enable_automatic_punctuation=True, + ) - response = client.recognize(config, audio) + response = client.recognize(request={"config": config, "audio": audio}) for i, result in enumerate(response.results): alternative = result.alternatives[0] - print('-' * 20) - print(u'First alternative of result {}'.format(i)) - print(u'Transcript: {}'.format(alternative.transcript)) + print("-" * 20) + print("First alternative of result {}".format(i)) + print("Transcript: {}".format(alternative.transcript)) # [END speech_transcribe_auto_punctuation_beta] @@ -139,24 +146,26 @@ def transcribe_file_with_diarization(): """Transcribe the given audio file synchronously with diarization.""" # [START speech_transcribe_diarization_beta] from google.cloud import speech_v1p1beta1 as speech + client = speech.SpeechClient() - speech_file = 'resources/commercial_mono.wav' + speech_file = "resources/commercial_mono.wav" - with open(speech_file, 'rb') as audio_file: + with open(speech_file, "rb") as audio_file: content = audio_file.read() - audio = speech.types.RecognitionAudio(content=content) + audio = speech.RecognitionAudio(content=content) - config = speech.types.RecognitionConfig( - encoding=speech.enums.RecognitionConfig.AudioEncoding.LINEAR16, + config = speech.RecognitionConfig( + encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=8000, - language_code='en-US', + language_code="en-US", enable_speaker_diarization=True, - diarization_speaker_count=2) + diarization_speaker_count=2, + ) - print('Waiting for operation to complete...') - response = client.recognize(config, audio) + print("Waiting for operation to complete...") + response = client.recognize(request={"config": config, "audio": audio}) # The transcript within each result is separate and sequential per result. # However, the words list within an alternative includes all the words @@ -168,8 +177,7 @@ def transcribe_file_with_diarization(): # Printing out the output: for word_info in words_info: - print(u"word: '{}', speaker_tag: {}".format( - word_info.word, word_info.speaker_tag)) + print(f"word: '{word_info.word}', speaker_tag: {word_info.speaker_tag}") # [END speech_transcribe_diarization_beta] @@ -178,30 +186,32 @@ def transcribe_file_with_multichannel(): multi channel.""" # [START speech_transcribe_multichannel_beta] from google.cloud import speech_v1p1beta1 as speech + client = speech.SpeechClient() - speech_file = 'resources/Google_Gnome.wav' + speech_file = "resources/Google_Gnome.wav" - with open(speech_file, 'rb') as audio_file: + with open(speech_file, "rb") as audio_file: content = audio_file.read() - audio = speech.types.RecognitionAudio(content=content) + audio = speech.RecognitionAudio(content=content) - config = speech.types.RecognitionConfig( - encoding=speech.enums.RecognitionConfig.AudioEncoding.LINEAR16, + config = speech.RecognitionConfig( + encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=16000, - language_code='en-US', + language_code="en-US", audio_channel_count=1, - enable_separate_recognition_per_channel=True) + enable_separate_recognition_per_channel=True, + ) - response = client.recognize(config, audio) + response = client.recognize(request={"config": config, "audio": audio}) for i, result in enumerate(response.results): alternative = result.alternatives[0] - print('-' * 20) - print('First alternative of result {}'.format(i)) - print(u'Transcript: {}'.format(alternative.transcript)) - print(u'Channel Tag: {}'.format(result.channel_tag)) + print("-" * 20) + print("First alternative of result {}".format(i)) + print("Transcript: {}".format(alternative.transcript)) + print("Channel Tag: {}".format(result.channel_tag)) # [END speech_transcribe_multichannel_beta] @@ -210,32 +220,34 @@ def transcribe_file_with_multilanguage(): multi language.""" # [START speech_transcribe_multilanguage_beta] from google.cloud import speech_v1p1beta1 as speech + client = speech.SpeechClient() - speech_file = 'resources/multi.wav' - first_lang = 'en-US' - second_lang = 'es' + speech_file = "resources/multi.wav" + first_lang = "en-US" + second_lang = "es" - with open(speech_file, 'rb') as audio_file: + with open(speech_file, "rb") as audio_file: content = audio_file.read() - audio = speech.types.RecognitionAudio(content=content) + audio = speech.RecognitionAudio(content=content) - config = speech.types.RecognitionConfig( - encoding=speech.enums.RecognitionConfig.AudioEncoding.LINEAR16, + config = speech.RecognitionConfig( + encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=44100, audio_channel_count=2, language_code=first_lang, - alternative_language_codes=[second_lang]) + alternative_language_codes=[second_lang], + ) - print('Waiting for operation to complete...') - response = client.recognize(config, audio) + print("Waiting for operation to complete...") + response = client.recognize(request={"config": config, "audio": audio}) for i, result in enumerate(response.results): alternative = result.alternatives[0] - print('-' * 20) - print(u'First alternative of result {}: {}'.format(i, alternative)) - print(u'Transcript: {}'.format(alternative.transcript)) + print("-" * 20) + print("First alternative of result {}: {}".format(i, alternative)) + print("Transcript: {}".format(alternative.transcript)) # [END speech_transcribe_multilanguage_beta] @@ -244,52 +256,57 @@ def transcribe_file_with_word_level_confidence(): word level confidence.""" # [START speech_transcribe_word_level_confidence_beta] from google.cloud import speech_v1p1beta1 as speech + client = speech.SpeechClient() - speech_file = 'resources/Google_Gnome.wav' + speech_file = "resources/Google_Gnome.wav" - with open(speech_file, 'rb') as audio_file: + with open(speech_file, "rb") as audio_file: content = audio_file.read() - audio = speech.types.RecognitionAudio(content=content) + audio = speech.RecognitionAudio(content=content) - config = speech.types.RecognitionConfig( - encoding=speech.enums.RecognitionConfig.AudioEncoding.LINEAR16, + config = speech.RecognitionConfig( + encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=16000, - language_code='en-US', - enable_word_confidence=True) + language_code="en-US", + enable_word_confidence=True, + ) - response = client.recognize(config, audio) + response = client.recognize(request={"config": config, "audio": audio}) for i, result in enumerate(response.results): alternative = result.alternatives[0] - print('-' * 20) - print('First alternative of result {}'.format(i)) - print(u'Transcript: {}'.format(alternative.transcript)) - print(u'First Word and Confidence: ({}, {})'.format( - alternative.words[0].word, alternative.words[0].confidence)) + print("-" * 20) + print("First alternative of result {}".format(i)) + print("Transcript: {}".format(alternative.transcript)) + print( + "First Word and Confidence: ({}, {})".format( + alternative.words[0].word, alternative.words[0].confidence + ) + ) # [END speech_transcribe_word_level_confidence_beta] -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser( - description=__doc__, - formatter_class=argparse.RawDescriptionHelpFormatter) - parser.add_argument('command') + description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter + ) + parser.add_argument("command") args = parser.parse_args() - if args.command == 'enhanced-model': + if args.command == "enhanced-model": transcribe_file_with_enhanced_model() - elif args.command == 'metadata': + elif args.command == "metadata": transcribe_file_with_metadata() - elif args.command == 'punctuation': + elif args.command == "punctuation": transcribe_file_with_auto_punctuation() - elif args.command == 'diarization': + elif args.command == "diarization": transcribe_file_with_diarization() - elif args.command == 'multi-channel': + elif args.command == "multi-channel": transcribe_file_with_multichannel() - elif args.command == 'multi-language': + elif args.command == "multi-language": transcribe_file_with_multilanguage() - elif args.command == 'word-level-conf': + elif args.command == "word-level-conf": transcribe_file_with_word_level_confidence() diff --git a/speech/snippets/beta_snippets_test.py b/speech/snippets/beta_snippets_test.py index 367d2ccc4b1b..d1242df50bd7 100644 --- a/speech/snippets/beta_snippets_test.py +++ b/speech/snippets/beta_snippets_test.py @@ -20,30 +20,31 @@ transcribe_file_with_metadata, transcribe_file_with_multichannel, transcribe_file_with_multilanguage, - transcribe_file_with_word_level_confidence) + transcribe_file_with_word_level_confidence, +) -RESOURCES = os.path.join(os.path.dirname(__file__), 'resources') +RESOURCES = os.path.join(os.path.dirname(__file__), "resources") def test_transcribe_file_with_enhanced_model(capsys): transcribe_file_with_enhanced_model() out, _ = capsys.readouterr() - assert 'Chrome' in out + assert "Chrome" in out def test_transcribe_file_with_metadata(capsys): transcribe_file_with_metadata() out, _ = capsys.readouterr() - assert 'Chrome' in out + assert "Chrome" in out def test_transcribe_file_with_auto_punctuation(capsys): transcribe_file_with_auto_punctuation() out, _ = capsys.readouterr() - assert 'First alternative of result ' in out + assert "First alternative of result " in out def test_transcribe_diarization(capsys): @@ -58,18 +59,18 @@ def test_transcribe_multichannel_file(capsys): transcribe_file_with_multichannel() out, err = capsys.readouterr() - assert 'OK Google stream stranger things from Netflix to my TV' in out + assert "OK Google stream stranger things from Netflix to my TV" in out def test_transcribe_multilanguage_file(capsys): transcribe_file_with_multilanguage() out, err = capsys.readouterr() - assert 'how are you doing estoy bien e tu' in out + assert "how are you doing estoy bien e tu" in out def test_transcribe_word_level_confidence(capsys): transcribe_file_with_word_level_confidence() out, err = capsys.readouterr() - assert 'OK Google stream stranger things from Netflix to my TV' in out + assert "OK Google stream stranger things from Netflix to my TV" in out diff --git a/speech/snippets/quickstart.py b/speech/snippets/quickstart.py index f90f52fb04b3..ad0ab3275838 100644 --- a/speech/snippets/quickstart.py +++ b/speech/snippets/quickstart.py @@ -23,8 +23,7 @@ def run_quickstart(): # Imports the Google Cloud client library # [START speech_python_migration_imports] from google.cloud import speech - from google.cloud.speech import enums - from google.cloud.speech import types + # [END speech_python_migration_imports] # Instantiates a client @@ -33,28 +32,26 @@ def run_quickstart(): # [END speech_python_migration_client] # The name of the audio file to transcribe - file_name = os.path.join( - os.path.dirname(__file__), - 'resources', - 'audio.raw') + file_name = os.path.join(os.path.dirname(__file__), "resources", "audio.raw") # Loads the audio into memory - with io.open(file_name, 'rb') as audio_file: + with io.open(file_name, "rb") as audio_file: content = audio_file.read() - audio = types.RecognitionAudio(content=content) + audio = speech.RecognitionAudio(content=content) - config = types.RecognitionConfig( - encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, + config = speech.RecognitionConfig( + encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=16000, - language_code='en-US') + language_code="en-US", + ) # Detects speech in the audio file - response = client.recognize(config, audio) + response = client.recognize(request={"config": config, "audio": audio}) for result in response.results: - print('Transcript: {}'.format(result.alternatives[0].transcript)) + print("Transcript: {}".format(result.alternatives[0].transcript)) # [END speech_quickstart] -if __name__ == '__main__': +if __name__ == "__main__": run_quickstart() diff --git a/speech/snippets/quickstart_test.py b/speech/snippets/quickstart_test.py index 0675ad195d3a..7fcca1856a79 100644 --- a/speech/snippets/quickstart_test.py +++ b/speech/snippets/quickstart_test.py @@ -19,4 +19,4 @@ def test_quickstart(capsys): quickstart.run_quickstart() out, _ = capsys.readouterr() - assert 'Transcript: how old is the Brooklyn Bridge' in out + assert "Transcript: how old is the Brooklyn Bridge" in out diff --git a/speech/snippets/speech_adaptation_beta.py b/speech/snippets/speech_adaptation_beta.py index 35e9527fdea8..890bb8ed7284 100644 --- a/speech/snippets/speech_adaptation_beta.py +++ b/speech/snippets/speech_adaptation_beta.py @@ -26,7 +26,6 @@ # [START speech_adaptation_beta] from google.cloud import speech_v1p1beta1 -from google.cloud.speech_v1p1beta1 import enums def sample_recognize(storage_uri, phrase): @@ -62,7 +61,7 @@ def sample_recognize(storage_uri, phrase): # Encoding of audio data sent. This sample sets this explicitly. # This field is optional for FLAC and WAV audio formats. - encoding = enums.RecognitionConfig.AudioEncoding.MP3 + encoding = speech_v1p1beta1.RecognitionConfig.AudioEncoding.MP3 config = { "speech_contexts": speech_contexts, "sample_rate_hertz": sample_rate_hertz, @@ -71,14 +70,13 @@ def sample_recognize(storage_uri, phrase): } audio = {"uri": storage_uri} - response = client.recognize(config, audio) + response = client.recognize(request={"config": config, "audio": audio}) for result in response.results: # First alternative is the most probable result alternative = result.alternatives[0] print(u"Transcript: {}".format(alternative.transcript)) - -# [END speech_adaptation_beta] + # [END speech_adaptation_beta] return response diff --git a/speech/snippets/speech_quickstart_beta.py b/speech/snippets/speech_quickstart_beta.py index 431f6d5490c2..ba1efab1a847 100644 --- a/speech/snippets/speech_quickstart_beta.py +++ b/speech/snippets/speech_quickstart_beta.py @@ -26,7 +26,6 @@ # [START speech_quickstart_beta] from google.cloud import speech_v1p1beta1 -from google.cloud.speech_v1p1beta1 import enums def sample_recognize(storage_uri): @@ -49,7 +48,7 @@ def sample_recognize(storage_uri): # Encoding of audio data sent. This sample sets this explicitly. # This field is optional for FLAC and WAV audio formats. - encoding = enums.RecognitionConfig.AudioEncoding.MP3 + encoding = speech_v1p1beta1.RecognitionConfig.AudioEncoding.MP3 config = { "language_code": language_code, "sample_rate_hertz": sample_rate_hertz, @@ -57,13 +56,13 @@ def sample_recognize(storage_uri): } audio = {"uri": storage_uri} - response = client.recognize(config, audio) + response = client.recognize(request={"config": config, "audio": audio}) for result in response.results: # First alternative is the most probable result alternative = result.alternatives[0] print(u"Transcript: {}".format(alternative.transcript)) -# [END speech_quickstart_beta] + # [END speech_quickstart_beta] return response diff --git a/speech/snippets/transcribe.py b/speech/snippets/transcribe.py index 1ff446d43652..2cd21ddc3194 100644 --- a/speech/snippets/transcribe.py +++ b/speech/snippets/transcribe.py @@ -29,32 +29,34 @@ def transcribe_file(speech_file): """Transcribe the given audio file.""" from google.cloud import speech - from google.cloud.speech import enums - from google.cloud.speech import types import io + client = speech.SpeechClient() # [START speech_python_migration_sync_request] # [START speech_python_migration_config] - with io.open(speech_file, 'rb') as audio_file: + with io.open(speech_file, "rb") as audio_file: content = audio_file.read() - audio = types.RecognitionAudio(content=content) - config = types.RecognitionConfig( - encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, + audio = speech.RecognitionAudio(content=content) + config = speech.RecognitionConfig( + encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=16000, - language_code='en-US') + language_code="en-US", + ) # [END speech_python_migration_config] # [START speech_python_migration_sync_response] - response = client.recognize(config, audio) + response = client.recognize(request={"config": config, "audio": audio}) # [END speech_python_migration_sync_request] # Each result is for a consecutive portion of the audio. Iterate through # them to get the transcripts for the entire audio file. for result in response.results: # The first alternative is the most likely one for this portion. - print(u'Transcript: {}'.format(result.alternatives[0].transcript)) + print(u"Transcript: {}".format(result.alternatives[0].transcript)) # [END speech_python_migration_sync_response] + + # [END speech_transcribe_sync] @@ -62,35 +64,36 @@ def transcribe_file(speech_file): def transcribe_gcs(gcs_uri): """Transcribes the audio file specified by the gcs_uri.""" from google.cloud import speech - from google.cloud.speech import enums - from google.cloud.speech import types + client = speech.SpeechClient() # [START speech_python_migration_config_gcs] - audio = types.RecognitionAudio(uri=gcs_uri) - config = types.RecognitionConfig( - encoding=enums.RecognitionConfig.AudioEncoding.FLAC, + audio = speech.RecognitionAudio(uri=gcs_uri) + config = speech.RecognitionConfig( + encoding=speech.RecognitionConfig.AudioEncoding.FLAC, sample_rate_hertz=16000, - language_code='en-US') + language_code="en-US", + ) # [END speech_python_migration_config_gcs] - response = client.recognize(config, audio) + response = client.recognize(request={"config": config, "audio": audio}) # Each result is for a consecutive portion of the audio. Iterate through # them to get the transcripts for the entire audio file. for result in response.results: # The first alternative is the most likely one for this portion. - print(u'Transcript: {}'.format(result.alternatives[0].transcript)) + print(u"Transcript: {}".format(result.alternatives[0].transcript)) + + # [END speech_transcribe_sync_gcs] -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser( - description=__doc__, - formatter_class=argparse.RawDescriptionHelpFormatter) - parser.add_argument( - 'path', help='File or GCS path for audio file to be recognized') + description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter + ) + parser.add_argument("path", help="File or GCS path for audio file to be recognized") args = parser.parse_args() - if args.path.startswith('gs://'): + if args.path.startswith("gs://"): transcribe_gcs(args.path) else: transcribe_file(args.path) diff --git a/speech/snippets/transcribe_async.py b/speech/snippets/transcribe_async.py index 0f9f5b2dc606..789f2f36edc1 100644 --- a/speech/snippets/transcribe_async.py +++ b/speech/snippets/transcribe_async.py @@ -30,34 +30,38 @@ def transcribe_file(speech_file): """Transcribe the given audio file asynchronously.""" from google.cloud import speech - from google.cloud.speech import enums - from google.cloud.speech import types + client = speech.SpeechClient() # [START speech_python_migration_async_request] - with io.open(speech_file, 'rb') as audio_file: + with io.open(speech_file, "rb") as audio_file: content = audio_file.read() - audio = types.RecognitionAudio(content=content) - config = types.RecognitionConfig( - encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, + audio = speech.RecognitionAudio(content=content) + config = speech.RecognitionConfig( + encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=16000, - language_code='en-US') + language_code="en-US", + ) # [START speech_python_migration_async_response] - operation = client.long_running_recognize(config, audio) + operation = client.long_running_recognize( + request={"config": config, "audio": audio} + ) # [END speech_python_migration_async_request] - print('Waiting for operation to complete...') + print("Waiting for operation to complete...") response = operation.result(timeout=90) # Each result is for a consecutive portion of the audio. Iterate through # them to get the transcripts for the entire audio file. for result in response.results: # The first alternative is the most likely one for this portion. - print(u'Transcript: {}'.format(result.alternatives[0].transcript)) - print('Confidence: {}'.format(result.alternatives[0].confidence)) + print(u"Transcript: {}".format(result.alternatives[0].transcript)) + print("Confidence: {}".format(result.alternatives[0].confidence)) # [END speech_python_migration_async_response] + + # [END speech_transcribe_async] @@ -65,38 +69,41 @@ def transcribe_file(speech_file): def transcribe_gcs(gcs_uri): """Asynchronously transcribes the audio file specified by the gcs_uri.""" from google.cloud import speech - from google.cloud.speech import enums - from google.cloud.speech import types + client = speech.SpeechClient() - audio = types.RecognitionAudio(uri=gcs_uri) - config = types.RecognitionConfig( - encoding=enums.RecognitionConfig.AudioEncoding.FLAC, + audio = speech.RecognitionAudio(uri=gcs_uri) + config = speech.RecognitionConfig( + encoding=speech.RecognitionConfig.AudioEncoding.FLAC, sample_rate_hertz=16000, - language_code='en-US') + language_code="en-US", + ) - operation = client.long_running_recognize(config, audio) + operation = client.long_running_recognize( + request={"config": config, "audio": audio} + ) - print('Waiting for operation to complete...') + print("Waiting for operation to complete...") response = operation.result(timeout=90) # Each result is for a consecutive portion of the audio. Iterate through # them to get the transcripts for the entire audio file. for result in response.results: # The first alternative is the most likely one for this portion. - print(u'Transcript: {}'.format(result.alternatives[0].transcript)) - print('Confidence: {}'.format(result.alternatives[0].confidence)) + print(u"Transcript: {}".format(result.alternatives[0].transcript)) + print("Confidence: {}".format(result.alternatives[0].confidence)) + + # [END speech_transcribe_async_gcs] -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser( - description=__doc__, - formatter_class=argparse.RawDescriptionHelpFormatter) - parser.add_argument( - 'path', help='File or GCS path for audio file to be recognized') + description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter + ) + parser.add_argument("path", help="File or GCS path for audio file to be recognized") args = parser.parse_args() - if args.path.startswith('gs://'): + if args.path.startswith("gs://"): transcribe_gcs(args.path) else: transcribe_file(args.path) diff --git a/speech/snippets/transcribe_async_test.py b/speech/snippets/transcribe_async_test.py index 7d66747eb446..47d5f8385a78 100644 --- a/speech/snippets/transcribe_async_test.py +++ b/speech/snippets/transcribe_async_test.py @@ -16,20 +16,18 @@ import transcribe_async -RESOURCES = os.path.join(os.path.dirname(__file__), 'resources') +RESOURCES = os.path.join(os.path.dirname(__file__), "resources") def test_transcribe(capsys): - transcribe_async.transcribe_file( - os.path.join(RESOURCES, 'audio.raw')) + transcribe_async.transcribe_file(os.path.join(RESOURCES, "audio.raw")) out, err = capsys.readouterr() - assert re.search(r'how old is the Brooklyn Bridge', out, re.DOTALL | re.I) + assert re.search(r"how old is the Brooklyn Bridge", out, re.DOTALL | re.I) def test_transcribe_gcs(capsys): - transcribe_async.transcribe_gcs( - 'gs://python-docs-samples-tests/speech/audio.flac') + transcribe_async.transcribe_gcs("gs://python-docs-samples-tests/speech/audio.flac") out, err = capsys.readouterr() - assert re.search(r'how old is the Brooklyn Bridge', out, re.DOTALL | re.I) + assert re.search(r"how old is the Brooklyn Bridge", out, re.DOTALL | re.I) diff --git a/speech/snippets/transcribe_auto_punctuation.py b/speech/snippets/transcribe_auto_punctuation.py index 4e65afafaf43..106de0f772a3 100644 --- a/speech/snippets/transcribe_auto_punctuation.py +++ b/speech/snippets/transcribe_auto_punctuation.py @@ -29,35 +29,37 @@ def transcribe_file_with_auto_punctuation(path): """Transcribe the given audio file with auto punctuation enabled.""" # [START speech_transcribe_auto_punctuation] from google.cloud import speech + client = speech.SpeechClient() # path = 'resources/commercial_mono.wav' - with io.open(path, 'rb') as audio_file: + with io.open(path, "rb") as audio_file: content = audio_file.read() - audio = speech.types.RecognitionAudio(content=content) - config = speech.types.RecognitionConfig( - encoding=speech.enums.RecognitionConfig.AudioEncoding.LINEAR16, + audio = speech.RecognitionAudio(content=content) + config = speech.RecognitionConfig( + encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=8000, - language_code='en-US', + language_code="en-US", # Enable automatic punctuation - enable_automatic_punctuation=True) + enable_automatic_punctuation=True, + ) - response = client.recognize(config, audio) + response = client.recognize(request={"config": config, "audio": audio}) for i, result in enumerate(response.results): alternative = result.alternatives[0] - print('-' * 20) - print('First alternative of result {}'.format(i)) - print('Transcript: {}'.format(alternative.transcript)) + print("-" * 20) + print("First alternative of result {}".format(i)) + print("Transcript: {}".format(alternative.transcript)) # [END speech_transcribe_auto_punctuation] -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser( - description=__doc__, - formatter_class=argparse.RawDescriptionHelpFormatter) - parser.add_argument('path', help='File to stream to the API') + description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter + ) + parser.add_argument("path", help="File to stream to the API") args = parser.parse_args() diff --git a/speech/snippets/transcribe_auto_punctuation_test.py b/speech/snippets/transcribe_auto_punctuation_test.py index e42018d47a6e..8e95eac68e34 100644 --- a/speech/snippets/transcribe_auto_punctuation_test.py +++ b/speech/snippets/transcribe_auto_punctuation_test.py @@ -15,12 +15,13 @@ import transcribe_auto_punctuation -RESOURCES = os.path.join(os.path.dirname(__file__), 'resources') +RESOURCES = os.path.join(os.path.dirname(__file__), "resources") def test_transcribe_file_with_auto_punctuation(capsys): transcribe_auto_punctuation.transcribe_file_with_auto_punctuation( - 'resources/commercial_mono.wav') + "resources/commercial_mono.wav" + ) out, _ = capsys.readouterr() - assert 'First alternative of result ' in out + assert "First alternative of result " in out diff --git a/speech/snippets/transcribe_context_classes.py b/speech/snippets/transcribe_context_classes.py index af483928ce92..69f40fd9c4ce 100644 --- a/speech/snippets/transcribe_context_classes.py +++ b/speech/snippets/transcribe_context_classes.py @@ -18,30 +18,32 @@ def transcribe_context_classes(storage_uri): favor specific classes of words in the results.""" # [START speech_context_classes] from google.cloud import speech + client = speech.SpeechClient() # storage_uri = 'gs://YOUR_BUCKET_ID/path/to/your/file.wav' - audio = speech.types.RecognitionAudio(uri=storage_uri) + audio = speech.RecognitionAudio(uri=storage_uri) # SpeechContext: to configure your speech_context see: # https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1#speechcontext # Full list of supported phrases (class tokens) here: # https://cloud.google.com/speech-to-text/docs/class-tokens - speech_context = speech.types.SpeechContext(phrases=['$TIME']) + speech_context = speech.SpeechContext(phrases=["$TIME"]) # RecognitionConfig: to configure your encoding and sample_rate_hertz, see: # https://cloud.google.com/speech-to-text/docs/reference/rpc/google.cloud.speech.v1#recognitionconfig - config = speech.types.RecognitionConfig( - encoding=speech.enums.RecognitionConfig.AudioEncoding.LINEAR16, + config = speech.RecognitionConfig( + encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=8000, - language_code='en-US', - speech_contexts=[speech_context]) + language_code="en-US", + speech_contexts=[speech_context], + ) - response = client.recognize(config, audio) + response = client.recognize(request={"config": config, "audio": audio}) for i, result in enumerate(response.results): alternative = result.alternatives[0] - print('-' * 20) - print('First alternative of result {}'.format(i)) - print('Transcript: {}'.format(alternative.transcript)) + print("-" * 20) + print("First alternative of result {}".format(i)) + print("Transcript: {}".format(alternative.transcript)) # [END speech_context_classes] diff --git a/speech/snippets/transcribe_context_classes_test.py b/speech/snippets/transcribe_context_classes_test.py index e0d10d6f5410..61642fb2a5ce 100644 --- a/speech/snippets/transcribe_context_classes_test.py +++ b/speech/snippets/transcribe_context_classes_test.py @@ -16,7 +16,8 @@ def test_transcribe_context_classes(capsys): transcribe_context_classes.transcribe_context_classes( - 'gs://cloud-samples-data/speech/commercial_mono.wav') + "gs://cloud-samples-data/speech/commercial_mono.wav" + ) out, _ = capsys.readouterr() - assert 'First alternative of result ' in out + assert "First alternative of result " in out diff --git a/speech/snippets/transcribe_enhanced_model.py b/speech/snippets/transcribe_enhanced_model.py index 1b233c52696c..6b2862c7c55c 100644 --- a/speech/snippets/transcribe_enhanced_model.py +++ b/speech/snippets/transcribe_enhanced_model.py @@ -34,35 +34,36 @@ def transcribe_file_with_enhanced_model(path): client = speech.SpeechClient() # path = 'resources/commercial_mono.wav' - with io.open(path, 'rb') as audio_file: + with io.open(path, "rb") as audio_file: content = audio_file.read() - audio = speech.types.RecognitionAudio(content=content) - config = speech.types.RecognitionConfig( - encoding=speech.enums.RecognitionConfig.AudioEncoding.LINEAR16, + audio = speech.RecognitionAudio(content=content) + config = speech.RecognitionConfig( + encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=8000, - language_code='en-US', + language_code="en-US", # Enhanced models are only available to projects that # opt in for audio data collection. use_enhanced=True, # A model must be specified to use enhanced model. - model='phone_call') + model="phone_call", + ) - response = client.recognize(config, audio) + response = client.recognize(request={"config": config, "audio": audio}) for i, result in enumerate(response.results): alternative = result.alternatives[0] - print('-' * 20) - print('First alternative of result {}'.format(i)) - print('Transcript: {}'.format(alternative.transcript)) + print("-" * 20) + print("First alternative of result {}".format(i)) + print("Transcript: {}".format(alternative.transcript)) # [END speech_transcribe_enhanced_model] -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser( - description=__doc__, - formatter_class=argparse.RawDescriptionHelpFormatter) - parser.add_argument('path', help='File to stream to the API') + description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter + ) + parser.add_argument("path", help="File to stream to the API") args = parser.parse_args() diff --git a/speech/snippets/transcribe_enhanced_model_test.py b/speech/snippets/transcribe_enhanced_model_test.py index 6e5676cfb8ff..cf673111604a 100644 --- a/speech/snippets/transcribe_enhanced_model_test.py +++ b/speech/snippets/transcribe_enhanced_model_test.py @@ -15,12 +15,13 @@ import transcribe_enhanced_model -RESOURCES = os.path.join(os.path.dirname(__file__), 'resources') +RESOURCES = os.path.join(os.path.dirname(__file__), "resources") def test_transcribe_file_with_enhanced_model(capsys): transcribe_enhanced_model.transcribe_file_with_enhanced_model( - 'resources/commercial_mono.wav') + "resources/commercial_mono.wav" + ) out, _ = capsys.readouterr() - assert 'Chrome' in out + assert "Chrome" in out diff --git a/speech/snippets/transcribe_model_selection.py b/speech/snippets/transcribe_model_selection.py index f81b9e72dd16..a25fc1d51472 100644 --- a/speech/snippets/transcribe_model_selection.py +++ b/speech/snippets/transcribe_model_selection.py @@ -32,26 +32,30 @@ def transcribe_model_selection(speech_file, model): """Transcribe the given audio file synchronously with the selected model.""" from google.cloud import speech + client = speech.SpeechClient() - with open(speech_file, 'rb') as audio_file: + with open(speech_file, "rb") as audio_file: content = audio_file.read() - audio = speech.types.RecognitionAudio(content=content) + audio = speech.RecognitionAudio(content=content) - config = speech.types.RecognitionConfig( - encoding=speech.enums.RecognitionConfig.AudioEncoding.LINEAR16, + config = speech.RecognitionConfig( + encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=16000, - language_code='en-US', - model=model) + language_code="en-US", + model=model, + ) - response = client.recognize(config, audio) + response = client.recognize(request={"config": config, "audio": audio}) for i, result in enumerate(response.results): alternative = result.alternatives[0] - print('-' * 20) - print('First alternative of result {}'.format(i)) - print(u'Transcript: {}'.format(alternative.transcript)) + print("-" * 20) + print("First alternative of result {}".format(i)) + print(u"Transcript: {}".format(alternative.transcript)) + + # [END speech_transcribe_model_selection] @@ -60,43 +64,50 @@ def transcribe_model_selection_gcs(gcs_uri, model): """Transcribe the given audio file asynchronously with the selected model.""" from google.cloud import speech + client = speech.SpeechClient() - audio = speech.types.RecognitionAudio(uri=gcs_uri) + audio = speech.RecognitionAudio(uri=gcs_uri) - config = speech.types.RecognitionConfig( - encoding=speech.enums.RecognitionConfig.AudioEncoding.LINEAR16, + config = speech.RecognitionConfig( + encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=16000, - language_code='en-US', - model=model) + language_code="en-US", + model=model, + ) - operation = client.long_running_recognize(config, audio) + operation = client.long_running_recognize( + request={"config": config, "audio": audio} + ) - print('Waiting for operation to complete...') + print("Waiting for operation to complete...") response = operation.result(timeout=90) for i, result in enumerate(response.results): alternative = result.alternatives[0] - print('-' * 20) - print('First alternative of result {}'.format(i)) - print(u'Transcript: {}'.format(alternative.transcript)) + print("-" * 20) + print("First alternative of result {}".format(i)) + print(u"Transcript: {}".format(alternative.transcript)) + + # [END speech_transcribe_model_selection_gcs] -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser( - description=__doc__, - formatter_class=argparse.RawDescriptionHelpFormatter) - parser.add_argument( - 'path', help='File or GCS path for audio file to be recognized') + description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter + ) + parser.add_argument("path", help="File or GCS path for audio file to be recognized") parser.add_argument( - '--model', help='The speech recognition model to use', - choices=['command_and_search', 'phone_call', 'video', 'default'], - default='default') + "--model", + help="The speech recognition model to use", + choices=["command_and_search", "phone_call", "video", "default"], + default="default", + ) args = parser.parse_args() - if args.path.startswith('gs://'): + if args.path.startswith("gs://"): transcribe_model_selection_gcs(args.path, args.model) else: transcribe_model_selection(args.path, args.model) diff --git a/speech/snippets/transcribe_model_selection_test.py b/speech/snippets/transcribe_model_selection_test.py index 07bd91a4a0ae..59d04fe0b99a 100644 --- a/speech/snippets/transcribe_model_selection_test.py +++ b/speech/snippets/transcribe_model_selection_test.py @@ -16,20 +16,22 @@ import transcribe_model_selection -RESOURCES = os.path.join(os.path.dirname(__file__), 'resources') +RESOURCES = os.path.join(os.path.dirname(__file__), "resources") def test_transcribe_model_selection_file(capsys): transcribe_model_selection.transcribe_model_selection( - os.path.join(RESOURCES, 'Google_Gnome.wav'), 'video') + os.path.join(RESOURCES, "Google_Gnome.wav"), "video" + ) out, err = capsys.readouterr() - assert re.search(r'the weather outside is sunny', out, re.DOTALL | re.I) + assert re.search(r"the weather outside is sunny", out, re.DOTALL | re.I) def test_transcribe_model_selection_gcs(capsys): transcribe_model_selection.transcribe_model_selection_gcs( - 'gs://cloud-samples-tests/speech/Google_Gnome.wav', 'video') + "gs://cloud-samples-tests/speech/Google_Gnome.wav", "video" + ) out, err = capsys.readouterr() - assert re.search(r'the weather outside is sunny', out, re.DOTALL | re.I) + assert re.search(r"the weather outside is sunny", out, re.DOTALL | re.I) diff --git a/speech/snippets/transcribe_multichannel.py b/speech/snippets/transcribe_multichannel.py index e84da59ad7b3..c5b4d5de95c5 100644 --- a/speech/snippets/transcribe_multichannel.py +++ b/speech/snippets/transcribe_multichannel.py @@ -30,28 +30,30 @@ def transcribe_file_with_multichannel(speech_file): multi channel.""" # [START speech_transcribe_multichannel] from google.cloud import speech + client = speech.SpeechClient() - with open(speech_file, 'rb') as audio_file: + with open(speech_file, "rb") as audio_file: content = audio_file.read() - audio = speech.types.RecognitionAudio(content=content) + audio = speech.RecognitionAudio(content=content) - config = speech.types.RecognitionConfig( - encoding=speech.enums.RecognitionConfig.AudioEncoding.LINEAR16, + config = speech.RecognitionConfig( + encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=44100, - language_code='en-US', + language_code="en-US", audio_channel_count=2, - enable_separate_recognition_per_channel=True) + enable_separate_recognition_per_channel=True, + ) - response = client.recognize(config, audio) + response = client.recognize(request={"config": config, "audio": audio}) for i, result in enumerate(response.results): alternative = result.alternatives[0] - print('-' * 20) - print('First alternative of result {}'.format(i)) - print(u'Transcript: {}'.format(alternative.transcript)) - print(u'Channel Tag: {}'.format(result.channel_tag)) + print("-" * 20) + print("First alternative of result {}".format(i)) + print(u"Transcript: {}".format(alternative.transcript)) + print(u"Channel Tag: {}".format(result.channel_tag)) # [END speech_transcribe_multichannel] @@ -60,36 +62,37 @@ def transcribe_gcs_with_multichannel(gcs_uri): multi channel.""" # [START speech_transcribe_multichannel_gcs] from google.cloud import speech + client = speech.SpeechClient() - audio = speech.types.RecognitionAudio(uri=gcs_uri) + audio = speech.RecognitionAudio(uri=gcs_uri) - config = speech.types.RecognitionConfig( - encoding=speech.enums.RecognitionConfig.AudioEncoding.LINEAR16, + config = speech.RecognitionConfig( + encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=44100, - language_code='en-US', + language_code="en-US", audio_channel_count=2, - enable_separate_recognition_per_channel=True) + enable_separate_recognition_per_channel=True, + ) - response = client.recognize(config, audio) + response = client.recognize(request={"config": config, "audio": audio}) for i, result in enumerate(response.results): alternative = result.alternatives[0] - print('-' * 20) - print('First alternative of result {}'.format(i)) - print(u'Transcript: {}'.format(alternative.transcript)) - print(u'Channel Tag: {}'.format(result.channel_tag)) + print("-" * 20) + print("First alternative of result {}".format(i)) + print(u"Transcript: {}".format(alternative.transcript)) + print(u"Channel Tag: {}".format(result.channel_tag)) # [END speech_transcribe_multichannel_gcs] -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser( - description=__doc__, - formatter_class=argparse.RawDescriptionHelpFormatter) - parser.add_argument( - 'path', help='File or GCS path for audio file to be recognized') + description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter + ) + parser.add_argument("path", help="File or GCS path for audio file to be recognized") args = parser.parse_args() - if args.path.startswith('gs://'): + if args.path.startswith("gs://"): transcribe_gcs_with_multichannel(args.path) else: transcribe_file_with_multichannel(args.path) diff --git a/speech/snippets/transcribe_multichannel_test.py b/speech/snippets/transcribe_multichannel_test.py index de9558629994..54808d169e66 100644 --- a/speech/snippets/transcribe_multichannel_test.py +++ b/speech/snippets/transcribe_multichannel_test.py @@ -15,22 +15,21 @@ from transcribe_multichannel import ( transcribe_file_with_multichannel, - transcribe_gcs_with_multichannel) + transcribe_gcs_with_multichannel, +) -RESOURCES = os.path.join(os.path.dirname(__file__), 'resources') +RESOURCES = os.path.join(os.path.dirname(__file__), "resources") def test_transcribe_multichannel_file(capsys): - transcribe_file_with_multichannel( - os.path.join(RESOURCES, 'multi.wav')) + transcribe_file_with_multichannel(os.path.join(RESOURCES, "multi.wav")) out, err = capsys.readouterr() - assert 'how are you doing' in out + assert "how are you doing" in out def test_transcribe_multichannel_gcs(capsys): - transcribe_gcs_with_multichannel( - 'gs://cloud-samples-data/speech/multi.wav') + transcribe_gcs_with_multichannel("gs://cloud-samples-data/speech/multi.wav") out, err = capsys.readouterr() - assert 'how are you doing' in out + assert "how are you doing" in out diff --git a/speech/snippets/transcribe_onprem/transcribe_onprem.py b/speech/snippets/transcribe_onprem/transcribe_onprem.py index 2c050a153f37..844ef1a0ae16 100644 --- a/speech/snippets/transcribe_onprem/transcribe_onprem.py +++ b/speech/snippets/transcribe_onprem/transcribe_onprem.py @@ -26,7 +26,6 @@ def transcribe_onprem(local_file_path, api_endpoint): api_endpoint: Endpoint to call for speech recognition, e.g. 0.0.0.0:10000 """ from google.cloud import speech_v1p1beta1 - from google.cloud.speech_v1p1beta1 import enums import grpc import io @@ -35,8 +34,11 @@ def transcribe_onprem(local_file_path, api_endpoint): # Create a gRPC channel to your server channel = grpc.insecure_channel(target=api_endpoint) + transport = speech_v1p1beta1.services.speech.transports.SpeechGrpcTransport( + channel=channel + ) - client = speech_v1p1beta1.SpeechClient(channel=channel) + client = speech_v1p1beta1.SpeechClient(transport=transport) # The language of the supplied audio language_code = "en-US" @@ -46,7 +48,7 @@ def transcribe_onprem(local_file_path, api_endpoint): # Encoding of audio data sent. This sample sets this explicitly. # This field is optional for FLAC and WAV audio formats. - encoding = enums.RecognitionConfig.AudioEncoding.LINEAR16 + encoding = speech_v1p1beta1.RecognitionConfig.AudioEncoding.LINEAR16 config = { "encoding": encoding, "language_code": language_code, @@ -56,18 +58,19 @@ def transcribe_onprem(local_file_path, api_endpoint): content = f.read() audio = {"content": content} - response = client.recognize(config, audio) + response = client.recognize(request={"config": config, "audio": audio}) for result in response.results: # First alternative is the most probable result alternative = result.alternatives[0] print(f"Transcript: {alternative.transcript}") + + # [END speech_transcribe_onprem] if __name__ == "__main__": parser = argparse.ArgumentParser( - description=__doc__, - formatter_class=argparse.RawDescriptionHelpFormatter + description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter ) parser.add_argument( "--file_path", @@ -81,6 +84,4 @@ def transcribe_onprem(local_file_path, api_endpoint): ) args = parser.parse_args() - transcribe_onprem( - local_file_path=args.file_path, api_endpoint=args.api_endpoint - ) + transcribe_onprem(local_file_path=args.file_path, api_endpoint=args.api_endpoint) diff --git a/speech/snippets/transcribe_streaming.py b/speech/snippets/transcribe_streaming.py index de727c221592..d3dc96e5db98 100644 --- a/speech/snippets/transcribe_streaming.py +++ b/speech/snippets/transcribe_streaming.py @@ -28,28 +28,29 @@ def transcribe_streaming(stream_file): """Streams transcription of the given audio file.""" import io from google.cloud import speech - from google.cloud.speech import enums - from google.cloud.speech import types + client = speech.SpeechClient() # [START speech_python_migration_streaming_request] - with io.open(stream_file, 'rb') as audio_file: + with io.open(stream_file, "rb") as audio_file: content = audio_file.read() # In practice, stream should be a generator yielding chunks of audio data. stream = [content] - requests = (types.StreamingRecognizeRequest(audio_content=chunk) - for chunk in stream) + requests = ( + speech.StreamingRecognizeRequest(audio_content=chunk) for chunk in stream + ) - config = types.RecognitionConfig( - encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, + config = speech.RecognitionConfig( + encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=16000, - language_code='en-US') - streaming_config = types.StreamingRecognitionConfig(config=config) + language_code="en-US", + ) + streaming_config = speech.StreamingRecognitionConfig(config=config) # streaming_recognize returns a generator. # [START speech_python_migration_streaming_response] - responses = client.streaming_recognize(streaming_config, requests) + responses = client.streaming_recognize(config=streaming_config, requests=requests,) # [END speech_python_migration_streaming_request] for response in responses: @@ -57,21 +58,23 @@ def transcribe_streaming(stream_file): # is_final result. The other results will be for subsequent portions of # the audio. for result in response.results: - print('Finished: {}'.format(result.is_final)) - print('Stability: {}'.format(result.stability)) + print("Finished: {}".format(result.is_final)) + print("Stability: {}".format(result.stability)) alternatives = result.alternatives # The alternatives are ordered from most likely to least. for alternative in alternatives: - print('Confidence: {}'.format(alternative.confidence)) - print(u'Transcript: {}'.format(alternative.transcript)) + print("Confidence: {}".format(alternative.confidence)) + print(u"Transcript: {}".format(alternative.transcript)) # [END speech_python_migration_streaming_response] + + # [END speech_transcribe_streaming] -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser( - description=__doc__, - formatter_class=argparse.RawDescriptionHelpFormatter) - parser.add_argument('stream', help='File to stream to the API') + description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter + ) + parser.add_argument("stream", help="File to stream to the API") args = parser.parse_args() transcribe_streaming(args.stream) diff --git a/speech/snippets/transcribe_streaming_test.py b/speech/snippets/transcribe_streaming_test.py index 2b3ca8ee5c0b..6eadbe1ecf44 100644 --- a/speech/snippets/transcribe_streaming_test.py +++ b/speech/snippets/transcribe_streaming_test.py @@ -16,12 +16,11 @@ import transcribe_streaming -RESOURCES = os.path.join(os.path.dirname(__file__), 'resources') +RESOURCES = os.path.join(os.path.dirname(__file__), "resources") def test_transcribe_streaming(capsys): - transcribe_streaming.transcribe_streaming( - os.path.join(RESOURCES, 'audio.raw')) + transcribe_streaming.transcribe_streaming(os.path.join(RESOURCES, "audio.raw")) out, err = capsys.readouterr() - assert re.search(r'how old is the Brooklyn Bridge', out, re.DOTALL | re.I) + assert re.search(r"how old is the Brooklyn Bridge", out, re.DOTALL | re.I) diff --git a/speech/snippets/transcribe_test.py b/speech/snippets/transcribe_test.py index d1e9f6338ea6..7aac4f865875 100644 --- a/speech/snippets/transcribe_test.py +++ b/speech/snippets/transcribe_test.py @@ -16,19 +16,18 @@ import transcribe -RESOURCES = os.path.join(os.path.dirname(__file__), 'resources') +RESOURCES = os.path.join(os.path.dirname(__file__), "resources") def test_transcribe_file(capsys): - transcribe.transcribe_file(os.path.join(RESOURCES, 'audio.raw')) + transcribe.transcribe_file(os.path.join(RESOURCES, "audio.raw")) out, err = capsys.readouterr() - assert re.search(r'how old is the Brooklyn Bridge', out, re.DOTALL | re.I) + assert re.search(r"how old is the Brooklyn Bridge", out, re.DOTALL | re.I) def test_transcribe_gcs(capsys): - transcribe.transcribe_gcs( - 'gs://python-docs-samples-tests/speech/audio.flac') + transcribe.transcribe_gcs("gs://python-docs-samples-tests/speech/audio.flac") out, err = capsys.readouterr() - assert re.search(r'how old is the Brooklyn Bridge', out, re.DOTALL | re.I) + assert re.search(r"how old is the Brooklyn Bridge", out, re.DOTALL | re.I) diff --git a/speech/snippets/transcribe_word_time_offsets.py b/speech/snippets/transcribe_word_time_offsets.py index 43ddf38c9aae..b49f2ecbe8f7 100644 --- a/speech/snippets/transcribe_word_time_offsets.py +++ b/speech/snippets/transcribe_word_time_offsets.py @@ -30,34 +30,33 @@ def transcribe_file_with_word_time_offsets(speech_file): """Transcribe the given audio file synchronously and output the word time offsets.""" from google.cloud import speech - from google.cloud.speech import enums - from google.cloud.speech import types + client = speech.SpeechClient() - with io.open(speech_file, 'rb') as audio_file: + with io.open(speech_file, "rb") as audio_file: content = audio_file.read() - audio = types.RecognitionAudio(content=content) - config = types.RecognitionConfig( - encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, + audio = speech.RecognitionAudio(content=content) + config = speech.RecognitionConfig( + encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, sample_rate_hertz=16000, - language_code='en-US', - enable_word_time_offsets=True) + language_code="en-US", + enable_word_time_offsets=True, + ) - response = client.recognize(config, audio) + response = client.recognize(request={"config": config, "audio": audio}) for result in response.results: alternative = result.alternatives[0] - print(u'Transcript: {}'.format(alternative.transcript)) + print("Transcript: {}".format(alternative.transcript)) for word_info in alternative.words: word = word_info.word start_time = word_info.start_time end_time = word_info.end_time - print('Word: {}, start_time: {}, end_time: {}'.format( - word, - start_time.seconds + start_time.nanos * 1e-9, - end_time.seconds + end_time.nanos * 1e-9)) + print( + f"Word: {word}, start_time: {start_time.total_seconds()}, end_time: {end_time.total_seconds()}" + ) # [START speech_transcribe_async_word_time_offsets_gcs] @@ -65,46 +64,48 @@ def transcribe_gcs_with_word_time_offsets(gcs_uri): """Transcribe the given audio file asynchronously and output the word time offsets.""" from google.cloud import speech - from google.cloud.speech import enums - from google.cloud.speech import types + client = speech.SpeechClient() - audio = types.RecognitionAudio(uri=gcs_uri) - config = types.RecognitionConfig( - encoding=enums.RecognitionConfig.AudioEncoding.FLAC, + audio = speech.RecognitionAudio(uri=gcs_uri) + config = speech.RecognitionConfig( + encoding=speech.RecognitionConfig.AudioEncoding.FLAC, sample_rate_hertz=16000, - language_code='en-US', - enable_word_time_offsets=True) + language_code="en-US", + enable_word_time_offsets=True, + ) - operation = client.long_running_recognize(config, audio) + operation = client.long_running_recognize( + request={"config": config, "audio": audio} + ) - print('Waiting for operation to complete...') + print("Waiting for operation to complete...") result = operation.result(timeout=90) for result in result.results: alternative = result.alternatives[0] - print(u'Transcript: {}'.format(alternative.transcript)) - print('Confidence: {}'.format(alternative.confidence)) + print("Transcript: {}".format(alternative.transcript)) + print("Confidence: {}".format(alternative.confidence)) for word_info in alternative.words: word = word_info.word start_time = word_info.start_time end_time = word_info.end_time - print('Word: {}, start_time: {}, end_time: {}'.format( - word, - start_time.seconds + start_time.nanos * 1e-9, - end_time.seconds + end_time.nanos * 1e-9)) + print( + f"Word: {word}, start_time: {start_time.total_seconds()}, end_time: {end_time.total_seconds()}" + ) + + # [END speech_transcribe_async_word_time_offsets_gcs] -if __name__ == '__main__': +if __name__ == "__main__": parser = argparse.ArgumentParser( - description=__doc__, - formatter_class=argparse.RawDescriptionHelpFormatter) - parser.add_argument( - 'path', help='File or GCS path for audio file to be recognized') + description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter + ) + parser.add_argument("path", help="File or GCS path for audio file to be recognized") args = parser.parse_args() - if args.path.startswith('gs://'): + if args.path.startswith("gs://"): transcribe_gcs_with_word_time_offsets(args.path) else: transcribe_file_with_word_time_offsets(args.path) diff --git a/speech/snippets/transcribe_word_time_offsets_test.py b/speech/snippets/transcribe_word_time_offsets_test.py index e894385f1e62..185209494529 100644 --- a/speech/snippets/transcribe_word_time_offsets_test.py +++ b/speech/snippets/transcribe_word_time_offsets_test.py @@ -16,16 +16,17 @@ import transcribe_word_time_offsets -RESOURCES = os.path.join(os.path.dirname(__file__), 'resources') +RESOURCES = os.path.join(os.path.dirname(__file__), "resources") def test_transcribe_file_with_word_time_offsets(capsys): transcribe_word_time_offsets.transcribe_file_with_word_time_offsets( - os.path.join(RESOURCES, 'audio.raw')) + os.path.join(RESOURCES, "audio.raw") + ) out, _ = capsys.readouterr() print(out) - match = re.search(r'Bridge, start_time: ([0-9.]+)', out, re.DOTALL | re.I) + match = re.search(r"Bridge, start_time: ([0-9.]+)", out, re.DOTALL | re.I) time = float(match.group(1)) assert time > 0 @@ -33,11 +34,12 @@ def test_transcribe_file_with_word_time_offsets(capsys): def test_transcribe_gcs_with_word_time_offsets(capsys): transcribe_word_time_offsets.transcribe_gcs_with_word_time_offsets( - 'gs://python-docs-samples-tests/speech/audio.flac') + "gs://python-docs-samples-tests/speech/audio.flac" + ) out, _ = capsys.readouterr() print(out) - match = re.search(r'Bridge, start_time: ([0-9.]+)', out, re.DOTALL | re.I) + match = re.search(r"Bridge, start_time: ([0-9.]+)", out, re.DOTALL | re.I) time = float(match.group(1)) assert time > 0