From 2d6510de9474636b64d0bc1119a947532fefce7b Mon Sep 17 00:00:00 2001 From: Shubham Singh Date: Tue, 23 Jan 2024 19:43:37 +0530 Subject: [PATCH] added support for v2 apis, removed old code --- speech/livecaption_from_file_v2/README.md | 32 ------ speech/livecaption_v2/README.md | 87 --------------- .../transcribe_streaming_decoder_test.go | 38 +++++++ speech/snippets/transcribe_streaming_test.go | 36 ++++++ .../transcribe_streaming_v2.go} | 103 ++++++++---------- .../transcribe_streaming_v2_decoder.go} | 61 +++++------ 6 files changed, 145 insertions(+), 212 deletions(-) delete mode 100644 speech/livecaption_from_file_v2/README.md delete mode 100644 speech/livecaption_v2/README.md create mode 100644 speech/snippets/transcribe_streaming_decoder_test.go create mode 100644 speech/snippets/transcribe_streaming_test.go rename speech/{livecaption_v2/livecaption.go => snippets/transcribe_streaming_v2.go} (52%) rename speech/{livecaption_from_file_v2/livecaption_from_file.go => snippets/transcribe_streaming_v2_decoder.go} (73%) diff --git a/speech/livecaption_from_file_v2/README.md b/speech/livecaption_from_file_v2/README.md deleted file mode 100644 index 71a2d09b57..0000000000 --- a/speech/livecaption_from_file_v2/README.md +++ /dev/null @@ -1,32 +0,0 @@ -# Google Cloud Speech API Go example - -## Authentication - -* Create a project with the [Google Cloud Console][cloud-console], and enable - the [Speech API][speech-api]. -* From the Cloud Console, create a service account, - download its json credentials file, then set the - `GOOGLE_APPLICATION_CREDENTIALS` environment variable: - - ```bash - export GOOGLE_APPLICATION_CREDENTIALS=/path/to/your-project-credentials.json - ``` - -[cloud-console]: https://console.cloud.google.com -[speech-api]: https://console.cloud.google.com/apis/api/speech.googleapis.com/overview?project=_ -[adc]: https://cloud.google.com/docs/authentication#developer_workflow - -## Run the sample - -Before running any example you must first install the Speech API client: - -```bash -go get -u cloud.google.com/go/speech/apiv1 -``` - -To run the example with one of a sample audio file: - -```bash -go build -livecaption_from_file ../testdata/audio.raw -``` diff --git a/speech/livecaption_v2/README.md b/speech/livecaption_v2/README.md deleted file mode 100644 index a0da18cd57..0000000000 --- a/speech/livecaption_v2/README.md +++ /dev/null @@ -1,87 +0,0 @@ -# Google Cloud Speech API Go example - -## Authentication - -* Create a project with the [Google Cloud Console][cloud-console], and enable - the [Speech API][speech-api]. -* From the Cloud Console, create a service account, - download its json credentials file, then set the - `GOOGLE_APPLICATION_CREDENTIALS` environment variable: - - ```bash - export GOOGLE_APPLICATION_CREDENTIALS=/path/to/your-project-credentials.json - ``` - -[cloud-console]: https://console.cloud.google.com -[speech-api]: https://console.cloud.google.com/apis/api/speech.googleapis.com/overview?project=_ -[adc]: https://cloud.google.com/docs/authentication#developer_workflow - -## Run the sample - -Before running any example you must first install the Speech API client: - -```bash -go get -u cloud.google.com/go/speech/apiv1 -``` - -To run the example with a local file: - -```bash -go build -cat ../testdata/audio.raw | livecaption -``` - -## Capturing audio from the mic - -Alternatively, `gst-launch` can be used to capture audio from the mic. For example: - -```bash -gst-launch-1.0 -v pulsesrc ! audioconvert ! audioresample ! audio/x-raw,channels=1,rate=16000 ! filesink location=/dev/stdout | livecaption -``` - -In order to discover your recording device you may use the `gst-device-monitor-1.0` command line tool. For example: - -```bash -$ gst-device-monitor-1.0 -Probing devices... - - -Device found: - - name : Built-in Output - class : Audio/Sink - caps : audio/x-raw, format=(string)F32LE, layout=(string)interleaved, rate=(int)44100, channels=(int)2, channel-mask=(bitmask)0x0000000000000003; - audio/x-raw, format=(string){ S8, U8, S16LE, S16BE, U16LE, U16BE, S24_32LE, S24_32BE, U24_32LE, U24_32BE, S32LE, S32BE, U32LE, U32BE, S24LE, S24BE, U24LE, U24BE, S20LE, S20BE, U20LE, U20BE, S18LE, S18BE, U18LE, U18BE, F32LE, F32BE, F64LE, F64BE }, layout=(string)interleaved, rate=(int)[ 1, 2147483647 ], channels=(int)2, channel-mask=(bitmask)0x0000000000000003; - audio/x-raw, format=(string){ S8, U8, S16LE, S16BE, U16LE, U16BE, S24_32LE, S24_32BE, U24_32LE, U24_32BE, S32LE, S32BE, U32LE, U32BE, S24LE, S24BE, U24LE, U24BE, S20LE, S20BE, U20LE, U20BE, S18LE, S18BE, U18LE, U18BE, F32LE, F32BE, F64LE, F64BE }, layout=(string)interleaved, rate=(int)[ 1, 2147483647 ], channels=(int)1; - gst-launch-1.0 ... ! osxaudiosink device=46 - - -Device found: - - name : Built-in Microph - class : Audio/Source - caps : audio/x-raw, format=(string)F32LE, layout=(string)interleaved, rate=(int)44100, channels=(int)2, channel-mask=(bitmask)0x0000000000000003; - audio/x-raw, format=(string){ S8, U8, S16LE, S16BE, U16LE, U16BE, S24_32LE, S24_32BE, U24_32LE, U24_32BE, S32LE, S32BE, U32LE, U32BE, S24LE, S24BE, U24LE, U24BE, S20LE, S20BE, U20LE, U20BE, S18LE, S18BE, U18LE, U18BE, F32LE, F32BE, F64LE, F64BE }, layout=(string)interleaved, rate=(int)44100, channels=(int)2, channel-mask=(bitmask)0x0000000000000003; - audio/x-raw, format=(string){ S8, U8, S16LE, S16BE, U16LE, U16BE, S24_32LE, S24_32BE, U24_32LE, U24_32BE, S32LE, S32BE, U32LE, U32BE, S24LE, S24BE, U24LE, U24BE, S20LE, S20BE, U20LE, U20BE, S18LE, S18BE, U18LE, U18BE, F32LE, F32BE, F64LE, F64BE }, layout=(string)interleaved, rate=(int)44100, channels=(int)1; - gst-launch-1.0 osxaudiosrc device=39 ! ... -``` - -In the above example the recording device (`Built-In Microphone`) is `osxaudiosrc device=39`, so in order to run the example you would need to adapt the command-line accordingly: - -```bash -gst-launch-1.0 -v osxaudiosrc device=39 ! audioconvert ! audioresample ! audio/x-raw,channels=1,rate=16000 ! filesink location=/dev/stdout | livecaption -``` - -## Content Limits - -The Speech API contains the following limits on the size of content (and are subject to change): - -| Content Limit | Audio Length | -| ------------- | ------------ | -| Synchronous Requests | ~1 Minute | -| Asynchronous Requests | ~180 Minutes | -| Streaming Requests | ~1 Minute | - -Please note that each `StreamingRecognize` session is considered a single request even though it includes multiple frames of `StreamingRecognizeRequest` audio within the stream. - -For more information, please refer to https://cloud.google.com/speech/limits#content. \ No newline at end of file diff --git a/speech/snippets/transcribe_streaming_decoder_test.go b/speech/snippets/transcribe_streaming_decoder_test.go new file mode 100644 index 0000000000..eb8357fb83 --- /dev/null +++ b/speech/snippets/transcribe_streaming_decoder_test.go @@ -0,0 +1,38 @@ +// Copyright 2023 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package snippets + +import ( + "bytes" + "os" + "strings" + "testing" + + "github.com/GoogleCloudPlatform/golang-samples/internal/testutil" +) + +var recognitionAudioFileRawLINEAR16 = "../testdata/audio.raw" + +func TestTrascribeStreamingV2SpecificDecoding(t *testing.T) { + testutil.SystemTest(t) + projectID := os.Getenv("GOLANG_SAMPLES_PROJECT_ID") + var buf bytes.Buffer + if err := transcribe_streaming_specific_decoding_v2(&buf, recognitionAudioFileRawLINEAR16, projectID); err != nil { + t.Fatalf("error in transcribe diarization gcs %v", err) + } + if got := buf.String(); !strings.Contains(got, "Brooklyn Bridge") { + t.Errorf("transcribe_diarization_gcs_beta got %q, expected %q", got, "Speaker") + } +} diff --git a/speech/snippets/transcribe_streaming_test.go b/speech/snippets/transcribe_streaming_test.go new file mode 100644 index 0000000000..7fed42863d --- /dev/null +++ b/speech/snippets/transcribe_streaming_test.go @@ -0,0 +1,36 @@ +// Copyright 2023 Google LLC +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package snippets + +import ( + "bytes" + "os" + "strings" + "testing" + + "github.com/GoogleCloudPlatform/golang-samples/internal/testutil" +) + +func TestTrascribeStreamingV2(t *testing.T) { + testutil.SystemTest(t) + projectID := os.Getenv("GOLANG_SAMPLES_PROJECT_ID") + var buf bytes.Buffer + if err := transcribe_streaming_v2(&buf, recognitionAudioFile, projectID); err != nil { + t.Fatalf("error in transcribe diarization gcs %v", err) + } + if got := buf.String(); !strings.Contains(got, "Chromecast") { + t.Errorf("transcribe_diarization_gcs_beta got %q, expected %q", got, "Speaker") + } +} diff --git a/speech/livecaption_v2/livecaption.go b/speech/snippets/transcribe_streaming_v2.go similarity index 52% rename from speech/livecaption_v2/livecaption.go rename to speech/snippets/transcribe_streaming_v2.go index 4581b22e50..c65a530e31 100644 --- a/speech/livecaption_v2/livecaption.go +++ b/speech/snippets/transcribe_streaming_v2.go @@ -12,23 +12,20 @@ // See the License for the specific language governing permissions and // limitations under the License. -// Command livecaption pipes the stdin audio data to +// Command livecaption_from_file streams a local audio file to // Google Speech API and outputs the transcript. -// -// As an example, gst-launch can be used to capture the mic input: -// -// $ gst-launch-1.0 -v pulsesrc ! audioconvert ! audioresample ! audio/x-raw,channels=1,rate=16000 ! filesink location=/dev/stdout | livecaption -package main +package snippets -// [START speech_transcribe_streaming_mic] +// [START speech_transcribe_streaming] import ( "context" - "flag" "fmt" "io" "log" "os" + "path/filepath" + "strings" speech "cloud.google.com/go/speech/apiv2" "cloud.google.com/go/speech/apiv2/speechpb" @@ -38,49 +35,36 @@ var projectID string const location = "global" -func main() { - ctx := context.Background() +func transcribe_streaming_v2(w io.Writer, path string, projectID string) error { - flag.Usage = func() { - fmt.Fprintf(os.Stderr, "Usage: %s \n", os.Args[0]) - fmt.Fprintf(os.Stderr, " must be a project_id to a valid gcp projectID with speech api enabled.\n") - - } - flag.Parse() - if len(flag.Args()) != 1 { - log.Fatal("Please pass the project_id as a command line argument. Should be a valid project_id with stt api enabled.") + audioFile, err := filepath.Abs(path) + if err != nil { + log.Println("Failed to load file: ", path) + return err } - projectID = flag.Arg(0) - if projectID == "" { - log.Fatalf("Project is is required parameter: %s", projectID) - } + ctx := context.Background() client, err := speech.NewClient(ctx) if err != nil { - log.Fatal(err) + log.Println(err) + return err } stream, err := client.StreamingRecognize(ctx) if err != nil { - log.Fatal(err) + log.Println(err) + return err } - + // Send the initial configuration message. if err := stream.Send(&speechpb.StreamingRecognizeRequest{ Recognizer: fmt.Sprintf("projects/%s/locations/%s/recognizers/_", projectID, location), StreamingRequest: &speechpb.StreamingRecognizeRequest_StreamingConfig{ StreamingConfig: &speechpb.StreamingRecognitionConfig{ Config: &speechpb.RecognitionConfig{ // In case of specific file encoding , so specify the decoding config. - //DecodingConfig: &speechpb.RecognitionConfig_AutoDecodingConfig{}, - DecodingConfig: &speechpb.RecognitionConfig_ExplicitDecodingConfig{ - ExplicitDecodingConfig: &speechpb.ExplicitDecodingConfig{ - Encoding: speechpb.ExplicitDecodingConfig_LINEAR16, - SampleRateHertz: 16000, - AudioChannelCount: 1, - }, - }, - Model: "long", - LanguageCodes: []string{"en-US"}, + DecodingConfig: &speechpb.RecognitionConfig_AutoDecodingConfig{}, + Model: "long", + LanguageCodes: []string{"en-US"}, Features: &speechpb.RecognitionFeatures{ MaxAlternatives: 2, }, @@ -89,17 +73,21 @@ func main() { }, }, }); err != nil { - log.Fatal(err) + log.Println(err) + return err } - go func() { - // Pipe stdin to the API. - buf := make([]byte, 1024) + f, err := os.Open(audioFile) + if err != nil { + log.Println(err) + return err + } + defer f.Close() + go func() error { + buf := make([]byte, 1024) for { - - n, err := os.Stdin.Read(buf) - + n, err := f.Read(buf) if n > 0 { if err := stream.Send(&speechpb.StreamingRecognizeRequest{ Recognizer: fmt.Sprintf("projects/%s/locations/%s/recognizers/_", projectID, location), @@ -107,18 +95,18 @@ func main() { Audio: buf[:n], }, }); err != nil { - log.Printf("Could not send audio: %v", err) + return fmt.Errorf("could not send audio: %v", err) } } if err == io.EOF { // Nothing else to pipe, close the stream. if err := stream.CloseSend(); err != nil { - log.Fatalf("Could not close stream: %v", err) + return fmt.Errorf("could not close stream: %w", err) } - return + return nil } if err != nil { - log.Printf("Could not read from stdin: %v", err) + log.Printf("Could not read from %s: %v", audioFile, err) continue } } @@ -127,25 +115,20 @@ func main() { for { resp, err := stream.Recv() if err == io.EOF { - log.Printf("EOF break") break } if err != nil { - log.Fatalf("Could not recognize: %v", err) - } else { - // It seems like the new response api does not have a field called Error - for _, result := range resp.Results { - //fmt.Printf("Result: %+v\n", result) - if len(result.Alternatives) > 0 { - if result.IsFinal == true { - log.Println("result", result.Alternatives[0].Transcript, result.IsFinal) - } - - } + return fmt.Errorf("cannot stream results: %v", err) + } + for i, result := range resp.Results { + fmt.Fprintf(w, "%s\n", strings.Repeat("-", 20)) + fmt.Fprintf(w, "Result %d\n", i+1) + for j, alternative := range result.Alternatives { + fmt.Fprintf(w, "Alternative %d is_final: %t : %s\n", j+1, result.IsFinal, alternative.Transcript) } } - } + return nil } -// [END speech_transcribe_streaming_mic] +// [END speech_transcribe_streaming] diff --git a/speech/livecaption_from_file_v2/livecaption_from_file.go b/speech/snippets/transcribe_streaming_v2_decoder.go similarity index 73% rename from speech/livecaption_from_file_v2/livecaption_from_file.go rename to speech/snippets/transcribe_streaming_v2_decoder.go index 27961d61db..ff82215dbe 100644 --- a/speech/livecaption_from_file_v2/livecaption_from_file.go +++ b/speech/snippets/transcribe_streaming_v2_decoder.go @@ -15,48 +15,40 @@ // Command livecaption_from_file streams a local audio file to // Google Speech API and outputs the transcript. -package main +package snippets // [START speech_transcribe_streaming] import ( "context" - "flag" "fmt" "io" "log" "os" "path/filepath" + "strings" speech "cloud.google.com/go/speech/apiv2" "cloud.google.com/go/speech/apiv2/speechpb" ) -var projectID string - -const location = "global" - -func main() { - flag.Usage = func() { - fmt.Fprintf(os.Stderr, "Usage: %s %s \n", os.Args[0], filepath.Base(os.Args[1])) - fmt.Fprintf(os.Stderr, " must be a path to a local audio file. Audio file must be a 16-bit signed little-endian encoded with a sample rate of 16000.\n") - - } - flag.Parse() - if len(flag.Args()) != 2 { - log.Fatal("Please pass path to your project_id and local audio file as a command line argument") +func transcribe_streaming_specific_decoding_v2(w io.Writer, path string, projectID string) error { + audioFile, err := filepath.Abs(path) + if err != nil { + log.Println("Failed to load file: ", path) + return err } - audioFile := flag.Arg(1) - projectID = flag.Arg(0) ctx := context.Background() client, err := speech.NewClient(ctx) if err != nil { - log.Fatal(err) + log.Println(err) + return err } stream, err := client.StreamingRecognize(ctx) if err != nil { - log.Fatal(err) + log.Println(err) + return err } // Send the initial configuration message. if err := stream.Send(&speechpb.StreamingRecognizeRequest{ @@ -83,16 +75,18 @@ func main() { }, }, }); err != nil { - log.Fatal(err) + log.Println(err) + return err } f, err := os.Open(audioFile) if err != nil { - log.Fatal(err) + log.Println(err) + return err } defer f.Close() - go func() { + go func() error { buf := make([]byte, 1024) for { n, err := f.Read(buf) @@ -103,15 +97,15 @@ func main() { Audio: buf[:n], }, }); err != nil { - log.Printf("Could not send audio: %v", err) + return fmt.Errorf("could not send audio: %v", err) } } if err == io.EOF { // Nothing else to pipe, close the stream. if err := stream.CloseSend(); err != nil { - log.Fatalf("Could not close stream: %v", err) + return fmt.Errorf("could not close stream: %w", err) } - return + return nil } if err != nil { log.Printf("Could not read from %s: %v", audioFile, err) @@ -123,21 +117,22 @@ func main() { for { resp, err := stream.Recv() if err == io.EOF { - print("Recv break") break } if err != nil { - log.Fatalf("Cannot stream results: %v", err) + return fmt.Errorf("cannot stream results: %v", err) } - for _, result := range resp.Results { - if len(result.Alternatives) > 0 { - if result.IsFinal == true { - log.Println("result alternatives", result.Alternatives[0].Transcript, result.IsFinal) - } - + for i, result := range resp.Results { + fmt.Fprintf(w, "%s\n", strings.Repeat("-", 20)) + fmt.Fprintf(w, "Result %d\n", i+1) + for j, alternative := range result.Alternatives { + log.Printf("Alternative %d is_final: %t : %s\n", j+1, result.IsFinal, alternative.Transcript) + fmt.Fprintf(w, "Alternative %d is_final: %t : %s\n", j+1, result.IsFinal, alternative.Transcript) } + } } + return nil } // [END speech_transcribe_streaming]