From 2d6510de9474636b64d0bc1119a947532fefce7b Mon Sep 17 00:00:00 2001
From: Shubham Singh <skshubham@google.com>
Date: Tue, 23 Jan 2024 19:43:37 +0530
Subject: [PATCH] added support for v2 apis, removed old code

---
 speech/livecaption_from_file_v2/README.md     |  32 ------
 speech/livecaption_v2/README.md               |  87 ---------------
 .../transcribe_streaming_decoder_test.go      |  38 +++++++
 speech/snippets/transcribe_streaming_test.go  |  36 ++++++
 .../transcribe_streaming_v2.go}               | 103 ++++++++----------
 .../transcribe_streaming_v2_decoder.go}       |  61 +++++------
 6 files changed, 145 insertions(+), 212 deletions(-)
 delete mode 100644 speech/livecaption_from_file_v2/README.md
 delete mode 100644 speech/livecaption_v2/README.md
 create mode 100644 speech/snippets/transcribe_streaming_decoder_test.go
 create mode 100644 speech/snippets/transcribe_streaming_test.go
 rename speech/{livecaption_v2/livecaption.go => snippets/transcribe_streaming_v2.go} (52%)
 rename speech/{livecaption_from_file_v2/livecaption_from_file.go => snippets/transcribe_streaming_v2_decoder.go} (73%)

diff --git a/speech/livecaption_from_file_v2/README.md b/speech/livecaption_from_file_v2/README.md
deleted file mode 100644
index 71a2d09b57..0000000000
--- a/speech/livecaption_from_file_v2/README.md
+++ /dev/null
@@ -1,32 +0,0 @@
-# Google Cloud Speech API Go example
-
-## Authentication
-
-* Create a project with the [Google Cloud Console][cloud-console], and enable
-  the [Speech API][speech-api].
-* From the Cloud Console, create a service account,
-  download its json credentials file, then set the 
-  `GOOGLE_APPLICATION_CREDENTIALS` environment variable:
-
-  ```bash
-  export GOOGLE_APPLICATION_CREDENTIALS=/path/to/your-project-credentials.json
-  ```
-
-[cloud-console]: https://console.cloud.google.com
-[speech-api]: https://console.cloud.google.com/apis/api/speech.googleapis.com/overview?project=_
-[adc]: https://cloud.google.com/docs/authentication#developer_workflow
-
-## Run the sample
-
-Before running any example you must first install the Speech API client:
-
-```bash
-go get -u cloud.google.com/go/speech/apiv1
-```
-
-To run the example with one of a sample audio file:
-
-```bash
-go build
-livecaption_from_file <project_id> ../testdata/audio.raw
-```
diff --git a/speech/livecaption_v2/README.md b/speech/livecaption_v2/README.md
deleted file mode 100644
index a0da18cd57..0000000000
--- a/speech/livecaption_v2/README.md
+++ /dev/null
@@ -1,87 +0,0 @@
-# Google Cloud Speech API Go example
-
-## Authentication
-
-* Create a project with the [Google Cloud Console][cloud-console], and enable
-  the [Speech API][speech-api].
-* From the Cloud Console, create a service account,
-  download its json credentials file, then set the 
-  `GOOGLE_APPLICATION_CREDENTIALS` environment variable:
-
-  ```bash
-  export GOOGLE_APPLICATION_CREDENTIALS=/path/to/your-project-credentials.json
-  ```
-
-[cloud-console]: https://console.cloud.google.com
-[speech-api]: https://console.cloud.google.com/apis/api/speech.googleapis.com/overview?project=_
-[adc]: https://cloud.google.com/docs/authentication#developer_workflow
-
-## Run the sample
-
-Before running any example you must first install the Speech API client:
-
-```bash
-go get -u cloud.google.com/go/speech/apiv1
-```
-
-To run the example with a local file:
-
-```bash
-go build
-cat ../testdata/audio.raw | livecaption <project_id>
-```
-
-## Capturing audio from the mic
-
-Alternatively, `gst-launch` can be used to capture audio from the mic. For example:
-
-```bash
-gst-launch-1.0 -v pulsesrc ! audioconvert ! audioresample ! audio/x-raw,channels=1,rate=16000 ! filesink location=/dev/stdout | livecaption <project_id>
-```
-
-In order to discover your recording device you may use the `gst-device-monitor-1.0` command line tool. For example:
-
-```bash
-$ gst-device-monitor-1.0
-Probing devices...
-
-
-Device found:
-
-	name  : Built-in Output
-	class : Audio/Sink
-	caps  : audio/x-raw, format=(string)F32LE, layout=(string)interleaved, rate=(int)44100, channels=(int)2, channel-mask=(bitmask)0x0000000000000003;
-	        audio/x-raw, format=(string){ S8, U8, S16LE, S16BE, U16LE, U16BE, S24_32LE, S24_32BE, U24_32LE, U24_32BE, S32LE, S32BE, U32LE, U32BE, S24LE, S24BE, U24LE, U24BE, S20LE, S20BE, U20LE, U20BE, S18LE, S18BE, U18LE, U18BE, F32LE, F32BE, F64LE, F64BE }, layout=(string)interleaved, rate=(int)[ 1, 2147483647 ], channels=(int)2, channel-mask=(bitmask)0x0000000000000003;
-	        audio/x-raw, format=(string){ S8, U8, S16LE, S16BE, U16LE, U16BE, S24_32LE, S24_32BE, U24_32LE, U24_32BE, S32LE, S32BE, U32LE, U32BE, S24LE, S24BE, U24LE, U24BE, S20LE, S20BE, U20LE, U20BE, S18LE, S18BE, U18LE, U18BE, F32LE, F32BE, F64LE, F64BE }, layout=(string)interleaved, rate=(int)[ 1, 2147483647 ], channels=(int)1;
-	gst-launch-1.0 ... ! osxaudiosink device=46
-
-
-Device found:
-
-	name  : Built-in Microph
-	class : Audio/Source
-	caps  : audio/x-raw, format=(string)F32LE, layout=(string)interleaved, rate=(int)44100, channels=(int)2, channel-mask=(bitmask)0x0000000000000003;
-	        audio/x-raw, format=(string){ S8, U8, S16LE, S16BE, U16LE, U16BE, S24_32LE, S24_32BE, U24_32LE, U24_32BE, S32LE, S32BE, U32LE, U32BE, S24LE, S24BE, U24LE, U24BE, S20LE, S20BE, U20LE, U20BE, S18LE, S18BE, U18LE, U18BE, F32LE, F32BE, F64LE, F64BE }, layout=(string)interleaved, rate=(int)44100, channels=(int)2, channel-mask=(bitmask)0x0000000000000003;
-	        audio/x-raw, format=(string){ S8, U8, S16LE, S16BE, U16LE, U16BE, S24_32LE, S24_32BE, U24_32LE, U24_32BE, S32LE, S32BE, U32LE, U32BE, S24LE, S24BE, U24LE, U24BE, S20LE, S20BE, U20LE, U20BE, S18LE, S18BE, U18LE, U18BE, F32LE, F32BE, F64LE, F64BE }, layout=(string)interleaved, rate=(int)44100, channels=(int)1;
-	gst-launch-1.0 osxaudiosrc device=39 ! ...
-```
-
-In the above example the recording device (`Built-In Microphone`) is `osxaudiosrc device=39`, so in order to run the example you would need to adapt the command-line accordingly:
-
-```bash
-gst-launch-1.0 -v osxaudiosrc device=39 ! audioconvert ! audioresample ! audio/x-raw,channels=1,rate=16000 ! filesink location=/dev/stdout | livecaption <project_id>
-```
-
-## Content Limits
-
-The Speech API contains the following limits on the size of content (and are subject to change):
-
-| Content Limit	| Audio Length |
-| ------------- | ------------ |
-| Synchronous Requests | ~1 Minute |
-| Asynchronous Requests	| ~180 Minutes |
-| Streaming Requests | ~1 Minute |
-
-Please note that each `StreamingRecognize` session is considered a single request even though it includes multiple frames of `StreamingRecognizeRequest` audio within the stream.
-
-For more information, please refer to https://cloud.google.com/speech/limits#content.
\ No newline at end of file
diff --git a/speech/snippets/transcribe_streaming_decoder_test.go b/speech/snippets/transcribe_streaming_decoder_test.go
new file mode 100644
index 0000000000..eb8357fb83
--- /dev/null
+++ b/speech/snippets/transcribe_streaming_decoder_test.go
@@ -0,0 +1,38 @@
+// Copyright 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package snippets
+
+import (
+	"bytes"
+	"os"
+	"strings"
+	"testing"
+
+	"github.com/GoogleCloudPlatform/golang-samples/internal/testutil"
+)
+
+var recognitionAudioFileRawLINEAR16 = "../testdata/audio.raw"
+
+func TestTrascribeStreamingV2SpecificDecoding(t *testing.T) {
+	testutil.SystemTest(t)
+	projectID := os.Getenv("GOLANG_SAMPLES_PROJECT_ID")
+	var buf bytes.Buffer
+	if err := transcribe_streaming_specific_decoding_v2(&buf, recognitionAudioFileRawLINEAR16, projectID); err != nil {
+		t.Fatalf("error in transcribe diarization gcs %v", err)
+	}
+	if got := buf.String(); !strings.Contains(got, "Brooklyn Bridge") {
+		t.Errorf("transcribe_diarization_gcs_beta got %q, expected %q", got, "Speaker")
+	}
+}
diff --git a/speech/snippets/transcribe_streaming_test.go b/speech/snippets/transcribe_streaming_test.go
new file mode 100644
index 0000000000..7fed42863d
--- /dev/null
+++ b/speech/snippets/transcribe_streaming_test.go
@@ -0,0 +1,36 @@
+// Copyright 2023 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package snippets
+
+import (
+	"bytes"
+	"os"
+	"strings"
+	"testing"
+
+	"github.com/GoogleCloudPlatform/golang-samples/internal/testutil"
+)
+
+func TestTrascribeStreamingV2(t *testing.T) {
+	testutil.SystemTest(t)
+	projectID := os.Getenv("GOLANG_SAMPLES_PROJECT_ID")
+	var buf bytes.Buffer
+	if err := transcribe_streaming_v2(&buf, recognitionAudioFile, projectID); err != nil {
+		t.Fatalf("error in transcribe diarization gcs %v", err)
+	}
+	if got := buf.String(); !strings.Contains(got, "Chromecast") {
+		t.Errorf("transcribe_diarization_gcs_beta got %q, expected %q", got, "Speaker")
+	}
+}
diff --git a/speech/livecaption_v2/livecaption.go b/speech/snippets/transcribe_streaming_v2.go
similarity index 52%
rename from speech/livecaption_v2/livecaption.go
rename to speech/snippets/transcribe_streaming_v2.go
index 4581b22e50..c65a530e31 100644
--- a/speech/livecaption_v2/livecaption.go
+++ b/speech/snippets/transcribe_streaming_v2.go
@@ -12,23 +12,20 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-// Command livecaption pipes the stdin audio data to
+// Command livecaption_from_file streams a local audio file to
 // Google Speech API and outputs the transcript.
-//
-// As an example, gst-launch can be used to capture the mic input:
-//
-//	$ gst-launch-1.0 -v pulsesrc ! audioconvert ! audioresample ! audio/x-raw,channels=1,rate=16000 ! filesink location=/dev/stdout | livecaption <project_id>
 
-package main
+package snippets
 
-// [START speech_transcribe_streaming_mic]
+// [START speech_transcribe_streaming]
 import (
 	"context"
-	"flag"
 	"fmt"
 	"io"
 	"log"
 	"os"
+	"path/filepath"
+	"strings"
 
 	speech "cloud.google.com/go/speech/apiv2"
 	"cloud.google.com/go/speech/apiv2/speechpb"
@@ -38,49 +35,36 @@ var projectID string
 
 const location = "global"
 
-func main() {
-	ctx := context.Background()
+func transcribe_streaming_v2(w io.Writer, path string, projectID string) error {
 
-	flag.Usage = func() {
-		fmt.Fprintf(os.Stderr, "Usage: %s <Project_id>\n", os.Args[0])
-		fmt.Fprintf(os.Stderr, "<projectID> must be a project_id to a valid gcp projectID with speech api enabled.\n")
-
-	}
-	flag.Parse()
-	if len(flag.Args()) != 1 {
-		log.Fatal("Please pass the project_id as a command line argument. Should be a valid project_id with stt api enabled.")
+	audioFile, err := filepath.Abs(path)
+	if err != nil {
+		log.Println("Failed to load file: ", path)
+		return err
 	}
-	projectID = flag.Arg(0)
 
-	if projectID == "" {
-		log.Fatalf("Project is is required parameter: %s", projectID)
-	}
+	ctx := context.Background()
 
 	client, err := speech.NewClient(ctx)
 	if err != nil {
-		log.Fatal(err)
+		log.Println(err)
+		return err
 	}
 	stream, err := client.StreamingRecognize(ctx)
 	if err != nil {
-		log.Fatal(err)
+		log.Println(err)
+		return err
 	}
-
+	// Send the initial configuration message.
 	if err := stream.Send(&speechpb.StreamingRecognizeRequest{
 		Recognizer: fmt.Sprintf("projects/%s/locations/%s/recognizers/_", projectID, location),
 		StreamingRequest: &speechpb.StreamingRecognizeRequest_StreamingConfig{
 			StreamingConfig: &speechpb.StreamingRecognitionConfig{
 				Config: &speechpb.RecognitionConfig{
 					// In case of specific file encoding , so specify the decoding config.
-					//DecodingConfig: &speechpb.RecognitionConfig_AutoDecodingConfig{},
-					DecodingConfig: &speechpb.RecognitionConfig_ExplicitDecodingConfig{
-						ExplicitDecodingConfig: &speechpb.ExplicitDecodingConfig{
-							Encoding:          speechpb.ExplicitDecodingConfig_LINEAR16,
-							SampleRateHertz:   16000,
-							AudioChannelCount: 1,
-						},
-					},
-					Model:         "long",
-					LanguageCodes: []string{"en-US"},
+					DecodingConfig: &speechpb.RecognitionConfig_AutoDecodingConfig{},
+					Model:          "long",
+					LanguageCodes:  []string{"en-US"},
 					Features: &speechpb.RecognitionFeatures{
 						MaxAlternatives: 2,
 					},
@@ -89,17 +73,21 @@ func main() {
 			},
 		},
 	}); err != nil {
-		log.Fatal(err)
+		log.Println(err)
+		return err
 	}
 
-	go func() {
-		// Pipe stdin to the API.
-		buf := make([]byte, 1024)
+	f, err := os.Open(audioFile)
+	if err != nil {
+		log.Println(err)
+		return err
+	}
+	defer f.Close()
 
+	go func() error {
+		buf := make([]byte, 1024)
 		for {
-
-			n, err := os.Stdin.Read(buf)
-
+			n, err := f.Read(buf)
 			if n > 0 {
 				if err := stream.Send(&speechpb.StreamingRecognizeRequest{
 					Recognizer: fmt.Sprintf("projects/%s/locations/%s/recognizers/_", projectID, location),
@@ -107,18 +95,18 @@ func main() {
 						Audio: buf[:n],
 					},
 				}); err != nil {
-					log.Printf("Could not send audio: %v", err)
+					return fmt.Errorf("could not send audio: %v", err)
 				}
 			}
 			if err == io.EOF {
 				// Nothing else to pipe, close the stream.
 				if err := stream.CloseSend(); err != nil {
-					log.Fatalf("Could not close stream: %v", err)
+					return fmt.Errorf("could not close stream: %w", err)
 				}
-				return
+				return nil
 			}
 			if err != nil {
-				log.Printf("Could not read from stdin: %v", err)
+				log.Printf("Could not read from %s: %v", audioFile, err)
 				continue
 			}
 		}
@@ -127,25 +115,20 @@ func main() {
 	for {
 		resp, err := stream.Recv()
 		if err == io.EOF {
-			log.Printf("EOF break")
 			break
 		}
 		if err != nil {
-			log.Fatalf("Could not recognize: %v", err)
-		} else {
-			// It seems like the new response api does not have a field called Error
-			for _, result := range resp.Results {
-				//fmt.Printf("Result: %+v\n", result)
-				if len(result.Alternatives) > 0 {
-					if result.IsFinal == true {
-						log.Println("result", result.Alternatives[0].Transcript, result.IsFinal)
-					}
-
-				}
+			return fmt.Errorf("cannot stream results: %v", err)
+		}
+		for i, result := range resp.Results {
+			fmt.Fprintf(w, "%s\n", strings.Repeat("-", 20))
+			fmt.Fprintf(w, "Result %d\n", i+1)
+			for j, alternative := range result.Alternatives {
+				fmt.Fprintf(w, "Alternative %d is_final: %t : %s\n", j+1, result.IsFinal, alternative.Transcript)
 			}
 		}
-
 	}
+	return nil
 }
 
-// [END speech_transcribe_streaming_mic]
+// [END speech_transcribe_streaming]
diff --git a/speech/livecaption_from_file_v2/livecaption_from_file.go b/speech/snippets/transcribe_streaming_v2_decoder.go
similarity index 73%
rename from speech/livecaption_from_file_v2/livecaption_from_file.go
rename to speech/snippets/transcribe_streaming_v2_decoder.go
index 27961d61db..ff82215dbe 100644
--- a/speech/livecaption_from_file_v2/livecaption_from_file.go
+++ b/speech/snippets/transcribe_streaming_v2_decoder.go
@@ -15,48 +15,40 @@
 // Command livecaption_from_file streams a local audio file to
 // Google Speech API and outputs the transcript.
 
-package main
+package snippets
 
 // [START speech_transcribe_streaming]
 import (
 	"context"
-	"flag"
 	"fmt"
 	"io"
 	"log"
 	"os"
 	"path/filepath"
+	"strings"
 
 	speech "cloud.google.com/go/speech/apiv2"
 	"cloud.google.com/go/speech/apiv2/speechpb"
 )
 
-var projectID string
-
-const location = "global"
-
-func main() {
-	flag.Usage = func() {
-		fmt.Fprintf(os.Stderr, "Usage: %s %s <AUDIOFILE>\n", os.Args[0], filepath.Base(os.Args[1]))
-		fmt.Fprintf(os.Stderr, "<AUDIOFILE> must be a path to a local audio file. Audio file must be a 16-bit signed little-endian encoded with a sample rate of 16000.\n")
-
-	}
-	flag.Parse()
-	if len(flag.Args()) != 2 {
-		log.Fatal("Please pass path to your project_id and local audio file as a command line argument")
+func transcribe_streaming_specific_decoding_v2(w io.Writer, path string, projectID string) error {
+	audioFile, err := filepath.Abs(path)
+	if err != nil {
+		log.Println("Failed to load file: ", path)
+		return err
 	}
-	audioFile := flag.Arg(1)
-	projectID = flag.Arg(0)
 
 	ctx := context.Background()
 
 	client, err := speech.NewClient(ctx)
 	if err != nil {
-		log.Fatal(err)
+		log.Println(err)
+		return err
 	}
 	stream, err := client.StreamingRecognize(ctx)
 	if err != nil {
-		log.Fatal(err)
+		log.Println(err)
+		return err
 	}
 	// Send the initial configuration message.
 	if err := stream.Send(&speechpb.StreamingRecognizeRequest{
@@ -83,16 +75,18 @@ func main() {
 			},
 		},
 	}); err != nil {
-		log.Fatal(err)
+		log.Println(err)
+		return err
 	}
 
 	f, err := os.Open(audioFile)
 	if err != nil {
-		log.Fatal(err)
+		log.Println(err)
+		return err
 	}
 	defer f.Close()
 
-	go func() {
+	go func() error {
 		buf := make([]byte, 1024)
 		for {
 			n, err := f.Read(buf)
@@ -103,15 +97,15 @@ func main() {
 						Audio: buf[:n],
 					},
 				}); err != nil {
-					log.Printf("Could not send audio: %v", err)
+					return fmt.Errorf("could not send audio: %v", err)
 				}
 			}
 			if err == io.EOF {
 				// Nothing else to pipe, close the stream.
 				if err := stream.CloseSend(); err != nil {
-					log.Fatalf("Could not close stream: %v", err)
+					return fmt.Errorf("could not close stream: %w", err)
 				}
-				return
+				return nil
 			}
 			if err != nil {
 				log.Printf("Could not read from %s: %v", audioFile, err)
@@ -123,21 +117,22 @@ func main() {
 	for {
 		resp, err := stream.Recv()
 		if err == io.EOF {
-			print("Recv break")
 			break
 		}
 		if err != nil {
-			log.Fatalf("Cannot stream results: %v", err)
+			return fmt.Errorf("cannot stream results: %v", err)
 		}
-		for _, result := range resp.Results {
-			if len(result.Alternatives) > 0 {
-				if result.IsFinal == true {
-					log.Println("result alternatives", result.Alternatives[0].Transcript, result.IsFinal)
-				}
-
+		for i, result := range resp.Results {
+			fmt.Fprintf(w, "%s\n", strings.Repeat("-", 20))
+			fmt.Fprintf(w, "Result %d\n", i+1)
+			for j, alternative := range result.Alternatives {
+				log.Printf("Alternative %d is_final: %t : %s\n", j+1, result.IsFinal, alternative.Transcript)
+				fmt.Fprintf(w, "Alternative %d is_final: %t : %s\n", j+1, result.IsFinal, alternative.Transcript)
 			}
+
 		}
 	}
+	return nil
 }
 
 // [END speech_transcribe_streaming]