feat(speech): Add speech StreamingRecognize samples (#3753)

* feature: add support to v2 stt apis * fix readme * refactor variables * added support for v2 apis, removed old code * updated license for new files * applied suggested fixed * fix tags for region issue * applied the suggested changes * applied the suggested changes * add support for location --------- Co-authored-by: Kodanda Rama <kodrama@google.com> Co-authored-by: Marc Dougherty <muncus@users.noreply.github.com>
GoogleCloudPlatform · Jun 4, 2024 · 743ff06 · 743ff06
1 parent 450027d
commit 743ff06
Show file tree

Hide file tree

Showing 4 changed files with 339 additions and 0 deletions.
diff --git a/speech/snippets/transcribe_streaming_v2.go b/speech/snippets/transcribe_streaming_v2.go
@@ -0,0 +1,128 @@
+// Copyright 2024 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Command livecaption_from_file streams a local audio file to
+// Google Speech API and outputs the transcript.
+
+package snippets
+
+// [START speech_transcribe_streaming_v2]
+import (
+	"context"
+	"fmt"
+	"io"
+	"log"
+	"os"
+	"path/filepath"
+	"strings"
+
+	speech "cloud.google.com/go/speech/apiv2"
+	"cloud.google.com/go/speech/apiv2/speechpb"
+)
+
+func transcribeStreamingV2(w io.Writer, projectID string, path string) error {
+	const location = "global"
+	audioFile, err := filepath.Abs(path)
+	if err != nil {
+		log.Println("Failed to load file: ", path)
+		return err
+	}
+	f, err := os.Open(audioFile)
+	if err != nil {
+		return err
+	}
+	defer f.Close()
+
+	ctx := context.Background()
+
+	client, err := speech.NewClient(ctx)
+	if err != nil {
+		log.Println(err)
+		return err
+	}
+	stream, err := client.StreamingRecognize(ctx)
+	if err != nil {
+		log.Println(err)
+		return err
+	}
+	// Send the initial configuration message.
+	err = stream.Send(&speechpb.StreamingRecognizeRequest{
+		Recognizer: fmt.Sprintf("projects/%s/locations/%s/recognizers/_", projectID, location),
+		StreamingRequest: &speechpb.StreamingRecognizeRequest_StreamingConfig{
+			StreamingConfig: &speechpb.StreamingRecognitionConfig{
+				Config: &speechpb.RecognitionConfig{
+					// In case of specific file encoding , so specify the decoding config.
+					DecodingConfig: &speechpb.RecognitionConfig_AutoDecodingConfig{},
+					Model:          "long",
+					LanguageCodes:  []string{"en-US"},
+					Features: &speechpb.RecognitionFeatures{
+						MaxAlternatives: 2,
+					},
+				},
+				StreamingFeatures: &speechpb.StreamingRecognitionFeatures{InterimResults: true},
+			},
+		},
+	})
+	if err != nil {
+		return err
+	}
+
+	go func() error {
+		buf := make([]byte, 1024)
+		for {
+			n, err := f.Read(buf)
+			if n > 0 {
+				if err := stream.Send(&speechpb.StreamingRecognizeRequest{
+					Recognizer: fmt.Sprintf("projects/%s/locations/%s/recognizers/_", projectID, location),
+					StreamingRequest: &speechpb.StreamingRecognizeRequest_Audio{
+						Audio: buf[:n],
+					},
+				}); err != nil {
+					return fmt.Errorf("could not send audio: %v", err)
+				}
+			}
+			if err == io.EOF {
+				// Nothing else to pipe, close the stream.
+				if err := stream.CloseSend(); err != nil {
+					return fmt.Errorf("could not close stream: %w", err)
+				}
+				return nil
+			}
+			if err != nil {
+				log.Printf("Could not read from %s: %v", audioFile, err)
+				continue
+			}
+		}
+	}()
+
+	for {
+		resp, err := stream.Recv()
+		if err == io.EOF {
+			break
+		}
+		if err != nil {
+			return fmt.Errorf("cannot stream results: %v", err)
+		}
+		for i, result := range resp.Results {
+			fmt.Fprintf(w, "%s\n", strings.Repeat("-", 20))
+			fmt.Fprintf(w, "Result %d\n", i+1)
+			for j, alternative := range result.Alternatives {
+				fmt.Fprintf(w, "Alternative %d is_final: %t : %s\n", j+1, result.IsFinal, alternative.Transcript)
+			}
+		}
+	}
+	return nil
+}
+
+// [END speech_transcribe_streaming_v2]
diff --git a/speech/snippets/transcribe_streaming_v2_explicit_decoding.go b/speech/snippets/transcribe_streaming_v2_explicit_decoding.go
@@ -0,0 +1,137 @@
+// Copyright 2024 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Command livecaption_from_file streams a local audio file to
+// Google Speech API and outputs the transcript.
+
+package snippets
+
+// [START speech_transcribe_streaming_v2_explicit_decoding]
+import (
+	"context"
+	"fmt"
+	"io"
+	"log"
+	"os"
+	"path/filepath"
+	"strings"
+
+	speech "cloud.google.com/go/speech/apiv2"
+	"cloud.google.com/go/speech/apiv2/speechpb"
+)
+
+func transcribeStreamingSpecificDecodingV2(w io.Writer, projectID string, path string) error {
+	const location = "global"
+	audioFile, err := filepath.Abs(path)
+	if err != nil {
+		log.Println("Failed to load file: ", path)
+		return err
+	}
+	f, err := os.Open(audioFile)
+	if err != nil {
+		return err
+	}
+	defer f.Close()
+
+	ctx := context.Background()
+
+	client, err := speech.NewClient(ctx)
+	if err != nil {
+		log.Println(err)
+		return err
+	}
+	stream, err := client.StreamingRecognize(ctx)
+	if err != nil {
+		log.Println(err)
+		return err
+	}
+	// Send the initial configuration message.
+	err = stream.Send(&speechpb.StreamingRecognizeRequest{
+		Recognizer: fmt.Sprintf("projects/%s/locations/%s/recognizers/_", projectID, location),
+		StreamingRequest: &speechpb.StreamingRecognizeRequest_StreamingConfig{
+			StreamingConfig: &speechpb.StreamingRecognitionConfig{
+				Config: &speechpb.RecognitionConfig{
+					// In case of specific file encoding , so specify the decoding config.
+					//DecodingConfig: &speechpb.RecognitionConfig_AutoDecodingConfig{},
+					DecodingConfig: &speechpb.RecognitionConfig_ExplicitDecodingConfig{
+						ExplicitDecodingConfig: &speechpb.ExplicitDecodingConfig{
+							Encoding:          speechpb.ExplicitDecodingConfig_LINEAR16,
+							SampleRateHertz:   16000,
+							AudioChannelCount: 1,
+						},
+					},
+					Model:         "long",
+					LanguageCodes: []string{"en-US"},
+					Features: &speechpb.RecognitionFeatures{
+						MaxAlternatives: 2,
+					},
+				},
+				StreamingFeatures: &speechpb.StreamingRecognitionFeatures{InterimResults: true},
+			},
+		},
+	})
+	if err != nil {
+		return err
+	}
+
+	go func() error {
+		buf := make([]byte, 1024)
+		for {
+			n, err := f.Read(buf)
+			if n > 0 {
+				if err := stream.Send(&speechpb.StreamingRecognizeRequest{
+					Recognizer: fmt.Sprintf("projects/%s/locations/%s/recognizers/_", projectID, location),
+					StreamingRequest: &speechpb.StreamingRecognizeRequest_Audio{
+						Audio: buf[:n],
+					},
+				}); err != nil {
+					return fmt.Errorf("could not send audio: %v", err)
+				}
+			}
+			if err == io.EOF {
+				// Nothing else to pipe, close the stream.
+				if err := stream.CloseSend(); err != nil {
+					return fmt.Errorf("could not close stream: %w", err)
+				}
+				return nil
+			}
+			if err != nil {
+				log.Printf("Could not read from %s: %v", audioFile, err)
+				continue
+			}
+		}
+	}()
+
+	for {
+		resp, err := stream.Recv()
+		if err == io.EOF {
+			break
+		}
+		if err != nil {
+			return fmt.Errorf("cannot stream results: %v", err)
+		}
+		for i, result := range resp.Results {
+			fmt.Fprintf(w, "%s\n", strings.Repeat("-", 20))
+			fmt.Fprintf(w, "Result %d\n", i+1)
+			for j, alternative := range result.Alternatives {
+				log.Printf("Alternative %d is_final: %t : %s\n", j+1, result.IsFinal, alternative.Transcript)
+				fmt.Fprintf(w, "Alternative %d is_final: %t : %s\n", j+1, result.IsFinal, alternative.Transcript)
+			}
+
+		}
+	}
+	return nil
+}
+
+// [END speech_transcribe_streaming_v2_explicit_decoding]
diff --git a/speech/snippets/transcribe_streaming_v2_explicit_decoding_test.go b/speech/snippets/transcribe_streaming_v2_explicit_decoding_test.go
@@ -0,0 +1,38 @@
+// Copyright 2024 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package snippets
+
+import (
+	"bytes"
+	"os"
+	"strings"
+	"testing"
+
+	"github.com/GoogleCloudPlatform/golang-samples/internal/testutil"
+)
+
+var recognitionAudioFileRawLINEAR16 = "../testdata/audio.raw"
+
+func TestTranscribeStreamingV2SpecificDecoding(t *testing.T) {
+	testutil.SystemTest(t)
+	projectID := os.Getenv("GOLANG_SAMPLES_PROJECT_ID")
+	var buf bytes.Buffer
+	if err := transcribeStreamingSpecificDecodingV2(&buf, projectID, recognitionAudioFileRawLINEAR16); err != nil {
+		t.Fatalf("error in transcribe rawfile %v", err)
+	}
+	if got := buf.String(); !strings.Contains(got, "Brooklyn Bridge") {
+		t.Errorf("transcribe_streaming_v2_explicit_decoding got %q, expected %q", got, "Speaker")
+	}
+}
diff --git a/speech/snippets/transcribe_streaming_v2_test.go b/speech/snippets/transcribe_streaming_v2_test.go
@@ -0,0 +1,36 @@
+// Copyright 2024 Google LLC
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     https://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package snippets
+
+import (
+	"bytes"
+	"os"
+	"strings"
+	"testing"
+
+	"github.com/GoogleCloudPlatform/golang-samples/internal/testutil"
+)
+
+func TestTranscribeStreamingV2(t *testing.T) {
+	testutil.SystemTest(t)
+	projectID := os.Getenv("GOLANG_SAMPLES_PROJECT_ID")
+	var buf bytes.Buffer
+	if err := transcribeStreamingV2(&buf, projectID, recognitionAudioFile); err != nil {
+		t.Fatalf("error in transcribe %v", err)
+	}
+	if got := buf.String(); !strings.Contains(got, "Chromecast") {
+		t.Errorf("transcribe_streaming_v2 got %q, expected %q", got, "Speaker")
+	}
+}