-
Notifications
You must be signed in to change notification settings - Fork 1.7k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat(speech): Add speech StreamingRecognize samples (#3753)
* feature: add support to v2 stt apis * fix readme * refactor variables * added support for v2 apis, removed old code * updated license for new files * applied suggested fixed * fix tags for region issue * applied the suggested changes * applied the suggested changes * add support for location --------- Co-authored-by: Kodanda Rama <kodrama@google.com> Co-authored-by: Marc Dougherty <muncus@users.noreply.github.com>
- Loading branch information
1 parent
450027d
commit 743ff06
Showing
4 changed files
with
339 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,128 @@ | ||
// Copyright 2024 Google LLC | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// https://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
|
||
// Command livecaption_from_file streams a local audio file to | ||
// Google Speech API and outputs the transcript. | ||
|
||
package snippets | ||
|
||
// [START speech_transcribe_streaming_v2] | ||
import ( | ||
"context" | ||
"fmt" | ||
"io" | ||
"log" | ||
"os" | ||
"path/filepath" | ||
"strings" | ||
|
||
speech "cloud.google.com/go/speech/apiv2" | ||
"cloud.google.com/go/speech/apiv2/speechpb" | ||
) | ||
|
||
func transcribeStreamingV2(w io.Writer, projectID string, path string) error { | ||
const location = "global" | ||
audioFile, err := filepath.Abs(path) | ||
if err != nil { | ||
log.Println("Failed to load file: ", path) | ||
return err | ||
} | ||
f, err := os.Open(audioFile) | ||
if err != nil { | ||
return err | ||
} | ||
defer f.Close() | ||
|
||
ctx := context.Background() | ||
|
||
client, err := speech.NewClient(ctx) | ||
if err != nil { | ||
log.Println(err) | ||
return err | ||
} | ||
stream, err := client.StreamingRecognize(ctx) | ||
if err != nil { | ||
log.Println(err) | ||
return err | ||
} | ||
// Send the initial configuration message. | ||
err = stream.Send(&speechpb.StreamingRecognizeRequest{ | ||
Recognizer: fmt.Sprintf("projects/%s/locations/%s/recognizers/_", projectID, location), | ||
StreamingRequest: &speechpb.StreamingRecognizeRequest_StreamingConfig{ | ||
StreamingConfig: &speechpb.StreamingRecognitionConfig{ | ||
Config: &speechpb.RecognitionConfig{ | ||
// In case of specific file encoding , so specify the decoding config. | ||
DecodingConfig: &speechpb.RecognitionConfig_AutoDecodingConfig{}, | ||
Model: "long", | ||
LanguageCodes: []string{"en-US"}, | ||
Features: &speechpb.RecognitionFeatures{ | ||
MaxAlternatives: 2, | ||
}, | ||
}, | ||
StreamingFeatures: &speechpb.StreamingRecognitionFeatures{InterimResults: true}, | ||
}, | ||
}, | ||
}) | ||
if err != nil { | ||
return err | ||
} | ||
|
||
go func() error { | ||
buf := make([]byte, 1024) | ||
for { | ||
n, err := f.Read(buf) | ||
if n > 0 { | ||
if err := stream.Send(&speechpb.StreamingRecognizeRequest{ | ||
Recognizer: fmt.Sprintf("projects/%s/locations/%s/recognizers/_", projectID, location), | ||
StreamingRequest: &speechpb.StreamingRecognizeRequest_Audio{ | ||
Audio: buf[:n], | ||
}, | ||
}); err != nil { | ||
return fmt.Errorf("could not send audio: %v", err) | ||
} | ||
} | ||
if err == io.EOF { | ||
// Nothing else to pipe, close the stream. | ||
if err := stream.CloseSend(); err != nil { | ||
return fmt.Errorf("could not close stream: %w", err) | ||
} | ||
return nil | ||
} | ||
if err != nil { | ||
log.Printf("Could not read from %s: %v", audioFile, err) | ||
continue | ||
} | ||
} | ||
}() | ||
|
||
for { | ||
resp, err := stream.Recv() | ||
if err == io.EOF { | ||
break | ||
} | ||
if err != nil { | ||
return fmt.Errorf("cannot stream results: %v", err) | ||
} | ||
for i, result := range resp.Results { | ||
fmt.Fprintf(w, "%s\n", strings.Repeat("-", 20)) | ||
fmt.Fprintf(w, "Result %d\n", i+1) | ||
for j, alternative := range result.Alternatives { | ||
fmt.Fprintf(w, "Alternative %d is_final: %t : %s\n", j+1, result.IsFinal, alternative.Transcript) | ||
} | ||
} | ||
} | ||
return nil | ||
} | ||
|
||
// [END speech_transcribe_streaming_v2] |
137 changes: 137 additions & 0 deletions
137
speech/snippets/transcribe_streaming_v2_explicit_decoding.go
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,137 @@ | ||
// Copyright 2024 Google LLC | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// https://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
|
||
// Command livecaption_from_file streams a local audio file to | ||
// Google Speech API and outputs the transcript. | ||
|
||
package snippets | ||
|
||
// [START speech_transcribe_streaming_v2_explicit_decoding] | ||
import ( | ||
"context" | ||
"fmt" | ||
"io" | ||
"log" | ||
"os" | ||
"path/filepath" | ||
"strings" | ||
|
||
speech "cloud.google.com/go/speech/apiv2" | ||
"cloud.google.com/go/speech/apiv2/speechpb" | ||
) | ||
|
||
func transcribeStreamingSpecificDecodingV2(w io.Writer, projectID string, path string) error { | ||
const location = "global" | ||
audioFile, err := filepath.Abs(path) | ||
if err != nil { | ||
log.Println("Failed to load file: ", path) | ||
return err | ||
} | ||
f, err := os.Open(audioFile) | ||
if err != nil { | ||
return err | ||
} | ||
defer f.Close() | ||
|
||
ctx := context.Background() | ||
|
||
client, err := speech.NewClient(ctx) | ||
if err != nil { | ||
log.Println(err) | ||
return err | ||
} | ||
stream, err := client.StreamingRecognize(ctx) | ||
if err != nil { | ||
log.Println(err) | ||
return err | ||
} | ||
// Send the initial configuration message. | ||
err = stream.Send(&speechpb.StreamingRecognizeRequest{ | ||
Recognizer: fmt.Sprintf("projects/%s/locations/%s/recognizers/_", projectID, location), | ||
StreamingRequest: &speechpb.StreamingRecognizeRequest_StreamingConfig{ | ||
StreamingConfig: &speechpb.StreamingRecognitionConfig{ | ||
Config: &speechpb.RecognitionConfig{ | ||
// In case of specific file encoding , so specify the decoding config. | ||
//DecodingConfig: &speechpb.RecognitionConfig_AutoDecodingConfig{}, | ||
DecodingConfig: &speechpb.RecognitionConfig_ExplicitDecodingConfig{ | ||
ExplicitDecodingConfig: &speechpb.ExplicitDecodingConfig{ | ||
Encoding: speechpb.ExplicitDecodingConfig_LINEAR16, | ||
SampleRateHertz: 16000, | ||
AudioChannelCount: 1, | ||
}, | ||
}, | ||
Model: "long", | ||
LanguageCodes: []string{"en-US"}, | ||
Features: &speechpb.RecognitionFeatures{ | ||
MaxAlternatives: 2, | ||
}, | ||
}, | ||
StreamingFeatures: &speechpb.StreamingRecognitionFeatures{InterimResults: true}, | ||
}, | ||
}, | ||
}) | ||
if err != nil { | ||
return err | ||
} | ||
|
||
go func() error { | ||
buf := make([]byte, 1024) | ||
for { | ||
n, err := f.Read(buf) | ||
if n > 0 { | ||
if err := stream.Send(&speechpb.StreamingRecognizeRequest{ | ||
Recognizer: fmt.Sprintf("projects/%s/locations/%s/recognizers/_", projectID, location), | ||
StreamingRequest: &speechpb.StreamingRecognizeRequest_Audio{ | ||
Audio: buf[:n], | ||
}, | ||
}); err != nil { | ||
return fmt.Errorf("could not send audio: %v", err) | ||
} | ||
} | ||
if err == io.EOF { | ||
// Nothing else to pipe, close the stream. | ||
if err := stream.CloseSend(); err != nil { | ||
return fmt.Errorf("could not close stream: %w", err) | ||
} | ||
return nil | ||
} | ||
if err != nil { | ||
log.Printf("Could not read from %s: %v", audioFile, err) | ||
continue | ||
} | ||
} | ||
}() | ||
|
||
for { | ||
resp, err := stream.Recv() | ||
if err == io.EOF { | ||
break | ||
} | ||
if err != nil { | ||
return fmt.Errorf("cannot stream results: %v", err) | ||
} | ||
for i, result := range resp.Results { | ||
fmt.Fprintf(w, "%s\n", strings.Repeat("-", 20)) | ||
fmt.Fprintf(w, "Result %d\n", i+1) | ||
for j, alternative := range result.Alternatives { | ||
log.Printf("Alternative %d is_final: %t : %s\n", j+1, result.IsFinal, alternative.Transcript) | ||
fmt.Fprintf(w, "Alternative %d is_final: %t : %s\n", j+1, result.IsFinal, alternative.Transcript) | ||
} | ||
|
||
} | ||
} | ||
return nil | ||
} | ||
|
||
// [END speech_transcribe_streaming_v2_explicit_decoding] |
38 changes: 38 additions & 0 deletions
38
speech/snippets/transcribe_streaming_v2_explicit_decoding_test.go
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
// Copyright 2024 Google LLC | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// https://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
|
||
package snippets | ||
|
||
import ( | ||
"bytes" | ||
"os" | ||
"strings" | ||
"testing" | ||
|
||
"github.com/GoogleCloudPlatform/golang-samples/internal/testutil" | ||
) | ||
|
||
var recognitionAudioFileRawLINEAR16 = "../testdata/audio.raw" | ||
|
||
func TestTranscribeStreamingV2SpecificDecoding(t *testing.T) { | ||
testutil.SystemTest(t) | ||
projectID := os.Getenv("GOLANG_SAMPLES_PROJECT_ID") | ||
var buf bytes.Buffer | ||
if err := transcribeStreamingSpecificDecodingV2(&buf, projectID, recognitionAudioFileRawLINEAR16); err != nil { | ||
t.Fatalf("error in transcribe rawfile %v", err) | ||
} | ||
if got := buf.String(); !strings.Contains(got, "Brooklyn Bridge") { | ||
t.Errorf("transcribe_streaming_v2_explicit_decoding got %q, expected %q", got, "Speaker") | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
// Copyright 2024 Google LLC | ||
// | ||
// Licensed under the Apache License, Version 2.0 (the "License"); | ||
// you may not use this file except in compliance with the License. | ||
// You may obtain a copy of the License at | ||
// | ||
// https://www.apache.org/licenses/LICENSE-2.0 | ||
// | ||
// Unless required by applicable law or agreed to in writing, software | ||
// distributed under the License is distributed on an "AS IS" BASIS, | ||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
// See the License for the specific language governing permissions and | ||
// limitations under the License. | ||
|
||
package snippets | ||
|
||
import ( | ||
"bytes" | ||
"os" | ||
"strings" | ||
"testing" | ||
|
||
"github.com/GoogleCloudPlatform/golang-samples/internal/testutil" | ||
) | ||
|
||
func TestTranscribeStreamingV2(t *testing.T) { | ||
testutil.SystemTest(t) | ||
projectID := os.Getenv("GOLANG_SAMPLES_PROJECT_ID") | ||
var buf bytes.Buffer | ||
if err := transcribeStreamingV2(&buf, projectID, recognitionAudioFile); err != nil { | ||
t.Fatalf("error in transcribe %v", err) | ||
} | ||
if got := buf.String(); !strings.Contains(got, "Chromecast") { | ||
t.Errorf("transcribe_streaming_v2 got %q, expected %q", got, "Speaker") | ||
} | ||
} |