In [None]:
# Copyright 2025 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Get started with Chirp 3 Transcription

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/audio/speech/getting-started/get_started_with_chirp_3_transcription.ipynb">
      <img width="32px" src="https://www.gstatic.com/pantheon/images/bigquery/welcome_page/colab-logo.svg" alt="Google Colaboratory logo"><br> Open in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fgenerative-ai%2Fmain%2Faudio%2Fspeech%2Fgetting-started%2Fget_started_with_chirp_3_transcription.ipynb">
      <img width="32px" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" alt="Google Cloud Colab Enterprise logo"><br> Open in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/main/audio/speech/getting-started/get_started_with_chirp_3_transcription.ipynb">
      <img src="https://www.gstatic.com/images/branding/gcpiconscolors/vertexai/v1/32px.svg" alt="Vertex AI logo"><br> Open in Vertex AI Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/generative-ai/blob/main/audio/speech/getting-started/get_started_with_chirp_3_transcription.ipynb">
      <img width="32px" src="https://www.svgrepo.com/download/217753/github.svg" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
</table>

<div style="clear: both;"></div>

<b>Share to:</b>

<a href="https://www.linkedin.com/sharing/share-offsite/?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/audio/speech/getting-started/get_started_with_chirp_3_transcription.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/8/81/LinkedIn_icon.svg" alt="LinkedIn logo">
</a>

<a href="https://bsky.app/intent/compose?text=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/audio/speech/getting-started/get_started_with_chirp_3_transcription.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/7/7a/Bluesky_Logo.svg" alt="Bluesky logo">
</a>

<a href="https://twitter.com/intent/tweet?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/audio/speech/getting-started/get_started_with_chirp_3_transcription.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/5a/X_icon_2.svg" alt="X logo">
</a>

<a href="https://reddit.com/submit?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/audio/speech/getting-started/get_started_with_chirp_3_transcription.ipynb" target="_blank">
  <img width="20px" src="https://redditinc.com/hubfs/Reddit%20Inc/Brand/Reddit_Logo.png" alt="Reddit logo">
</a>

<a href="https://www.facebook.com/sharer/sharer.php?u=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/audio/speech/getting-started/get_started_with_chirp_3_transcription.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/51/Facebook_f_logo_%282019%29.svg" alt="Facebook logo">
</a>            

| Author |
| --- |
| [Katie Nguyen](https://github.com/katiemn) |

## Overview

### Chirp 3

This notebook introduces [Chirp 3](https://cloud.google.com/speech-to-text/v2/docs/chirp_3-model), Google's model for converting speech to text in multiple languages.

In this tutorial, you'll learn how to use the Speech-to-Text API V2 to:

- Transcribe an audio file with batch speech recognition
- Perform a language-agnostic transcription
- Use Chirp 3 for speaker diarization
- Perform streaming speech recognition

## Get started

### Install the Speech SDK and other required packages


In [None]:
%pip install --upgrade --quiet google-cloud-speech ipywebrtc

### Authenticate your notebook environment (Colab only)

If you're running this notebook on Google Colab, run the cell below to authenticate your environment.

In [None]:
import sys

if "google.colab" in sys.modules:
    from google.colab import auth

    auth.authenticate_user()

### Import libraries

In [None]:
from google.cloud.speech_v2 import SpeechClient
from google.cloud.speech_v2.types import cloud_speech

from IPython.display import Audio, HTML, display
from google.api_core.client_options import ClientOptions

from ipywebrtc import AudioRecorder, CameraStream

import json
import re

### Set Google Cloud project information

To get started using the Speech-to-Text API, you must have an existing Google Cloud project and [enable the API](https://console.cloud.google.com/flows/enableapi?apiid=speech.googleapis.com).

Learn more about [setting up a project and a development environment](https://cloud.google.com/vertex-ai/docs/start/cloud-environment).

Please note the **available regions** for Chirp 3, see [documentation](https://cloud.google.com/speech-to-text/v2/docs/chirp_3-model#regional_availability).

In [None]:
# Use the environment variable if the user doesn't provide Project ID.
import os

PROJECT_ID = "[your-project-id]"  # @param {type: "string", placeholder: "[your-project-id]", isTemplate: true}
if not PROJECT_ID or PROJECT_ID == "[your-project-id]":
    PROJECT_ID = str(os.environ.get("GOOGLE_CLOUD_PROJECT"))

STT_LOCATION = "us"  # @param {type: "string"}

In [None]:
! gcloud config set project {PROJECT_ID}
! gcloud auth application-default login -q
! gcloud auth application-default set-quota-project {PROJECT_ID}


### Create client

Initiate the API endpoint and the speech to text client.


In [None]:
client = SpeechClient(
    client_options=ClientOptions(api_endpoint=f"{STT_LOCATION}-speech.googleapis.com"))

### Define helper functions

In [None]:
def generate_audio_chunks(audio_content: bytes, chunk_size: int):
    for start in range(0, len(audio_content), chunk_size):
        yield audio_content[start : start + chunk_size]


def group_utterances_by_speaker_from_file(json_file_path: str):
    with open(json_file_path, 'r', encoding='utf-8') as f:
        json_data_string = f.read()
    words_regex = r'"words":\s*(\[.*?\])'
    match = re.search(words_regex, json_data_string, re.DOTALL)

    words_list = json.loads(match.group(1))
    dialogue = []
    current_speaker = None
    current_utterance_words = []
    current_speaker = words_list[0]["speakerLabel"]

    for item in words_list:
        word = item["word"]
        speaker = item["speakerLabel"]
        # Check if the speaker has changed
        if speaker != current_speaker:
            dialogue.append({
                "speaker": current_speaker,
                "text": " ".join(current_utterance_words)
            })
            # Start a new utterance
            current_speaker = speaker
            current_utterance_words = [word]
        else:
            # Continue the current utterance
            current_utterance_words.append(word)
    # Add the final pending utterance
    if current_speaker is not None:
        dialogue.append({
            "speaker": current_speaker,
            "text": " ".join(current_utterance_words)
        })

    return {"dialogue": dialogue}

## Transcribe using Chirp 3


### Batch speech recognition

For this first request, you'll use the `batch_recognize` method to transcribe an audio file in Cloud Storage. Run the following cell to play the audio you'll be transcribing. If you'd like to use a different audio clip, modify the `audio_url` and `audio_gcs_uri` variables below.

In [None]:
audio_url = "https://storage.googleapis.com/cloud-samples-data/generative-ai/audio/audio_summary_clean_energy.mp3"

audio_gcs_uri = "gs://cloud-samples-data/generative-ai/audio/audio_summary_clean_energy.mp3"

display(Audio(url=audio_url))

Now, you'll send the `batch_recognize` request. The transcription will be returned as part of the response and displayed in HTML for better visualization in this notebook.

In [None]:
# Set a timeout for the batch recognition operation
MAX_AUDIO_LENGTH_SECS = 8 * 60 * 60

config = cloud_speech.RecognitionConfig(
    auto_decoding_config={},
    model="chirp_3",
    language_codes=["en-US"],
)

files = [cloud_speech.BatchRecognizeFileMetadata(uri=audio_gcs_uri)]

request = cloud_speech.BatchRecognizeRequest(
    recognizer=f"projects/{PROJECT_ID}/locations/{STT_LOCATION}/recognizers/_",
    config=config,
    files=files,
    recognition_output_config=cloud_speech.RecognitionOutputConfig(
          inline_response_config=cloud_speech.InlineOutputConfig(),
    ),
)

operation = client.batch_recognize(request=request)
response = operation.result(timeout=3 * MAX_AUDIO_LENGTH_SECS)

for result in response.results[audio_gcs_uri].transcript.results:
   styled_html = f"""<div style="word-break: break-all;">{result.alternatives[0].transcript}</div>
   """
HTML(styled_html)

### Perform a language-agnostic transcription

In this next request, you'll perform a language-agnostic transcription. This means that Chirp 3 will automatically identify and transcribe the dominant language spoken in the audio, which is essential for multilingual applications.

In this next example, you'll use a Spanish audio clip saved in Cloud Storage. To see a full list of the languages available for transcription, check the [documentation](https://cloud.google.com/speech-to-text/v2/docs/chirp_3-model#language_availability_for_transcription).

In [None]:
audio_url = "https://storage.googleapis.com/cloud-samples-data/generative-ai/audio/spanish.wav"

audio_gcs_uri = "gs://cloud-samples-data/generative-ai/audio/spanish.wav"

display(Audio(url=audio_url))

This request is the same as the previous one, except this time, you'll set `language_codes=["auto"]`.

In [None]:
MAX_AUDIO_LENGTH_SECS = 8 * 60 * 60

config = cloud_speech.RecognitionConfig(
    auto_decoding_config={},
    model="chirp_3",
    language_codes=["auto"],
)

files = [cloud_speech.BatchRecognizeFileMetadata(uri=audio_gcs_uri)]

request = cloud_speech.BatchRecognizeRequest(
    recognizer=f"projects/{PROJECT_ID}/locations/{STT_LOCATION}/recognizers/_",
    config=config,
    files=files,
    recognition_output_config=cloud_speech.RecognitionOutputConfig(
          inline_response_config=cloud_speech.InlineOutputConfig(),
    ),
)

operation = client.batch_recognize(request=request)
response = operation.result(timeout=3 * MAX_AUDIO_LENGTH_SECS)

for result in response.results[audio_gcs_uri].transcript.results:
   styled_html = f"""<div style="word-break: break-all;">{result.alternatives[0].transcript}</div>
   """
HTML(styled_html)

### Speaker Diarization

Chirp 3 also supports speaker diarization, which means it can automatically identify the different speakers in a single-channel audio sample. See the [documentation](https://cloud.google.com/speech-to-text/v2/docs/chirp_3-model#language_availability_for_diarization) for a list of supported available languages for diarization.

In this example, you'll also save the transcription to Cloud Storage. Make sure to add the bucket where you'd like it to be saved in `gcs_output_folder` below.

In [None]:
audio_url = "https://storage.googleapis.com/cloud-samples-data/generative-ai/audio/Chirp-3-Docs-Dive.mp3"

audio_gcs_uri = "gs://cloud-samples-data/generative-ai/audio/Chirp-3-Docs-Dive.mp3"

display(Audio(url=audio_url))

# The output path of the transcription result
gcs_output_folder = "gs://[your-bucket-path]" # @param {type: "string"}

In order to enable speaker diarization, set the `diarization_config` in the `features` parameter of the `RecognitionConfig`.

You'll also set your `gcs_output_folder` in a `RecognitionOutputConfig` so the transcription will be saved in Cloud Storage. To display the transcription, you'll copy the output JSON file and use a helper function to format it.

In [None]:
MAX_AUDIO_LENGTH_SECS = 8 * 60 * 60

config = cloud_speech.RecognitionConfig(
    auto_decoding_config={},
    features=cloud_speech.RecognitionFeatures(
        diarization_config=cloud_speech.SpeakerDiarizationConfig(
        ),
      ),
    model="chirp_3",
    language_codes=["en-US"],
)

output_config = cloud_speech.RecognitionOutputConfig(
    gcs_output_config=cloud_speech.GcsOutputConfig(uri=gcs_output_folder),
)

files = [cloud_speech.BatchRecognizeFileMetadata(uri=audio_gcs_uri)]

request = cloud_speech.BatchRecognizeRequest(
    recognizer=f"projects/{PROJECT_ID}/locations/{STT_LOCATION}/recognizers/_",
    config=config,
    files=files,
    recognition_output_config=output_config,
)
operation = client.batch_recognize(request=request)

response = operation.result(timeout=3 * MAX_AUDIO_LENGTH_SECS)
transcript = response.results[audio_gcs_uri].uri

!gsutil cp {transcript} output.json

print(json.dumps(group_utterances_by_speaker_from_file("output.json"), indent=4))

### Streaming speech recognition

In the following cells you'll simulate transcribing text from an audio stream. To start, you'll record an audio clip with your microphone by running the following cell.

In [None]:
from google.colab import output
output.enable_custom_widget_manager()

camera = CameraStream(constraints={'audio': True, 'video': False})
recorder = AudioRecorder(stream=camera)
recorder

Once the audio is captured and you've stopped recording, you'll use FFmpeg to convert and save the clip to a MP3 file for processing.

In [None]:
with open('recording.webm', 'wb') as f:
    f.write(recorder.audio.value)

!ffmpeg -i recording.webm -vn -ar 44100 -ac 2 -f mp3 recording.mp3
audio_file = "recording.mp3"

Now, you'll read the audio file and generate audio chunks to simulate streaming from a helper function. You'll then use the `streaming_recognize` method to get the transcription from each audio chunk with help from a generator function to correctly structure the data stream.

In [None]:
# Create a generator to send the recognition config and then the audio stream
def requests() -> iter:
    yield request
    yield from audio_requests

with open(audio_file, "rb") as f:
    audio_content = f.read()

CHUNK_SIZE = 3200
stream = list(generate_audio_chunks(audio_content, CHUNK_SIZE))

recognition_config = cloud_speech.RecognitionConfig(
    auto_decoding_config={},
    language_codes=["auto"],
    model="chirp_3",
)

streaming_config = cloud_speech.StreamingRecognitionConfig(
    config=recognition_config,
)

request = cloud_speech.StreamingRecognizeRequest(
    recognizer=f"projects/{PROJECT_ID}/locations/{STT_LOCATION}/recognizers/_",
    streaming_config=streaming_config,
)

audio_requests = (
    cloud_speech.StreamingRecognizeRequest(audio=chunk) for chunk in stream
)

responses_iterator = client.streaming_recognize(
    requests=requests()
)
responses = []
for response in responses_iterator:
    responses.append(response)
    for result in response.results:
        print(result.alternatives[0].transcript)