In [1]:
%pip install google-cloud-speech
# perform this for local implementations https://codelabs.developers.google.com/codelabs/cloud-speech-text-python3#0
# then setup credentials https://cloud.google.com/docs/authentication/provide-credentials-adc#how-to

Note: you may need to restart the kernel to use updated packages.


In [2]:
from google.cloud import speech


def speech_to_text(
    config: speech.RecognitionConfig,
    audio: speech.RecognitionAudio,
) -> speech.RecognizeResponse:
    client = speech.SpeechClient()

    # Synchronous speech recognition request
    response = client.recognize(config=config, audio=audio)

    return response


def print_response(response: speech.RecognizeResponse):
    for result in response.results:
        print_result(result)


def print_result(result: speech.SpeechRecognitionResult):
    best_alternative = result.alternatives[0]
    print("-" * 80)
    print(f"language_code: {result.language_code}")
    print(f"transcript:    {best_alternative.transcript}")
    print(f"confidence:    {best_alternative.confidence:.0%}")
    print("-" * 80)
    for word in best_alternative.words:
        start_s = word.start_time.total_seconds()
        end_s = word.end_time.total_seconds()
        print(f"{start_s:>7.3f} | {end_s:>7.3f} | {word.word}")

config = speech.RecognitionConfig(
    language_code="en",
    enable_automatic_punctuation=True,
    enable_word_time_offsets=True,
)
audio = speech.RecognitionAudio(
    uri="gs://cloud-samples-data/speech/brooklyn_bridge.flac",
)

response = speech_to_text(config, audio)
print_response(response)

In [3]:
def transcribe_model_selection(
    speech_file: str,
    model: str,
) -> speech.RecognizeResponse:
    """Transcribe the given audio file synchronously with
    the selected model."""
    client = speech.SpeechClient()

    with open(speech_file, "rb") as audio_file:
        content = audio_file.read()

    audio = speech.RecognitionAudio(content=content)

    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.AMR,
        sample_rate_hertz=44100,
        language_code="en-US",
        model=model,
    )

    response = client.recognize(config=config, audio=audio)

    for i, result in enumerate(response.results):
        alternative = result.alternatives[0]
        print("-" * 20)
        print(f"First alternative of result {i}")
        print(f"Transcript: {alternative.transcript}")

    return response


In [10]:
#%pip install -U openai-whisper
import whisper

model = whisper.load_model("base")
result = model.transcribe("/Users/sanjivjha/Desktop/IMG_9993.MOV")
for i in result["segments"]:
    print('start: {} text: {}'.format(i['start'], i['text']))



start: 0.0 text:  I need to tell you where that should be.
start: 2.0 text:  Go on, call the laptop.
start: 14.0 text:  Now can you read the book?
start: 18.0 text:  I need to go.
start: 24.0 text:  Can you drink from the bottle?
start: 30.0 text:  What is?
start: 32.0 text:  Can you go to that chair?
start: 54.0 text:  Hold this.
start: 56.0 text:  Hold both hands.
start: 58.0 text:  I can't see you.
start: 60.0 text:  Can you take the other computer now?
start: 62.0 text:  Because now we only have one computer.
