## Load libs

In [1]:
import speech_recognition as sr
from os import path
import re
import pydub
from pydub import AudioSegment
from pydub.silence import split_on_silence, detect_nonsilent, detect_silence
from pydub.playback import play

PATH_AUDIO = "./October4thGoogleEvent_short.flac"

## Functions

In [2]:
"""
Open a given audio/video flac file using pydub lib
"""
def open_audio_pydub(path_file):
    audio = AudioSegment.from_file(path_file, format="flac")
    print('Reading audio file... done')
    return audio


"""
Transcribing speech in audio/video file to text.
Use Googles speech-to-text service.
"""
def speech_to_text(path_file):
    r = sr.Recognizer()
    with sr.AudioFile(path_file) as source:
        audio = r.record(source)

    text = None
    try:
        # for testing purposes, we're just using the default API key
        # to use another API key, use `r.recognize_google(audio, key="GOOGLE_SPEECH_RECOGNITION_API_KEY")`
        text = r.recognize_google(audio)
    except sr.UnknownValueError:
        print("Google Speech Recognition could not understand audio")
    except sr.RequestError as e:
        print("Could not request results from Google Speech Recognition service; {0}".format(e))
    
    return text


"""
Count occurances of certain phrase in a string.
"""
def count_phrase(phrase, text):
    all_matches = re.findall(phrase, text)
    return len(all_matches)

## Read audio file 

In [3]:
audio = open_audio_pydub(PATH_AUDIO)
audio

Reading audio file... done


## Split audio file into digestable chunks, transcribe each chunk

In [10]:
complete_text = ''

# Define number of 15sec long chunks
length_segment = 15 * 1000
length_total = audio.duration_seconds
n_segments = int(length_total * 1000 / length_segment + 1)
segments = [(x*length_segment, (x+1)*length_segment) for x in range(0, n_segments)]

# For each chunk: convert to .wav and transcribe text
for start, end in segments:
    print('Transcribing chunk {:.0f}s-{:.0f}s of total {:.0f} seconds...'.format(start/1000, end/1000, length_total))
    start = max(0, start-250)
    chunk = audio[start:end]
    chunk.export('tmp.wav', format="wav")
    
    text = speech_to_text('tmp.wav')
    complete_text += text + ' '
    
    #print('{}-{}: {}'.format(start, end, text))
    #play(chunk)

complete_text

Transcribing chunk 0-15s of total 136 seconds...
Transcribing chunk 15-30s of total 136 seconds...
Transcribing chunk 30-45s of total 136 seconds...
Transcribing chunk 45-60s of total 136 seconds...
Transcribing chunk 60-75s of total 136 seconds...
Transcribing chunk 75-90s of total 136 seconds...
Transcribing chunk 90-105s of total 136 seconds...
Transcribing chunk 105-120s of total 136 seconds...
Transcribing chunk 120-135s of total 136 seconds...
Transcribing chunk 135-150s of total 136 seconds...


"we are going to use machine learning we understand location data we try to understand patio cleaner and the colour shows the density of parking and we can analyse it throughout the day and predict parking difficulty and then Google Google Maps give you options a simple example but it's the kind of everyday use case which we are using machine learning to make a difference the best example I can think of what it before is Google translation I literally remember many years ago Adam translation in Chrome and making it automatic so if you land in a page different from your language we do that for you fast forward to today with the power of machine learning and neural machine cancellation visa or 2 million translations in many many languages every single day commit shows the power of staying at a problem constantly using computer science to make it better and seeing users respond to descale this is why we are excited about the shift from a mobile first to AI first well it's not just about a

## Count occurences of phrases

In [11]:
phrases = [r'machine learning',
           r' ai first',
           r'mobile first']

complete_text = complete_text.lower()
for phrase in phrases:
    count = count_phrase(phrase, complete_text)
    
    print('Found {} occurences of phrase: "{}"'.format(count, phrase))

Found 4 occurences of phrase: "machine learning"
Found 1 occurences of phrase: " ai first"
Found 1 occurences of phrase: "mobile first"


## Some testing with pydub's detect_silence and pause functionality

In [12]:
chunks_on_pause = detect_silence(audio[0:20000], min_silence_len=100, silence_thresh=-25)

for start, end in chunks_on_pause:
    print('play new chunk')
    play(audio[start:end])

play new chunk
play new chunk
