## Audio files preview 
1. All audio files are loaded from Google Cloud Storage
2. please forgive me if my Cantonese sounds not native

In [109]:
import IPython
import librosa
import soundfile as sf
import io
from google.cloud import storage

BUCKET = 'cantonese_speech_voice'

# Create a Cloud Storage client.
gcs = storage.Client()

# Get the bucket that the file will be uploaded to.
bucket = gcs.get_bucket(BUCKET)

# specify a filename
file_name = '1.wav'

# read a blob
blob = bucket.blob(file_name)
file_as_string = blob.download_as_string()

# convert the string to bytes and then finally to audio samples as floats 
# and the audio sample rate
data, sample_rate = sf.read(io.BytesIO(file_as_string))

left_channel = data[:,0]  # I assume the left channel is column zero

# enable play button in datalab notebook
print ("Display an audio example:")
IPython.display.Audio(left_channel, rate=sample_rate)

Display an audio example:


# Speech to Text

In [88]:
from google.cloud import speech_v1p1beta1 as speech
import os

# get Google Cloud Credential in .json format
os.environ['GOOGLE_APPLICATION_CREDENTIALS']='rich-suprstate-311107-b521ee51d373.json'

client = speech.SpeechClient()
encoding=speech.enums.RecognitionConfig.AudioEncoding

In [89]:
def transcribe2_gcs(gcs_uri):
    audio = speech.types.RecognitionAudio(uri=gcs_uri)
    config = speech.types.RecognitionConfig(
        encoding=speech.enums.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=44100,
        audio_channel_count=2, # based on the number of channels in the audio file
        enable_separate_recognition_per_channel=False, # Only 1 speaker so no need to separate speech per channel
        language_code='yue-Hant-HK')
    response = client.long_running_recognize(config, audio)
    print('Waiting for operation to complete...')
    response = response.result(timeout=36000)

    return response

## Transcribe in batch

In [94]:
gcs_uri = ""
Text_list = []
Confidence_list = []
for i in range(1,21):
    gcs_uri = "gs://cantonese_speech_voice/"+ str(i) + ".wav"
    res = transcribe2_gcs(gcs_uri)
    for result in res.results: 
        print('Transcript: {}'.format(result.alternatives[0].transcript))
        print('confidence: {:.3f}'.format(result.alternatives[0].confidence))
        print()
    Text_list.append('{}'.format(result.alternatives[0].transcript))
    Confidence_list.append('{:.3f}'.format(result.alternatives[0].confidence))

Waiting for operation to complete...
Transcript: 係呀但我得唔得閒呀可唔可以遲啲再打畀我吖
confidence: 0.939

Waiting for operation to complete...
Transcript: 我唔需要藉錢啦唔好再打畀我啦
confidence: 0.939

Waiting for operation to complete...
Transcript: 我唔係呀你打錯呀
confidence: 0.950

Waiting for operation to complete...
Transcript: 我唔鍾意
confidence: 0.951

Waiting for operation to complete...
Transcript: 冇興趣拜拜
confidence: 0.929

Waiting for operation to complete...
Transcript: 唔好再打嚟
confidence: 0.951

Waiting for operation to complete...
Transcript: 我有興趣但我宜家已經買咗好多保險呀但是唔需要再買
confidence: 0.938

Waiting for operation to complete...
Transcript: 我鍾意你哋嘅party但點解你哋畀人哋貴咁多呀
confidence: 0.891

Waiting for operation to complete...
Transcript: 你呢隻prada係平但我覺得cover仲唔夠多囉有冇好啲嘅plan呀
confidence: 0.935

Waiting for operation to complete...
Transcript: 不如你send啲資料畀我睇下啦
confidence: 0.932

Waiting for operation to complete...
Transcript: 我唔係跟你哋買我有自己嘅agent
confidence: 0.951

Waiting for operation to complete...
Transcript: 我唔相信你哋
confidence: 0.93

In [111]:
import pandas as pd
Text_df = pd.DataFrame({'Audio_Number' : list(range(1,21)),
                        'Text_GoogleAPI' : Text_list,
                        'Confidence' : Confidence_list,
                        })
Text_df.to_excel("Speech_to_Text_Result.xlsx")
Text_df.shape
Text_df.head()

Unnamed: 0,Audio_Number,Confidence,Text_GoogleAPI
0,1,0.939,係呀但我得唔得閒呀可唔可以遲啲再打畀我吖
1,2,0.939,我唔需要藉錢啦唔好再打畀我啦
2,3,0.95,我唔係呀你打錯呀
3,4,0.951,我唔鍾意
4,5,0.929,冇興趣拜拜


# Appendix

In [74]:
# Load file from local drive (instead of Cloud Storage) and then use Google API
from google.cloud import speech_v1p1beta1 as speech
import os
os.environ['GOOGLE_APPLICATION_CREDENTIALS']='rich-suprstate-311107-b521ee51d373.json'

client = speech.SpeechClient()
encoding=speech.enums.RecognitionConfig.AudioEncoding

input = "stt_success/audio.raw"

config = {
    "language_code": 'en-US',
    "encoding": encoding.LINEAR16,
    "sample_rate_hertz": 22050,
    "speech_contexts": []
}

with open(input, "rb") as f:
    audio = {"content": f.read()}
    
response = client.recognize(config, audio)

for result in response.results: 
    print('Transcript: {}'.format(result.alternatives[0].transcript))
    print('confidence: {:.3f}'.format(result.alternatives[0].confidence))

Transcript: how old is the Brooklyn Bridge
confidence: 0.959
