In [7]:
!pip install SpeechRecognition

Collecting SpeechRecognition
  Downloading SpeechRecognition-3.8.1-py2.py3-none-any.whl (32.8 MB)
[K     |████████████████████████████████| 32.8 MB 1.4 MB/s 
[?25hInstalling collected packages: SpeechRecognition
Successfully installed SpeechRecognition-3.8.1


In [8]:
# Import Speech Recognition Package
import speech_recognition as spr


In [9]:
# Validate the installation
spr.__version__

'3.8.1'

In [10]:
recog = spr.Recognizer()

In [11]:
recog.recognize_google()

TypeError: ignored

# **Convert Speech to Text**

In [13]:
speech = spr.AudioFile('/content/drive/MyDrive/NLP/OSR_us_000_0010_8k.wav')
with speech as filesource:
    audio = recog.record(filesource)


In [14]:
recog.recognize_google(audio)

"Birch canoe slid on the smooth plank glue the sea to a dark blue background it is easy to tell the depth of the well these days a chicken leg of a variegated price is often served in Randall's the juice of lemons makes find the boxes on the side the pump truck the ha grimstead top corn and garbage for hours of City Works in a large size and stockings and hard to sell"

# **Convert Speech to Text - Capture only particular segments of audio suing offset and duration**

In [15]:
with speech as filesource:
    audio = recog.record(filesource, duration = 5)
recog.recognize_google(audio)

'the Birch canoe slid on the smooth planks'

In [18]:
# Capture multiple portions of speech one after another
with speech as filesource:
    audio_1 = recog.record(filesource, duration=5)
    audio_2 = recog.record(filesource, duration=5)
print(recog.recognize_google(audio_1))
print(recog.recognize_google(audio_2))

the Birch canoe slid on the smooth planks
dog food background it is easy to tell the depth of a well


In [20]:
# Capturing second portion of the speech using an offset argument
with speech as filesource:
    audio = recog.record(filesource, offset=5, duration=7)
recog.recognize_google(audio)

'62 dog food background it is easy to tell the depth of the well'

# **Convert Speech to Text - Effect of Noise**

In [21]:
noisyspeech = spr.AudioFile('/content/drive/MyDrive/NLP/OSR_us_000_0010_8k.wav')

with noisyspeech as noisesource:
    audio = recog.record(noisesource)

recog.recognize_google(audio)

"Birch canoe slid on the smooth plank glue the sea to a dark blue background it is easy to tell the depth of the well these days a chicken leg of a variegated price is often served in Randall's the juice of lemons makes find the boxes on the side the pump truck the ha grimstead top corn and garbage for hours of City Works in a large size and stockings and hard to sell"

In [24]:
with noisyspeech as noisesource:
    recog.adjust_for_ambient_noise(noisesource)
recog.recognize_google(audio)

"Birch canoe slid on the smooth plank glue the sea to a dark blue background it is easy to tell the depth of the well these days a chicken leg of a variegated price is often served in Randall's the juice of lemons makes find the boxes on the side the pump truck the ha grimstead top corn and garbage for hours of City Works in a large size and stockings and hard to sell"

In [25]:
recog.recognize_google(audio, show_all=True)

{'alternative': [{'confidence': 0.7867201,
   'transcript': "Birch canoe slid on the smooth plank glue the sea to a dark blue background it is easy to tell the depth of the well these days a chicken leg of a variegated price is often served in Randall's the juice of lemons makes find the boxes on the side the pump truck the ha grimstead top corn and garbage for hours of City Works in a large size and stockings and hard to sell"},
  {'transcript': "Birch canoe slid on the smooth plank glue the sea to a dark blue background it is easy to tell the depth of the well these day the chicken leg of a variegated price is often served in Randall's the juice of lemons makes find the boxes down beside the pump truck the ha grimstead top corn and garbage for hours of City Works in a large size and stockings and hard to sell"},
  {'transcript': 'Birch canoe slid on the smooth plank glue the sea to a dark blue background it is easy to tell the depth of the well these days a chicken leg of a variegate

# **Convert Speech to Text in Real Time using Microphone**

In [36]:
!apt install libasound2-dev portaudio19-dev libportaudio2 libportaudiocpp0 ffmpeg

Reading package lists... Done
Building dependency tree       
Reading state information... Done
libasound2-dev is already the newest version (1.1.3-5ubuntu0.6).
ffmpeg is already the newest version (7:3.4.8-0ubuntu0.2).
Suggested packages:
  portaudio19-doc
The following NEW packages will be installed:
  libportaudio2 libportaudiocpp0 portaudio19-dev
0 upgraded, 3 newly installed, 0 to remove and 37 not upgraded.
Need to get 184 kB of archives.
After this operation, 891 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 libportaudio2 amd64 19.6.0-1 [64.6 kB]
Get:2 http://archive.ubuntu.com/ubuntu bionic/universe amd64 libportaudiocpp0 amd64 19.6.0-1 [15.1 kB]
Get:3 http://archive.ubuntu.com/ubuntu bionic/universe amd64 portaudio19-dev amd64 19.6.0-1 [104 kB]
Fetched 184 kB in 1s (247 kB/s)
Selecting previously unselected package libportaudio2:amd64.
(Reading database ... 155222 files and directories currently installed.)
Preparing to 

In [37]:
pip install PyAudio

Collecting PyAudio
  Using cached PyAudio-0.2.11.tar.gz (37 kB)
Building wheels for collected packages: PyAudio
  Building wheel for PyAudio (setup.py) ... [?25l[?25hdone
  Created wheel for PyAudio: filename=PyAudio-0.2.11-cp37-cp37m-linux_x86_64.whl size=52602 sha256=bd7191f5138a4879c16945f613a3116c4c69f2bd5cd8a47cd1bd92da7cd31734
  Stored in directory: /root/.cache/pip/wheels/40/2e/4c/b71e7e96c861a46e6213bc6bb482b94dcf293a92c5e736c1ec
Successfully built PyAudio
Installing collected packages: PyAudio
Successfully installed PyAudio-0.2.11


In [39]:
# all imports
from io import BytesIO
from base64 import b64decode
from google.colab import output
from IPython.display import Javascript

RECORD = """
const sleep  = time => new Promise(resolve => setTimeout(resolve, time))
const b2text = blob => new Promise(resolve => {
  const reader = new FileReader()
  reader.onloadend = e => resolve(e.srcElement.result)
  reader.readAsDataURL(blob)
})
var record = time => new Promise(async resolve => {
  stream = await navigator.mediaDevices.getUserMedia({ audio: true })
  recorder = new MediaRecorder(stream)
  chunks = []
  recorder.ondataavailable = e => chunks.push(e.data)
  recorder.start()
  await sleep(time)
  recorder.onstop = async ()=>{
    blob = new Blob(chunks)
    text = await b2text(blob)
    resolve(text)
  }
  recorder.stop()
})
"""

def record(sec=3):
  print("Speak Now...")
  display(Javascript(RECORD))
  sec += 1
  s = output.eval_js('record(%d)' % (sec*1000))
  print("Done Recording !")
  b = b64decode(s.split(',')[1])
  return b #byte stream

In [40]:
audio = record(5)

Speak Now...


<IPython.core.display.Javascript object>

Done Recording !


In [41]:
type(audio)

bytes

In [42]:
import IPython.display as ipd
ipd.display(ipd.Audio(audio))

In [43]:
mc = spr.Microphone()

OSError: ignored