In [1]:
import ipywidgets as widgets
from IPython.display import display
from threading import Thread
from queue import Queue

#Create 2 queue to pass messages
messages = Queue() #when to stop recording and stop transcribing
recordings = Queue() #store recorded audio from microphone and pass it to transcription

#Record button widget
record_button = widgets.Button(
    description = "Record",
    disabled = False,
    button_style = "success",
    tooltip = 'Record',
    icon = "microphone"
)

#Stop button widget
stop_button = widgets.Button(
    description = "Stop",
    disabled = False,
    button_style = "warning",
    tooltip='Stop',
    icon = "stop"
)

#Output to display the recorded speech into text
output = widgets.Output()

#start recording function
#Put message to the messages queue, message will tell thread to keep running and recording
def start_recording(data):
    messages.put(True)

    with output:
        display("Starting...")
        record = Thread(target = record_microphone)
        record.start()

        transcribe = Thread(target = speech_recog, args=(output,))
        transcribe.start()

#stop recording function
def stop_recording(data):
    with output:
        messages.get()
        display("Stopped recording")

#click start and stop button
record_button.on_click(start_recording)
stop_button.on_click(stop_recording)
    
#displaying the record button and stop button
display(record_button, stop_button, output)

Button(button_style='success', description='Record', icon='microphone', style=ButtonStyle(), tooltip='Record')



Output()

In [2]:
#install pyaudio if you haven't
#%pip install pyaudio

In [3]:
# Install pyaudio from http://people.csail.mit.edu/hubert/pyaudio/
# Find audio device index using this code
import pyaudio
p = pyaudio.PyAudio()
for i in range(p.get_device_count()):
    print(p.get_device_info_by_index(i))

p.terminate()

{'index': 0, 'structVersion': 2, 'name': 'Microsoft Sound Mapper - Input', 'hostApi': 0, 'maxInputChannels': 2, 'maxOutputChannels': 0, 'defaultLowInputLatency': 0.09, 'defaultLowOutputLatency': 0.09, 'defaultHighInputLatency': 0.18, 'defaultHighOutputLatency': 0.18, 'defaultSampleRate': 44100.0}
{'index': 1, 'structVersion': 2, 'name': 'Microphone (Realtek(R) Audio)', 'hostApi': 0, 'maxInputChannels': 2, 'maxOutputChannels': 0, 'defaultLowInputLatency': 0.09, 'defaultLowOutputLatency': 0.09, 'defaultHighInputLatency': 0.18, 'defaultHighOutputLatency': 0.18, 'defaultSampleRate': 44100.0}
{'index': 2, 'structVersion': 2, 'name': 'Microsoft Sound Mapper - Output', 'hostApi': 0, 'maxInputChannels': 0, 'maxOutputChannels': 2, 'defaultLowInputLatency': 0.09, 'defaultLowOutputLatency': 0.09, 'defaultHighInputLatency': 0.18, 'defaultHighOutputLatency': 0.18, 'defaultSampleRate': 44100.0}
{'index': 3, 'structVersion': 2, 'name': 'Speakers (Realtek(R) Audio)', 'hostApi': 0, 'maxInputChannels': 

In [4]:
CHANNELS = 1
FRAME_RATE = 16000 #Determines how high quality the recording is
RECORD_SECONDS = 20 #How many seconds we want to record audio before sending off for trnascription
AUDIO_FORMAT = pyaudio.paInt16 #Format to record audio in
SAMPLE_SIZE = 2

#Chunk = how often we read from microphone
def record_microphone(chunk=1024):
    p = pyaudio.PyAudio()

    #Create a stream that connects to our microphone and record
    stream = p.open(format=AUDIO_FORMAT,
                    channels=CHANNELS,
                    rate=FRAME_RATE,
                    input=True,
                    input_device_index=1,
                    frames_per_buffer=chunk)

    frames = [] #To store all the audio recorded from microphone
    while not messages.empty():
        data = stream.read(chunk)
        frames.append(data)

        if len(frames) >= (FRAME_RATE * RECORD_SECONDS) / chunk: #If we recorded more than number stated in RECORD_SECONDS
            recordings.put(frames.copy()) #Add audio data into recordings Q
            frames = []

    #Close connection from pyaudio to microphone
    stream.stop_stream()
    stream.close()
    p.terminate()

In [5]:
#Transcribe function - Turn audio into text
#Installing vosk and all other packages - IF YOU HAVENT INSTALLED

#%pip install vosk
#%pip install transformers
#%pip install torch

In [6]:
import subprocess #Used to call the punctuation model
import json
from vosk import Model, KaldiRecognizer

model = Model(model_name="vosk-model-small-en-us-0.15") #downloading the model from model list
rec = KaldiRecognizer(model, FRAME_RATE) #Use the model to do the speech recog
rec.SetWords(True) #Give us confidence levels for each individual words

def speech_recog(output):
    while not messages.empty(): #To ensure we have not clicked "Stop recording", once clicked messages Q will be empty
        frames = recordings.get()
         
        rec.AcceptWaveform(b''.join(frames)) #frames will be several diff chunks, this will join all chunks together into 1 binary string
        results = rec.Result()
        text = json.loads(result)["text"] #vosk returns result in json format so we need to use json library to load results and get the text

        #add punctuation to our transcript
        output.append_stdout(text)
        #time.sleep(1)
        
        