In [34]:
import ipywidgets as widgets
from IPython.display import display
from threading import Thread
from queue import Queue # well let us pass messages between threads

messages = Queue() # will tell thread when to stop recording and transcribing
recordings = Queue() # will store audio from mic and pass it to transcription
TRANSCRIPT_FILENAME = "transcript_files/fulltranscript.txt"
# delete contents of transcription file
f = open(TRANSCRIPT_FILENAME, "w") 
f.write('')
f.close()

record_button = widgets.Button(
    description="Record",
    disabled=False,
    button_style="success",
    icon="microphone"
)

stop_button = widgets.Button(
    description="Stop",
    disabled=False,
    button_style="warning",
    icon="stop"
)

# output widget will show transcript as it's generated
output = widgets.Output()

# data is automatically passed by jupyter notebook
def start_recording(data):
    print('recording')
    messages.put(True) # will tell thread to keep running and recording

    with output: 
        display("Starting...")

        # thread that will record microphone
        record = Thread(target=record_microphone) 
        record.start()

        # thread that will start transcribing
        transcribe = Thread(target=speech_recognition, args=(output,))
        transcribe.start()

        # thread that will answer questions
        # chatbot_thread = Thread(target=chatbot)
        # chatbot_thread.start()

def stop_recording(data):
    # with output: 
        messages.get() # will take the 'True' message off the queue
        display("Stopped.")

record_button.on_click(start_recording)
stop_button.on_click(stop_recording)
    

display(record_button, stop_button, output)

Button(button_style='success', description='Record', icon='microphone', style=ButtonStyle())



Output()

In [28]:
from transformers import pipeline

qa_pipeline = pipeline("question-answering")

def chatbot():
    while True: 
        transcription = open(TRANSCRIPT_FILENAME, "r").read()
        question = input('You: ')
        result = qa_pipeline(question=question, context=transcription)
        print(result['answer'])
        
    
    

No model was supplied, defaulted to distilbert/distilbert-base-cased-distilled-squad and revision 626af31 (https://huggingface.co/distilbert/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [2]:
# Figure out which microphone we want to use 
import pyaudio

p = pyaudio.PyAudio()
for i in range(p.get_device_count()):
    print(p.get_device_info_by_index(i))

p.terminate()

{'index': 0, 'structVersion': 2, 'name': 'HDA Intel PCH: ALC289 Analog (hw:0,0)', 'hostApi': 0, 'maxInputChannels': 2, 'maxOutputChannels': 2, 'defaultLowInputLatency': 0.005804988662131519, 'defaultLowOutputLatency': 0.005333333333333333, 'defaultHighInputLatency': 0.034829931972789115, 'defaultHighOutputLatency': 0.032, 'defaultSampleRate': 48000.0}
{'index': 1, 'structVersion': 2, 'name': 'HDA Intel PCH: HDMI 0 (hw:0,3)', 'hostApi': 0, 'maxInputChannels': 0, 'maxOutputChannels': 8, 'defaultLowInputLatency': -1.0, 'defaultLowOutputLatency': 0.005804988662131519, 'defaultHighInputLatency': -1.0, 'defaultHighOutputLatency': 0.034829931972789115, 'defaultSampleRate': 44100.0}
{'index': 2, 'structVersion': 2, 'name': 'HDA Intel PCH: HDMI 1 (hw:0,7)', 'hostApi': 0, 'maxInputChannels': 0, 'maxOutputChannels': 8, 'defaultLowInputLatency': -1.0, 'defaultLowOutputLatency': 0.005804988662131519, 'defaultHighInputLatency': -1.0, 'defaultHighOutputLatency': 0.034829931972789115, 'defaultSampleRa

ALSA lib pcm.c:2664:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.rear
ALSA lib pcm.c:2664:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.center_lfe
ALSA lib pcm.c:2664:(snd_pcm_open_noupdate) Unknown PCM cards.pcm.side
ALSA lib pcm_route.c:877:(find_matching_chmap) Found no matching channel map
ALSA lib pcm_route.c:877:(find_matching_chmap) Found no matching channel map
ALSA lib pcm_route.c:877:(find_matching_chmap) Found no matching channel map
ALSA lib pcm_route.c:877:(find_matching_chmap) Found no matching channel map
ALSA lib pcm_oss.c:397:(_snd_pcm_oss_open) Cannot open device /dev/dsp
ALSA lib pcm_oss.c:397:(_snd_pcm_oss_open) Cannot open device /dev/dsp
ALSA lib confmisc.c:160:(snd_config_get_card) Invalid field card
ALSA lib pcm_usb_stream.c:482:(_snd_pcm_usb_stream_open) Invalid card 'card'
ALSA lib confmisc.c:160:(snd_config_get_card) Invalid field card
ALSA lib pcm_usb_stream.c:482:(_snd_pcm_usb_stream_open) Invalid card 'card'


##### The microphone I want is: 
``` {'index': 5, 'structVersion': 2, 'name': 'Dell Headset WH3022: USB Audio (hw:1,0)', 'hostApi': 0, 'maxInputChannels': 1, 'maxOutputChannels': 2, 'defaultLowInputLatency': 0.0239375, 'defaultLowOutputLatency': 0.0239375, 'defaultHighInputLatency': 0.096, 'defaultHighOutputLatency': 0.096, 'defaultSampleRate': 16000.0} ```



In [32]:
# constants that will allow for optimal conditions for speech recognition
CHANNELS = 1
FRAME_RATE = 16000 # Determines how high quality the frame rate is. how quickly the audio signal is sampled. unit is kHz = cycles/sec?
RECORD_SECONDS = 3 # How many seconds we want to record audio for before we send it off for transcription. every 20s we'll generate a transcript
AUDIO_FORMAT = pyaudio.paInt16
SAMPLE_SIZE = 2


def record_microphone(chunk=1024): # chunk is how often we are going to read audio from the microphone (how many audio frames).
    p = pyaudio.PyAudio()

    stream = p.open(format=AUDIO_FORMAT,
                    channels=CHANNELS,
                    rate=FRAME_RATE,
                    input=True,
                    # input_device_index=5,
                    frames_per_buffer=chunk)
                    
    frames = [] # will store all the audio recorded from the microphone

    while not messages.empty():
        data = stream.read(chunk)
        frames.append(data)

        # if we recorded more than 20 seconds of audio, then add audio data to recordings queue
        if len(frames) >= (FRAME_RATE * RECORD_SECONDS) / chunk: 
            recordings.put(frames.copy())
            frames = []

    stream.stop_stream()
    stream.close()
    p.terminate()


In [None]:
%pip install vosk

In [None]:
%pip install transformers # needed for recase punc (to add punctuation back to transcript)

In [None]:
%pip install torch # requirement for recase punc

In [23]:
import subprocess # will be used to call the punctuation model
import json
from vosk import Model, KaldiRecognizer

model = Model(model_name="vosk-model-en-us-0.22")
rec = KaldiRecognizer(model, FRAME_RATE) # responsible for managing the audio transcription 
rec.SetWords(True) # will give us confidence levels for each individual word

# takes in the output widget so that it can display the transcript live.
def speech_recognition(output):
    global text_so_far
    
    while not messages.empty():
        frames = recordings.get()
        raw_audio_data = b''.join(frames)

        rec.AcceptWaveform(b''.join(frames)) # join all chunks together into one binary string
        result = rec.Result()
        text = json.loads(result)["text"]

        # write to file
        f = open(TRANSCRIPT_FILENAME,"a")
        f.write(text)
        f.close()
        # now we have to add punctuation to our transcript
        # this method is inefficient, because we're reloading the model and re-initializing everything every time 
        # instead,if you don't use the command line, it would be a lot faster and you can reduce the RECORD_SECONDS to 2-3 seconds which will make the transcription a lot more live. 
        # cased = subprocess.check_output("python recasepunc/recasepunc.py predict recasepunc/checkpoint/path", shell=True,text=True,input=text)
        output.append_stdout(text) # add transcript to output widget

   


LOG (VoskAPI:ReadDataFiles():model.cc:213) Decoding params beam=13 max-active=7000 lattice-beam=6
LOG (VoskAPI:ReadDataFiles():model.cc:216) Silence phones 1:2:3:4:5:11:12:13:14:15
LOG (VoskAPI:RemoveOrphanNodes():nnet-nnet.cc:948) Removed 0 orphan nodes.
LOG (VoskAPI:RemoveOrphanComponents():nnet-nnet.cc:847) Removing 0 orphan components.
LOG (VoskAPI:ReadDataFiles():model.cc:248) Loading i-vector extractor from /home/mariam/.cache/vosk/vosk-model-en-us-0.22/ivector/final.ie
LOG (VoskAPI:ComputeDerivedVars():ivector-extractor.cc:183) Computing derived variables for iVector extractor
LOG (VoskAPI:ComputeDerivedVars():ivector-extractor.cc:204) Done.
LOG (VoskAPI:ReadDataFiles():model.cc:279) Loading HCLG from /home/mariam/.cache/vosk/vosk-model-en-us-0.22/graph/HCLG.fst
LOG (VoskAPI:ReadDataFiles():model.cc:297) Loading words from /home/mariam/.cache/vosk/vosk-model-en-us-0.22/graph/words.txt
LOG (VoskAPI:ReadDataFiles():model.cc:308) Loading winfo /home/mariam/.cache/vosk/vosk-model-en

In [8]:
# from pydub import AudioSegment
# import whisper
# import json
# import io

# model = whisper.load_model("base")

# # takes in the output widget so that it can display the transcript live.
# def speech_recognition_whisper(output):
#     while not messages.empty():
#         frames = recordings.get()
#         raw_audio_data = b''.join(frames)

#         sound = AudioSegment(
#             data=raw_audio_data,
#             sample_width=SAMPLE_SIZE, # 2 byte (16 bit) samples
#             frame_rate=FRAME_RATE,
#             channels=2 # stereo
#         )

#         sound_file = sound.export('file.mp3',format='mp3')
#         transcript = model.transcribe(sound_file)

#         output.append_stdout(transcript['text']) # add transcript to output widget

#         # # OR
#         # buffer = io.BytesIO(raw_audio_data)
#         # buffer.name = 'temp-file.mp3'

#         # transcript = model.transcribe(buffer)

#         # output.append_stdout(transcript['text'])

    
# # from faster_whisper import WhisperModel
# # print(whisper.__file__)
# # model = WhisperModel(model_size, device="cpu", compute_type="int8")

# # def speech_recognition(output):
# #     while not messages.empty():
# #         frames = recordings.get()

# #         result = model.transcribe(b''.join(frames))
# #         text = result["text"]
# #         print(text)
# #         output.append_stdout(text)
       