In [1]:
print('test')

test


In [1]:
#! python3.7

import argparse
import os
import numpy as np
import speech_recognition as sr
import whisper
import torch

from datetime import datetime, timedelta
from queue import Queue
from time import sleep
from sys import platform


def start_recording(model='medium', non_english='false', energy_threshold=1000, record_timeout=2, phrase_timeout=3, default_microphone='pulse'):
    # The last time a recording was retrieved from the queue.
    phrase_time = None
    # Thread safe Queue for passing data from the threaded recording callback.
    data_queue = Queue()
    # We use SpeechRecognizer to record our audio because it has a nice feature where it can detect when speech ends.
    recorder = sr.Recognizer()
    recorder.energy_threshold = energy_threshold
    # Definitely do this, dynamic energy compensation lowers the energy threshold dramatically to a point where the SpeechRecognizer never stops recording.
    recorder.dynamic_energy_threshold = False

    # Important for linux users.
    # Prevents permanent application hang and crash by using the wrong Microphone
    if 'linux' in platform:
        mic_name = default_microphone
        if not mic_name or mic_name == 'list':
            print("Available microphone devices are: ")
            for index, name in enumerate(sr.Microphone.list_microphone_names()):
                print(f"Microphone with name \"{name}\" found")
            return
        else:
            for index, name in enumerate(sr.Microphone.list_microphone_names()):
                if mic_name in name:
                    source = sr.Microphone(sample_rate=16000, device_index=index)
                    break
    else:
        source = sr.Microphone(sample_rate=16000)

    # Load / Download model
    model = model
    if model != "large" and not non_english:
        model = model + ".en"
    audio_model = whisper.load_model(model)

    record_timeout = record_timeout
    phrase_timeout = phrase_timeout

    transcription = ['']

    with source:
        recorder.adjust_for_ambient_noise(source)

    def record_callback(_, audio:sr.AudioData) -> None:
        """
        Threaded callback function to receive audio data when recordings finish.
        audio: An AudioData containing the recorded bytes.
        """
        # Grab the raw bytes and push it into the thread safe queue.
        data = audio.get_raw_data()
        data_queue.put(data)

    # Create a background thread that will pass us raw audio bytes.
    # We could do this manually but SpeechRecognizer provides a nice helper.
    recorder.listen_in_background(source, record_callback, phrase_time_limit=record_timeout)

    # Cue the user that we're ready to go.
    print("Model loaded.\n")

    while True:
        try:
            now = datetime.utcnow()
            # Pull raw recorded audio from the queue.
            if not data_queue.empty():
                phrase_complete = False
                # If enough time has passed between recordings, consider the phrase complete.
                # Clear the current working audio buffer to start over with the new data.
                if phrase_time and now - phrase_time > timedelta(seconds=phrase_timeout):
                    phrase_complete = True
                # This is the last time we received new audio data from the queue.
                phrase_time = now
                
                # Combine audio data from queue
                audio_data = b''.join(data_queue.queue)
                data_queue.queue.clear()
                
                # Convert in-ram buffer to something the model can use directly without needing a temp file.
                # Convert data from 16 bit wide integers to floating point with a width of 32 bits.
                # Clamp the audio stream frequency to a PCM wavelength compatible default of 32768hz max.
                audio_np = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32) / 32768.0

                # Read the transcription.
                result = audio_model.transcribe(audio_np, fp16=torch.cuda.is_available())
                text = result['text'].strip()

                # If we detected a pause between recordings, add a new item to our transcription.
                # Otherwise edit the existing one.
                if phrase_complete:
                    transcription.append(text)
                else:
                    transcription[-1] = text

                # Clear the console to reprint the updated transcription.
                # os.system('cls' if os.name=='nt' else 'clear')
                for line in transcription:
                    print(line)
                # Flush stdout.
                print('', end='', flush=True)

                # Infinite loops are bad for processors, must sleep.
                sleep(0.25)
        except KeyboardInterrupt:
            break

    print("\n\nTranscription:")
    for line in transcription:
        print(line)

In [2]:
# word level 
start_recording(model='base', record_timeout=0.5, phrase_timeout=0.4)

Model loaded.

Hello?
Hello?
What is up?
Hello?
What is up?
Quick run.
Hello?
What is up?
Quick run.
Fors a finish of you with the 4-part
Hello?
What is up?
Quick run.
Fors a finish of you with the 4-part
the click.
Hello?
What is up?
Quick run.
Fors a finish of you with the 4-part
the click.
box.
Hello?
What is up?
Quick run.
Fors a finish of you with the 4-part
the click.
box.

Hello?
What is up?
Quick run.
Fors a finish of you with the 4-part
the click.
box.

this is not gonna be – you
Hello?
What is up?
Quick run.
Fors a finish of you with the 4-part
the click.
box.

this is not gonna be – you
has 말씀ed it a little earlier when I went through fear ofHHHHHHHHMUSIC HORROW Shoa Sine No LAUGH테
Hello?
What is up?
Quick run.
Fors a finish of you with the 4-part
the click.
box.

this is not gonna be – you
has 말씀ed it a little earlier when I went through fear ofHHHHHHHHMUSIC HORROW Shoa Sine No LAUGH테
You can't see me among the way.


In [None]:
# sentence level
start_recording(model='base')

# Multithreading vs Multiprocessing Test

In [None]:
print('crash?')

In [3]:
import threading

def run_recording_in_thread(model, non_english, energy_threshold, record_timeout, phrase_timeout, default_microphone):
    # This function will be executed in a separate thread
    start_recording(model=model, non_english=non_english, energy_threshold=energy_threshold,
                    record_timeout=record_timeout, phrase_timeout=phrase_timeout, default_microphone=default_microphone)

# start_recording(model='base', record_timeout=0.5, phrase_timeout=0.4)
# Parameters for each instance
instances = [
    {'model': 'medium', 'non_english': 'false', 'energy_threshold': 1000, 'record_timeout': 0.5, 'phrase_timeout': 0.4, 'default_microphone': 'pulse'},
    {'model': 'medium', 'non_english': 'false', 'energy_threshold': 1000, 'record_timeout': 2, 'phrase_timeout': 3, 'default_microphone': 'pulse'},
]

threads = []

for instance in instances:
    # Create a Thread for each set of parameters
    t = threading.Thread(target=run_recording_in_thread, kwargs=instance)
    t.start()
    threads.append(t)

for t in threads:
    t.join()  # Wait for all threads to complete

Model loaded.

Model loaded.

Ah
Ugh.
Ugh.
Hello hello. I don't know. The quick brown fox. This is model loaded this time.
Ah
Hello, hello. I don't know. The click brown fox. It says model loaded this time.
Ugh.
Hello hello. I don't know. The quick brown fox. This is model loaded this time.
Wait, it just detects your voice and stuff. Yeah, it's actually so slow. Yeah
Ugh.
Hello hello. I don't know. The quick brown fox. This is model loaded this time.
Wait, it just detects your voice and stuff. Yeah, it's actually so slow. Yeah
Or maybe instead of... Oh, it just was very slow, like very delayed. I think it's just wrong implementation. Like I think it's...
Ah
Hello, hello. I don't know. The click brown fox. It says model loaded this time.
Wait, did this detect your deploy system? What is this? Yeah, it's actually so slow. It's actually happening nothing. Yeah. So... Or maybe instead of... Oh, it's...
Ugh.
Hello hello. I don't know. The quick brown fox. This is model loaded this time.
Wai

In [3]:
from multiprocessing import Process

def run_recording_in_process(model, non_english, energy_threshold, record_timeout, phrase_timeout, default_microphone):
    start_recording(model=model, non_english=non_english, energy_threshold=energy_threshold,
                    record_timeout=record_timeout, phrase_timeout=phrase_timeout, default_microphone=default_microphone)

if __name__ == '__main__':
    instances = [
        {'model': 'medium', 'non_english': 'false', 'energy_threshold': 1000, 'record_timeout': 0.5, 'phrase_timeout': 0.4, 'default_microphone': 'pulse'},
        {'model': 'medium', 'non_english': 'false', 'energy_threshold': 1000, 'record_timeout': 2, 'phrase_timeout': 3, 'default_microphone': 'pulse'},
    ]

    processes = []

    for instance in instances:
        p = Process(target=run_recording_in_process, kwargs=instance)
        p.start()
        processes.append(p)

    for p in processes:
        p.join()

# Multi-queue test

In [13]:
import argparse
import os
import numpy as np
import speech_recognition as sr
import whisper
import torch

from datetime import datetime, timedelta
from queue import Queue
from time import sleep
from sys import platform


class AudioStreamHandler:
    def __init__(self, model='medium', non_english='false', energy_threshold=1000, default_microphone='pulse'):
        self.model_name = model
        self.non_english = non_english
        self.energy_threshold = energy_threshold
        self.default_microphone = default_microphone
        self.audio_model = self.load_model()
        self.source = self.initialize_microphone()
        self.queues = []
        self.transcriptions = []

    def load_model(self):
        model = self.model_name
        if model != "large" and not self.non_english:
            model = model + ".en"
        return whisper.load_model(model)

    def initialize_microphone(self):
        if 'linux' in platform:
            mic_name = self.default_microphone
            if not mic_name or mic_name == 'list':
                print("Available microphone devices are:")
                for index, name in enumerate(sr.Microphone.list_microphone_names()):
                    print(f"Microphone with name \"{name}\" found")
                return
            else:
                for index, name in enumerate(sr.Microphone.list_microphone_names()):
                    if mic_name in name:
                        return sr.Microphone(sample_rate=16000, device_index=index)
        else:
            return sr.Microphone(sample_rate=16000)

    def add_queue(self, record_timeout, phrase_timeout):
        self.queues.append({
            'queue': Queue(),
            'record_timeout': record_timeout,
            'phrase_timeout': phrase_timeout,
            'last_phrase_time': None
        })
        self.transcriptions.append('')

    def record_callback(self, _, audio: sr.AudioData, queue_index):
        data = audio.get_raw_data()
        self.queues[queue_index]['queue'].put(data)

    def start_recording(self):
        recorder = sr.Recognizer()
        recorder.energy_threshold = self.energy_threshold
        recorder.dynamic_energy_threshold = False

        with self.source as source:
            recorder.adjust_for_ambient_noise(source)
            for i, q in enumerate(self.queues):
                recorder.listen_in_background(source, lambda recognizer, audio: self.record_callback(recognizer, audio, i), phrase_time_limit=q['record_timeout'])

        print("Model loaded.\n")
        try:
            while True:
                for i, q in enumerate(self.queues):
                    self.process_queue(i)
                sleep(0.25)
        except KeyboardInterrupt:
            self.print_transcriptions()

    def process_queue(self, queue_index):
        now = datetime.utcnow()
        q = self.queues[queue_index]
        if not q['queue'].empty():
            phrase_complete = False
            if q['last_phrase_time'] and now - q['last_phrase_time'] > timedelta(seconds=q['phrase_timeout']):
                phrase_complete = True
            q['last_phrase_time'] = now

            audio_data = b''.join(q['queue'].queue)
            q['queue'].queue.clear()

            audio_np = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32) / 32768.0
            result = self.audio_model.transcribe(audio_np, fp16=torch.cuda.is_available())
            text = result['text'].strip()

            if phrase_complete:
                self.transcriptions[queue_index] += "\n" + text
            else:
                self.transcriptions[queue_index] += " " + text

            print(self.transcriptions[queue_index])

    def print_transcriptions(self):
        print("\n\nTranscriptions:")
        for line in self.transcriptions:
            print(line)


if __name__ == "__main__":
    handler = AudioStreamHandler()
    handler.add_queue(record_timeout=2, phrase_timeout=3)
    handler.add_queue(record_timeout=5, phrase_timeout=7)
    handler.start_recording()


Exception in thread Thread-7 (threaded_listen):
Traceback (most recent call last):
  File "c:\Users\Andre\miniforge3\envs\hackviolet2024\Lib\threading.py", line 1045, in _bootstrap_inner
    self.run()
  File "c:\Users\Andre\miniforge3\envs\hackviolet2024\Lib\site-packages\ipykernel\ipkernel.py", line 761, in run_closure
Exception in thread Thread-8 (threaded_listen):
Traceback (most recent call last):
  File "c:\Users\Andre\miniforge3\envs\hackviolet2024\Lib\threading.py", line 1045, in _bootstrap_inner
    _threading_Thread_run(self)
  File "c:\Users\Andre\miniforge3\envs\hackviolet2024\Lib\threading.py", line 982, in run
    self.run()
  File "c:\Users\Andre\miniforge3\envs\hackviolet2024\Lib\site-packages\ipykernel\ipkernel.py", line 761, in run_closure
    self._target(*self._args, **self._kwargs)
  File "c:\Users\Andre\miniforge3\envs\hackviolet2024\Lib\site-packages\speech_recognition\__init__.py", line 561, in threaded_listen
    _threading_Thread_run(self)
  File "c:\Users\And

Model loaded.



In [1]:
import argparse
import os
import numpy as np
import speech_recognition as sr
import whisper
import torch

from datetime import datetime, timedelta
from queue import Queue
from time import sleep
from sys import platform


class AudioStreamHandler:
    def __init__(self, model='medium', non_english='false', energy_threshold=1000, default_microphone='pulse'):
        self.model_name = model
        self.non_english = non_english
        self.energy_threshold = energy_threshold
        self.default_microphone = default_microphone
        self.audio_model = self.load_model()
        self.source = self.initialize_microphone()
        self.queues = []  # Stores queues along with their settings

    def load_model(self):
        model = self.model_name
        if model != "large" and not self.non_english:
            model = model + ".en"
        return whisper.load_model(model)

    def initialize_microphone(self):
        if 'linux' in platform:
            mic_name = self.default_microphone
            if not mic_name or mic_name == 'list':
                print("Available microphone devices are:")
                for index, name in enumerate(sr.Microphone.list_microphone_names()):
                    print(f"Microphone with name \"{name}\" found")
                return
            else:
                for index, name in enumerate(sr.Microphone.list_microphone_names()):
                    if mic_name in name:
                        return sr.Microphone(sample_rate=16000, device_index=index)
        else:
            return sr.Microphone(sample_rate=16000)

    def add_queue(self, record_timeout, phrase_timeout):
        self.queues.append({
            'queue': Queue(),
            'record_timeout': record_timeout,
            'phrase_timeout': phrase_timeout,
            'last_phrase_time': None,
            'transcription': ''
        })

    def record_callback(self, recognizer, audio):
        for q in self.queues:
            q['queue'].put(audio.get_raw_data())

    def start_recording(self):
        recorder = sr.Recognizer()
        recorder.energy_threshold = self.energy_threshold
        recorder.dynamic_energy_threshold = False

        with self.source as source:
            recorder.adjust_for_ambient_noise(source)
            stop_listening = recorder.listen_in_background(source, self.record_callback)

        print("Model loaded. Recording...")
        try:
            while True:
                now = datetime.utcnow()
                for q in self.queues:
                    if not q['queue'].empty():
                        self.process_queue(q, now)
                sleep(0.25)
        except KeyboardInterrupt:
            stop_listening()  # Stop listening to the microphone
            self.print_transcriptions()

    def process_queue(self, q, now):
        if q['last_phrase_time'] and now - q['last_phrase_time'] > timedelta(seconds=q['phrase_timeout']):
            q['transcription'] += "\n"  # Add a newline to separate phrases
        q['last_phrase_time'] = now

        audio_data = b''.join(list(q['queue'].queue))
        q['queue'].queue.clear()

        audio_np = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32) / 32768.0
        result = self.audio_model.transcribe(audio_np, fp16=torch.cuda.is_available())
        text = result['text'].strip()

        q['transcription'] += text + " "  # Append new text to the existing transcription

    def print_transcriptions(self):
        print("\n\nTranscriptions:")
        for i, q in enumerate(self.queues):
            print(f"Queue {i+1} Transcription:\n{q['transcription']}")

if __name__ == "__main__":
    handler = AudioStreamHandler()
    handler.add_queue(record_timeout=2, phrase_timeout=3)  # Short phrases
    handler.add_queue(record_timeout=5, phrase_timeout=7)  # Longer phrases
    handler.start_recording()


Exception in thread Thread-5 (threaded_listen):
Traceback (most recent call last):
  File "c:\Users\Andre\miniforge3\envs\hackviolet2024\Lib\threading.py", line 1045, in _bootstrap_inner
    self.run()
  File "c:\Users\Andre\miniforge3\envs\hackviolet2024\Lib\site-packages\ipykernel\ipkernel.py", line 761, in run_closure
    _threading_Thread_run(self)
  File "c:\Users\Andre\miniforge3\envs\hackviolet2024\Lib\threading.py", line 982, in run
    self._target(*self._args, **self._kwargs)
  File "c:\Users\Andre\miniforge3\envs\hackviolet2024\Lib\site-packages\speech_recognition\__init__.py", line 561, in threaded_listen
    with source as s:
  File "c:\Users\Andre\miniforge3\envs\hackviolet2024\Lib\site-packages\speech_recognition\__init__.py", line 174, in __enter__
    assert self.stream is None, "This audio source is already inside a context manager"
           ^^^^^^^^^^^^^^^^^^^
AssertionError: This audio source is already inside a context manager


Model loaded. Recording...


Transcriptions:
Queue 1 Transcription:

Queue 2 Transcription:



In [1]:
#! python3.7

import argparse
import os
import numpy as np
import speech_recognition as sr
import whisper
import torch

from datetime import datetime, timedelta
from queue import Queue
from time import sleep
from sys import platform


def start_recording(model='medium', non_english='false', energy_threshold=1000, record_timeouts=(0.5, 2), phrase_timeouts=(0.5, 3), default_microphone='pulse'):
    # The last time a recording was retrieved from the queue.
    phrase_time = None
    # Thread safe Queue for passing data from the threaded recording callback.
    fast_data_queue = Queue()
    slow_data_queue = Queue()
    # We use SpeechRecognizer to record our audio because it has a nice feature where it can detect when speech ends.
    recorder = sr.Recognizer()
    recorder.energy_threshold = energy_threshold
    # Definitely do this, dynamic energy compensation lowers the energy threshold dramatically to a point where the SpeechRecognizer never stops recording.
    recorder.dynamic_energy_threshold = False

    # Important for linux users.
    # Prevents permanent application hang and crash by using the wrong Microphone
    if 'linux' in platform:
        mic_name = default_microphone
        if not mic_name or mic_name == 'list':
            print("Available microphone devices are: ")
            for index, name in enumerate(sr.Microphone.list_microphone_names()):
                print(f"Microphone with name \"{name}\" found")
            return
        else:
            for index, name in enumerate(sr.Microphone.list_microphone_names()):
                if mic_name in name:
                    source = sr.Microphone(sample_rate=16000, device_index=index)
                    break
    else:
        source = sr.Microphone(sample_rate=16000)

    # Load / Download model
    model = model
    if model != "large" and not non_english:
        model = model + ".en"
    audio_model = whisper.load_model(model)

    transcription = ['']

    with source:
        recorder.adjust_for_ambient_noise(source)

    def record_callback(_, audio:sr.AudioData) -> None:
        """
        Threaded callback function to receive audio data when recordings finish.
        audio: An AudioData containing the recorded bytes.
        """
        # Grab the raw bytes and push it into the thread safe queue.
        data = audio.get_raw_data()
        fast_data_queue.put(data)
        slow_data_queue.put(data)

    # Create a background thread that will pass us raw audio bytes.
    # We could do this manually but SpeechRecognizer provides a nice helper.
    recorder.listen_in_background(source, record_callback, phrase_time_limit=record_timeouts[0])

    # Cue the user that we're ready to go.
    print("Model loaded.\n")

    while True:
        try:
            now = datetime.utcnow()
            # Pull raw recorded audio from the queue.
            if not fast_data_queue.empty():
                phrase_complete = False
                # If enough time has passed between recordings, consider the phrase complete.
                # Clear the current working audio buffer to start over with the new data.
                if phrase_time and now - phrase_time > timedelta(seconds=phrase_timeouts[0]):
                    phrase_complete = True
                # This is the last time we received new audio data from the queue.
                phrase_time = now
                
                # Combine audio data from queue
                audio_data = b''.join(fast_data_queue.queue)
                fast_data_queue.queue.clear()
                
                # Convert in-ram buffer to something the model can use directly without needing a temp file.
                # Convert data from 16 bit wide integers to floating point with a width of 32 bits.
                # Clamp the audio stream frequency to a PCM wavelength compatible default of 32768hz max.
                audio_np = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32) / 32768.0

                # Read the transcription.
                result = audio_model.transcribe(audio_np, fp16=torch.cuda.is_available())
                text = result['text'].strip()

                # If we detected a pause between recordings, add a new item to our transcription.
                # Otherwise edit the existing one.
                if phrase_complete:
                    transcription.append(text)
                else:
                    transcription[-1] = text

                # Clear the console to reprint the updated transcription.
                # os.system('cls' if os.name=='nt' else 'clear')
                for line in transcription:
                    print('fast queue: ', line)
                # Flush stdout.
                print('', end='', flush=True)

                # Infinite loops are bad for processors, must sleep.
            
            now = datetime.utcnow()
            # Pull raw recorded audio from the queue.
            if not slow_data_queue.empty():
                phrase_complete = False
                # If enough time has passed between recordings, consider the phrase complete.
                # Clear the current working audio buffer to start over with the new data.
                if phrase_time and now - phrase_time > timedelta(seconds=phrase_timeouts[1]):
                    phrase_complete = True
                # This is the last time we received new audio data from the queue.
                phrase_time = now
                
                # Combine audio data from queue
                audio_data = b''.join(slow_data_queue.queue)
                slow_data_queue.queue.clear()
                
                # Convert in-ram buffer to something the model can use directly without needing a temp file.
                # Convert data from 16 bit wide integers to floating point with a width of 32 bits.
                # Clamp the audio stream frequency to a PCM wavelength compatible default of 32768hz max.
                audio_np = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32) / 32768.0

                # Read the transcription.
                result = audio_model.transcribe(audio_np, fp16=torch.cuda.is_available())
                text = result['text'].strip()

                # If we detected a pause between recordings, add a new item to our transcription.
                # Otherwise edit the existing one.
                if phrase_complete:
                    transcription.append(text)
                else:
                    transcription[-1] = text

                # Clear the console to reprint the updated transcription.
                # os.system('cls' if os.name=='nt' else 'clear')
                for line in transcription:
                    print('slow queue', line)
                # Flush stdout.
                print('', end='', flush=True)

                # Infinite loops are bad for processors, must sleep.
                sleep(0.25)
        except KeyboardInterrupt:
            break

    print("\n\nTranscription:")
    for line in transcription:
        print(line)

In [2]:
start_recording()

Model loaded.

fast queue:  I'm at the end
slow queue I'm at the end actually
fast queue:  I'm at the end actually
fast queue:  and actually should not have sleep at both.
slow queue I'm at the end actually
slow queue at both.
fast queue:  I'm at the end actually
fast queue:  at both.
fast queue:  for the infinite loop.
slow queue I'm at the end actually
slow queue at both.
slow queue Definitely.
fast queue:  I'm at the end actually
fast queue:  at both.
fast queue:  Definitely.
fast queue:  Now get this
slow queue I'm at the end actually
slow queue at both.
slow queue Definitely.
slow queue I get this weird thing when I have two keys
fast queue:  I'm at the end actually
fast queue:  at both.
fast queue:  Definitely.
fast queue:  I get this weird thing when I have two keys
fast queue:  The weird thing when I have two cues is like...
slow queue I'm at the end actually
slow queue at both.
slow queue Definitely.
slow queue I get this weird thing when I have two keys
slow queue use this li

# Record Audio and playbcak from single queue

In [1]:
import argparse
import os
import numpy as np
import speech_recognition as sr
import whisper
import torch
import wave
import pyaudio

from datetime import datetime, timedelta
from queue import Queue
from time import sleep
from sys import platform

def start_recording(model='medium', non_english='false', energy_threshold=1000, record_timeout=2, phrase_timeout=3, default_microphone='pulse', output_filename='output_audio.wav'):
    phrase_time = None
    data_queue = Queue()
    recorder = sr.Recognizer()
    recorder.energy_threshold = energy_threshold
    recorder.dynamic_energy_threshold = False

    if 'linux' in platform:
        mic_name = default_microphone
        for index, name in enumerate(sr.Microphone.list_microphone_names()):
            if mic_name in name:
                source = sr.Microphone(sample_rate=16000, device_index=index)
                break
    else:
        source = sr.Microphone(sample_rate=16000)

    model = model if model == "large" or non_english == 'true' else model + ".en"
    audio_model = whisper.load_model(model)

    transcription = ['']

    # PyAudio setup for playback
    p = pyaudio.PyAudio()
    playback_stream = p.open(format=pyaudio.paInt16, channels=1, rate=16000, output=True)

    # Open a wave file for writing
    wave_file = wave.open(output_filename, 'wb')
    wave_file.setnchannels(1)  # Mono audio
    wave_file.setsampwidth(2)  # Sample width in bytes
    wave_file.setframerate(16000)  # Frame rate

    with source:
        recorder.adjust_for_ambient_noise(source)

    def record_callback(_, audio: sr.AudioData):
        data = audio.get_raw_data()
        data_queue.put(data)
        wave_file.writeframes(data)  # Write audio data to wave file
        playback_stream.write(data)  # Play audio data back to the user

    recorder.listen_in_background(source, record_callback, phrase_time_limit=record_timeout)

    print("Model loaded and recording to", output_filename)


    while True:
        try:
            now = datetime.utcnow()
            # Pull raw recorded audio from the queue.
            if not data_queue.empty():
                phrase_complete = False
                # If enough time has passed between recordings, consider the phrase complete.
                # Clear the current working audio buffer to start over with the new data.
                if phrase_time and now - phrase_time > timedelta(seconds=phrase_timeout):
                    phrase_complete = True
                # This is the last time we received new audio data from the queue.
                phrase_time = now
                
                # Combine audio data from queue
                audio_data = b''.join(data_queue.queue)
                data_queue.queue.clear()
                
                # Convert in-ram buffer to something the model can use directly without needing a temp file.
                # Convert data from 16 bit wide integers to floating point with a width of 32 bits.
                # Clamp the audio stream frequency to a PCM wavelength compatible default of 32768hz max.
                audio_np = np.frombuffer(audio_data, dtype=np.int16).astype(np.float32) / 32768.0

                # Read the transcription.
                result = audio_model.transcribe(audio_np, fp16=torch.cuda.is_available())
                text = result['text'].strip()

                # If we detected a pause between recordings, add a new item to our transcription.
                # Otherwise edit the existing one.
                if phrase_complete:
                    transcription.append(text)
                else:
                    transcription[-1] = text

                # Clear the console to reprint the updated transcription.
                # os.system('cls' if os.name=='nt' else 'clear')
                for line in transcription:
                    print(line)
                # Flush stdout.
                print('', end='', flush=True)

                # Infinite loops are bad for processors, must sleep.
                sleep(0.25)
        except KeyboardInterrupt:
            break

    print("\n\nTranscription:")
    for line in transcription:
        print(line)

    # Clean up
    wave_file.close()
    playback_stream.stop_stream()
    playback_stream.close()
    p.terminate()
    print("Recording saved to", output_filename)

In [2]:
start_recording()

Model loaded and recording to output_audio.wav
The front end was like...
The front end was like...
Really? Yeah. Wait, do you sound like automated?
The front end was like...
Really? Yeah. Wait, do you sound like automated?
Thank you for joining. This is a good prompt, right? Yeah.
The front end was like...
Really? Yeah. Wait, do you sound like automated?
Thank you for joining. This is a good prompt, right? Yeah.
I'm going to parse them into just a list of words.
The front end was like...
Really? Yeah. Wait, do you sound like automated?
Thank you for joining. This is a good prompt, right? Yeah.
I'm going to parse them into just a list of words.
Okay.
The front end was like...
Really? Yeah. Wait, do you sound like automated?
Thank you for joining. This is a good prompt, right? Yeah.
I'm going to parse them into just a list of words.
Okay.
Oh, what else is on the spreadsheet?
The front end was like...
Really? Yeah. Wait, do you sound like automated?
Thank you for joining. This is a good p

KeyboardInterrupt: 