In [1]:
import ipywidgets as widgets
from IPython.display import display, clear_output
import time
import threading
import os
import speech_recognition as sr

# Initialize the recognizer
recognizer = sr.Recognizer()

# Create a directory for storing recordings
output_directory = 'recordings'
os.makedirs(output_directory, exist_ok=True)

# List to store recorded files
recorded_files = []

def start_recording(data):
    record_button.disabled = True
    stop_button.disabled = False
    
    with output:
        clear_output()
        display("Recording...")

    audio_filename = f"{output_directory}/recording_{int(time.time())}.wav"

    def record_audio():
        with sr.Microphone() as source:
            audio = recognizer.listen(source)
            with open(audio_filename, "wb") as f:
                f.write(audio.get_wav_data())
            recorded_files.append(audio_filename)

    recording_thread = threading.Thread(target=record_audio)
    recording_thread.start()

def stop_recording(data):
    record_button.disabled = False
    stop_button.disabled = True

    with output:
        display("Stopped recording.")

record_button = widgets.Button(
    description='Record',
    disabled=False,
    button_style='success',
    tooltip='Record',
    icon='microphone'
)

record_button.on_click(start_recording)

stop_button = widgets.Button(
    description='Stop',
    disabled=True,
    button_style='warning',
    tooltip='Stop',
    icon='stop'
)

stop_button.on_click(stop_recording)

output = widgets.Output()
display(widgets.HBox([record_button, stop_button]), output)


HBox(children=(Button(button_style='success', description='Record', icon='microphone', style=ButtonStyle(), to…

Output()

In [2]:
import pyaudio

def list_audio_devices():
    p = pyaudio.PyAudio()
    
    device_info_list = []
    
    print("Available Audio Devices:")
    for i in range(p.get_device_count()):
        device_info = p.get_device_info_by_index(i)
        device_info_list.append(device_info)
        device_name = device_info['name']
        device_index = device_info['index']
        max_input_channels = device_info['maxInputChannels']
        max_output_channels = device_info['maxOutputChannels']
        default_sample_rate = device_info['defaultSampleRate']
        
        print(f"Device {i}:")
        print(f"Name: {device_name}")
        print(f"Index: {device_index}")
        print(f"Max Input Channels: {max_input_channels}")
        print(f"Max Output Channels: {max_output_channels}")
        print(f"Default Sample Rate: {default_sample_rate} Hz")
        print("-" * 30)
    
    p.terminate()
    
    return device_info_list

device_info_list = list_audio_devices()

Available Audio Devices:
Device 0:
Name: Microsoft Sound Mapper - Input
Index: 0
Max Input Channels: 2
Max Output Channels: 0
Default Sample Rate: 44100.0 Hz
------------------------------
Device 1:
Name: PC (Realtek(R) Audio)
Index: 1
Max Input Channels: 2
Max Output Channels: 0
Default Sample Rate: 44100.0 Hz
------------------------------
Device 2:
Name: Microsoft Sound Mapper - Output
Index: 2
Max Input Channels: 0
Max Output Channels: 2
Default Sample Rate: 44100.0 Hz
------------------------------
Device 3:
Name: Speakers (Realtek(R) Audio)
Index: 3
Max Input Channels: 0
Max Output Channels: 2
Default Sample Rate: 44100.0 Hz
------------------------------
Device 4:
Name: Primary Sound Capture Driver
Index: 4
Max Input Channels: 2
Max Output Channels: 0
Default Sample Rate: 44100.0 Hz
------------------------------
Device 5:
Name: PC (Realtek(R) Audio)
Index: 5
Max Input Channels: 2
Max Output Channels: 0
Default Sample Rate: 44100.0 Hz
------------------------------
Device 6:
Nam

In [3]:
import pyaudio

class AudioRecorder:
    def __init__(self, channels=1, frame_rate=16000, record_seconds=20, audio_format=pyaudio.paInt16):
        self.channels = channels
        self.frame_rate = frame_rate
        self.record_seconds = record_seconds
        self.audio_format = audio_format
        self.sample_size = pyaudio.get_sample_size(audio_format)
        self.chunk = 1024

    def record(self):
        audio_data = []
        p = pyaudio.PyAudio()

        try:
            stream = p.open(
                format=self.audio_format,
                channels=self.channels,
                rate=self.frame_rate,
                input=True,
                input_device_index=1,  # Adjust this index to your desired input device
                frames_per_buffer=self.chunk
            )

            for _ in range(0, int(self.frame_rate / self.chunk * self.record_seconds)):
                data = stream.read(self.chunk)
                audio_data.append(data)

        except Exception as e:
            print(f"An error occurred during recording: {e}")

        finally:
            stream.stop_stream()
            stream.close()
            p.terminate()

        return audio_data

# Usage:
recorder = AudioRecorder()
recorded_audio = recorder.record()

In [4]:
import subprocess
import json
from vosk import Model, KaldiRecognizer
import time

def speech_recognition(output, FRAME_RATE):
    try:
        # Initialize the Vosk model and recognizer
        model = Model(model_name="vosk-model-small-en-us-0.15")
        rec = KaldiRecognizer(model, FRAME_RATE)
        rec.SetWords(True)

        while not messages.empty():
            frames = recordings.get()

            # Accept and process audio frames
            rec.AcceptWaveform(b''.join(frames))
            
            # Get the recognition result
            result = rec.Result()
            
            # Extract the recognized text from the result
            recognized_text = json.loads(result)["text"]

            # Display the recognized text or take other actions
            with output:
                print("Recognized: " + recognized_text)
                # You can customize the display format or take further actions here

    except Exception as e:
        # Handle any exceptions that may occur during the recognition process
        with output:
            print("Recognition error: " + str(e))

# Usage:
# You can call speech_recognition(output, FRAME_RATE) in a separate thread or function.
