In [30]:
import os
import hashlib
import numpy as np
import soundfile as sf
from numpy.random import randint


class SampleSaver:
    def __init__(self, sample_rate, folder_name="datasets/HeyPiSamples"):
        self.sample_rate = sample_rate
        self.folder = folder_name
        self.files_count = 0

        if not os.path.exists(self.folder):
            os.makedirs(self.folder)
            print(f"Created folder: {self.folder}")
        else:
             self.files_count = sum(1 for entry in os.scandir(folder_name) if entry.is_file())

    def save(self, audio: np.ndarray):
        if len(audio) == 0:
            print("Audio buffer is empty, nothing to save.")
            return None
        audio = audio.astype(np.float32)
        hash_str = hashlib.md5(audio.tobytes()).hexdigest()
        filepath = os.path.join(self.folder, f"hey_pi_{hash_str}.wav")
        sf.write(filepath, audio, self.sample_rate)

        print(f"Saved: {filepath}\t total files count: {self.files_count}")
        self.files_count += 1
        return filepath

sample_saver = SampleSaver(16000)

In [34]:
import sounddevice as sd
import numpy as np
import queue
import librosa
import time
import threading
from random import randint

SAMPLE_RATE = 16000
WINDOW_SIZE = 1.0

WINDOW_SAMPLES = int(SAMPLE_RATE * WINDOW_SIZE)

audio_queue = queue.Queue()
audio_buffer = np.array([], dtype=np.float32)


def callback(indata, frames, time, status):
    if status:
        print(status)
    audio_queue.put(indata.copy().flatten())

word_detected_flag = False

with sd.InputStream(channels=1, callback=callback) as stream:
    sr = stream.samplerate
    print("Actual mic sample rate:", sr)
    try:
        while True: 
            new_audio = librosa.resample(audio_queue.get(), orig_sr=int(sr), target_sr=SAMPLE_RATE)
            if new_audio is None:
                continue
            if not word_detected_flag and len(audio_buffer) >= WINDOW_SAMPLES:
                avg_loughtness = np.mean(np.abs(audio_buffer))
                new_audio_loughtness = np.mean(np.abs(new_audio))
                if (avg_loughtness * 20 < new_audio_loughtness):
                        print("Word Detected")
                        word_detected_flag = True
                        audio_buffer = audio_buffer[-min(len(audio_buffer),randint(0,WINDOW_SAMPLES//4)):] #Randomly cut previous buffer before word 
            else:
                if (len(audio_buffer) >= WINDOW_SAMPLES):
                    word_audio = audio_buffer[:WINDOW_SAMPLES]
                    sd.play(word_audio, SAMPLE_RATE)
                    sample_saver.save(word_audio)
                    word_detected_flag = False
            audio_buffer = np.concatenate((audio_buffer, new_audio))
            if len(audio_buffer) >= WINDOW_SAMPLES * 2:
                audio_buffer = audio_buffer[WINDOW_SAMPLES:]
    except KeyboardInterrupt:
        print("Exiting from keyboard interrupt")


Actual mic sample rate: 44100.0
Word Detected
Saved: datasets/HeyPiSamples\hey_pi_4796312d121bda989f9313724bdb3a99.wav	 total files count: 97
Word Detected
Saved: datasets/HeyPiSamples\hey_pi_13983b078d56724c76eef51237d1e0d6.wav	 total files count: 98
Word Detected
Saved: datasets/HeyPiSamples\hey_pi_7ba6ca7dbe99e95d72a9c840d01e2a76.wav	 total files count: 99
Word Detected
Saved: datasets/HeyPiSamples\hey_pi_17b3a6bc98870460f9e0f41c8243f73e.wav	 total files count: 100
Word Detected
Saved: datasets/HeyPiSamples\hey_pi_9e77de9941550b0c636128db33f64cda.wav	 total files count: 101
Word Detected
Saved: datasets/HeyPiSamples\hey_pi_8e2794cd2efcd9ed2e97ed7683f64c05.wav	 total files count: 102
Word Detected
Saved: datasets/HeyPiSamples\hey_pi_042f6c45d7703f3589e5225788df78bd.wav	 total files count: 103
Word Detected
Saved: datasets/HeyPiSamples\hey_pi_5ab2aafbfa32b81304149e8b26ca7423.wav	 total files count: 104
Word Detected
Saved: datasets/HeyPiSamples\hey_pi_159a0a737ebc56cf2d3c2eb8b8039deb.