In [None]:
import numpy as np
import sounddevice as sd
import torch
import threading
import time
import queue
from silero_vad import load_silero_vad, get_speech_timestamps
from speechbrain.pretrained import EncoderClassifier

# Constants
AUDIO_FS = 16000
CHANNELS = 1
SLIDING_WINDOW_SECONDS = 1.5
STEP_SIZE_SECONDS = 0.75
BUFFER_SIZE_SECONDS = 20
MIN_SPEAK_TIME_ENROLL = 3.0
MAX_SPEAK_TIME_ENROLL = 5.0
MIN_SPEAK_TIME_VERIFY = 1.5
SIMILARITY_THRESHOLD = 0.6
CONSECUTIVE_MATCHES_REQUIRED = 3

vad_model = load_silero_vad()
spkrec_model = EncoderClassifier.from_hparams(
    source="speechbrain/spkrec-ecapa-voxceleb",
    savedir="pretrained_models/spkrec-ecapa-voxceleb"
)

audio_queue = queue.Queue()
stop_event = threading.Event()
mic_active = threading.Event()


def extract_embedding(audio):
    audio = audio / (np.max(np.abs(audio)) + 1e-10)
    tensor = torch.from_numpy(audio.astype(np.float32).reshape(1, -1))
    with torch.no_grad():
        emb = spkrec_model.encode_batch(tensor)
    emb = emb.squeeze()
    emb /= emb.norm(p=2)
    return emb.cpu().numpy()


def cosine_similarity(a, b):
    a_norm = a / np.linalg.norm(a)
    b_norm = b / np.linalg.norm(b)
    return float(np.dot(a_norm, b_norm))


def detect_fluent_speech(audio_np):
    audio_tensor = torch.from_numpy(audio_np.astype(np.float32))
    speech_segments = get_speech_timestamps(audio_tensor, vad_model, sampling_rate=AUDIO_FS)
    if not speech_segments:
        return 0, None
    longest_segment = max(speech_segments, key=lambda s: s['end'] - s['start'])
    start, end = longest_segment['start'], longest_segment['end']
    duration = (end - start) / AUDIO_FS
    return duration, audio_np[start:end]


def record_fixed_duration(duration, prompt=""):
    if prompt:
        print(prompt)
    audio = sd.rec(int(duration * AUDIO_FS), samplerate=AUDIO_FS, channels=CHANNELS, dtype='float32')
    sd.wait()
    return audio.flatten()


def audio_callback(indata, frames, time_info, status):
    if status:
        pass
    audio_queue.put(indata[:, 0].copy())


def verification_worker(enrolled_embedding):
    buffer = np.zeros(int(BUFFER_SIZE_SECONDS * AUDIO_FS), dtype=np.float32)
    buffer_len = 0
    window_size = int(SLIDING_WINDOW_SECONDS * AUDIO_FS)
    step_size = int(STEP_SIZE_SECONDS * AUDIO_FS)
    consecutive_matches = 0
    greeted = False
    live_verification_started_printed = False

    while not stop_event.is_set():
        try:
            chunk = audio_queue.get(timeout=0.1)
        except queue.Empty:
            continue

        if not live_verification_started_printed:
            print("=== LIVE VERIFICATION STARTED ===")
            print("Microphone is now always-on for speaker verification.")
            live_verification_started_printed = True

        chunk_len = len(chunk)
        if buffer_len + chunk_len > len(buffer):
            overflow = buffer_len + chunk_len - len(buffer)
            buffer[:len(buffer) - chunk_len] = buffer[overflow:buffer_len]
            buffer_len -= overflow

        buffer[buffer_len:buffer_len + chunk_len] = chunk
        buffer_len += chunk_len

        if buffer_len >= window_size and mic_active.is_set():
            for start_idx in range(0, buffer_len - window_size + 1, step_size):
                segment = buffer[start_idx:start_idx + window_size]
                dur, fluent_audio = detect_fluent_speech(segment)
                if dur >= MIN_SPEAK_TIME_VERIFY:
                    start_t = time.perf_counter()
                    live_emb = extract_embedding(fluent_audio)
                    similarity = cosine_similarity(enrolled_embedding, live_emb)
                    elapsed_ms = (time.perf_counter() - start_t) * 1000

                    # internal efficiency warning (optional)
                    if elapsed_ms > 100:
                        print("(Warning) Verification took longer than 100 ms.")

                    if similarity >= SIMILARITY_THRESHOLD:
                        consecutive_matches += 1
                        if not greeted and consecutive_matches >= CONSECUTIVE_MATCHES_REQUIRED:
                            print("Hi, how can I help you?")
                            greeted = True
                    else:
                        print("Sorry, I can't reply to you.")
                        mic_active.clear()
                        stop_event.set()
                        return

            buffer[:buffer_len - step_size] = buffer[step_size:buffer_len]
            buffer_len -= step_size


def main():
    print("Warming up model...")
    dummy = torch.randn(1, int(SLIDING_WINDOW_SECONDS * AUDIO_FS))
    with torch.no_grad():
        spkrec_model.encode_batch(dummy)

    fluent_audio = None
    while fluent_audio is None:
        input("=== ENROLLMENT ===\nPress Enter, then speak your enrollment phrase for 5 seconds (quiet background recommended).")
        audio = record_fixed_duration(MAX_SPEAK_TIME_ENROLL)
        dur, seg = detect_fluent_speech(audio)
        if seg is not None and dur >= MIN_SPEAK_TIME_ENROLL:
            fluent_audio = seg
        else:
            print("Fluent speech not detected or too short, please try again.")

    enrolled_embedding = extract_embedding(fluent_audio)

    print("Enrollment complete.")
    print("Voiceprint registered.")

    stop_event.clear()
    mic_active.set()

    with sd.InputStream(samplerate=AUDIO_FS, channels=CHANNELS, callback=audio_callback, blocksize=1024):
        worker = threading.Thread(target=verification_worker, args=(enrolled_embedding,), daemon=True)
        worker.start()
        try:
            while not stop_event.is_set():
                time.sleep(0.1)
        except KeyboardInterrupt:
            print("\nExiting on user interrupt.")
            stop_event.set()
            mic_active.clear()

    print("Microphone stopped. Program ended.")


if __name__ == "__main__":
    main()


Warming up model...


=== ENROLLMENT ===
Press Enter, then speak your enrollment phrase for 5 seconds (quiet background recommended). 


Fluent speech not detected or too short, please try again.


=== ENROLLMENT ===
Press Enter, then speak your enrollment phrase for 5 seconds (quiet background recommended). 


Enrollment complete.
Voiceprint registered.
=== LIVE VERIFICATION STARTED ===
Microphone is now always-on for speaker verification.
Hi, how can I help you?
