In [4]:
# CONTINOUSLY CHECK WEATHER SPEAKER IS IDENTIFIED OF NOT

import os
import time
import wave
import numpy as np
import sounddevice as sd
import torch
import torchaudio
import torch.nn.functional as F
from silero_vad import load_silero_vad, get_speech_timestamps
from speechbrain.pretrained import EncoderClassifier

MODEL_PATH = "pretrained_models/spkrec-ecapa-voxceleb"
ENROLL_AUDIO_FILE = "alexa.wav"
ENROLL_EMBEDDING_FILE = "alexa.npy"
AUDIO_FS = 16000
CHANNELS = 1
ENROLL_DURATION = 5  # seconds to record enrollment audio
MIN_FLUENT_ENROLL = 3.0  # minimum fluent speech seconds required for enrollment
THRESHOLD = 0.6

# Load Silero VAD model globally once
vad_model = load_silero_vad()

def save_wav(path, audio_np, sample_rate=AUDIO_FS):
    audio_int16 = (audio_np * 32767).astype(np.int16)
    with wave.open(path, "wb") as wf:
        wf.setnchannels(CHANNELS)
        wf.setsampwidth(2)
        wf.setframerate(sample_rate)
        wf.writeframes(audio_int16.tobytes())
    print(f"[{time.strftime('%X')}] Saved WAV file '{path}'.")

def record_audio(duration):
    print(f"Recording for {duration} seconds...")
    recording = sd.rec(int(duration * AUDIO_FS), samplerate=AUDIO_FS, channels=CHANNELS, dtype="float32")
    sd.wait()
    return recording.flatten()

def detect_longest_fluent_segment(audio_np):
    audio_tensor = torch.from_numpy(audio_np)
    speech_timestamps = get_speech_timestamps(audio_tensor, vad_model, sampling_rate=AUDIO_FS)
    if not speech_timestamps:
        return 0, None
    longest = max(speech_timestamps, key=lambda seg: seg['end'] - seg['start'])
    duration = (longest['end'] - longest['start']) / AUDIO_FS
    segment_audio = audio_np[longest['start']:longest['end']]
    return duration, segment_audio

def wav_to_embedding(model, wav_path):
    wav, sr = torchaudio.load(wav_path)
    if sr != AUDIO_FS:
        wav = torchaudio.transforms.Resample(sr, AUDIO_FS)(wav)
    if wav.size(0) > 1:
        wav = torch.mean(wav, dim=0, keepdim=True)
    with torch.no_grad():
        emb = model.encode_batch(wav)
    emb_np = emb.squeeze().cpu().numpy()
    print(f"[{time.strftime('%X')}] Extracted embedding from '{wav_path}'.")
    return emb_np

def save_embedding(embedding, path):
    np.save(path, embedding)
    print(f"[{time.strftime('%X')}] Saved embedding '{path}'.")

def load_embedding(path):
    if not os.path.exists(path):
        print(f"[{time.strftime('%X')}] Embedding file '{path}' not found.")
        return None
    emb = np.load(path)
    print(f"[{time.strftime('%X')}] Loaded embedding '{path}'.")
    return emb

def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

def enroll_owner(model):
    print("=== ENROLLMENT (as 'alexa') ===")
    while True:
        input(f"Press Enter and speak for {ENROLL_DURATION} seconds for enrollment...")
        audio_np = record_audio(ENROLL_DURATION)
        dur, fluent_audio = detect_longest_fluent_segment(audio_np)
        if fluent_audio is None or dur < MIN_FLUENT_ENROLL:
            print(f"Fluent speech duration {dur:.2f}s too short. Please try again.")
        else:
            save_wav(ENROLL_AUDIO_FILE, fluent_audio)
            emb = wav_to_embedding(model, ENROLL_AUDIO_FILE)
            save_embedding(emb, ENROLL_EMBEDDING_FILE)
            print("Enrollment completed.")
            return True

def live_verification_loop(model, enrolled_emb):
    global mic_active
    mic_active = True
    print("\n=== LIVE VERIFICATION STARTED ===")
    print("Microphone is active. Say something... (Press Ctrl+C to stop)")
    buffer_duration = 3.0  # seconds of audio per verification chunk
    buffer_size = int(buffer_duration * AUDIO_FS)
    audio_buffer = np.zeros(buffer_size, dtype=np.float32)
    idx = 0
    try:
        with sd.InputStream(samplerate=AUDIO_FS, channels=CHANNELS, dtype="float32") as stream:
            while mic_active:
                data, _ = stream.read(int(AUDIO_FS * 0.5))  # read half-second
                chunk = data.flatten()
                length = len(chunk)
                if idx + length > buffer_size:
                    shift = idx + length - buffer_size
                    audio_buffer[:buffer_size-shift] = audio_buffer[shift:idx]
                    idx -= shift
                audio_buffer[idx:idx+length] = chunk
                idx += length
                if idx >= buffer_size:
                    dur, speech_segment = detect_longest_fluent_segment(audio_buffer)
                    if speech_segment is not None and dur >= 1.0:
                        temp_path = "temp_live.wav"
                        save_wav(temp_path, speech_segment)
                        live_emb = wav_to_embedding(model, temp_path)
                        similarity = cosine_similarity(enrolled_emb, live_emb)
                        print(f"[{time.strftime('%X')}] Similarity score: {similarity:.4f}")
                        if similarity >= THRESHOLD:
                            print("Speaker verified. Mic remains ON.")
                        else:
                            print("Speaker NOT verified. Turning microphone OFF.")
                            mic_active = False
                            break
                    idx = 0
    except KeyboardInterrupt:
        print("\nUser interrupted, stopping microphone.")
        mic_active = False

def main():
    model = EncoderClassifier.from_hparams(
        source="speechbrain/spkrec-ecapa-voxceleb",
        savedir=MODEL_PATH
    )
    if not enroll_owner(model):
        return
    enrolled_emb = load_embedding(ENROLL_EMBEDDING_FILE)
    if enrolled_emb is None:
        print("Enrollment embedding missing, exiting.")
        return
    live_verification_loop(model, enrolled_emb)
    print("Microphone turned off. Program ended.")

if __name__ == "__main__":
    main()


=== ENROLLMENT (as 'alexa') ===


Press Enter and speak for 5 seconds for enrollment... 


Recording for 5 seconds...
[16:57:27] Saved WAV file 'alexa.wav'.
[16:57:27] Extracted embedding from 'alexa.wav'.
[16:57:27] Saved embedding 'alexa.npy'.
Enrollment completed.
[16:57:27] Loaded embedding 'alexa.npy'.

=== LIVE VERIFICATION STARTED ===
Microphone is active. Say something... (Press Ctrl+C to stop)
[16:57:31] Saved WAV file 'temp_live.wav'.
[16:57:31] Extracted embedding from 'temp_live.wav'.
[16:57:31] Similarity score: 0.6646
Speaker verified. Mic remains ON.
[16:57:34] Saved WAV file 'temp_live.wav'.
[16:57:34] Extracted embedding from 'temp_live.wav'.
[16:57:34] Similarity score: 0.6364
Speaker verified. Mic remains ON.
[16:57:37] Saved WAV file 'temp_live.wav'.
[16:57:37] Extracted embedding from 'temp_live.wav'.
[16:57:37] Similarity score: 0.4682
Speaker NOT verified. Turning microphone OFF.
Microphone turned off. Program ended.


Press Enter and speak for 5 seconds for enrollment... 


Recording for 5 seconds...
[11:51:32] Saved WAV file 'alexa.wav'.
[11:51:32] Extracted embedding from 'alexa.wav'.
[11:51:32] Saved embedding 'alexa.npy'.
Enrollment completed.
[11:51:32] Loaded embedding 'alexa.npy'.

=== LIVE VERIFICATION STARTED ===
Microphone is active. Say something... (Press Ctrl+C to stop)
[11:51:35] Saved WAV file 'temp_live.wav'.
[11:51:36] Extracted embedding from 'temp_live.wav'.
[11:51:36] Similarity score: 0.5038
Speaker NOT verified. Turning microphone OFF.
Microphone turned off. Program ended.
