In [1]:
# Comparing our audio with 10 audio

import os
import time
import wave
import webrtcvad
import sounddevice as sd
from speechbrain.pretrained import SpeakerRecognition

MODEL_PATH = "pretrained_models/spkrec-ecapa-voxceleb"
ENROLLMENT_AUDIO_FILE = "enrolled_owner.wav"
AUDIO_FS = 16000
CHANNELS = 1
FRAME_DURATION_MS = 30
MAX_SILENCE_DURATION_SEC = 1.0
ENROLL_DURATION = 7  # seconds

def write_wave(path, audio_bytes, sample_rate):
    with wave.open(path, 'wb') as wf:
        wf.setnchannels(CHANNELS)
        wf.setsampwidth(2)  # 16-bit PCM
        wf.setframerate(sample_rate)
        wf.writeframes(audio_bytes)

def record_audio_vad(duration_limit=10):
    vad = webrtcvad.Vad(2)
    frame_size = int(AUDIO_FS * FRAME_DURATION_MS / 1000)
    silence_frames = 0
    voiced_frames = bytearray()

    print("\nListening for speech...")

    stream = sd.InputStream(samplerate=AUDIO_FS, channels=CHANNELS, dtype='int16')
    stream.start()
    start_time = time.time()
    try:
        while True:
            frame, _ = stream.read(frame_size)
            frame_bytes = frame.tobytes()
            is_speech = vad.is_speech(frame_bytes, AUDIO_FS)
            if is_speech:
                voiced_frames.extend(frame_bytes)
                silence_frames = 0
            else:
                if voiced_frames:
                    silence_frames += 1
                    if (silence_frames * FRAME_DURATION_MS) / 1000 > MAX_SILENCE_DURATION_SEC:
                        break
            if time.time() - start_time > duration_limit:
                break
    finally:
        stream.stop()
        stream.close()

    if not voiced_frames:
        print("No speech detected.")
        return None
    return bytes(voiced_frames)

def enroll_owner():
    print(f"------ ENROLLMENT ------")
    input(f"Press Enter and then speak your enrollment phrase for {ENROLL_DURATION} seconds...")
    audio_bytes = record_audio_vad(duration_limit=ENROLL_DURATION)
    if not audio_bytes:
        print("Enrollment failed: no speech detected.")
        return False
    write_wave(ENROLLMENT_AUDIO_FILE, audio_bytes, AUDIO_FS)
    print("Enrollment complete.")
    return True

def verify_files(model, enrollment_file, test_file):
    score, prediction = model.verify_files(enrollment_file, test_file)
    pred_str = "Match" if prediction.item() else "No Match"
    print(f"Comparing '{enrollment_file}' with '{test_file}':")
    print(f"  Similarity score: {score.item():.4f} --> Prediction: {pred_str}")
    print()

def main():
    # Step 1: Enroll your voice
    if not enroll_owner():
        print("Enrollment failed, exiting.")
        return

    # Step 2: Load verification model
    model = SpeakerRecognition.from_hparams(
        source="speechbrain/spkrec-ecapa-voxceleb",
        savedir=MODEL_PATH
    )

    # Step 3: List of your test audio files
    test_files = [f"v{i}.wav" for i in range(1, 11)]

    # Step 4: Verify enrollment voice against all test audio files
    for test_file in test_files:
        if not os.path.exists(test_file):
            print(f"Test file '{test_file}' not found, skipping.")
            continue
        verify_files(model, ENROLLMENT_AUDIO_FILE, test_file)

if __name__ == "__main__":
    main()


  available_backends = torchaudio.list_audio_backends()
  if ismodule(module) and hasattr(module, '__file__'):
  from speechbrain.pretrained import SpeakerRecognition


------ ENROLLMENT ------


Press Enter and then speak your enrollment phrase for 7 seconds... 



Listening for speech...
Enrollment complete.


  available_backends = torchaudio.list_audio_backends()
  wrapped_fwd = torch.cuda.amp.custom_fwd(fwd, cast_inputs=cast_inputs)


Comparing 'enrolled_owner.wav' with 'v1.wav':
  Similarity score: 0.0111 --> Prediction: No Match

Comparing 'enrolled_owner.wav' with 'v2.wav':
  Similarity score: 0.0636 --> Prediction: No Match

Comparing 'enrolled_owner.wav' with 'v3.wav':
  Similarity score: 0.1283 --> Prediction: No Match

Comparing 'enrolled_owner.wav' with 'v4.wav':
  Similarity score: 0.0280 --> Prediction: No Match

Comparing 'enrolled_owner.wav' with 'v5.wav':
  Similarity score: 0.0989 --> Prediction: No Match

Comparing 'enrolled_owner.wav' with 'v6.wav':
  Similarity score: 0.1676 --> Prediction: No Match

Comparing 'enrolled_owner.wav' with 'v7.wav':
  Similarity score: 0.0410 --> Prediction: No Match

Comparing 'enrolled_owner.wav' with 'v8.wav':
  Similarity score: 0.0364 --> Prediction: No Match

Comparing 'enrolled_owner.wav' with 'v9.wav':
  Similarity score: 0.0238 --> Prediction: No Match

Comparing 'enrolled_owner.wav' with 'v10.wav':
  Similarity score: 0.0910 --> Prediction: No Match



In [8]:
from speechbrain.inference import SpeakerRecognition
from speechbrain.utils.fetching import LocalStrategy
import numpy as np
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# Load model with copy strategy to avoid Windows symlink error
model = SpeakerRecognition.from_hparams(
    source="speechbrain/spkrec-ecapa-voxceleb",
    savedir="pretrained_models/spkrec-ecapa-voxceleb",
    local_strategy=LocalStrategy.COPY
)

# List of audio files
audio_files = [f"v{i}.wav" for i in range(1, 11)]

# Threshold to decide match (can be adjusted based on use case)
threshold = 0.75

y_true = []
y_pred = []
similarity_scores = []

print("Similarity scores and matches:")

for i, ref_file in enumerate(audio_files):
    for j, test_file in enumerate(audio_files):
        score, prediction = model.verify_files(ref_file, test_file)
        similarity_scores.append(score.item())
        
        # Ground truth: match if same file index
        true_match = 1 if i == j else 0
        y_true.append(true_match)
        
        # Prediction based on threshold on similarity score
        pred_match = 1 if score >= threshold else 0
        y_pred.append(pred_match)
        
        print(f"Comparing {ref_file} with {test_file}: Similarity Score = {score.item():.4f}, Match = {'Yes' if pred_match else 'No'}")

# Calculate confusion matrix and metrics
cm = confusion_matrix(y_true, y_pred)
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, zero_division=0)
recall = recall_score(y_true, y_pred, zero_division=0)
f1 = f1_score(y_true, y_pred, zero_division=0)

print("\nConfusion Matrix:")
print(cm)
print(f"\nAccuracy:  {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall:    {recall:.4f}")
print(f"F1 Score:  {f1:.4f}")


Similarity scores and matches:




Comparing v1.wav with v1.wav: Similarity Score = 1.0000, Match = Yes
Comparing v1.wav with v2.wav: Similarity Score = 0.1279, Match = No
Comparing v1.wav with v3.wav: Similarity Score = 0.0056, Match = No
Comparing v1.wav with v4.wav: Similarity Score = -0.0206, Match = No
Comparing v1.wav with v5.wav: Similarity Score = 0.1170, Match = No
Comparing v1.wav with v6.wav: Similarity Score = 0.1304, Match = No
Comparing v1.wav with v7.wav: Similarity Score = 0.1470, Match = No
Comparing v1.wav with v8.wav: Similarity Score = 0.1142, Match = No
Comparing v1.wav with v9.wav: Similarity Score = 0.0515, Match = No
Comparing v1.wav with v10.wav: Similarity Score = 0.1420, Match = No
Comparing v2.wav with v1.wav: Similarity Score = 0.1279, Match = No
Comparing v2.wav with v2.wav: Similarity Score = 1.0000, Match = Yes
Comparing v2.wav with v3.wav: Similarity Score = 0.3496, Match = No
Comparing v2.wav with v4.wav: Similarity Score = -0.0428, Match = No
Comparing v2.wav with v5.wav: Similarity Sc

In [4]:
# Comparing Audio files with .npy file

import os
import time
import wave
import numpy as np
import webrtcvad
import sounddevice as sd
import torch
import torchaudio
from speechbrain.pretrained import EncoderClassifier

MODEL_PATH = "pretrained_models/spkrec-ecapa-voxceleb"
ENROLL_AUDIO_FILE = "alexa.wav"
ENROLL_EMBEDDING_FILE = "alexa.npy"
TEST_AUDIO_FILE = "siri.wav"
TEST_EMBEDDING_FILE = "siri.npy"
AUDIO_FS = 16000
CHANNELS = 1
FRAME_DURATION_MS = 30
MAX_SILENCE_DURATION_SEC = 1.0
ENROLL_DURATION = 7  # seconds
TEST_DURATION = 7  # seconds
THRESHOLD = 0.6  # similarity threshold

def write_wave(path, audio_bytes, sample_rate):
    start = time.time()
    with wave.open(path, "wb") as wf:
        wf.setnchannels(CHANNELS)
        wf.setsampwidth(2)
        wf.setframerate(sample_rate)
        wf.writeframes(audio_bytes)
    end = time.time()
    print(f"[{time.strftime('%X')}] Wrote WAV file '{path}' in {end - start:.2f} seconds.")

def record_audio_vad(duration_limit=10):
    vad = webrtcvad.Vad(2)
    frame_size = int(AUDIO_FS * FRAME_DURATION_MS / 1000)
    silence_frames = 0
    voiced_frames = bytearray()
    print("\nListening for speech...")
    stream = sd.InputStream(samplerate=AUDIO_FS, channels=CHANNELS, dtype="int16")
    stream.start()
    start_time = time.time()
    try:
        while True:
            frame, _ = stream.read(frame_size)
            frame_bytes = frame.tobytes()
            is_speech = vad.is_speech(frame_bytes, AUDIO_FS)
            if is_speech:
                voiced_frames.extend(frame_bytes)
                silence_frames = 0
            else:
                if voiced_frames:
                    silence_frames += 1
                    if silence_frames * FRAME_DURATION_MS / 1000 > MAX_SILENCE_DURATION_SEC:
                        break
            if time.time() - start_time > duration_limit:
                break
    finally:
        stream.stop()
        stream.close()
    end_time = time.time()
    print(f"[{time.strftime('%X')}] Recorded audio for {end_time - start_time:.2f} seconds.")
    if not voiced_frames:
        print("No speech detected.")
        return None
    return bytes(voiced_frames)

def wav_to_embedding(model, wav_path):
    start = time.time()
    wav, sr = torchaudio.load(wav_path)
    if sr != AUDIO_FS:
        wav = torchaudio.transforms.Resample(sr, AUDIO_FS)(wav)
    if wav.size(0) > 1:
        wav = torch.mean(wav, dim=0, keepdim=True)
    with torch.no_grad():
        embedding = model.encode_batch(wav)
    embedding_np = embedding.squeeze().cpu().numpy()
    end = time.time()
    print(f"[{time.strftime('%X')}] Extracted embedding from '{wav_path}' in {end - start:.2f} seconds.")
    return embedding_np

def save_embedding(embedding, path):
    start = time.time()
    np.save(path, embedding)
    end = time.time()
    print(f"[{time.strftime('%X')}] Saved embedding '{path}' in {end - start:.2f} seconds.")

def load_embedding(path):
    start = time.time()
    if os.path.exists(path):
        embedding = np.load(path)
        end = time.time()
        print(f"[{time.strftime('%X')}] Loaded embedding '{path}' in {end - start:.2f} seconds.")
        return embedding
    end = time.time()
    print(f"[{time.strftime('%X')}] Failed to load embedding '{path}' (file missing). Took {end - start:.2f} seconds.")
    return None

def cosine_similarity(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

def enroll_owner(model):
    print("=== ENROLLMENT (as 'alexa') ===")
    input(f"Press Enter, then speak for {ENROLL_DURATION} seconds for enrollment...")
    audio_bytes = record_audio_vad(duration_limit=ENROLL_DURATION)
    if not audio_bytes:
        print("No speech detected during enrollment.")
        return False
    write_wave(ENROLL_AUDIO_FILE, audio_bytes, AUDIO_FS)
    emb = wav_to_embedding(model, ENROLL_AUDIO_FILE)
    save_embedding(emb, ENROLL_EMBEDDING_FILE)
    print("Enrollment completed and saved as 'alexa.wav' and 'alexa.npy'.")
    return True

def test_speaker(model):
    print("\n=== TESTING (as 'siri') ===")
    input(f"Press Enter, then speak for {TEST_DURATION} seconds for verification...")
    audio_bytes = record_audio_vad(duration_limit=TEST_DURATION)
    if not audio_bytes:
        print("No speech detected during test.")
        return False
    write_wave(TEST_AUDIO_FILE, audio_bytes, AUDIO_FS)
    emb = wav_to_embedding(model, TEST_AUDIO_FILE)
    save_embedding(emb, TEST_EMBEDDING_FILE)
    print("Test completed and saved as 'siri.wav' and 'siri.npy'.")
    return True

def verify_embeddings():
    enroll_emb = load_embedding(ENROLL_EMBEDDING_FILE)
    test_emb = load_embedding(TEST_EMBEDDING_FILE)
    if enroll_emb is None or test_emb is None:
        print("Cannot verify without both enrollment and test embeddings.")
        return False
    start = time.time()
    similarity = cosine_similarity(enroll_emb, test_emb)
    end = time.time()
    print(f"[{time.strftime('%X')}] Similarity score: {similarity:.4f} (computed in {end - start:.4f}s)")
    if similarity >= THRESHOLD:
        print("Speaker verified: same person detected.")
        return True
    else:
        print("Speaker not verified: different person detected.")
        return False

def main():
    model = EncoderClassifier.from_hparams(
        source="speechbrain/spkrec-ecapa-voxceleb",
        savedir=MODEL_PATH,
    )
    if not enroll_owner(model):
        return
    if not test_speaker(model):
        return
    verify_embeddings()

if __name__ == "__main__":
    main()


=== ENROLLMENT (as 'alexa') ===


Press Enter, then speak for 7 seconds for enrollment... 



Listening for speech...
[16:42:27] Recorded audio for 6.76 seconds.
[16:42:27] Wrote WAV file 'alexa.wav' in 0.00 seconds.
[16:42:28] Extracted embedding from 'alexa.wav' in 0.66 seconds.
[16:42:28] Saved embedding 'alexa.npy' in 0.00 seconds.
Enrollment completed and saved as 'alexa.wav' and 'alexa.npy'.

=== TESTING (as 'siri') ===


Press Enter, then speak for 7 seconds for verification... 



Listening for speech...
[16:42:42] Recorded audio for 7.08 seconds.
[16:42:42] Wrote WAV file 'siri.wav' in 0.00 seconds.
[16:42:43] Extracted embedding from 'siri.wav' in 0.69 seconds.
[16:42:43] Saved embedding 'siri.npy' in 0.00 seconds.
Test completed and saved as 'siri.wav' and 'siri.npy'.
[16:42:43] Loaded embedding 'alexa.npy' in 0.01 seconds.
[16:42:43] Loaded embedding 'siri.npy' in 0.01 seconds.
[16:42:43] Similarity score: 0.7105 (computed in 0.0000s)
Speaker verified: same person detected.
