In [1]:
import sounddevice as sd
import numpy as np
import soundfile as sf
import os

def create_dataset_dir(name):
    path = f"voice_dataset/{name}"
    if not os.path.exists(path):
        os.makedirs(path)
    return path

def record_audio(filename, duration=5, fs=16000):
    print("Recording...")
    audio_data = sd.rec(int(duration * fs), samplerate=fs, channels=1, dtype=np.int16)
    sd.wait()
    sf.write(filename, audio_data, fs)
    print(f"Audio saved as {filename}")

# Main function to capture audio samples for the dataset
name = input("Enter the name of the person: ")
dataset_path = create_dataset_dir(name)
for i in range(5):  # Record 5 samples
    file_path = os.path.join(dataset_path, f"sample_{i+1}.wav")
    record_audio(file_path, duration=5)
    print(f"Sample {i+1} recorded.")


Recording...
Audio saved as voice_dataset/Siddhartha\sample_1.wav
Sample 1 recorded.
Recording...
Audio saved as voice_dataset/Siddhartha\sample_2.wav
Sample 2 recorded.
Recording...
Audio saved as voice_dataset/Siddhartha\sample_3.wav
Sample 3 recorded.
Recording...
Audio saved as voice_dataset/Siddhartha\sample_4.wav
Sample 4 recorded.
Recording...
Audio saved as voice_dataset/Siddhartha\sample_5.wav
Sample 5 recorded.


In [3]:
import numpy as np
import os
from resemblyzer import VoiceEncoder, preprocess_wav
from pathlib import Path

# Function to compute and save embeddings
def compute_and_save_embeddings(person_name):
    dataset_path = f"voice_dataset/{person_name}"
    embeddings = []

    encoder = VoiceEncoder()

    for file in os.listdir(dataset_path):
        if file.endswith(".wav"):
            filepath = os.path.join(dataset_path, file)
            print(f"Processing {file}...")
            
            wav = preprocess_wav(Path(filepath))
            embedding = encoder.embed_utterance(wav)
            embeddings.append(embedding)

    # Save the embeddings as a .npy file (Numpy array format)
    np.save(f"{person_name}_embeddings.npy", embeddings)
    print(f"Embeddings for {person_name} saved to {person_name}_embeddings.npy")

# Main function to generate and save voice embedding
name = input("Enter the name of the person to train: ")
compute_and_save_embeddings(name)


Loaded the voice encoder model on cpu in 0.01 seconds.
Processing sample_1.wav...
Processing sample_2.wav...
Processing sample_3.wav...
Processing sample_4.wav...
Processing sample_5.wav...
Embeddings for Siddhartha saved to Siddhartha_embeddings.npy


In [9]:
from resemblyzer import VoiceEncoder, preprocess_wav
import numpy as np
import os
import sounddevice as sd  # For recording audio
import soundfile as sf  # For saving audio to a file

def record_audio_for_test(filename, duration=5, fs=16000):
    print("Recording...")
    audio_data = sd.rec(int(duration * fs), samplerate=fs, channels=1, dtype=np.int16)
    sd.wait()  # Wait until the recording is finished
    sf.write(filename, audio_data, fs)  # Save the audio data to a file
    print(f"Audio saved as {filename}")

def cosine_similarity(embedding1, embedding2):
    # Calculate cosine similarity between two embeddings
    return np.dot(embedding1, embedding2) / (np.linalg.norm(embedding1) * np.linalg.norm(embedding2))

def recognize_voice(name):
    # Load the saved user embedding
    embedding_path = f"voice_profiles/{name}_embeddings.npy"
    if not os.path.exists(embedding_path):
        print(f"Embedding for {name} not found. Please train first.")
        return

    user_embedding = np.load(embedding_path)

    # Record real-time audio for testing
    test_file = "test_sample.wav"
    record_audio_for_test(test_file, duration=5)

    # Load and compute the embedding for the test audio
    encoder = VoiceEncoder()
    wav = preprocess_wav(test_file)  # Use preprocess_wav to load the audio
    test_embedding = encoder.embed_utterance(wav)

    # Calculate cosine similarity between user and test embeddings
    similarity = cosine_similarity(np.mean(user_embedding, axis=0), test_embedding)

    # Threshold for matching (higher is better, usually around 0.8-0.9 is a good match)
    threshold = 0.8
    if similarity > threshold:
        print(f"Matched with {name} (Similarity: {similarity:.2f})")
    else:
        print(f"Not Matched (Similarity: {similarity:.2f})")

# Main function to test the voice
name = input("Enter the name of the person to recognize: ")
recognize_voice(name)


Recording...
Audio saved as test_sample.wav
Loaded the voice encoder model on cpu in 0.01 seconds.
Matched with Siddhartha (Similarity: 0.85)
