In [10]:
import pyaudio
import wave
import os

def record_audio(filename, duration=5, sample_rate=44100):
    chunk = 1024  # Record in chunks of 1024 samples
    audio = pyaudio.PyAudio()

    # Set up recording stream
    stream = audio.open(format=pyaudio.paInt16, channels=1,
                        rate=sample_rate, input=True,
                        frames_per_buffer=chunk)

    print("Recording...")
    frames = []

    for _ in range(0, int(sample_rate / chunk * duration)):
        data = stream.read(chunk)
        frames.append(data)

    print("Recording finished.")
    stream.stop_stream()
    stream.close()
    audio.terminate()

    # Save audio file
    with wave.open(filename, 'wb') as wf:
        wf.setnchannels(1)
        wf.setsampwidth(audio.get_sample_size(pyaudio.paInt16))
        wf.setframerate(sample_rate)
        wf.writeframes(b''.join(frames))

if __name__ == "__main__":
    person_name = input("Enter the name of the person: ")
    num_samples = 5
    dataset_dir = f"voice_dataset/{person_name}"
    
    if not os.path.exists(dataset_dir):
        os.makedirs(dataset_dir)

    for i in range(num_samples):
        filename = os.path.join(dataset_dir, f"{i+1}.wav")
        record_audio(filename, duration=5)


Recording...
Recording finished.
Recording...
Recording finished.
Recording...
Recording finished.
Recording...
Recording finished.
Recording...
Recording finished.


In [12]:
import librosa
import numpy as np
import joblib
import os

def extract_features(filename):
    y, sr = librosa.load(filename)
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    return np.mean(mfccs, axis=1)

def train_model(user_name):
    dataset_folder = f'voice_dataset/{user_name}'
    features = []

    for filename in os.listdir(dataset_folder):
        if filename.endswith('.wav'):
            feature = extract_features(os.path.join(dataset_folder, filename))
            features.append(feature)

    features = np.array(features)
    mean_feature = np.mean(features, axis=0)
    std_feature = np.std(features, axis=0)

    joblib.dump((mean_feature, std_feature), 'voice_model.pkl')
    print(f"Model trained and saved as 'voice_model.pkl'")

if __name__ == "__main__":
    user_name = input("Enter the name of the user: ")
    train_model(user_name)


Model trained and saved as 'voice_model.pkl'


In [14]:
import librosa
import numpy as np
import joblib
import pyaudio
import wave

def record_audio(filename, duration=5, sample_rate=44100):
    chunk = 1024
    audio = pyaudio.PyAudio()

    stream = audio.open(format=pyaudio.paInt16, channels=1,
                        rate=sample_rate, input=True,
                        frames_per_buffer=chunk)

    print("Recording...")
    frames = []

    for _ in range(0, int(sample_rate / chunk * duration)):
        data = stream.read(chunk)
        frames.append(data)

    print("Recording finished.")
    stream.stop_stream()
    stream.close()
    audio.terminate()

    with wave.open(filename, 'wb') as wf:
        wf.setnchannels(1)
        wf.setsampwidth(audio.get_sample_size(pyaudio.paInt16))
        wf.setframerate(sample_rate)
        wf.writeframes(b''.join(frames))

def extract_features(filename):
    y, sr = librosa.load(filename)
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    return np.mean(mfccs, axis=1)

def recognize_voice():
    mean_feature, std_feature = joblib.load('voice_model.pkl')
    
    # Record a new sample
    filename = 'test_sample.wav'
    record_audio(filename, duration=5)
    
    # Predict
    feature = extract_features(filename)
    distance = np.linalg.norm(feature - mean_feature)
    threshold = 1.0  # You can adjust this threshold based on your needs

    if distance < threshold:
        print("The sample matches the user's voice.")
    else:
        print("The sample does not match the user's voice.")

if __name__ == "__main__":
    recognize_voice()


Recording...
Recording finished.
The sample does not match the user's voice.


In [25]:
import librosa
import numpy as np

def extract_features(filename):
    y, sr = librosa.load(filename, sr=None)
    
    # MFCCs
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    mfccs_mean = np.mean(mfccs, axis=1)
    
    # Chroma features
    chroma = librosa.feature.chroma_stft(y=y, sr=sr)
    chroma_mean = np.mean(chroma, axis=1)
    
    # Mel Spectrogram
    mel = librosa.feature.melspectrogram(y=y, sr=sr)
    mel_mean = np.mean(mel, axis=1)
    
    # Combine features
    features = np.concatenate([mfccs_mean, chroma_mean, mel_mean])
    return features


In [23]:
import numpy as np
import tensorflow as tf
import os

def prepare_data(user_name):
    dataset_folder = f'voice_dataset/{user_name}'
    features = []
    labels = []

    for filename in os.listdir(dataset_folder):
        if filename.endswith('.wav'):
            feature = extract_features(os.path.join(dataset_folder, filename))
            features.append(feature)
            labels.append(1)  # Label 1 for user's samples

    features = np.array(features)
    labels = np.array(labels)
    
    return features, labels

import tensorflow as tf
import numpy as np

def train_model(user_name):
    features, labels = prepare_data(user_name)
    
    # Define a more complex model
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(256, activation='relu', input_shape=(features.shape[1],)),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    model.fit(features, labels, epochs=30, batch_size=4, validation_split=0.1)
    
    model.save('voice_model.h5')
    print(f"Model trained and saved as 'voice_model.h5'")


if __name__ == "__main__":
    user_name = input("Enter the name of the user: ")
    train_model(user_name)


Epoch 1/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.9012 - loss: 1.3683 - val_accuracy: 1.0000 - val_loss: 0.0000e+00
Epoch 2/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 1.0000 - loss: 8.2934e-17 - val_accuracy: 1.0000 - val_loss: 0.0000e+00
Epoch 3/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 1.0000 - loss: 5.6309e-31 - val_accuracy: 1.0000 - val_loss: 0.0000e+00
Epoch 4/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 1.0000 - loss: 2.6176e-26 - val_accuracy: 1.0000 - val_loss: 0.0000e+00
Epoch 5/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 1.0000 - loss: 3.1329e-24 - val_accuracy: 1.0000 - val_loss: 0.0000e+00
Epoch 6/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 1.0000 - loss: 3.4546e-37 - val_accuracy: 1.0000 - val_loss: 0.0000e



Model trained and saved as 'voice_model.h5'


In [26]:
import numpy as np
import tensorflow as tf
import librosa
import os

def extract_features(filename):
    y, sr = librosa.load(filename, sr=None)
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    chroma = librosa.feature.chroma_stft(y=y, sr=sr)
    mel = librosa.feature.melspectrogram(y=y, sr=sr)
    
    features = np.concatenate([
        np.mean(mfccs, axis=1),
        np.mean(chroma, axis=1),
        np.mean(mel, axis=1)
    ])
    return features

def prepare_data(user_name):
    dataset_folder = f'voice_dataset/{user_name}'
    features = []
    labels = []

    for filename in os.listdir(dataset_folder):
        if filename.endswith('.wav'):
            feature = extract_features(os.path.join(dataset_folder, filename))
            features.append(feature)
            labels.append(1)  # Label for user's samples

    features = np.array(features)
    labels = np.array(labels)
    
    return features, labels

def train_model(user_name):
    features, labels = prepare_data(user_name)
    model = tf.keras.Sequential([
        tf.keras.layers.Dense(256, activation='relu', input_shape=(features.shape[1],)),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    model.fit(features, labels, epochs=30, batch_size=4, validation_split=0.1)
    
    model.save('voice_model.h5')
    print(f"Model trained and saved as 'voice_model.h5'")

def recognize_voice():
    model = tf.keras.models.load_model('voice_model.h5')
    filename = 'test_sample.wav'
    record_audio(filename, duration=5)
    
    feature = extract_features(filename)
    feature = np.expand_dims(feature, axis=0)
    prediction = model.predict(feature)
    
    threshold = 0.5  # Adjust if necessary
    if prediction[0] > threshold:
        print("The sample matches the user's voice.")
    else:
        print("The sample does not match the user's voice.")

if __name__ == "__main__":
    user_name = input("Enter the name of the user: ")
    train_model(user_name)
    recognize_voice()


Epoch 1/30
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.8697 - loss: 3.8813 - val_accuracy: 1.0000 - val_loss: 0.0000e+00
Epoch 2/30
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 1.0000 - loss: 0.0000e+00 - val_accuracy: 1.0000 - val_loss: 0.0000e+00
Epoch 3/30
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 1.0000 - loss: 0.0000e+00 - val_accuracy: 1.0000 - val_loss: 0.0000e+00
Epoch 4/30
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 1.0000 - loss: 0.0000e+00 - val_accuracy: 1.0000 - val_loss: 0.0000e+00
Epoch 5/30
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 1.0000 - loss: 0.0000e+00 - val_accuracy: 1.0000 - val_loss: 0.0000e+00
Epoch 6/30
[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 1.0000 - loss: 0.0000e+00 - val_accuracy: 1.0000 - val_loss: 0.0000



Model trained and saved as 'voice_model.h5'
Recording...
Recording finished.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
The sample matches the user's voice.


In [1]:
import sounddevice as sd
import numpy as np
import soundfile as sf
import os

def create_dataset_dir(name):
    path = f"voice_dataset/{name}"
    if not os.path.exists(path):
        os.makedirs(path)
    return path

def record_audio(filename, duration=5, fs=16000):
    print("Recording...")
    audio_data = sd.rec(int(duration * fs), samplerate=fs, channels=1, dtype=np.int16)
    sd.wait()
    sf.write(filename, audio_data, fs)
    print(f"Audio saved as {filename}")

# Main function to capture audio samples for the dataset
name = input("Enter the name of the person: ")
dataset_path = create_dataset_dir(name)
for i in range(5):  # Record 5 samples
    file_path = os.path.join(dataset_path, f"sample_{i+1}.wav")
    record_audio(file_path, duration=5)
    print(f"Sample {i+1} recorded.")


Recording...
Audio saved as voice_dataset/Siddhartha\sample_1.wav
Sample 1 recorded.
Recording...
Audio saved as voice_dataset/Siddhartha\sample_2.wav
Sample 2 recorded.
Recording...
Audio saved as voice_dataset/Siddhartha\sample_3.wav
Sample 3 recorded.
Recording...
Audio saved as voice_dataset/Siddhartha\sample_4.wav
Sample 4 recorded.
Recording...
Audio saved as voice_dataset/Siddhartha\sample_5.wav
Sample 5 recorded.


In [2]:
from resemblyzer import VoiceEncoder
import numpy as np
import os

def compute_embeddings(name):
    dataset_path = f'voice_dataset/{name}'
    encoder = VoiceEncoder()

    # List to store embeddings
    embeddings = []

    # Loop through voice samples in the dataset folder
    for file in os.listdir(dataset_path):
        if file.endswith(".wav"):
            filepath = os.path.join(dataset_path, file)
            print(f"Processing {file}...")
            wav = encoder.load_wav(filepath)
            embedding = encoder.embed_utterance(wav)
            embeddings.append(embedding)
    
    # Average embeddings to create a single profile for the user
    average_embedding = np.mean(embeddings, axis=0)

    # Save the embedding
    np.save(f"voice_profiles/{name}_embedding.npy", average_embedding)
    print(f"Embedding for {name} saved as {name}_embedding.npy")

# Create the voice_profiles directory to save embeddings
if not os.path.exists('voice_profiles'):
    os.makedirs('voice_profiles')

# Main function to generate voice embedding for the user
name = input("Enter the name of the person to train: ")
compute_embeddings(name)


ModuleNotFoundError: No module named 'resemblyzer'