In [10]:
import pyaudio
import wave
import os

def record_audio(filename, duration=5, sample_rate=44100):
    chunk = 1024  # Record in chunks of 1024 samples
    audio = pyaudio.PyAudio()

    # Set up recording stream
    stream = audio.open(format=pyaudio.paInt16, channels=1,
                        rate=sample_rate, input=True,
                        frames_per_buffer=chunk)

    print("Recording...")
    frames = []

    for _ in range(0, int(sample_rate / chunk * duration)):
        data = stream.read(chunk)
        frames.append(data)

    print("Recording finished.")
    stream.stop_stream()
    stream.close()
    audio.terminate()

    # Save audio file
    with wave.open(filename, 'wb') as wf:
        wf.setnchannels(1)
        wf.setsampwidth(audio.get_sample_size(pyaudio.paInt16))
        wf.setframerate(sample_rate)
        wf.writeframes(b''.join(frames))

if __name__ == "__main__":
    person_name = input("Enter the name of the person: ")
    num_samples = 5
    dataset_dir = f"voice_dataset/{person_name}"
    
    if not os.path.exists(dataset_dir):
        os.makedirs(dataset_dir)

    for i in range(num_samples):
        filename = os.path.join(dataset_dir, f"{i+1}.wav")
        record_audio(filename, duration=5)


Recording...
Recording finished.
Recording...
Recording finished.
Recording...
Recording finished.
Recording...
Recording finished.
Recording...
Recording finished.


In [12]:
import librosa
import numpy as np
import joblib
import os

def extract_features(filename):
    y, sr = librosa.load(filename)
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    return np.mean(mfccs, axis=1)

def train_model(user_name):
    dataset_folder = f'voice_dataset/{user_name}'
    features = []

    for filename in os.listdir(dataset_folder):
        if filename.endswith('.wav'):
            feature = extract_features(os.path.join(dataset_folder, filename))
            features.append(feature)

    features = np.array(features)
    mean_feature = np.mean(features, axis=0)
    std_feature = np.std(features, axis=0)

    joblib.dump((mean_feature, std_feature), 'voice_model.pkl')
    print(f"Model trained and saved as 'voice_model.pkl'")

if __name__ == "__main__":
    user_name = input("Enter the name of the user: ")
    train_model(user_name)


Model trained and saved as 'voice_model.pkl'


In [14]:
import librosa
import numpy as np
import joblib
import pyaudio
import wave

def record_audio(filename, duration=5, sample_rate=44100):
    chunk = 1024
    audio = pyaudio.PyAudio()

    stream = audio.open(format=pyaudio.paInt16, channels=1,
                        rate=sample_rate, input=True,
                        frames_per_buffer=chunk)

    print("Recording...")
    frames = []

    for _ in range(0, int(sample_rate / chunk * duration)):
        data = stream.read(chunk)
        frames.append(data)

    print("Recording finished.")
    stream.stop_stream()
    stream.close()
    audio.terminate()

    with wave.open(filename, 'wb') as wf:
        wf.setnchannels(1)
        wf.setsampwidth(audio.get_sample_size(pyaudio.paInt16))
        wf.setframerate(sample_rate)
        wf.writeframes(b''.join(frames))

def extract_features(filename):
    y, sr = librosa.load(filename)
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    return np.mean(mfccs, axis=1)

def recognize_voice():
    mean_feature, std_feature = joblib.load('voice_model.pkl')
    
    # Record a new sample
    filename = 'test_sample.wav'
    record_audio(filename, duration=5)
    
    # Predict
    feature = extract_features(filename)
    distance = np.linalg.norm(feature - mean_feature)
    threshold = 1.0  # You can adjust this threshold based on your needs

    if distance < threshold:
        print("The sample matches the user's voice.")
    else:
        print("The sample does not match the user's voice.")

if __name__ == "__main__":
    recognize_voice()


Recording...
Recording finished.
The sample does not match the user's voice.


In [9]:
import librosa
import numpy as np
import os

def extract_features(filename):
    y, sr = librosa.load(filename)
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    return np.mean(mfccs, axis=1)


In [15]:
import numpy as np
import tensorflow as tf
import os

def prepare_data(user_name):
    dataset_folder = f'voice_dataset/{user_name}'
    features = []
    labels = []

    for filename in os.listdir(dataset_folder):
        if filename.endswith('.wav'):
            feature = extract_features(os.path.join(dataset_folder, filename))
            features.append(feature)
            labels.append(1)  # Label 1 for user's samples

    features = np.array(features)
    labels = np.array(labels)
    
    return features, labels

import tensorflow as tf
import numpy as np

def train_model(user_name):
    features, labels = prepare_data(user_name)

    model = tf.keras.Sequential([
        tf.keras.layers.Dense(128, activation='relu', input_shape=(features.shape[1],)),
        tf.keras.layers.Dropout(0.5),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(32, activation='relu'),
        tf.keras.layers.Dense(1, activation='sigmoid')
    ])

    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    model.fit(features, labels, epochs=20, batch_size=2, validation_split=0.1)
    
    model.save('voice_model.h5')
    print(f"Model trained and saved as 'voice_model.h5'")

if __name__ == "__main__":
    user_name = input("Enter the name of the user: ")
    train_model(user_name)


Epoch 1/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.0000e+00 - loss: 106.8276  
Epoch 2/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.0000e+00 - loss: 55.1582 
Epoch 3/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.1750 - loss: 13.7845     
Epoch 4/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 1.0000 - loss: 9.4562e-08 
Epoch 5/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 1.0000 - loss: 5.7176e-20 
Epoch 6/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 1.0000 - loss: 8.7845e-27 
Epoch 7/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 1.0000 - loss: 1.8886e-30 
Epoch 8/10
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 1.0000 - loss: 7.2114e-33 
Epoch 9/10
[1m5/5[0m [32m━━━━━━━━━━



Model trained and saved as 'voice_model.h5'


In [16]:
import tensorflow as tf
import numpy as np
import pyaudio
import wave

def record_audio(filename, duration=5, sample_rate=44100):
    chunk = 1024
    audio = pyaudio.PyAudio()

    stream = audio.open(format=pyaudio.paInt16, channels=1,
                        rate=sample_rate, input=True,
                        frames_per_buffer=chunk)

    print("Recording...")
    frames = []

    for _ in range(0, int(sample_rate / chunk * duration)):
        data = stream.read(chunk)
        frames.append(data)

    print("Recording finished.")
    stream.stop_stream()
    stream.close()
    audio.terminate()

    with wave.open(filename, 'wb') as wf:
        wf.setnchannels(1)
        wf.setsampwidth(audio.get_sample_size(pyaudio.paInt16))
        wf.setframerate(sample_rate)
        wf.writeframes(b''.join(frames))

def extract_features(filename):
    y, sr = librosa.load(filename)
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    return np.mean(mfccs, axis=1)

def recognize_voice():
    model = tf.keras.models.load_model('voice_model.h5')
    
    filename = 'test_sample.wav'
    record_audio(filename, duration=5)
    
    feature = extract_features(filename)
    feature = np.expand_dims(feature, axis=0)
    prediction = model.predict(feature)
    
    threshold = 0.5  # Try different values if needed
    if prediction[0] > threshold:
        print("The sample matches the user's voice.")
    else:
        print("The sample does not match the user's voice.")


if __name__ == "__main__":
    recognize_voice()




Recording...
Recording finished.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
The sample matches the user's voice.
