saas

In [14]:
import numpy as np
import librosa
from pydub import AudioSegment
from pydub.utils import mediainfo
from sklearn import preprocessing
def mfcc_extraction(audio_filename, #.wav filename
    hop_duration, #hop_length in seconds, e.g., 0.015s (i.e., 15ms)
    num_mfcc #number of mfcc features
    ):
    speech = AudioSegment.from_wav(audio_filename) #Read audio data from file
    samples = speech.get_array_of_samples() #samples x(t)
    sampling_rate = speech.frame_rate #sampling rate f

    mfcc = librosa.feature.mfcc(
    y=np.float32(samples),
    sr = sampling_rate,
    hop_length = int(sampling_rate * hop_duration),
    n_mfcc = num_mfcc)

    return mfcc.T

from sklearn.mixture import GaussianMixture
def learningGMM(features, #list of feature vectors, each feature vector is an array
 n_components, #the number of components
 max_iter #maximum number of iterations
 ):
    gmm = GaussianMixture(n_components = n_components, max_iter = max_iter)
    gmm.fit(features)
    return gmm

In [2]:
import os
path = 'C:/Users/User/Desktop/Class Folder/Computer Vision/SpeakerData/'
speakers = os.listdir(path + 'Train/')
print(speakers)

['Anthony', 'AppleEater', 'Ara', 'Argail', 'Ariyan', 'Arjuan', 'Artem', 'Arthur', 'Artk', 'Arun', 'Arvala', 'Asalkeld', 'Asladic', 'Asp', 'Azmisov', 'B', 'Bachroxx', 'Bae', 'Bahoke', 'Bareford', 'Bart', 'Bassel', 'Beady', 'Beez', 'BelmontGuy']


In [3]:
from sklearn import preprocessing
#this list is used to store the MFCC features of all training data of all speakers
mfcc_all_speakers = []
hop_duration = 0.015 #15ms
num_mfcc = 12
for s in speakers:
    sub_path = path + 'Train/' + s + '/'
    sub_file_names = [os.path.join(sub_path, f) for f in os.listdir(sub_path)]
    mfcc_one_speaker = np.asarray(())
    for fn in sub_file_names:
        mfcc_one_file = mfcc_extraction(fn, hop_duration, num_mfcc)
        if mfcc_one_speaker.size == 0:
            mfcc_one_speaker = mfcc_one_file
        else:
            mfcc_one_speaker = np.vstack((mfcc_one_speaker, mfcc_one_file))
    mfcc_all_speakers.append(mfcc_one_speaker)

In [4]:
import pickle
for i in range(0, len(speakers)):
    with open('C:/Users/User/Desktop/Class Folder/Computer Vision/TrainingFeatures/' + speakers[i] + '_mfcc.fea','wb') as f:
        pickle.dump(mfcc_all_speakers[i], f)

In [5]:
n_components = 5
max_iter = 50
gmms = [] #list of GMMs, each is for a speaker
for i in range(0, len(speakers)):
    gmm = learningGMM(mfcc_all_speakers[i],n_components,max_iter)
    gmms.append(gmm)

In [6]:
for i in range(len(speakers)):
    with open('Models/' + speakers[i] + '.gmm', 'wb') as f: #'wb' is for binary write
        pickle.dump(gmms[i], f)

In [7]:
gmms = []
for i in range(len(speakers)):
    with open('Models/' + speakers[i] + '.gmm', 'rb') as f: #'wb' is for binary write
        gmm = pickle.load(f)
        gmms.append(gmm)

In [10]:
# Define the speaker recognition method
def speaker_recognition(audio_file_name, gmms):
    # Extract MFCC features from the input audio file
    mfcc_features = mfcc_extraction(audio_file_name, hop_duration, num_mfcc)

    # Calculate the likelihood scores for each speaker's GMM
    likelihood_scores = [gmm.score(mfcc_features) for gmm in gmms]

    # Find the speaker ID with the highest likelihood score
    speaker_id = np.argmax(likelihood_scores)

    return speaker_id

In [11]:
speaker_id = speaker_recognition('SpeakerData/Test/Ara/a0522.wav', gmms)
print(speakers[speaker_id])

Ara


#### Task

In [13]:
import os
import numpy as np
import pickle
from scipy.io import wavfile
from sklearn.mixture import GaussianMixture
import librosa

# Test the speaker recognition algorithm on the entire test set
correct_predictions = 0
total_samples = 0

# Iterate through test files
for root, _, files in os.walk('SpeakerData/Test'):
    for file in files:
        if file.endswith('.wav'):
            total_samples += 1
            audio_file_path = os.path.join(root, file)
            
            # Identify the speaker using the recognition method
            predicted_speaker_id = speaker_recognition(audio_file_path, gmms)
            
            # Get the true speaker ID from the folder name (assuming the folder name is the speaker's name)
            true_speaker = os.path.basename(root)
            true_speaker_id = speakers.index(true_speaker)
            
            # Check if the prediction is correct
            if predicted_speaker_id == true_speaker_id:
                correct_predictions += 1

# Calculate and print the recognition accuracy
accuracy = (correct_predictions / total_samples) * 100
print(f"Recognition Accuracy: {accuracy:.2f}%")

Recognition Accuracy: 92.57%
