In [61]:
import os
import numpy as np
from sklearn.mixture import GaussianMixture as GMM
from sklearn import preprocessing
from scipy.io.wavfile import read
import python_speech_features as mfcc

In [62]:
#I have used the stack stack overflow answer which was given in the question as the reference
# (Ref- https://stackoverflow.com/questions/54160128/feature-extraction-using-mfcc)
def delta_calculation(feature_matrix):
    num_frames,num_features=feature_matrix.shape
    delta_features=np.zeros((num_frames, num_features))  
    window_size=2 
    for frame_idx in range(num_frames):
        delta_sum=np.zeros(num_features)
        for n in range(1,window_size+1):
            prev_idx=max(0,frame_idx-n) 
            next_idx=min(num_frames-1,frame_idx+n)
            delta_sum+=(n *(feature_matrix[next_idx]-feature_matrix[prev_idx]))
        delta_features[frame_idx]=delta_sum/(2*window_size*(window_size+1))  # Normalize by the total weight
    return delta_features
# Function to extract MFCC features
def extract_features(audio,rate):
    #Input parametres for mfcc.mfcc
    frame_size=0.025 
    frame_stride=0.01 
    num_mfcc_features=20
    nfft=1200 
    append_energy=True  
    mfcc_features=mfcc.mfcc(audio,rate,winlen=frame_size,winstep=frame_stride,numcep=num_mfcc_features,nfft=nfft,appendEnergy=append_energy)
    mfcc_features=preprocessing.scale(mfcc_features)
    delta_features=delta_calculation(mfcc_features)
    combined_features=np.hstack((mfcc_features,delta_features))
    return combined_features


In [63]:
#Before writing this code I have checked online to study and understand  about implementing GMM on features of audio samples
# So the acknowledgements are https://github.com/shivam-shukla/Speaker-Recognition-Using-GMM-MFCC-Python3

# Function to train GMM models for each speaker
def train_gmm(source_dir,model_save_dir,training_file):
    with open(training_file,'r') as file_paths:
        file_count=1 
        accumulated_features=np.asarray(())
        for file_path in file_paths:
            file_path=file_path.strip()
            sample_rate,audio_data=read(os.path.join(source_dir,file_path))
            feature_vectors=extract_features(audio_data,sample_rate)  # Extract features from the audio
            if accumulated_features.size==0:
                accumulated_features=feature_vectors
            else:
                accumulated_features=np.vstack((accumulated_features,feature_vectors))
            # Training GMM after processing 5 files for each speaker
            if file_count==5:
                gmm_model=GMM(n_components=5,covariance_type='diag',n_init=3)
                gmm_model.fit(accumulated_features)
                speaker_name=file_path.split("-")[0] 
                model_filename=f"{speaker_name}.gmm"
                np.save(os.path.join(model_save_dir,model_filename),gmm_model)
                print(f'The GMM model for speaker "{speaker_name}" has {accumulated_features.shape} as shape.')
                # Resetting for the next speaker's files
                accumulated_features=np.asarray(())
                file_count = 0
            file_count += 1


In [59]:
def evaluate_speakers(test_audio_dir,model_dir,test_file_list):
    with open(test_file_list, 'r') as test_files:
        # Load GMM models saved in .npy format
        gmm_model_files=[os.path.join(model_dir,file) for file in os.listdir(model_dir) if file.endswith('.npy')]
        gmm_models=[np.load(model_file,allow_pickle=True).item() for model_file in gmm_model_files]
        speaker_names=[os.path.basename(model_file).split(".gmm")[0] for model_file in gmm_model_files]
        actual_speaker_labels=[] 
        predicted_speaker_labels=[]
        # Process each test file for speaker identification
        for audio_file in test_files:
            audio_file=audio_file.strip()
            sample_rate,audio_data=read(os.path.join(test_audio_dir,audio_file))
            feature_vector=extract_features(audio_data,sample_rate)
            log_likelihoods=np.zeros(len(gmm_models)) 
            # Evaluating the audio features against each GMM model
            for i, gmm in enumerate(gmm_models):
                log_likelihoods[i]=np.sum(gmm.score(feature_vector))  # Sum of log likelihood for the current model
            best_match_idx=np.argmax(log_likelihoods)
            predicted_speaker=speaker_names[best_match_idx]
            predicted_speaker_labels.append(predicted_speaker)
            actual_speaker=audio_file.split("-")[0][:len(predicted_speaker)]
            actual_speaker_labels.append(actual_speaker)
            print(f"Processed audio file: {audio_file}")
            print(f"Identified speaker: {predicted_speaker} (Best match)")
            print(f"Actual speaker in the recording: {actual_speaker}\n")
        correct_predictions=sum([1 for actual, predicted in zip(actual_speaker_labels,predicted_speaker_labels) if actual==predicted])
        accuracy = (correct_predictions/len(actual_speaker_labels)) * 100
        print(f"Speaker identification completed for {len(actual_speaker_labels)} audio files.")
        print(f"Overall identification accuracy: {accuracy:.2f}%\n")
    return actual_speaker_labels,predicted_speaker_labels,accuracy




In [60]:

source="GMM-Speaker-Identification-data/Speaker_data/Voice_Samples_Training"  # Path of folder containing training audio
dest="Trained_Speech_Models"  # Path of folder to save trained models
train_file="Voice_Samples_Training_Path.txt"  # File listing training data
test_file="Testing_audio_Path.txt"  # File listing testing data
# Train GMM models
train_gmm(source,dest,train_file)
print("\n")
#Test speaker identification and get accuracy
actual_speakers,detected_speakers,accuracy=evaluate_speakers("GMM-Speaker-Identification-data/Speaker_data/Testing_Audio", dest, test_file)
print(f"\nGMM Model Accuracy for test audio files: {accuracy}%")
print(f"List of Actual Speakers: {actual_speakers}")
print(f"List of Detected Speakers: {detected_speakers}")



The GMM model for speaker "Abhay" has (5439, 40) as shape.
The GMM model for speaker "Eknath" has (6079, 40) as shape.
The GMM model for speaker "Rg" has (5099, 40) as shape.
The GMM model for speaker "Vaibhav" has (6334, 40) as shape.
The GMM model for speaker "Rishika" has (6079, 40) as shape.
The GMM model for speaker "ShivamY" has (8967, 40) as shape.


Processed audio file: Abhay_audio1.wav
Identified speaker: Abhay (Best match)
Actual speaker in the recording: Abhay

Processed audio file: Abhay_audio2.wav
Identified speaker: Abhay (Best match)
Actual speaker in the recording: Abhay

Processed audio file: Abhay_audio3.wav
Identified speaker: Abhay (Best match)
Actual speaker in the recording: Abhay

Processed audio file: Abhay_audio4.wav
Identified speaker: Abhay (Best match)
Actual speaker in the recording: Abhay

Processed audio file: Abhay_audio5.wav
Identified speaker: Abhay (Best match)
Actual speaker in the recording: Abhay

Processed audio file: Eknath_audio1.wav
Identified