In [1]:
""" Source: https://github.com/UNREALre/SpeechRecognitionHMM_Basics"""

"""os library provides a portable way of using operating system dependent functionality."""
import os

"""numpy library contains fundamental package for scientific computing with Python."""
import numpy as np

"""It provides many user-friendly and efficient numerical routines, such as routines for numerical integration, interpolation, 
  optimization, linear algebra, and statistics."""
import scipy

"""Warnings are provided to warn the developer of situations that aren’t necessarily exceptions."""
import warnings

"""wavefile library provides the methods to read and get information from the wav files."""
from scipy.io import wavfile

"""hmm models to learn HMMs (Hidden Markov Models)."""
from hmmlearn import hmm

"""Matplotlib is a comprehensive library for creating static, animated, and interactive visualizations in Python."""
import matplotlib.pyplot as plt

"""python_speech_features library provides common speech features for ASR including MFCCs and filterbank energies."""
from python_speech_features import mfcc

"""confusion matrix to evaluate the accuracy of a classification."""
from sklearn.metrics import confusion_matrix

"""The classification report visualizer displays the precision, recall, F1, and support scores for the model. """
from sklearn.metrics import classification_report

"""%matplotlib inline command will make plots outputs appear in the notebook"""
%matplotlib inline

#### MFCC implementation function

In [2]:
# mfcc features extraction function
def mfcc_features(input_file):
    # reading the wav file and retrieving the file information
    sample_rate, audio = wavfile.read(input_file)
    
    # getting the mfcc features using mfcc library and storing in mfcc_features
    mfcc_features = mfcc(audio, sample_rate)
    
    # returning the mfcc features
    return mfcc_features

In [3]:
# building the models for the audio samples function
def build_models(input_folder):
    
    speech_models = [] # list to hold the trained models and intializing empty list
    fpaths = []  # path of individual audio clips
    labels = []  # Folder names containing audio clips
    
    # reading the audio files and training the models
    for f in os.listdir(input_folder):
        all_obs = np.array([])
        for w in os.listdir(input_folder + '/' + f):
            labels.append(f)
            mfcc_feat = mfcc_features(input_folder + '/' + f + '/' + w)
            if len(all_obs) == 0:
                all_obs = mfcc_feat
            else:
                all_obs = np.append(all_obs, mfcc_feat, axis=0)

            # creating the instance of the ModelHMM class
            model = ModelHMM()
            
            # feeding the observations to train method of model class 
            model.train(all_obs)

            # storing all the trained models along with audio labels
            speech_models.append((model, f))

            # setting model to None to get the next observations to train
            model = None
           
    # returning the trained speech_models list    
    return speech_models
    

In [4]:
# testing the test audio dataset function
def run_test(test_files, types = 1):
    # delaring 2 lists to hold the actual audio label and predicted audio label and intializing empty list
    y_test = []
    predicted_labels = []

    
    # fetching the test files one by one 
    for test_file in test_files:

        # getting the mfcc feature of the test files and storing in features_mfcc
        with warnings.catch_warnings():
            warnings.simplefilter('ignore')
            features_mfcc = mfcc_features(test_file)

        # initializing max_score and out_label
        max_score = -float('inf')
        output_label = None

        # computing the score to based on the features_mfcc and each items in the speech_mdels
        for item in speech_models:
            model, label = item

            score = model.compute_score(features_mfcc)
            # for max_score generated we will assin the label and predcited label for the audio
            if score > max_score:
                max_score = score
                predicted_label = label
        if types == 1: 
            # getting the actual label and predicted label and printing the same
            start_index = test_file.find('\\') + 1
            end_index = test_file.rfind('\\')+1
            print('Predictions: ')
            print()
            print('Audio file:',test_file, start_index, end_index)
            original_label = test_file[end_index:]
            print('Original: {}'.format(original_label.split('/')[1]))
            print('Predicted: {}'.format(predicted_label))
            print()
            y_test.append(original_label.split('/')[1])
            predicted_labels.append(predicted_label)
        else:
            # getting the actual label and predicted label and printing the same
            start_index = test_file.find('\\') + 1
            end_index = test_file.rfind('\\')+1
            original_label = test_file[end_index:]
            y_test.append(original_label.split('/')[1])
            predicted_labels.append(predicted_label)

            
    """Compute confusion matrix to evaluate the accuracy of a classification."""
    cm = confusion_matrix(y_test, predicted_labels)
    print(cm)      
    """Build a text report showing the main classification metrics."""
    print(classification_report(y_test, predicted_labels))

In [5]:
# class to train and compute score using the hmm model
class ModelHMM(object):
    
    # n_components (int) – Number of states, Maximum number of iterations to perform.
    def __init__(self, num_components=4, num_iter=1000):
        self.n_components = num_components
        self.n_iter = num_iter

        """String describing the type of covariance parameters to use. 
          ”diag” — each state uses a diagonal covariance matrix.
        """
        self.cov_type = 'diag'
        
        """The Gaussian hidden Markov model (Gaussian HMM) is a type of finite-state-space and homogeneous HMM 
           where the observation probability distribution is the normal distribution"""
        self.model_name = 'GaussianHMM'

        self.models = []
        
        # using hmm library to initialize the instance of the GaussianHMM class
        self.model = hmm.GaussianHMM(n_components=self.n_components, covariance_type=self.cov_type, n_iter=self.n_iter)

    # method to train the audio samples oberavation    
    def train(self, training_data):
        np.seterr(all='ignore')
        cur_model = self.model.fit(training_data)
        self.models.append(cur_model)

    # compute the score for the audio sample features    
    def compute_score(self, input_data):
        return self.model.score(input_data)

if __name__ == '__main__':
    # train audio dataset folder name
    input_folder = 'TrainAudio'

    # store the trained speech models
    speech_models = build_models(input_folder)

    

In [8]:
    # declaring the train files list and intializing to empty list
    train_files = []
    
    # fetching the full directory path of the each audio test files and appending to the test_files
    for root, dirs, files in os.walk(input_folder):
        for filename in files:
            filepath = os.path.join(root, filename)
            if(filepath not in train_files):
                train_files.append(filepath)
 
    # calculating the accurary and computing confusion matrix and classification matrix for the trained data set
    run_test(train_files, 2)

[[70  0  1  6 15  3]
 [ 0 81  4  1  3  6]
 [ 2  6 62  7 12  6]
 [ 2  5  5 75  7  1]
 [ 6  1  3  4 71 10]
 [ 2  4  2  2  2 83]]
              precision    recall  f1-score   support

       happy       0.85      0.74      0.79        95
       house       0.84      0.85      0.84        95
        left       0.81      0.65      0.72        95
          no       0.79      0.79      0.79        95
       right       0.65      0.75      0.69        95
         yes       0.76      0.87      0.81        95

    accuracy                           0.78       570
   macro avg       0.78      0.78      0.78       570
weighted avg       0.78      0.78      0.78       570



In [9]:
    # declaring the test files list and intializing to empty list
    test_files = []
    
    # fetching the full directory path of the each audio test files and appending to the test_files
    for root, dirs, files in os.walk('TestAudio'):
        for filename in files:
            filepath = os.path.join(root, filename)
            if(filepath not in test_files):
                test_files.append(filepath)
    
    # calculating the accurary and computing confusion matrix and classification matrix for the test data set
    run_test(test_files, 1)

Predictions: 

Audio file: TestAudio/happy/1fe4c891_nohash_1.wav 0 0
Original: happy
Predicted: happy

Predictions: 

Audio file: TestAudio/happy/2a89ad5c_nohash_1.wav 0 0
Original: happy
Predicted: happy

Predictions: 

Audio file: TestAudio/happy/2aa787cf_nohash_0.wav 0 0
Original: happy
Predicted: happy

Predictions: 

Audio file: TestAudio/happy/2a0b413e_nohash_0.wav 0 0
Original: happy
Predicted: happy

Predictions: 

Audio file: TestAudio/happy/2c6446f7_nohash_1.wav 0 0
Original: happy
Predicted: happy

Predictions: 

Audio file: TestAudio/right/1eddce1d_nohash_3.wav 0 0
Original: right
Predicted: right

Predictions: 

Audio file: TestAudio/right/1ed557b9_nohash_1.wav 0 0
Original: right
Predicted: right

Predictions: 

Audio file: TestAudio/right/1c45ceb2_nohash_0.wav 0 0
Original: right
Predicted: right

Predictions: 

Audio file: TestAudio/right/1d1fe0a0_nohash_1.wav 0 0
Original: right
Predicted: right

Predictions: 

Audio file: TestAudio/right/1bb6ed89_nohash_0.wav 0 0
Orig