<h1 style = "font-size:3rem;color:darkcyan"> Make Speech Predictions</h1>


In [3]:
# import libraries
import numpy as np
import tensorflow.keras as keras
import librosa

In [82]:
class Keyword_Recognition:
    
    def __init__(self, model_path, sample_rate = 22500, audio_dur = 1, hop_length = 512, n_mfcc = 13, n_fft = 2048):
    
        self.model = keras.models.load_model(model_path)
        
        self.mappings = [
            "backward",
            "bed",
            "bird",
            "cat",
            "dog",
            "down",
            "eight",
            "five",
            "follow",
            "forward",
            "four",
            "go",
            "happy",
            "house",
            "learn",
            "left",
            "marvin",
            "nine",
            "no",
            "off",
            "on",
            "one",
            "right",
            "seven",
            "sheila",
            "six",
            "stop",
            "three",
            "tree",
            "two",
            "up",
            "visual",
            "wow",
            "yes",
            "zero"
        ]
        
        self.n_samples = int(audio_dur * sample_rate)
        self.sample_rate = sample_rate
        self.n_mfcc = n_mfcc
        self.n_fft = n_fft
        self.hop_length = hop_length

    def _preprocess(self, file_path):
        audio_data, _ = librosa.load(file_path, sr= self.sample_rate, mono=True)
        
        # zero pad signal if needed to ensure even lengths
        if len(audio_data) < self.n_samples:
            zero_pad = np.zeros(self.n_samples - len(audio_data))
            audio_data = np.append(audio_data, zero_pad)
            
        # crop audio when larger
        if len(audio_data > self.n_samples):
            audio_data = audio_data[:self.n_samples] 

        # extract the MFCC
        mfcc = librosa.feature.mfcc(y = audio_data, 
                                            sr = self.sample_rate, 
                                            n_mfcc = self.n_mfcc, 
                                            n_fft = self.n_fft, 
                                            hop_length = self.hop_length)

        return mfcc.T
        
    def predict(self, file_path):
        
        # extract MFCCs
        MFCCs = self._preprocess(file_path)
        
        # add axis for CNN network
        MFCCs = MFCCs[np.newaxis, ..., np.newaxis]
        
        
        # make prediction
        y_pred = self.model.predict(MFCCs)

        # get index of higest probability
        pred_class = np.argmax(y_pred, axis=1)
        
        # print the label
        print('The predicted keyword = {}'.format(self.mappings[int(pred_class)]))



In [83]:
keyword_recognition = Keyword_Recognition(model_path = 'model.h5')

In [84]:
audio_file_path = '../../../Datasets/Speech/Digits/AudioMNIST/data/04/0_04_49.wav'
keyword_recognition.predict(audio_file_path)

The predicted keyword = zero
