In [37]:
import numpy as np
from scipy.io import wavfile
import os
from sklearn.preprocessing import LabelEncoder
import keras
from sklearn.model_selection import train_test_split
import IPython.display as ipd
import keras.regularizers
import tensorflow as tf
from keras.callbacks import LearningRateScheduler
from datetime import datetime
from tensorflow.keras.optimizers import Adam
import matplotlib.pyplot as plt
import time
from scipy.fftpack import dct
import seaborn as sns
from keras.layers import Input, Conv2D, Lambda, Dense, Flatten,MaxPooling2D
from keras.models import Model

In [88]:
def MSLFB(signal_resampled):
    pre_emphasis = 0.97
    frame_size = 0.025
    frame_stride = 0.01
    NFFT = 512  # FFT Size
    nfilt = 40  # number of filters
    emphasized_signal = np.append(signal_resampled[0], signal_resampled[1:] - pre_emphasis * signal_resampled[:-1])
    frame_length, frame_step = frame_size * sample_rate, frame_stride * sample_rate  # Convert from seconds to samples
    signal_length = len(emphasized_signal)
    frame_length = int(round(frame_length))
    frame_step = int(round(frame_step))
    num_frames = int(
        np.ceil(float(np.abs(signal_length - frame_length)) / frame_step))  # Make sure that we have at least 1 frame

    pad_signal_length = num_frames * frame_step + frame_length
    z = np.zeros((pad_signal_length - signal_length))
    pad_signal = np.append(emphasized_signal, z)  # Pad Signal to make sure that all frames have equal number of samples without truncating any samples from the original signal
    indices = np.tile(np.arange(0, frame_length), (num_frames, 1)) + np.tile(
        np.arange(0, num_frames * frame_step, frame_step), (frame_length, 1)).T
    frames = pad_signal[indices.astype(np.int32, copy=False)]
    frames *= np.hamming(frame_length)
    mag_frames = np.absolute(np.fft.rfft(frames, NFFT))  # Magnitude of the FFT
    pow_frames = ((1.0 / NFFT) * ((mag_frames) ** 2))  # Power Spectrum

    low_freq_mel = 0
    high_freq_mel = (2595 * np.log10(1 + (sample_rate / 2) / 700))  # Convert Hz to Mel
    mel_points = np.linspace(low_freq_mel, high_freq_mel, nfilt + 2)  # Equally spaced in Mel scale
    hz_points = (700 * (10 ** (mel_points / 2595) - 1))  # Convert Mel to Hz
    bin = np.floor((NFFT + 1) * hz_points / sample_rate)

    fbank = np.zeros((nfilt, int(np.floor(NFFT / 2 + 1))))
    for m in range(1, nfilt + 1):
        f_m_minus = int(bin[m - 1])  # left
        f_m = int(bin[m])  # center
        f_m_plus = int(bin[m + 1])  # right

        for k in range(f_m_minus, f_m):
            fbank[m - 1, k] = (k - bin[m - 1]) / (bin[m] - bin[m - 1])
        for k in range(f_m, f_m_plus):
            fbank[m - 1, k] = (bin[m + 1] - k) / (bin[m + 1] - bin[m])
    filter_banks = np.dot(pow_frames, fbank.T)
    filter_banks_eps = np.where(filter_banks == 0, np.finfo(float).eps, filter_banks)  # Numerical Stability
    filter_banks_log = 20 * np.log10(filter_banks_eps)  # dB
    filter_banks_log -= (np.mean(filter_banks_log, axis=0) + 1e-8)
    return abs(filter_banks_log)

In [139]:
def MFCC(signal_resampled):
    num_ceps = 12
    filter_banks = MSLFB(signal_resampled)
    mfcc = dct(filter_banks, type=2, axis=1, norm='ortho')[:, 1: (num_ceps + 1)]  # Keep 2-13
    #mfccs = librosa.feature.mfcc(y=xf, sr=sr, n_mfcc=4)
    return mfcc

In [89]:
path = r"C:\Users\korol\Desktop\Upiter\Speech_Recogn\commands"
noise_dir = r"C:\Users\korol\Desktop\Upiter\Speech_Recogn\train\audio\_background_noise_"

audio_length = 16000
sample_rate = 16000

classes = ['down', 'go', 'left', 'no', 'off', 'on', 'right', 'stop', 'up', 'yes']

train_samples = []
train_labels = []

    
for trains in os.listdir('commands'):
     for wav_file in os.listdir(os.path.join('commands', trains)):
        if wav_file[-4:] != '.wav':
            continue
        sample_rate_init, sample = wavfile.read(os.path.join(os.path.join(path, trains), wav_file))
        signal_resampled = sample
        if len(signal_resampled) != audio_length:
            zeros_needed = audio_length - len(signal_resampled)
            signal_resampled = np.append(signal_resampled, np.zeros((zeros_needed))) # if the length is not right just pad with zeros
        train_samples.append(signal_resampled)
        train_labels.append(trains)

In [90]:
print(len(train_samples))
print(len(train_labels))

23682
23682


In [52]:
# converting labels from text to int
le = LabelEncoder()
y_all = le.fit_transform(train_labels)
y_all_hot = keras.utils.np_utils.to_categorical(y_all, len(le.classes_))
x_train, x_test, y_train_one_hot, y_test_one_hot = train_test_split(train_samples, y_all_hot, test_size=0.2, random_state=42)

In [53]:
print(len(x_train))
print(len(x_test))

18945
4737


In [56]:
x_train_mslfb = []
for ind in range(len(x_train)):
    x_train_mslfb.append(MSLFB(x_train[ind]))

In [57]:
x_test_mslfb = []
for ind in range(len(x_test)):
    x_test_mslfb.append(MSLFB(x_test[ind]))

In [60]:
x_train_mslfb_reshaped = np.reshape(x_train_mslfb, (len(x_train_mslfb), len(x_train_mslfb[0]), len(x_train_mslfb[0][0]), 1))

In [64]:
x_train_mslfb_reshaped.shape

(18945, 98, 40, 1)

In [65]:
x_test_mslfb_reshaped = np.reshape(x_test_mslfb, (len(x_test_mslfb), len(x_test_mslfb[0]), len(x_test_mslfb[0][0]), 1))

In [67]:
input_shape_mslfb = (98, 40, 1)
mslfb_input = Input(input_shape_mslfb)
drop_out_rate = 0.5

In [130]:
model_mslfb = tf.keras.models.Sequential([
    keras.layers.Input(shape=input_shape_mslfb),
    keras.layers.Resizing(32, 32),
    keras.layers.Normalization(),
    keras.layers.Conv2D(32, 3, activation='relu'),
    keras.layers.Conv2D(64, 3, activation='relu'),
    keras.layers.MaxPool2D(),
    keras.layers.Dropout(0.25),
    keras.layers.Flatten(),
    keras.layers.Dense(128, activation='relu'),
    keras.layers.Dropout(0.5),
    keras.layers.Dense(10)
])

model_mslfb.summary()

Model: "sequential_14"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 resizing_1 (Resizing)       (None, 32, 32, 1)         0         
                                                                 
 normalization_1 (Normalizat  (None, 32, 32, 1)        3         
 ion)                                                            
                                                                 
 conv2d_51 (Conv2D)          (None, 30, 30, 32)        320       
                                                                 
 conv2d_52 (Conv2D)          (None, 28, 28, 64)        18496     
                                                                 
 max_pooling2d_22 (MaxPoolin  (None, 14, 14, 64)       0         
 g2D)                                                            
                                                                 
 dropout_75 (Dropout)        (None, 14, 14, 64)      

In [131]:
def step_decay_schedule(initial_lr=1e-3, decay_factor=0.75, step_size=10):
    '''
    Wrapper function to create a LearningRateScheduler with step decay schedule.
    '''

    def schedule(epoch):
        return initial_lr * (decay_factor ** np.floor(epoch / step_size))

    return LearningRateScheduler(schedule)

In [137]:
optimizer = Adam(learning_rate=0.001)
model_mslfb.compile(loss='categorical_crossentropy', optimizer=tf.keras.optimizers.Adam(), metrics=['accuracy'])
lr_sched = step_decay_schedule(initial_lr=0.001, decay_factor=0.97, step_size=5)
cb_earlystop = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', min_delta=0.003, patience=30, mode='max')


In [138]:
history = model_mslfb.fit(x_train_mslfb_reshaped,
                             y_train_one_hot,
                             batch_size=64,
                             epochs=100,
                             callbacks=tf.keras.callbacks.EarlyStopping(verbose=1, patience=2),
                             validation_data=(x_test_mslfb_reshaped, y_test_one_hot),
                             verbose=1)

Epoch 1/100

KeyboardInterrupt: 

In [144]:
x_train_mfcc = []
for ind in range(len(x_train)):
    x_train_mfcc.append(MFCC(x_train[ind]))

In [145]:
x_test_mfcc = []
for ind in range(len(x_test)):
    x_test_mfcc.append(MFCC(x_test[ind]))

In [146]:
x_train_mfcc_reshaped = np.reshape(x_train_mfcc, (len(x_train_mfcc), len(x_train_mfcc[0]), len(x_train_mfcc[0][0]), 1))
x_test_mfcc_reshaped = np.reshape(x_test_mfcc, (len(x_test_mfcc), len(x_test_mfcc[0]), len(x_test_mfcc[0][0]), 1))

In [161]:
print(x_train_mfcc_reshaped.shape)

(18945, 98, 12, 1)


In [157]:
input_shape_mfcc = (98, 12, 1)
mfcc_input = Input(input_shape_mfcc)

In [158]:
model_mfcc = tf.keras.models.Sequential([
  tf.keras.layers.Conv2D(filters=16, kernel_size=(4, 4), padding='valid', activation='relu', input_shape=(x_train_mfcc_reshaped.shape[1], x_train_mfcc_reshaped.shape[2], 1)), #, kernel_regularizer=keras.regularizers.l2(0.0001)
  tf.keras.layers.Dropout(drop_out_rate),
  tf.keras.layers.Conv2D(filters=32, kernel_size=(4, 4), padding='valid', activation='relu'),
  tf.keras.layers.MaxPooling2D(2, 2),
  tf.keras.layers.Dropout(drop_out_rate),
  tf.keras.layers.Conv2D(filters=64, kernel_size=(2, 2), padding='same', activation='relu'),
  tf.keras.layers.MaxPooling2D(2, 2),
  tf.keras.layers.Dropout(drop_out_rate),
  tf.keras.layers.Conv2D(filters=128, kernel_size=(2, 2), padding='same', activation='relu'),
  tf.keras.layers.Dropout(drop_out_rate),
  tf.keras.layers.Flatten(),
  tf.keras.layers.Dense(256,  activation='relu'),
  tf.keras.layers.Dropout(drop_out_rate),
  tf.keras.layers.Dense(128,  activation='relu'),
  tf.keras.layers.Dropout(drop_out_rate),
  tf.keras.layers.Dense(10, activation='relu')
])

model_mslfb.summary()

Model: "sequential_14"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 resizing_1 (Resizing)       (None, 32, 32, 1)         0         
                                                                 
 normalization_1 (Normalizat  (None, 32, 32, 1)        3         
 ion)                                                            
                                                                 
 conv2d_51 (Conv2D)          (None, 30, 30, 32)        320       
                                                                 
 conv2d_52 (Conv2D)          (None, 28, 28, 64)        18496     
                                                                 
 max_pooling2d_22 (MaxPoolin  (None, 14, 14, 64)       0         
 g2D)                                                            
                                                                 
 dropout_75 (Dropout)        (None, 14, 14, 64)      

In [159]:
model_mfcc.compile(loss='categorical_crossentropy', optimizer=tf.keras.optimizers.Adam(), metrics=['accuracy'])

In [162]:
history = model_mfcc.fit(x_train_mfcc_reshaped,
                             y_train_one_hot,
                             batch_size=64,
                             epochs=100,
                             callbacks=[lr_sched, cb_earlystop],
                             validation_data=(x_test_mslfb_reshaped, y_test_one_hot),
                             verbose=1)

Epoch 1/100

ValueError: in user code:

    File "D:\Anaconda\lib\site-packages\keras\engine\training.py", line 1820, in test_function  *
        return step_function(self, iterator)
    File "D:\Anaconda\lib\site-packages\keras\engine\training.py", line 1804, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "D:\Anaconda\lib\site-packages\keras\engine\training.py", line 1792, in run_step  **
        outputs = model.test_step(data)
    File "D:\Anaconda\lib\site-packages\keras\engine\training.py", line 1756, in test_step
        y_pred = self(x, training=False)
    File "D:\Anaconda\lib\site-packages\keras\utils\traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "D:\Anaconda\lib\site-packages\keras\engine\input_spec.py", line 295, in assert_input_compatibility
        raise ValueError(

    ValueError: Input 0 of layer "sequential_18" is incompatible with the layer: expected shape=(None, 98, 12, 1), found shape=(None, 98, 40, 1)
