In [1]:
# Mount the Google drive to the Colab notebook.
# It means that I giving access to the files in your google drive to Colab notebook.
from google.colab import drive
drive.mount("/content/gdrive", force_remount=True)

Mounted at /content/gdrive


In [2]:
# Change the current working directory to where you want to download the Kaggle dataset
%cd /content/gdrive/MyDrive/

/content/gdrive/MyDrive


In [3]:
# Import the relevant modules to be used later
import os
import struct
import sys
import re 
from tqdm import tqdm
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from glob import glob
import glob 
import shutil
import time
import wave
import pathlib
from IPython.display import display, Audio
import IPython

# Math
import numpy as np # linear algebra
from scipy.fftpack import fft
from scipy import signal
from scipy.io import wavfile as wav
from scipy.io import wavfile
import librosa
from sklearn.decomposition import PCA

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import IPython.display as ipd
import librosa.display
    
# Config matplotlib for inline plotting
%matplotlib inline

In [4]:
# Defining our current directory path and the dataset path
train_data_dir = '/content/gdrive/MyDrive/speech_commands_v02/'

In [5]:
# Create a new dataframe without label '_background_noise_'
train_labels = os.listdir(train_data_dir)
wav, labels, paths = [], [], []
for label in train_labels:
    if label == '_background_noise_':
        continue
    files = os.listdir(train_data_dir + '/' + label)
    for i in files:
        if not i.endswith('wav'):
            train_labels.remove('_background_noise_')
            continue
        wav.append(i)
        labels.append(label)
        paths.append(train_data_dir + label + '/' + i)

label_df = pd.DataFrame(labels, columns=['label'])
file_df = pd.DataFrame(wav, columns=['file'])
path_df = pd.DataFrame(paths, columns=['path'])
wav_df = pd.concat([label_df, file_df, path_df], axis = 1)

In [6]:
wav_df

Unnamed: 0,label,file,path
0,sheila,88a487ce_nohash_0.wav,/content/gdrive/MyDrive/speech_commands_v02/sh...
1,sheila,68effe85_nohash_0.wav,/content/gdrive/MyDrive/speech_commands_v02/sh...
2,sheila,56eb74ae_nohash_0.wav,/content/gdrive/MyDrive/speech_commands_v02/sh...
3,sheila,f33660af_nohash_0.wav,/content/gdrive/MyDrive/speech_commands_v02/sh...
4,sheila,14587ff0_nohash_0.wav,/content/gdrive/MyDrive/speech_commands_v02/sh...
...,...,...,...
105824,five,2bdbe5f7_nohash_1.wav,/content/gdrive/MyDrive/speech_commands_v02/fi...
105825,five,91b03183_nohash_2.wav,/content/gdrive/MyDrive/speech_commands_v02/fi...
105826,five,64e48f55_nohash_2.wav,/content/gdrive/MyDrive/speech_commands_v02/fi...
105827,five,b5cf6ea8_nohash_10.wav,/content/gdrive/MyDrive/speech_commands_v02/fi...


In [7]:
labels = [ 'backward', 'bed', 'bird', 'cat', 'dog', 'down', 'eight', 'five', 'follow', 'forward', 'four', 'go', 'happy', 'house', 'learn', 'left', 'marvin', 'nine', 'no', 'off', 'on', 'one', 'right', 'seven', 'sheila', 'six', 'stop', 'three', 'tree', 'two', 'up', 'visual', 'wow', 'yes', 'zero' ]

In [None]:
#Duration of recordings 
diarkeia=[]
for label in labels:
    waves = [f for f in os.listdir(train_data_dir + '/'+ label) if f.endswith('.wav')]
    for w in waves:
        sample_rate, samples = wavfile.read(train_data_dir + '/'+ label + '/' + w)
        diarkeia.append(float(len(samples)/sample_rate))
d = np.array(diarkeia)   
plt.hist(d)

In [None]:
# Lengths of recordings
lengths=[]
for label in labels:
    waves = [f for f in os.listdir(train_data_dir + '/'+ label) if f.endswith('.wav')]
    for wav in waves:
        sample_rate, samples = wavfile.read(train_data_dir + '/' + label + '/' + wav)
        samples, sr= librosa.load(train_data_dir + '/' + label + '/' + wav, sr=sample_rate)
        lengths.append(len(samples))
      
l=np.array(lengths)        
plt.hist(l)

In [None]:
# Function for the Feature Extraction
def feature_(samples, sr):
    features_ = np.array([])

    # Feature Extraction
    mfcc = librosa.feature.mfcc(samples, sr = 16000, n_mfcc = 13)
    delta1 = librosa.feature.delta(mfcc, order=1)
    delta2 = librosa.feature.delta(mfcc, order=2)
    features_ = np.concatenate((mfcc, delta1), axis=0)
    features_ = np.concatenate((features_, delta2), axis=0)

    print(features_.shape)
    return features_

In [None]:
def maxlength(npath, lengths):
    nsample_rate, nsamples = wavfile.read(npath)
    samples, srate = librosa.load(npath, nsample_rate)
    lengths.append(len(samples))
    return lengths

In [None]:
len2=[]
def padding(npath, mlength):
    nsample_rate, nsamples = wavfile.read(npath)
    samples, srate = librosa.load(npath, nsample_rate)
    
    if len(samples)<mlength:
        m=int((mlength-len(samples))/2)
        samples=np.pad(samples, pad_width=(m, m), mode='constant')
        res1 = feature_(samples, srate)
        result = np.array(res1)
    print(len(samples))
    len2.append(len(samples))
    res1 = feature_(samples, srate)
    result = np.array(res1)
    return result

In [None]:
lengths=[]
X_tr = []
Y_tr = []

#wav_df(train_data_dir)
for path in wav_df.path:
    leng = maxlength(path, lengths)  
length2 = max(leng)

for label, path in zip(wav_df.label, wav_df.path):
    features = padding(path, length2)
    X_tr.append(features)
    Y_tr.append(label)

In [None]:
import tensorflow as tf
import tensorflow as tf
from keras import utils as np_utils 
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder

scaler = StandardScaler()
X_simple=np.array(X_tr)
X_tr=np.array(X_tr)
X_tr = scaler.fit_transform(X_tr.reshape(-1, X_tr.shape[-1])).reshape(X_tr.shape)
print(X_tr.shape)

In [None]:
#Encoding the labels 
print(Y_tr[0])
le = LabelEncoder()
Y_tr = tf.keras.utils.to_categorical(le.fit_transform(Y_tr)) 
print(Y_tr[0])

In [None]:
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import *
#from keras.layers import Dense, Dropout, Flatten, Input, Conv2D, MaxPooling2D, BatchNormalization
from tensorflow.keras.optimizers import Adam, SGD, RMSprop, Nadam, Adamax
from keras.callbacks import ReduceLROnPlateau, ModelCheckpoint, EarlyStopping
from keras.constraints import maxnorm

from keras import backend as K
K.clear_session()

import warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")
warnings.filterwarnings("ignore", category=DeprecationWarning) 

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X_tr, Y_tr, test_size=0.2, random_state=1, shuffle=True)
x_train.shape, y_train.shape, x_test.shape, y_test.shape 

In [None]:
x_train = np.expand_dims(x_train, axis=3)
x_test = np.expand_dims(x_test, axis=3)
x_train.shape, y_train.shape, x_test.shape, y_test.shape

In [None]:
print(x_train.shape[1])
print(x_train.shape[2])

In [None]:
input_shape = (x_train.shape[1], x_train.shape[2])

In [None]:
# CNN1D + RNN LSTM (with one cnn1d layer)

In [None]:
model = tf.keras.models.Sequential()

# first conv1D layer
model.add(Conv1D(128, kernel_size=3, strides = 1, padding='same', activation='relu', input_shape = input_shape))
model.add(MaxPooling1D(pool_size=2, strides = 1, padding = 'same'))
model.add(BatchNormalization())
# lstm
model.add(tf.keras.layers.LSTM(150, return_sequences=True, stateful=False ))

# lstm
model.add(tf.keras.layers.LSTM(100))
model.add(tf.keras.layers.Dropout(0.2)) 

model.add(tf.keras.layers.Dense(512, activation='relu'))
model.add(tf.keras.layers.Dropout(0.2)) 

model.add(tf.keras.layers.Dense(len(labels), activation='softmax'))
model.summary()

In [None]:
keras.utils.plot_model(model, 'model.png',show_shapes=True)

In [None]:
# CNN1D + RNN LSTM (with two cnn1d layers)

In [None]:
model = tf.keras.models.Sequential()

# first conv1D layer
model.add(Conv1D(256, kernel_size=3, strides = 1, padding='same', activation='relu', input_shape = input_shape))
model.add(MaxPooling1D(pool_size=2, strides = 1, padding = 'same'))
model.add(BatchNormalization())

# second conv1D layer
model.add(Conv1D(128, kernel_size=3, strides = 1, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2, strides = 1, padding = 'same'))
model.add(BatchNormalization())

# lstm 1st layer
model.add(tf.keras.layers.LSTM(150, return_sequences=True, stateful=False))
# lstm 2nd layer
model.add(tf.keras.layers.LSTM(100)) 
model.add(tf.keras.layers.Dropout(0.2)) 

model.add(tf.keras.layers.Dense(512, activation='relu'))
model.add(tf.keras.layers.Dropout(0.2)) 

model.add(tf.keras.layers.Dense(len(labels), activation='softmax'))
model.summary()

In [None]:
keras.utils.plot_model(model, 'model2.png',show_shapes=True)

In [None]:
# RNN lstm

In [None]:
model = tf.keras.models.Sequential()

model.add(tf.keras.layers.LSTM(150, return_sequences=True, stateful=False, input_shape = input_shape))

model.add(tf.keras.layers.LSTM(100)) 
model.add(tf.keras.layers.Dense(512, activation='relu'))
model.add(tf.keras.layers.Dropout(0.2)) 

model.add(tf.keras.layers.Dense(len(labels), activation='softmax'))
model.summary()

In [None]:
keras.utils.plot_model(model, 'model3.png',show_shapes=True)

In [None]:
# RNN GRU # Define GRU based recurrent network architecture

In [None]:
model = tf.keras.models.Sequential()

model.add(keras.layers.GRU(128, input_shape=input_shape))

model.add(tf.keras.layers.Dense(512, activation='relu'))
model.add(tf.keras.layers.Dropout(0.2)) 
model.add(tf.keras.layers.Dense(len(labels), activation='softmax'))

model.summary()

In [None]:
opt = tf.keras.optimizers.Adam(learning_rate=1e-4)

In [None]:
model.compile(loss='categorical_crossentropy', optimizer = opt, metrics=['accuracy'])

In [None]:
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=10, min_delta=0.0001) 

In [None]:
history = model.fit(x_train, y_train,validation_data=(x_test,y_test), callbacks = es, epochs=200, batch_size=32)

In [None]:
# Summarize history for accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
# Summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
train_score = model.evaluate(x_train, y_train)
accuracy = 100*train_score[1]
print("Accuracy of our model on train data: %.4f%%" % accuracy)
#-------------------------------------------------------------------
test_score = model.evaluate(x_test, y_test)
accuracy = 100*test_score[1]
print("Accuracy of our model on test data: %.4f%%" % accuracy

In [None]:
pred = model.predict(x_test) 
pred = np.argmax(pred, axis = 1)
label = np.argmax(y_test,axis = 1)

print(pred) 
print(label)

In [None]:
df = pd.DataFrame(columns=['Predicted Labels', 'Actual Labels'])
df['Predicted Labels'] = pred.flatten()
df['Actual Labels'] = label.flatten()

df.head(10)

In [None]:
# Confusion matrix
from sklearn.metrics import confusion_matrix
import seaborn as sn
prediction= model.predict(x_test)
y_1=prediction.argmax(axis=-1)
y_2=y_test.argmax(axis=1)
print(y_1)
print(y_2)

conf_matrix=confusion_matrix(y_2, y_1)
classes = [ 'backward', 'bed', 'bird', 'cat', 'dog', 'down', 'eight', 'five', 'follow', 'forward', 'four', 'go', 'happy', 'house', 'learn', 'left', 'marvin', 'nine', 'no', 'off', 'on', 'one', 'right', 'seven', 'sheila', 'six', 'stop', 'three', 'tree', 'two', 'up', 'visual', 'wow', 'yes', 'zero' ]
df =pd.DataFrame(conf_matrix,index =[i for i in classes], columns = [i for i in classes])
plt.figure(figsize = (35,35))
splot = sn.heatmap(df, annot=True)
plt.title("Confusion Matrix", fontsize=30)
plt.ylabel("True Class"     , fontsize=30)
plt.xlabel("Predicted Class", fontsize=30)

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_2, y_1))

In [None]:
from keras.models import load_model
model.save("SpeechRModel_2.h5")