<a href="https://colab.research.google.com/github/JacobeCode/Neural_Network_For_Speech_Recognition/blob/main/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
!pip install theano
!pip install speechpy

# Main modules import
import keras
import librosa
import tensorflow
import os
import datetime
import speechpy

# Side modules import
import numpy as np
import theano as te
import matplotlib as plt
import pandas as pd

# Needed parts - import
from pathlib import Path
from google.colab import drive
from datetime import datetime
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout, Activation, Reshape, Flatten, LSTM, Conv2D, MaxPooling2D
from keras.optimizers import Adam, SGD
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.losses import categorical_crossentropy, binary_crossentropy, sparse_categorical_crossentropy
from keras.utils.np_utils import to_categorical
from IPython.display import Audio, display
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, KFold, LeaveOneOut, StratifiedKFold

# Root directory for files - for different directory - change here

# Python local version

# root_dir = Path.cwd()
# train_set_dir = os.path.join(Path.cwd(), "Recordings")
# train_set_list = os.listdir(train_set_dir)
# eval_set_dir = os.path.join(Path.cwd(), "Eval_Recordings")
# eval_set_list = os.listdir(eval_set_dir)

# Colab with Drive version
drive.mount('/content/gdrive/')

root_dir = '/content/gdrive/MyDrive/Speech_Recognition'
model_dir = '/content/gdrive/MyDrive/Speech_Recognition/Model'
CSV_dir = '/content/gdrive/MyDrive/Speech_Recognition/CSV'
train_set_dir = '/content/gdrive/MyDrive/Speech_Recognition/Recordings'
train_set_list = os.listdir(train_set_dir)
eval_set_dir = '/content/gdrive/MyDrive/Speech_Recognition/Eval_Recordings'
eval_set_list = os.listdir(eval_set_dir)
os.chdir(root_dir)

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


In [39]:
# Parameters for comfortable changes later
n_mfcc = 13                                     # Number of MFCC coefficients
win_length = 512                                # Length of MFCC window
n_batch_size = 128                              # Batch Size
n_epoch = 10                                    # Number of Epoch's to perform

add_display = 1
loss_function = 'categorical_crossentropy'      # Type of loss function
optimizer = 'SGD'                              # Type of optimizer
metric_type = 'accuracy'                        # Type of testing metric
test_percentage = 0.1                           # How much of data will be selected to test the training

In [14]:
# Data pre-processing - building database
eval_set_base = pd.DataFrame()

os.chdir(eval_set_dir)
MFCC_list = []
delta_list = []
records = []
for count, audio in enumerate(eval_set_list):
    data, fs = librosa.load(audio, sr=None)
    data = data/max(np.abs(data))
    for iteration, item in enumerate(data):
        if np.abs(item) < 0.01:
            np.delete(data, iteration)
    MFCC = librosa.feature.mfcc(y=data, sr=fs, n_mfcc=n_mfcc, hop_length=int(win_length/2), win_length=win_length, n_fft=win_length)
    MFCC_list.append(MFCC)
    records.append(audio)
MFCC_temp = []
max_len = max(np.shape(x)[1] for x in MFCC_list)
for item in MFCC_list:
    temp = []
    n_add = max_len - np.shape(item)[1]
    for coeff in item:
      temp.append(np.append(coeff, np.zeros(n_add)))
    MFCC_temp.append(temp)
    del temp
MFCC_list = MFCC_temp
del MFCC_temp

for item in MFCC_list:
    np.reshape(item, (n_mfcc, max_len, 1))

eval_set_base["Record"] = records
eval_set_base["MFCC"] = MFCC_list
os.chdir(root_dir)

del MFCC_list
del records

In [15]:
# Data pre-processing - building database
train_set_base = pd.DataFrame()

os.chdir(train_set_dir)
MFCC_list = []
labels = []
records = []
for count, audio in enumerate(train_set_list):
    data, fs = librosa.load(audio, sr=None)
    data = data/max(np.abs(data))
    for iteration, item in enumerate(data):
        if np.abs(item) < 0.01:
            np.delete(data, iteration)
    MFCC = librosa.feature.mfcc(y=data, sr=fs, n_mfcc=n_mfcc, hop_length=int(win_length/2), win_length=win_length, n_fft=win_length)
    labels.append(audio[6])
    records.append(audio)
    MFCC_list.append(MFCC)
MFCC_temp = []
for item in MFCC_list:
    temp = []
    n_add = max_len - np.shape(item)[1]
    for coeff in item:
      temp.append(np.append(coeff, np.zeros(n_add)))
    MFCC_temp.append(temp)
    del temp
MFCC_list = MFCC_temp
del MFCC_temp

for item in MFCC_list:
    np.reshape(item, (n_mfcc, max_len, 1))

train_set_base["Record"] = records
train_set_base["MFCC"] = MFCC_list
train_set_base["Labels"] = labels
os.chdir(root_dir)

del MFCC_list
del labels
del records

In [16]:
# CMVN Normalization

for count, item in enumerate(train_set_base["MFCC"]):
    train_set_base["MFCC"].replace(speechpy.processing.cmvn(np.asarray(item)))

In [17]:
for count, item in enumerate(eval_set_base["MFCC"]):
    eval_set_base["MFCC"].replace(speechpy.processing.cmvn(np.asarray(item)))

In [18]:
print("Training Recordings database : ")
display(train_set_base)

print("Eval Recordings database : ")
display(eval_set_base)

Training Recordings database : 


Unnamed: 0,Record,MFCC,Labels
0,AO1M1_2_.wav,"[[-662.9744262695312, -631.0436401367188, -568...",2
1,AO1M1_0_.wav,"[[-612.0942993164062, -573.1663818359375, -417...",0
2,AO1M1_1_.wav,"[[-571.8582763671875, -549.147216796875, -550....",1
3,AO1M1_3_.wav,"[[-595.4745483398438, -432.62554931640625, -26...",3
4,AO1M1_9_.wav,"[[-618.9461669921875, -591.8220825195312, -412...",9
...,...,...,...
215,SW1M1_8_.wav,"[[-590.7490234375, -406.0035705566406, -288.78...",8
216,SW1M1_3_.wav,"[[-614.0647583007812, -439.59356689453125, -23...",3
217,SW1M1_5_.wav,"[[-672.4749145507812, -453.1002502441406, -306...",5
218,SW1M1_7_.wav,"[[-596.5587768554688, -483.9967041015625, -410...",7


Eval Recordings database : 


Unnamed: 0,Record,MFCC
0,040.wav,"[[-638.1614990234375, -542.1619262695312, -438..."
1,058.wav,"[[-344.8790588378906, -298.9581298828125, -290..."
2,004.wav,"[[-618.7013549804688, -599.5948486328125, -599..."
3,029.wav,"[[-402.5004577636719, -392.03924560546875, -39..."
4,064.wav,"[[-449.2403259277344, -417.07275390625, -429.5..."
...,...,...
655,650.wav,"[[-487.6944274902344, -455.0632629394531, -414..."
656,647.wav,"[[-548.4116821289062, -543.112060546875, -549...."
657,643.wav,"[[-615.047607421875, -604.5725708007812, -602...."
658,639.wav,"[[-609.5606689453125, -601.455322265625, -310...."


In [40]:
model = Sequential()
model.add(Conv2D(32, (6,6), padding="valid", activation="relu", input_shape=(n_mfcc, max_len, 1)))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Conv2D(16, (3,3), padding="valid", activation="relu", input_shape=(n_mfcc/2, max_len/2, 1)))
model.add(MaxPooling2D(pool_size=(2,2)))
model.add(Flatten(input_shape=(n_mfcc/4, max_len/4, 1)))
model.add(Activation('relu'))
model.add(Dense(128))
model.add(Dropout(0.5))
model.add(Activation('relu'))
model.add(Dense(10))
model.add(Activation('softmax'))

if optimizer == 'Adam':
    adam = Adam(learning_rate=0.001)
    model.compile(loss=loss_function, metrics=[metric_type, loss_function], optimizer=adam)
elif optimizer == 'SGD':
    sgd = SGD(learning_rate=0.001)
    model.compile(loss=loss_function, metrics=[metric_type, loss_function], optimizer=sgd)

model.summary()

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv2d_8 (Conv2D)           (None, 8, 121, 32)        1184      
                                                                 
 max_pooling2d_8 (MaxPooling  (None, 4, 60, 32)        0         
 2D)                                                             
                                                                 
 conv2d_9 (Conv2D)           (None, 2, 58, 16)         4624      
                                                                 
 max_pooling2d_9 (MaxPooling  (None, 1, 29, 16)        0         
 2D)                                                             
                                                                 
 flatten_4 (Flatten)         (None, 464)               0         
                                                                 
 activation_13 (Activation)  (None, 464)              

In [41]:
X = np.array(train_set_base['MFCC'].tolist())
y_true = train_set_base['Labels'].tolist()

label_encoder = LabelEncoder()
y_true = to_categorical(label_encoder.fit_transform(y_true))

acc_per_fold = []
loss_per_fold = []
i = 0
kf = KFold(n_splits = 11)

# X_train, X_test, y_train, y_test = train_test_split(X, y_true, test_size=test_percentage, random_state=42)

for train_index, test_index in kf.split(X,  y_true):
    # print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y_true[train_index], y_true[test_index]
    start = datetime.now()
    history = model.fit(X_train, y_train, batch_size=n_batch_size, epochs=n_epoch, validation_data=(X_test, y_test), verbose=add_display)
    duration = datetime.now() - start

    results = model.evaluate(X_test, y_test, verbose=add_display)
    
    print(f'Score for fold {i}: {model.metrics_names[0]} of {results[0]}; {model.metrics_names[1]} of {results[1]*100}%')
    acc_per_fold.append(results[1] * 100)
    loss_per_fold.append(results[0])
    i += 1

print('Accuracy: ')
print(np.mean(acc_per_fold))
print('Loss: ')
print(np.mean(loss_per_fold))
del start

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Score for fold 0: loss of 2.253592014312744; accuracy of 25.0%
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Score for fold 1: loss of 2.20428729057312; accuracy of 15.000000596046448%
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Score for fold 2: loss of 2.005000114440918; accuracy of 30.000001192092896%
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Score for fold 3: loss of 1.7763532400131226; accuracy of 44.999998807907104%
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Score for fold 4: loss of 1.8676681518554688; accuracy of 40.00000059604645%
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/

In [37]:
model.save(model_dir)



In [38]:
model = load_model(model_dir)
X = np.array(eval_set_base['MFCC'].tolist())
predict = model.predict(X)

values_from_prediction = []
for item in predict:
    values_from_prediction.append(np.max(item))
class_predictions = np.argmax(predict, axis=-1)
print(class_predictions)

rd={"file": eval_set_base['Record'],
        "prediction":class_predictions,
      "values":values_from_prediction}
results_to_csv=pd.DataFrame(data=rd)
print(results_to_csv)
os.chdir(CSV_dir)
results_to_csv.to_csv('results_ia.csv', index=False, header = False, sep = ',')
os.chdir(root_dir)

[6 8 6 2 7 4 2 2 6 0 8 1 6 6 5 6 4 8 8 9 5 5 6 7 0 5 0 3 9 6 9 7 9 7 6 3 2
 9 5 9 4 9 7 3 6 9 7 9 9 0 6 6 6 9 8 6 7 9 9 4 9 1 2 6 9 6 7 8 9 8 5 9 9 4
 0 1 8 1 1 9 1 2 9 4 7 4 5 9 0 3 4 7 9 2 2 3 8 1 7 4 2 9 0 3 7 9 5 2 2 0 1
 6 3 1 3 5 9 4 7 5 4 8 8 6 7 6 7 5 2 2 1 6 9 7 3 1 3 0 7 2 1 8 0 9 2 5 3 9
 7 5 2 9 5 5 7 6 8 2 3 5 5 1 3 6 5 9 1 0 0 7 3 1 9 8 5 5 6 2 7 0 7 6 3 5 7
 0 0 0 7 2 8 6 4 8 9 7 9 4 7 8 1 9 9 0 9 1 9 9 9 9 3 3 9 2 8 0 1 7 7 4 9 3
 5 9 5 9 6 8 7 4 2 9 8 2 0 9 3 9 9 8 3 5 6 1 2 7 0 7 3 2 2 4 0 4 8 7 6 3 6
 9 7 7 0 7 0 9 7 3 4 8 9 2 8 8 1 7 2 5 6 8 7 0 1 4 0 5 8 9 9 2 2 9 5 2 9 7
 7 3 9 6 2 3 4 9 5 9 0 3 3 9 0 4 4 9 2 6 4 6 9 6 0 4 6 4 7 7 8 6 3 7 7 6 0
 9 9 9 7 8 5 2 9 4 1 4 2 9 2 7 7 9 7 5 7 0 7 4 9 7 6 0 5 2 5 4 9 7 2 8 9 0
 3 7 5 2 2 6 2 1 3 5 0 9 6 6 9 9 9 2 7 8 4 5 5 6 7 0 9 0 9 5 7 8 7 9 8 2 0
 0 6 5 5 7 4 0 9 8 8 8 4 2 3 5 7 1 3 7 9 2 2 6 5 3 7 4 6 9 2 5 1 6 5 6 9 2
 0 5 2 9 4 7 5 4 1 6 2 0 1 9 0 0 1 4 3 0 9 9 2 7 7 2 9 8 9 2 5 9 1 2 6 4 9
 6 9 9 9 3 5 2 7 9 1 9 9 