# Dataset

# Importing libraries

In [34]:
import pandas as pd
import numpy as np

import os
import sys

# librosa is a Python library for analyzing audio and music. It can be used to extract the data from the audio files we will see it later.
import librosa
import librosa.display
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

# to play the audio files
from IPython.display import Audio

import keras
from keras.callbacks import ReduceLROnPlateau, LearningRateScheduler
from keras.models import Sequential
from keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Dropout, BatchNormalization
from tensorflow.keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint
from tensorflow.python.keras.layers.recurrent import LSTM

import warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")
warnings.filterwarnings("ignore", category=DeprecationWarning)

# Dataset Preparation

In [12]:
# Paths for data.
Ravdess = "../dataset/Ravdess/audio_speech_actors_01-24/"
Crema = "../dataset/Crema/"
Tess = "../dataset/Tess/"
Savee = "../dataset/Savee/"

# Ravdees

In [13]:
ravdess_directory_list = os.listdir(Ravdess)

file_emotion = []
file_path = []
for dir in ravdess_directory_list:
    # as their are 24 different actors in our previous directory we need to extract files for each actor.
    actor = os.listdir(Ravdess + dir)
    for file in actor:
        part = file.split('.')[0]
        part = part.split('-')
        # third number in each file represents the emotion associated to that file.
        file_emotion.append(int(part[2]))
        # storing the path of the file
        file_path.append(Ravdess + dir + '/' + file)

# dataframe for emotion of files
emotion_df = pd.DataFrame(file_emotion, columns=['Emotions'])

# dataframe for path of files.
path_df = pd.DataFrame(file_path, columns=['Path'])

# dataframe of the emotions and path combined.
Ravdess_df = pd.concat([emotion_df, path_df], axis=1)

# changing integers to actual emotions.
Ravdess_df.Emotions.replace({1:'neutral', 2:'calm', 3:'happy', 4:'sad', 5:'angry', 6:'fear', 7:'disgust', 8:'surprise'}, inplace=True)

# shuffling the data
Ravdess_df = Ravdess_df.sample(frac=1).reset_index(drop=True)
Ravdess_df.head(10)

Unnamed: 0,Emotions,Path
0,happy,../dataset/Ravdess/audio_speech_actors_01-24/A...
1,happy,../dataset/Ravdess/audio_speech_actors_01-24/A...
2,neutral,../dataset/Ravdess/audio_speech_actors_01-24/A...
3,disgust,../dataset/Ravdess/audio_speech_actors_01-24/A...
4,fear,../dataset/Ravdess/audio_speech_actors_01-24/A...
5,sad,../dataset/Ravdess/audio_speech_actors_01-24/A...
6,fear,../dataset/Ravdess/audio_speech_actors_01-24/A...
7,surprise,../dataset/Ravdess/audio_speech_actors_01-24/A...
8,neutral,../dataset/Ravdess/audio_speech_actors_01-24/A...
9,calm,../dataset/Ravdess/audio_speech_actors_01-24/A...


# Crema

In [14]:
crema_directory_list = os.listdir(Crema)

file_emotion = []
file_path = []

for file in crema_directory_list:
    # storing file paths
    file_path.append(Crema + file)
    # storing file emotions
    part=file.split('_')
    if part[2] == 'SAD':
        file_emotion.append('sad')
    elif part[2] == 'ANG':
        file_emotion.append('angry')
    elif part[2] == 'DIS':
        file_emotion.append('disgust')
    elif part[2] == 'FEA':
        file_emotion.append('fear')
    elif part[2] == 'HAP':
        file_emotion.append('happy')
    elif part[2] == 'NEU':
        file_emotion.append('neutral')
    else:
        file_emotion.append('Unknown')

# dataframe for emotion of files
emotion_df = pd.DataFrame(file_emotion, columns=['Emotions'])

# dataframe for path of files.
path_df = pd.DataFrame(file_path, columns=['Path'])
Crema_df = pd.concat([emotion_df, path_df], axis=1)
Crema_df = Crema_df.sample(frac=1).reset_index(drop=True)
Crema_df

Unnamed: 0,Emotions,Path
0,angry,../dataset/Crema/1067_IEO_ANG_LO.wav
1,fear,../dataset/Crema/1054_IEO_FEA_MD.wav
2,fear,../dataset/Crema/1075_TIE_FEA_XX.wav
3,disgust,../dataset/Crema/1041_IEO_DIS_MD.wav
4,happy,../dataset/Crema/1086_IWW_HAP_XX.wav
...,...,...
7437,fear,../dataset/Crema/1076_WSI_FEA_XX.wav
7438,happy,../dataset/Crema/1028_IEO_HAP_HI.wav
7439,happy,../dataset/Crema/1041_TIE_HAP_XX.wav
7440,angry,../dataset/Crema/1031_DFA_ANG_XX.wav


# TESS

In [15]:
tess_directory_list = os.listdir(Tess)

file_emotion = []
file_path = []

for dir in tess_directory_list:
    directories = os.listdir(Tess + dir)
    for file in directories:
        part = file.split('.')[0]
        part = part.split('_')[2]
        if part=='ps':
            file_emotion.append('surprise')
        else:
            file_emotion.append(part)
        file_path.append(Tess + dir + '/' + file)

# dataframe for emotion of files
emotion_df = pd.DataFrame(file_emotion, columns=['Emotions'])

# dataframe for path of files.
path_df = pd.DataFrame(file_path, columns=['Path'])
Tess_df = pd.concat([emotion_df, path_df], axis=1)
Tess_df = Tess_df.sample(frac=1).reset_index(drop=True)

Tess_df.head()

Unnamed: 0,Emotions,Path
0,disgust,../dataset/Tess/OAF_disgust/OAF_bath_disgust.wav
1,sad,../dataset/Tess/YAF_sad/YAF_soup_sad.wav
2,happy,../dataset/Tess/OAF_happy/OAF_mess_happy.wav
3,angry,../dataset/Tess/OAF_angry/OAF_whip_angry.wav
4,neutral,../dataset/Tess/OAF_neutral/OAF_chalk_neutral.wav


# Savee

In [16]:
savee_directory_list = os.listdir(Savee)

file_emotion = []
file_path = []

for file in savee_directory_list:
    file_path.append(Savee + file)
    part = file.split('_')[1]
    ele = part[:-6]
    if ele=='a':
        file_emotion.append('angry')
    elif ele=='d':
        file_emotion.append('disgust')
    elif ele=='f':
        file_emotion.append('fear')
    elif ele=='h':
        file_emotion.append('happy')
    elif ele=='n':
        file_emotion.append('neutral')
    elif ele=='sa':
        file_emotion.append('sad')
    else:
        file_emotion.append('surprise')

# dataframe for emotion of files
emotion_df = pd.DataFrame(file_emotion, columns=['Emotions'])

# dataframe for path of files.
path_df = pd.DataFrame(file_path, columns=['Path'])
Savee_df = pd.concat([emotion_df, path_df], axis=1)
Savee_df = Savee_df.sample(frac=1).reset_index(drop=True)

Savee_df.head()

Unnamed: 0,Emotions,Path
0,sad,../dataset/Savee/JK_sa04.wav
1,angry,../dataset/Savee/JE_a10.wav
2,sad,../dataset/Savee/JK_sa14.wav
3,sad,../dataset/Savee/DC_sa12.wav
4,fear,../dataset/Savee/JE_f10.wav


In [17]:
# creating Dataframe using all the 4 dataframes we created so far.
data_path = pd.concat([Ravdess_df, Crema_df, Tess_df, Savee_df], axis = 0)
data_path.to_csv("../utils/data_path.csv",index=False)
data_path.head()

Unnamed: 0,Emotions,Path
0,happy,../dataset/Ravdess/audio_speech_actors_01-24/A...
1,happy,../dataset/Ravdess/audio_speech_actors_01-24/A...
2,neutral,../dataset/Ravdess/audio_speech_actors_01-24/A...
3,disgust,../dataset/Ravdess/audio_speech_actors_01-24/A...
4,fear,../dataset/Ravdess/audio_speech_actors_01-24/A...


In [18]:
# To make our model to learn and adapt to real-life situations, as noise stimulates the real-life situations
def noise(data):
    noise_amp = 0.035*np.random.uniform()*np.amax(data)
    data = data + noise_amp*np.random.normal(size=data.shape[0])
    return data

# Stretching the audio file will include making it fast and slow, allowing our model to detect emotions in both slow and fast speech.
def stretch(data, rate=0.8):
    return librosa.effects.time_stretch(data, rate=rate)

# Time shifting stimulates the start of the audio, allowing our model to learn when the audio starts in an audio file.
def shift(data):
    shift_range = int(np.random.uniform(low=-5, high = 5)*1000)
    return np.roll(data, shift_range)

# To help model with high and low pitches
def pitch(data, sampling_rate, pitch_factor=0.7):
    return librosa.effects.pitch_shift(data,sr=sampling_rate, n_steps=pitch_factor)

# Taking any example and checking for techniques.
path = np.array(data_path.Path)[1]
data, sample_rate = librosa.load(path)

In [19]:
def extract_features(data):
    # ZCR
    # The zero-crossing rate is the rate at which the audio signal changes from positive to negative or vice versa. It gives an idea of how noisy or percussive the signal is.
    # This feature is useful when playing with audios having high disturbance or unvoiced sounds.
    result = np.array([])
    zcr = np.mean(librosa.feature.zero_crossing_rate(y=data).T, axis=0)
    result=np.hstack((result, zcr)) # stacking horizontally

    # Chroma_stft
    # This feature is useful for identifying harmonic and pitch-related information, which can help distinguish different emotions in speech (e.g., sadness may have a different harmonic pattern than excitement).
    stft = np.abs(librosa.stft(data))
    chroma_stft = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0)
    result = np.hstack((result, chroma_stft)) # stacking horizontally

    # MFCC
    # MFCCs are highly effective in speech and emotion recognition because they model how humans perceive sound, focusing on the most important features of speech.
    mfcc = np.mean(librosa.feature.mfcc(y=data, sr=sample_rate).T, axis=0)
    result = np.hstack((result, mfcc)) # stacking horizontally

    # Root Mean Square Value
    # RMS is useful for distinguishing between different speech emotions. For example, speech with higher energy may indicate excitement or anger, while lower energy may indicate calmness or sadness.
    rms = np.mean(librosa.feature.rms(y=data).T, axis=0)
    result = np.hstack((result, rms)) # stacking horizontally

    # MelSpectogram
    # The Mel spectrogram captures the timbre and texture of the audio, which can be crucial for recognizing the emotional content of speech. For instance, anger or excitement might have more energy in higher-frequency bands, while sadness might be more present in lower frequencies.
    mel = np.mean(librosa.feature.melspectrogram(y=data, sr=sample_rate).T, axis=0)
    result = np.hstack((result, mel)) # stacking horizontally

    return result

def get_features(path):
    # duration and offset are used to take care of the no audio in start and the ending of each audio files as seen above.
    data, sample_rate = librosa.load(path, duration=2.5, offset=0.6)

    # without augmentation
    res1 = extract_features(data)
    result = np.array(res1)

    # data with noise
    noise_data = noise(data)
    res2 = extract_features(noise_data)
    result = np.vstack((result, res2)) # stacking vertically

    # data with stretching and pitching
    new_data = stretch(data)
    data_stretch_pitch = pitch(new_data, sample_rate)
    res3 = extract_features(data_stretch_pitch)
    result = np.vstack((result, res3)) # stacking vertically

    return result

# Splitting Data
We are augmenting our dataset with 3 different techniques 
* Without Augmentation 
* Data with noise
* Data with stretching and pitching

In [20]:
%%time
X, Y = [], []
for path, emotion in zip(data_path.Path, data_path.Emotions):
    feature = get_features(path)
    for ele in feature:
        X.append(ele)
        # appending emotion 3 times as we have made 3 augmentation techniques on each audio file.
        Y.append(emotion)

CPU times: total: 9min 17s
Wall time: 30min 53s


In [21]:
len(X), len(Y), data_path.Path.shape

(36486, 36486, (12162,))

In [24]:
Features = pd.DataFrame(X)
Features['labels'] = Y
Features.to_csv('../utils/features.csv', index=False)
Features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,123,124,125,126,127,128,129,130,131,132,133,134,135,136,137,138,139,140,141,142,143,144,145,146,147,148,149,150,151,152,153,154,155,156,157,158,159,160,161,labels
0,0.219157,0.691433,0.667136,0.615862,0.636741,0.612271,0.563172,0.589636,0.577268,0.538605,0.57355,0.620119,0.679081,-389.310089,44.604862,-22.185759,8.823205,-6.492015,-17.190641,-12.072454,-10.507295,-9.845922,1.190978,-13.18554,-5.775838,-0.716264,-6.928237,-5.893147,-8.138218,-1.948143,-0.507945,1.597394,4.096745,0.028427,0.010388,0.002234,0.000901,0.000676,0.001357,0.024204,...,0.033903,0.024785,0.025461,0.007359,0.003066,0.005026,0.002904,0.002267,0.00351,0.008899,0.010553,0.015689,0.009416,0.011137,0.004788,0.004688,0.003358,0.003425,0.003716,0.003637,0.002551,0.003321,0.004074,0.003392,0.002228,0.001928,0.002257,0.003255,0.001603,0.001273,0.001556,0.003427,0.005109,0.006119,0.007552,0.005767,0.003263,0.001742,0.000233,happy
1,0.329454,0.745107,0.776924,0.755002,0.763561,0.764032,0.670636,0.659072,0.648103,0.624234,0.634966,0.659839,0.711413,-214.096323,14.352454,-2.522039,0.929195,-3.139962,-7.03721,-5.042682,-5.556265,-4.761097,-2.794272,-4.224562,-3.806318,-1.70788,-3.459285,-4.11184,-3.74604,-2.256215,-0.051007,0.845458,2.895086,0.03285,0.016199,0.008063,0.008627,0.009107,0.010071,0.028292,...,0.042021,0.03336,0.033674,0.013887,0.010415,0.011627,0.009836,0.009249,0.010294,0.016287,0.017275,0.023458,0.016971,0.018805,0.012488,0.011836,0.009995,0.010639,0.010178,0.010675,0.009386,0.010632,0.011498,0.009794,0.009208,0.009298,0.009217,0.010439,0.008639,0.008308,0.008255,0.010496,0.012512,0.014699,0.015044,0.012488,0.010466,0.008846,0.007468,happy
2,0.180624,0.690494,0.630426,0.612377,0.548845,0.602829,0.568806,0.583996,0.58641,0.561694,0.527813,0.561941,0.615647,-446.122284,41.60664,-25.104706,9.550559,-8.946218,-17.385601,-14.084661,-11.910576,-8.615192,1.579874,-13.617373,-1.983365,-2.542864,-7.308558,-6.744491,-5.923739,0.574276,1.544911,3.857584,5.22477,0.012973,0.002613,0.000618,0.000283,0.000284,0.000142,0.005552,...,0.004571,0.007452,0.007168,0.006677,0.003708,0.001116,0.000891,0.001125,0.000409,0.000523,0.001202,0.00273,0.0021,0.002061,0.002036,0.001869,0.00137,0.001009,0.000703,0.000822,0.000992,0.000917,0.000643,0.001072,0.000881,0.000537,0.000461,0.000383,0.000578,0.000453,0.000333,0.000258,0.000428,0.001138,0.001421,0.001315,0.001306,0.000455,4.3e-05,happy
3,0.155834,0.617245,0.597271,0.566049,0.538231,0.530569,0.443821,0.479569,0.547259,0.514224,0.483463,0.486443,0.576259,-327.932617,35.423176,-20.308094,4.352321,-11.217619,-3.791472,-22.04365,-15.588261,-11.681346,-19.524824,1.544646,-12.672573,-11.721163,-2.973376,-12.30769,-3.50448,-11.080645,-2.704814,-1.517587,-7.291881,0.051731,0.017751,0.001004,0.000718,0.000682,0.000677,0.004671,...,0.042382,0.007467,0.009406,0.004731,0.004817,0.008351,0.014432,0.026922,0.043614,0.063168,0.056716,0.070787,0.056882,0.041463,0.014382,0.010063,0.010356,0.007357,0.006951,0.004216,0.0058,0.005278,0.004526,0.004202,0.005765,0.005739,0.004184,0.003794,0.004106,0.004206,0.00471,0.004816,0.005614,0.006275,0.007023,0.006955,0.005516,0.002728,0.000197,happy
4,0.251135,0.719829,0.702356,0.649794,0.644915,0.627301,0.527844,0.532246,0.599953,0.570876,0.549004,0.576111,0.687266,-195.275082,19.604292,-3.163328,-0.12798,-4.280029,-2.259832,-9.010055,-7.46566,-4.56293,-7.164631,0.478484,-3.60318,-4.710583,-2.745778,-6.293137,-2.911897,-5.670152,-2.328435,-1.08583,-6.819318,0.054536,0.025172,0.008426,0.008803,0.007774,0.007983,0.012188,...,0.049392,0.015066,0.01705,0.012675,0.012021,0.015368,0.022189,0.035432,0.049493,0.068756,0.064688,0.079747,0.068307,0.049623,0.02127,0.017627,0.017413,0.014199,0.014324,0.011496,0.013734,0.012734,0.011704,0.01177,0.01302,0.013213,0.011557,0.011967,0.011737,0.011617,0.011539,0.012486,0.013517,0.013621,0.014454,0.014626,0.012705,0.010159,0.007395,happy


In [26]:
# Features = pd.read_csv("../utils/features.csv")
X = Features.iloc[: ,:-1].values
Y = Features['labels'].values

In [27]:
# As this is a multiclass classification problem onehotencoding our Y.
encoder = OneHotEncoder()
Y = encoder.fit_transform(np.array(Y).reshape(-1,1)).toarray()

In [28]:

# First split into train and test
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0, shuffle=True)

# Further split train into train and validation sets
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=0, shuffle=True)

# Check the shapes of the resulting splits
x_train.shape, y_train.shape, x_val.shape, y_val.shape, x_test.shape, y_test.shape


((23350, 162), (23350, 8), (5838, 162), (5838, 8), (7298, 162), (7298, 8))

In [29]:
# scaling our data with sklearn's Standard scaler
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((23350, 162), (23350, 8), (7298, 162), (7298, 8))

In [30]:
# making our data compatible to model.
x_train = np.expand_dims(x_train, axis=2)
# x_test = np.expand_dims(x_test, axis=2)
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((23350, 162, 1), (23350, 8), (7298, 162), (7298, 8))

# Outlining our model
we will use 1D convolutional neural networks which are great for the data related to time-series data like audios, videos. In our case we are dealing with audio files, which can be treated as time-series data where features like MFCC, chroma etc. are extracted over different time windows. 1 CNN work particularly well for such data because they can effectively capture pattern and features along the time-axis, identifying important temporal features like pitch, tone, and rhythm that help differentiate emotions

In [35]:
FIRST_LAYER_FILTER = 256
SECOND_LAYER_FILTER = 256
THIRD_LAYER_FILTER = 128
FOURTH_LAYER_FILTER = 64
FIRST_HIDDEN_LAYER_INPUT = 32
SECOND_HIDDEN_LAYER_INPUT = 8
KERNEL_SIZE = 5
POOL_SIZE = 5
model = Sequential(
	[
		# Applying a convolutional layer with 256 filters, with kernel size of 5 and activation function "relu" which is used to induce non-linearity to make it non-linear which is essential to deal with real-life situations
		Conv1D(FIRST_LAYER_FILTER, kernel_size=KERNEL_SIZE, strides=1, padding='same', activation='relu', input_shape=(x_train.shape[1], 1)),
		# Pooling is used to downsampling the feature maps, summarizing importand information by taking the maximum value in each window of KERNEL_SIZE, which helps to reduce to computational complexity prevents overffitting, while preserving important patterns.
		MaxPooling1D(pool_size=POOL_SIZE, strides=2, padding='same'),
		# Stacking Convolutional layers allows the model to learn hierarchical features. The inital layer capture low-level features while deeper layers capture more abstract features.
		# You progressively reduce the number of filters (256 → 128 → 64), making the model lighter as the features become more abstract.
		BatchNormalization(),
		Conv1D(SECOND_LAYER_FILTER, kernel_size=KERNEL_SIZE, strides=1, padding='same', activation='relu'),
		MaxPooling1D(pool_size=POOL_SIZE, strides=2, padding='same'),
		BatchNormalization(),
		Conv1D(THIRD_LAYER_FILTER, kernel_size=KERNEL_SIZE, strides=1, padding='same', activation='relu'),
		MaxPooling1D(pool_size=POOL_SIZE, strides=2, padding='same'),
		# The regularization technique randomly turns off 20% of neurons during training, helping to prevent overfitting by encouriging the network to be more robust and not rely on specific neurons.
		Dropout(0.3),
		BatchNormalization(),
		LSTM(64, return_sequences=True),
		LSTM(64, return_sequences=False),
		Dropout(0.4),
		# Flattening converts the 3D output from the Conv1D and MaxPooling layers into a 1D vector, which can be passed to fully connected (Dense) layers. This transformation is necessary for classification
		Flatten(),
		# Here starts our hidden-layers
		Dense(units=FIRST_HIDDEN_LAYER_INPUT, activation='relu'),
		# Turning off 30% neurons
		Dropout(0.5),
		Dense(units=SECOND_HIDDEN_LAYER_INPUT, activation='softmax'),
	]
			)
model.compile(optimizer = 'adam' , loss = 'categorical_crossentropy' , metrics = ['accuracy'])
model.summary()


ValueError: Only instances of `keras.Layer` can be added to a Sequential model. Received: <tensorflow.python.keras.layers.recurrent.LSTM object at 0x00000299263109B0> (of type <class 'tensorflow.python.keras.layers.recurrent.LSTM'>)

In [36]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Dropout, Flatten, Dense, LSTM, BatchNormalization

model = Sequential()

# First Convolution Block
# Applying a convolutional layer with 256 filters, with kernel size of 5 and activation function "relu" which is used to induce non-linearity to make it non-linear which is essential to deal with real-life situations
model.add(Conv1D(256, kernel_size=5, strides=1, padding='same', activation='relu', input_shape=(x_train.shape[1], 1)))
# Pooling is used to downsampling the feature maps, summarizing importand information by taking the maximum value in each window of KERNEL_SIZE, which helps to reduce to computational complexity prevents overffitting, while preserving important patterns.
model.add(MaxPooling1D(pool_size=5, strides=2, padding='same'))
# Stacking Convolutional layers allows the model to learn hierarchical features. The inital layer capture low-level features while deeper layers capture more abstract features.
# You progressively reduce the number of filters (256 → 128 → 64), making the model lighter as the features become more abstract.
model.add(BatchNormalization())

# Second Convolution Block
model.add(Conv1D(256, kernel_size=5, strides=1, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=5, strides=2, padding='same'))
model.add(BatchNormalization())

# Third Convolution Block
model.add(Conv1D(128, kernel_size=5, strides=1, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=5, strides=2, padding='same'))
# The regularization technique randomly turns off 20% of neurons during training, helping to prevent overfitting by encouriging the network to be more robust and not rely on specific neurons.
model.add(Dropout(0.3))
model.add(BatchNormalization())

# LSTM Layer for sequential feature extraction
model.add(LSTM(64, return_sequences=True))
model.add(LSTM(64, return_sequences=False))
model.add(Dropout(0.4))

# Fully Connected Layers
# Flattening converts the 3D output from the Conv1D and MaxPooling layers into a 1D vector, which can be passed to fully connected (Dense) layers. This transformation is necessary for classification

model.add(Flatten())
model.add(Dense(32, activation='relu'))
# Turning off 50% neurons

model.add(Dropout(0.5))
model.add(Dense(8, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()


In [37]:
def scheduler(epoch, lr):
    if epoch > 10:  # Reduce learning rate after 10 epochs
        lr = lr * 0.5
    return lr
lr_scheduler = LearningRateScheduler(scheduler)
history = model.fit(x_train, y_train, 
                    validation_data=(x_val, y_val),
                    epochs=50, batch_size=32, 
                    callbacks=[lr_scheduler])

Epoch 1/50
[1m730/730[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m68s[0m 81ms/step - accuracy: 0.2360 - loss: 1.8808 - val_accuracy: 0.1742 - val_loss: 1.9832 - learning_rate: 0.0010
Epoch 2/50
[1m730/730[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 81ms/step - accuracy: 0.3377 - loss: 1.6320 - val_accuracy: 0.1694 - val_loss: 2.2374 - learning_rate: 0.0010
Epoch 3/50
[1m730/730[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 82ms/step - accuracy: 0.4012 - loss: 1.5296 - val_accuracy: 0.1889 - val_loss: 2.0431 - learning_rate: 0.0010
Epoch 4/50
[1m730/730[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m58s[0m 79ms/step - accuracy: 0.4396 - loss: 1.4386 - val_accuracy: 0.1528 - val_loss: 2.2664 - learning_rate: 0.0010
Epoch 5/50
[1m730/730[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m57s[0m 78ms/step - accuracy: 0.4567 - loss: 1.3958 - val_accuracy: 0.2122 - val_loss: 2.1200 - learning_rate: 0.0010
Epoch 6/50
[1m730/730[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37

In [39]:
keras.saving.save_model(model=model, filepath="../api/utils/speech_recognition_model.keras")