Importing libraries

In [1]:
import os
import librosa
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Sequential
from keras.layers import InputLayer, Dense, LSTM, Dropout



Preprocess the data

In [2]:
DATA_PATH = "C:/Users/User/Downloads/emotion_detection_audio"

In [3]:
#extracting features from audio files
def extract_features(file_name):
    audio, sample_rate = librosa.load(file_name, res_type='kaiser_fast')
    mfccs = np.mean(librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40).T, axis=0)
    return mfccs

In [4]:
#parsing the filename and extract emotion, gender, etc.
def parse_filename(file_name):
    parts = file_name.split('-')
    emotion = int(parts[2])
    actor = int(parts[-1].split('.')[0])
    gender = 'female' if actor % 2 == 0 else 'male'
    return emotion, gender

In [5]:
# Loading data and extracting features
data = []
labels = []
for root, _, files in os.walk(DATA_PATH):
    for file in files:
        if file.endswith('.wav'):
            file_path = os.path.join(root, file)
            emotion, gender = parse_filename(file)
            if gender == 'female':  # Filter for female voices
                features = extract_features(file_path)
                data.append(features)
                labels.append(emotion)

In [6]:
# Converting lists to numpy arrays
X = np.array(data)
y = np.array(labels)


In [7]:
# Encoding labels
le = LabelEncoder()
y = le.fit_transform(y)

In [8]:
# Saving  label encoder classes for later use in the app
np.save('classes.npy', le.classes_)

In [10]:
# Spliting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [11]:
# Reshaping X for LSTM model
X_train = np.expand_dims(X_train, axis=2)
X_test = np.expand_dims(X_test, axis=2)

In [12]:
# Building and training the LSTM model
model = Sequential()
model.add(InputLayer(shape=(X_train.shape[1], 1)))
model.add(LSTM(128, return_sequences=True))
model.add(LSTM(64))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(len(le.classes_), activation='softmax'))

In [13]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [14]:
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test))
model.save('emotion_detection_model.keras')

Epoch 1/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 118ms/step - accuracy: 0.1695 - loss: 2.0606 - val_accuracy: 0.2049 - val_loss: 1.9792
Epoch 2/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 211ms/step - accuracy: 0.2465 - loss: 1.9539 - val_accuracy: 0.2951 - val_loss: 1.8706
Epoch 3/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 82ms/step - accuracy: 0.2634 - loss: 1.8775 - val_accuracy: 0.3542 - val_loss: 1.7253
Epoch 4/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 83ms/step - accuracy: 0.3310 - loss: 1.8092 - val_accuracy: 0.3160 - val_loss: 1.7585
Epoch 5/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 80ms/step - accuracy: 0.3115 - loss: 1.7828 - val_accuracy: 0.3333 - val_loss: 1.7563
Epoch 6/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 77ms/step - accuracy: 0.3088 - loss: 1.7786 - val_accuracy: 0.3333 - val_loss: 1.6895
Epoch 7/50
[1m36/36[0m [32m━