In [1]:
!pip install librosa
!pip install resampy




[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: C:\Users\NILOY CHOWDHURY\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: C:\Users\NILOY CHOWDHURY\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [2]:
import os
import librosa
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import ModelCheckpoint
import glob

In [3]:
DATA_PATH = "X:/ravdess/Audio_Speech_Actors_01-24/"

In [4]:
emotion_map = {
    "01": "neutral",
    "02": "calm",
    "03": "happy",
    "04": "sad",
    "05": "angry",
    "06": "fearful",
    "07": "disgust",
    "08": "surprised"
}

def extract_features(file_path, max_pad_len=174):
    audio, sample_rate = librosa.load(file_path, res_type='kaiser_fast')
    mfcc = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40)
    if mfcc.shape[1] < max_pad_len:
        pad_width = max_pad_len - mfcc.shape[1]
        mfcc = np.pad(mfcc, pad_width=((0, 0), (0, pad_width)), mode='constant')
    else:
        mfcc = mfcc[:, :max_pad_len]
    return mfcc

features = []
labels = []

for file in glob.glob(DATA_PATH + "**/*.wav", recursive=True):
    try:
        file_name = os.path.basename(file)
        emotion_code = file_name.split("-")[2]
        emotion = emotion_map.get(emotion_code)
        if emotion:
            mfcc = extract_features(file)
            features.append(mfcc)
            labels.append(emotion)
    except Exception as e:
        print("Error processing {}: {}".format(file, e))


In [5]:
# Convert to numpy arrays
X = np.array(features)
y = np.array(labels)


In [6]:
# Encode labels
le = LabelEncoder()
y_encoded = to_categorical(le.fit_transform(y))


In [7]:
X_train, X_val, y_train, y_val = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y)


In [8]:
model = Sequential()
model.add(LSTM(128, return_sequences=True, input_shape=(X.shape[1], X.shape[2])))
model.add(Dropout(0.3))
model.add(LSTM(64))
model.add(Dropout(0.3))
model.add(Dense(64, activation='relu'))
model.add(Dense(y_encoded.shape[1], activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


  super().__init__(**kwargs)


In [9]:
checkpoint_path = os.path.join("..", "models", "voice_emotion_model.h5")
checkpoint = ModelCheckpoint(filepath=checkpoint_path, save_best_only=True, monitor='val_accuracy', mode='max')

In [10]:
model.fit(X_train, y_train, epochs=40, batch_size=32, validation_data=(X_val, y_val), callbacks=[checkpoint])


Epoch 1/40
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - accuracy: 0.1698 - loss: 2.0690



[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 35ms/step - accuracy: 0.1703 - loss: 2.0685 - val_accuracy: 0.2257 - val_loss: 2.0122
Epoch 2/40
[1m33/36[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m0s[0m 20ms/step - accuracy: 0.3758 - loss: 1.8738



[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 24ms/step - accuracy: 0.3750 - loss: 1.8698 - val_accuracy: 0.2604 - val_loss: 1.8779
Epoch 3/40
[1m34/36[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m0s[0m 20ms/step - accuracy: 0.4948 - loss: 1.5606



[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 25ms/step - accuracy: 0.4939 - loss: 1.5589 - val_accuracy: 0.2778 - val_loss: 1.8461
Epoch 4/40
[1m35/36[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 19ms/step - accuracy: 0.5842 - loss: 1.2595



[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 24ms/step - accuracy: 0.5835 - loss: 1.2586 - val_accuracy: 0.2986 - val_loss: 1.8449
Epoch 5/40
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 23ms/step - accuracy: 0.7260 - loss: 0.8879 - val_accuracy: 0.2882 - val_loss: 2.0805
Epoch 6/40
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 23ms/step - accuracy: 0.8031 - loss: 0.6471 - val_accuracy: 0.2986 - val_loss: 2.3107
Epoch 7/40
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - accuracy: 0.8243 - loss: 0.5108



[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 25ms/step - accuracy: 0.8245 - loss: 0.5105 - val_accuracy: 0.3090 - val_loss: 2.4241
Epoch 8/40
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 23ms/step - accuracy: 0.8773 - loss: 0.3991 - val_accuracy: 0.2917 - val_loss: 2.4998
Epoch 9/40
[1m33/36[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m0s[0m 19ms/step - accuracy: 0.9196 - loss: 0.2850



[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 24ms/step - accuracy: 0.9177 - loss: 0.2904 - val_accuracy: 0.3299 - val_loss: 2.5957
Epoch 10/40
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 23ms/step - accuracy: 0.9100 - loss: 0.2769 - val_accuracy: 0.3090 - val_loss: 2.5696
Epoch 11/40
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 24ms/step - accuracy: 0.9074 - loss: 0.2829 - val_accuracy: 0.2812 - val_loss: 3.0187
Epoch 12/40
[1m34/36[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m0s[0m 19ms/step - accuracy: 0.9622 - loss: 0.1683



[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 23ms/step - accuracy: 0.9615 - loss: 0.1686 - val_accuracy: 0.3368 - val_loss: 2.8692
Epoch 13/40
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 23ms/step - accuracy: 0.9507 - loss: 0.1441 - val_accuracy: 0.3160 - val_loss: 3.1243
Epoch 14/40
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 23ms/step - accuracy: 0.9778 - loss: 0.1009 - val_accuracy: 0.3021 - val_loss: 3.5503
Epoch 15/40
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 24ms/step - accuracy: 0.9611 - loss: 0.1421 - val_accuracy: 0.2917 - val_loss: 3.4543
Epoch 16/40
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 23ms/step - accuracy: 0.9649 - loss: 0.1377 - val_accuracy: 0.2951 - val_loss: 3.6420
Epoch 17/40
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 23ms/step - accuracy: 0.9690 - loss: 0.0933 - val_accuracy: 0.3056 - val_loss: 3.5545
Epoch 18/40
[1m36/36[0m [32m━━━━━━━━━



[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 25ms/step - accuracy: 0.9879 - loss: 0.0367 - val_accuracy: 0.3542 - val_loss: 3.7598
Epoch 33/40
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 23ms/step - accuracy: 0.9945 - loss: 0.0270 - val_accuracy: 0.3264 - val_loss: 3.7775
Epoch 34/40
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 23ms/step - accuracy: 0.9908 - loss: 0.0271 - val_accuracy: 0.3403 - val_loss: 3.7074
Epoch 35/40
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 23ms/step - accuracy: 0.9821 - loss: 0.0457 - val_accuracy: 0.3542 - val_loss: 3.8389
Epoch 36/40
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 24ms/step - accuracy: 0.9831 - loss: 0.0534 - val_accuracy: 0.3403 - val_loss: 3.9180
Epoch 37/40
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 25ms/step - accuracy: 0.9822 - loss: 0.0648 - val_accuracy: 0.3507 - val_loss: 3.8674
Epoch 38/40
[1m36/36[0m [32m━━━━━━━━━

<keras.src.callbacks.history.History at 0x1f24f1eb910>