In [5]:
import librosa
import numpy as np
import glob
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import joblib
from sklearn.metrics import classification_report
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout
from keras.models import Sequential
from keras.layers import Conv1D, Activation, Dropout, MaxPooling1D, Flatten, Dense
from keras.optimizers import RMSprop
import tensorflow as tf

# Define the emotions and observed emotions
emotions = {
    '01': 'neutral',
    '02': 'calm',
    '03': 'happy',
    '04': 'sad',
    '05': 'angry',
    '06': 'fearful',
    '07': 'disgust',
    '08': 'surprised'
}

observed_emotions = ['neutral', 'calm', 'happy', 'sad', 'fearful', 'surprised']

data_directory = "E:\SER\data\\ravdess"
def extract_feature(data, sr, mfcc=True, chroma=True, mel=True):
    result = np.array([])
    if chroma:
        stft = np.abs(librosa.stft(data))
    if mfcc:
        mfccs = np.mean(librosa.feature.mfcc(y=data, sr=sr, n_mfcc=40).T, axis=0)
        result = np.hstack((result, mfccs))
    if chroma:
        chroma_feature = np.mean(librosa.feature.chroma_stft(S=stft, sr=sr).T, axis=0)
        result = np.hstack((result, chroma_feature))
    if mel:
        mel_feature = np.mean(librosa.feature.melspectrogram(y=data, sr=sr).T, axis=0)
        result = np.hstack((result, mel_feature))
    return result

def noise(data, noise_factor):
    noise = np.random.randn(len(data))
    augmented_data = data + noise_factor * noise
    return augmented_data.astype(type(data[0]))

def shift(data, sampling_rate, shift_max, shift_direction):
    shift = np.random.randint(sampling_rate * shift_max)
    if shift_direction == 'right':
        shift = -shift
    elif shift_direction == 'both':
        direction = np.random.randint(0, 2)
        if direction == 1:
            shift = -shift
    augmented_data = np.roll(data, shift)
    if shift > 0:
        augmented_data[:shift] = 0
    else:
        augmented_data[shift:] = 0
    return augmented_data

def load_data(save=False):
    x, y = [], []
    durations = []

    for file in glob.glob(data_directory + "/Actor_*/*.wav"):
        data, sr = librosa.load(file)
        duration = len(data) / sr
        durations.append(duration)
        
        feature = extract_feature(data, sr)
        file_name = os.path.basename(file)
        emotion = emotions[file_name.split("-")[2]]
        
        if emotion in observed_emotions:
            x.append(feature)
            y.append(emotion)

            n_data = noise(data, 0.001)
            n_feature = extract_feature(n_data, sr)
            x.append(n_feature)
            y.append(emotion)

            s_data = shift(data, sr, 0.25, 'right')
            s_feature = extract_feature(s_data, sr)
            x.append(s_feature)
            y.append(emotion)
    
 
    

  
    np.save('Xfinal.npy', np.array(x))
    np.save('yfinal.npy', y)
    # min_duration = min(durations)
    # max_duration = max(durations)
    # print(f"Minimum audio length: {min_duration:.2f} seconds")
    # print(f"Maximum audio length: {max_duration:.2f} seconds")

    return np.array(x), y
    

# Load and process the data
X, y = load_data(save=True)


In [6]:
X

array([[-7.04823425e+02,  6.53288727e+01, -9.19415855e+00, ...,
         1.34345183e-16,  1.62977596e-16,  3.77389475e-17],
       [-4.45094574e+02,  2.10709743e+01,  6.55960560e+00, ...,
         7.13321861e-05,  6.89793815e-05,  6.78238212e-05],
       [-7.08801819e+02,  6.61232224e+01, -9.42120075e+00, ...,
         1.34336342e-16,  1.62974168e-16,  3.77350630e-17],
       ...,
       [-5.33215759e+02,  5.07854843e+01, -2.39176464e+01, ...,
         1.03407571e-12,  1.01320916e-12,  9.97647169e-13],
       [-4.03522736e+02,  2.37669430e+01, -7.50971651e+00, ...,
         6.69460060e-05,  7.25114514e-05,  7.23951671e-05],
       [-5.33193542e+02,  5.07531815e+01, -2.39222736e+01, ...,
         1.09925599e-12,  1.07700415e-12,  1.06058001e-12]])

In [7]:
# Convert y to a NumPy array
y = np.array(y)
# Split data into training and test sets
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=9)
# Label Encoding
label_encoder = LabelEncoder()
label_encoder.fit(observed_emotions)
y_train_encoded = label_encoder.transform(y_train)
y_test_encoded = label_encoder.transform(y_test)
# Save the LabelEncoder
joblib.dump(label_encoder, 'label_encoderfinal.pkl')
# Add an extra dimension to the data
XProcessed = np.expand_dims(x_train, axis=2)
XTestProcessed = np.expand_dims(x_test, axis=2)

In [8]:

# Define the model
model = Sequential()

# First Conv1D layer
model.add(Conv1D(64, 5, padding='same', input_shape=(XProcessed.shape[1], 1)))
model.add(Activation('relu'))
model.add(Dropout(0.1))
model.add(MaxPooling1D(pool_size=4))

# Second Conv1D layer
model.add(Conv1D(128, 5, padding='same'))
model.add(Activation('relu'))
model.add(Dropout(0.1))
model.add(MaxPooling1D(pool_size=4))

# Third Conv1D layer
model.add(Conv1D(256, 5, padding='same'))
model.add(Activation('relu'))
model.add(Dropout(0.1))

# Flatten and Dense layers
model.add(Flatten())
model.add(Dense(6))
model.add(Activation('softmax'))

# Compile the model with RMSprop optimizer
opt = RMSprop(learning_rate=0.0005, rho=0.9, epsilon=1e-07)
model.compile(loss='sparse_categorical_crossentropy', optimizer=opt, metrics=['accuracy'])

# Print model summary
model.summary()

# Train the model
history = model.fit(XProcessed, y_train_encoded, epochs=200, validation_data=(XTestProcessed, y_test_encoded), batch_size=63)

# Evaluate the model
loss, accuracy = model.evaluate(XTestProcessed, y_test_encoded)
print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")

# Make predictions and print classification report
y_pred = model.predict(XTestProcessed)
y_pred = np.argmax(y_pred, axis=1)

print(classification_report(y_test_encoded, y_pred, target_names=observed_emotions))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/200
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 18ms/step - accuracy: 0.1961 - loss: 6.1126 - val_accuracy: 0.2765 - val_loss: 1.7713
Epoch 2/200
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - accuracy: 0.2591 - loss: 1.8760 - val_accuracy: 0.2525 - val_loss: 1.6318
Epoch 3/200
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - accuracy: 0.3271 - loss: 1.6035 - val_accuracy: 0.3573 - val_loss: 1.4673
Epoch 4/200
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - accuracy: 0.3868 - loss: 1.4762 - val_accuracy: 0.3548 - val_loss: 1.4291
Epoch 5/200
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - accuracy: 0.4310 - loss: 1.4000 - val_accuracy: 0.4975 - val_loss: 1.2954
Epoch 6/200
[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - accuracy: 0.4452 - loss: 1.3446 - val_accuracy: 0.4432 - val_loss: 1.3271
Epoch 7/200
[1m38/38[0m [

In [9]:
model.save('emotion_recognition_modelv3.h5')
print("Model and weights saved to 'emotion_recognition_modelv3.h5'")



Model and weights saved to 'emotion_recognition_modelv3.h5'
