In [1]:
import os
import pandas as pd
import librosa
import numpy as np

# Function to extract features from an audio file
def process_audio(file, target_length=227):
    y, sr = librosa.load(file)
    y_trimmed, _ = librosa.effects.trim(y, top_db=50)
    S = librosa.feature.melspectrogram(y=y_trimmed, sr=sr, n_mels=19)
    S_db_mel = librosa.amplitude_to_db(S, ref=np.max)
    if S_db_mel.shape[1] < target_length:
        pad_width = target_length - S_db_mel.shape[1]
        S_db_mel_padded = np.pad(S_db_mel, ((0, 0), (0, pad_width)), mode='constant')
    else:
        S_db_mel_padded = S_db_mel[:, :target_length]
    return S_db_mel_padded.mean(axis=1)

# Function to process the dataset
def process_dataset(dataset_path, output_path):
    processed_df = pd.DataFrame()
    for root, dirs, files in os.walk(dataset_path):
        for file in files:
            if file.endswith(".wav"):
                audio_path = os.path.join(root, file)
                try:
                    mean_mel = process_audio(audio_path)
                    mean_row_df = pd.DataFrame(mean_mel).T
                    gender = 'male' if int(file.split('-')[-1][:-4]) % 2 else 'female'
                    emotion_code = int(file.split('-')[2])
                    emotion_dict = {1: 'neutral', 2: 'calm', 3: 'happy', 4: 'sad', 5: 'angry', 6: 'fearful', 7: 'disgust', 8: 'surprised'}
                    emotion = emotion_dict.get(emotion_code, 'unknown')
                    label = f"{gender}_{emotion}"
                    mean_row_df['label'] = label
                    processed_df = pd.concat([processed_df, mean_row_df])
                except Exception as e:
                    print(f"Error processing file {audio_path}: {e}")
    processed_df.reset_index(drop=True, inplace=True)
    output_csv_path = os.path.join(output_path, 'audio_features.csv')
    processed_df.to_csv(output_csv_path, index=False)
    print(f"Data processed and saved to: {output_csv_path}")
    return processed_df

# Paths
dataset_path = "C://Users//Malhan//Downloads//Ravdess Emotional Speech Audio//Audio Dataset"
output_path = "C://Users//Malhan//Downloads//Ravdess Emotional Speech Audio//Processed Audio Data"

# Process the dataset and get the DataFrame
processed_df = process_dataset(dataset_path, output_path)


Data processed and saved to: C://Users//Malhan//Downloads//Ravdess Emotional Speech Audio//Processed Audio Data\audio_features.csv


In [2]:
processed_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,label
0,-12.165576,-12.582055,-12.896970,-17.279921,-21.065340,-21.372866,-20.442114,-20.379951,-21.045950,-22.823767,-21.380089,-22.583340,-22.569992,-23.998085,-24.554270,-24.754057,-25.188608,-25.313219,-25.334679,male_neutral
1,-13.709365,-14.521650,-15.083750,-19.586294,-23.286217,-23.857920,-22.800055,-22.581064,-23.502352,-25.162014,-23.905432,-24.596926,-24.338203,-25.689928,-26.718462,-26.907103,-27.197077,-27.355879,-27.297461,male_neutral
2,-9.256921,-9.541002,-10.184114,-13.826313,-18.049000,-17.980541,-17.347843,-17.646027,-18.865736,-20.293119,-19.221397,-20.412535,-19.114946,-20.176924,-20.728050,-21.195278,-21.624374,-21.950111,-21.873751,male_neutral
3,-9.608938,-10.248178,-11.057754,-14.719895,-18.234875,-18.587791,-17.916666,-17.698530,-18.534472,-20.366220,-19.249313,-20.273186,-19.133587,-19.754866,-20.547983,-20.714357,-21.346714,-21.478708,-21.411345,male_neutral
4,-13.958900,-15.159245,-15.918657,-20.991602,-24.609772,-24.992729,-24.361069,-24.561304,-25.265135,-26.704437,-25.508638,-26.574772,-26.761263,-27.633867,-27.872135,-27.829931,-28.261316,-28.654547,-28.688662,male_calm
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2875,-22.687885,-16.430719,-15.784697,-18.620865,-19.126484,-19.967772,-22.622166,-21.887785,-21.567331,-24.261831,-25.358107,-25.323982,-26.971121,-25.734680,-25.531927,-27.240358,-27.565830,-28.136839,-28.539036,female_surprised
2876,-29.419840,-25.114035,-22.438528,-22.986082,-26.773340,-28.039248,-29.221439,-30.792112,-31.859699,-32.621799,-32.943989,-33.477905,-34.123672,-33.255352,-33.617828,-35.612583,-35.413616,-35.321392,-35.367447,female_surprised
2877,-30.038107,-24.517479,-21.878922,-21.634138,-24.825340,-28.372612,-29.436012,-29.490381,-30.143545,-30.744141,-31.707172,-31.909689,-32.671314,-32.251190,-33.040981,-35.752705,-35.430252,-35.437866,-35.605213,female_surprised
2878,-27.577820,-20.152512,-23.304667,-24.678059,-25.355543,-26.895813,-29.012003,-29.816372,-30.375233,-31.069624,-31.872177,-30.541704,-32.415806,-31.125731,-30.670326,-31.351460,-32.763256,-33.113979,-33.606762,female_surprised


In [3]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [4]:
# Encode labels
label_encoder = LabelEncoder()
processed_df['label_encoded'] = label_encoder.fit_transform(processed_df['label'])


In [5]:
# Prepare features and labels
X = processed_df.drop(['label', 'label_encoded'], axis=1)
y = processed_df['label_encoded']

In [6]:
# Normalize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [7]:
# Reshape for RNN
X_scaled = X_scaled.reshape(X_scaled.shape[0], 1, X_scaled.shape[1])


In [8]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


In [42]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, BatchNormalization


In [43]:
# Number of unique labels
num_labels = y.nunique()

In [44]:
# Build the RNN model
model = Sequential()
model.add(LSTM(128, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(BatchNormalization())
model.add(Dropout(0.5))

model.add(LSTM(128, return_sequences=True))
model.add(BatchNormalization())
model.add(Dropout(0.5))

model.add(LSTM(128))
model.add(BatchNormalization())
model.add(Dropout(0.5))

model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))

model.add(Dense(32, activation='relu'))
model.add(Dropout(0.3))

model.add(Dense(num_labels, activation='softmax'))

In [45]:

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


In [51]:
# Train the model
history = model.fit(X_train, y_train, epochs=1800, batch_size=64, validation_data=(X_test, y_test), verbose=2)


Epoch 1/1800
36/36 - 0s - loss: 1.3602 - accuracy: 0.5577 - val_loss: 1.0197 - val_accuracy: 0.6580 - 248ms/epoch - 7ms/step
Epoch 2/1800
36/36 - 0s - loss: 1.2222 - accuracy: 0.5829 - val_loss: 0.9696 - val_accuracy: 0.6528 - 216ms/epoch - 6ms/step
Epoch 3/1800
36/36 - 0s - loss: 1.1523 - accuracy: 0.6076 - val_loss: 0.9757 - val_accuracy: 0.6580 - 227ms/epoch - 6ms/step
Epoch 4/1800
36/36 - 0s - loss: 1.2022 - accuracy: 0.6003 - val_loss: 0.9717 - val_accuracy: 0.6649 - 228ms/epoch - 6ms/step
Epoch 5/1800
36/36 - 0s - loss: 1.2410 - accuracy: 0.5781 - val_loss: 0.9811 - val_accuracy: 0.6580 - 223ms/epoch - 6ms/step
Epoch 6/1800
36/36 - 0s - loss: 1.2570 - accuracy: 0.5929 - val_loss: 0.9906 - val_accuracy: 0.6684 - 226ms/epoch - 6ms/step
Epoch 7/1800
36/36 - 0s - loss: 1.1810 - accuracy: 0.6059 - val_loss: 0.9715 - val_accuracy: 0.6615 - 201ms/epoch - 6ms/step
Epoch 8/1800
36/36 - 0s - loss: 1.2161 - accuracy: 0.5972 - val_loss: 0.9919 - val_accuracy: 0.6719 - 199ms/epoch - 6ms/step


In [53]:
# Print final accuracy
final_train_accuracy = accuracy[-1]
final_val_accuracy = val_accuracy[-1]
print(f"Final Training Accuracy: {final_train_accuracy*100:.2f}%")
print(f"Final Validation Accuracy: {final_val_accuracy*100:.2f}%")


Final Training Accuracy: 68.40%
Final Validation Accuracy: 78.12%


In [54]:
def predict_emotion_from_file(file_path, model, scaler, label_encoder):
    # Process audio file
    features = process_audio(file_path)
    features_scaled = scaler.transform([features])
    features_scaled = features_scaled.reshape(1, 1, -1)

    # Predict emotion
    prediction = model.predict(features_scaled)
    predicted_label = np.argmax(prediction, axis=1)[0]
    predicted_emotion = label_encoder.inverse_transform([predicted_label])

    return predicted_emotion[0]

# Example usage
real_time_audio_file = 'C://Users//Malhan//Downloads//Test Audios//03-01-05-01-02-01-19.wav'
predicted_emotion = predict_emotion_from_file(real_time_audio_file, model, scaler, label_encoder)
print("Predicted Emotion:", predicted_emotion)

Predicted Emotion: male_disgust


In [55]:
model.save("C://Users//Malhan//Downloads//Test Audios//emotion_recognition_model.h5")

  saving_api.save_model(


In [56]:
import joblib

# Assuming 'label_encoder' is your LabelEncoder object
# Save the label encoder to a file
joblib.dump(label_encoder, 'C://Users//Malhan//Downloads//Test Audios//label_encoder.pkl')


['C://Users//Malhan//Downloads//Test Audios//label_encoder.pkl']