<a href="https://colab.research.google.com/github/Israt1063/8.VocalEmotionNet-Speech-Emotion-Recognition-using-MLPClassifier-on-RAVDESS-Dataset/blob/main/8_VocalEmotionNet_Speech_Emotion_Recognition_using_MLPClassifier_on_RAVDESS_Dataset.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

| 🔍 Reason                                     | ✅ Explanation                                                                                           |
| --------------------------------------------- | ------------------------------------------------------------------------------------------------------- |
| **1. Suitable for MFCC features**             | MFCCs are **continuous numerical vectors** that are ideal input for MLPs.                               |
| **2. Learns complex patterns**                | MLP can learn **non-linear mappings** between audio features and emotion labels.                        |
| **3. Easy to implement with sklearn**         | You don’t need to build or tune a deep model from scratch. `sklearn`’s `MLPClassifier` makes it simple. |
| **4. Works well on medium-sized datasets**    | RAVDESS has **1440 samples**, which is small for deep learning but sufficient for an MLP.               |
| **5. Handles multi-class classification**     | RAVDESS has 8 emotion classes – MLP supports **multi-class softmax** out of the box.                    |
| **6. No need for manual feature engineering** | Once MFCCs are extracted, MLP can learn directly from them without extra rules.                         |



In [22]:
# Step 1: Import necessary libraries
import os
import numpy as np
import librosa
import soundfile
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
import requests
import zipfile


In [23]:
# Step 2: Download RAVDESS dataset zip file from a public URL (Direct link)
dataset_url = "https://zenodo.org/record/1188976/files/Audio_Speech_Actors_01-24.zip?download=1"
dataset_path = "ravdess.zip"

if not os.path.exists(dataset_path):

    print("Downloading RAVDESS dataset...")
    r = requests.get(dataset_url)
    with open(dataset_path, "wb") as f:
        f.write(r.content)
    print("Download completed.")
else:
    print("Dataset already downloaded.")

Downloading RAVDESS dataset...
Download completed.


In [24]:
# Step 3: Extract dataset zip if not extracted
extract_folder = "Audio_Speech_Actors_01-24"
if not os.path.exists(extract_folder):
    print("Extracting dataset...")
    with zipfile.ZipFile(dataset_path, "r") as zip_ref:
        zip_ref.extractall()
    print("Extraction done.")
else:
    print("Dataset already extracted.")

Extracting dataset...
Extraction done.


In [27]:
# List all files and folders in current working directory
print("Current directory contents:")
print(os.listdir())

# Check inside your extracted folder (if it exists)
if 'Audio_Speech_Actors_01-24' in os.listdir():
    print("Extracted folder found: Audio_Speech_Actors_01-24")
else:
    # List folders that start with "Audio"
    print("Folders starting with 'Audio':")
    for item in os.listdir():
        if item.startswith('Audio'):
            print(item)


Current directory contents:
['.config', 'ravdess.zip', 'Actor_09', 'Actor_22', 'Actor_10', 'Actor_15', 'Actor_17', 'Actor_11', 'Actor_21', 'Actor_02', 'Actor_24', 'Actor_14', 'Actor_18', 'Actor_07', 'Actor_20', 'Actor_04', 'data', 'Actor_01', 'Actor_05', 'Actor_12', 'Actor_13', 'Actor_16', 'Actor_19', 'Actor_06', 'Actor_23', 'Actor_08', 'Actor_03', 'sample_data']
Folders starting with 'Audio':


In [31]:
!pip uninstall librosa -y
!pip install librosa==0.9.2


Found existing installation: librosa 0.11.0
Uninstalling librosa-0.11.0:
  Successfully uninstalled librosa-0.11.0
Collecting librosa==0.9.2
  Downloading librosa-0.9.2-py3-none-any.whl.metadata (8.2 kB)
Collecting resampy>=0.2.2 (from librosa==0.9.2)
  Downloading resampy-0.4.3-py3-none-any.whl.metadata (3.0 kB)
Downloading librosa-0.9.2-py3-none-any.whl (214 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m214.3/214.3 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading resampy-0.4.3-py3-none-any.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m69.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: resampy, librosa
Successfully installed librosa-0.9.2 resampy-0.4.3


In [1]:
import os

base_path = "."  # or "data" if you moved actors inside data
actor_dirs = [d for d in os.listdir(base_path) if d.startswith("Actor_")]
print("Actor folders found:", actor_dirs)


Actor folders found: ['Actor_09', 'Actor_22', 'Actor_10', 'Actor_15', 'Actor_17', 'Actor_11', 'Actor_21', 'Actor_02', 'Actor_24', 'Actor_14', 'Actor_18', 'Actor_07', 'Actor_20', 'Actor_04', 'Actor_01', 'Actor_05', 'Actor_12', 'Actor_13', 'Actor_16', 'Actor_19', 'Actor_06', 'Actor_23', 'Actor_08', 'Actor_03']


In [2]:
import glob
wav_files = glob.glob(os.path.join(base_path, "Actor_*", "*.wav"))
print("Total .wav files:", len(wav_files))


Total .wav files: 1440


In [3]:
emotion_map = {
    '01': 'neutral',
    '02': 'calm',
    '03': 'happy',
    '04': 'sad',
    '05': 'angry',
    '06': 'fearful',
    '07': 'disgust',
    '08': 'surprised'
}


In [4]:
from collections import defaultdict

emotion_counts = defaultdict(int)
for file in wav_files:
    emotion_code = os.path.basename(file).split("-")[2]
    emotion = emotion_map.get(emotion_code, "unknown")
    emotion_counts[emotion] += 1

print(dict(emotion_counts))


{'sad': 192, 'surprised': 192, 'happy': 192, 'fearful': 192, 'disgust': 192, 'angry': 192, 'neutral': 96, 'calm': 192}


In [5]:
import librosa

corrupted = []
for file in wav_files:
    try:
        y, sr = librosa.load(file)
        if y.shape[0] == 0:
            corrupted.append(file)
    except Exception:
        corrupted.append(file)

print(f"Corrupted files: {len(corrupted)}")


Corrupted files: 0


In [6]:
import numpy as np

def extract_features(file_path):
    try:
        audio, sr = librosa.load(file_path, res_type='kaiser_fast')
        mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=40)
        mfccs_scaled = np.mean(mfccs.T, axis=0)
        return mfccs_scaled
    except Exception as e:
        print(f"Error: {file_path}, {e}")
        return None


In [7]:
features, labels = [], []
for file in wav_files:
    data = extract_features(file)
    if data is not None:
        features.append(data)
        emotion_code = os.path.basename(file).split("-")[2]
        labels.append(emotion_map[emotion_code])

X = np.array(features)
y = np.array(labels)

print("Features shape:", X.shape)
print("Labels shape:", y.shape)


Features shape: (1440, 40)
Labels shape: (1440,)


In [8]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

model = RandomForestClassifier()
model.fit(X_train, y_train)

predictions = model.predict(X_test)
acc = accuracy_score(y_test, predictions)
print("Accuracy:", acc)


Accuracy: 0.6284722222222222
