In [1]:
import zipfile

In [2]:
with zipfile.ZipFile("speech-emotion-recognition-ravdess-data.zip", "r") as zip_ref:
    zip_ref.extractall()

In [3]:
# !pip install librosa soundfile numpy scikit-learn pyaudio
#Import libraries
import librosa
import soundfile
import os, glob, pickle
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

In [None]:
#Define extract_feature. Follow the steps below.
"""Define a function extract feature that takes in mfcc(Mel Frequency Ceptral Coefficient), chroma and mel.
a) open soundfile with soundfile.SoundFile so that it is automatically closed once we're done.
b) Read from it and name it X
c) Get sample rate
d) If chroma is True, get the Short-Time Fourier Transform of X
e) Result is an empty np.array
f) For each feature out of the three, make a corresponding feature from librosa.feature.
g) Get the mean value
h) Call the function hstack() from numpy with result and feature value
i) Store in results=np.hstack()
j) Return the result"""

In [4]:
# extract_feature function
def extract_feature(file_name, mfcc, chroma, mel):
    with soundfile.SoundFile(file_name) as sound_file:
        X = sound_file.read(dtype="float32")
        sample_rate = sound_file.samplerate
        if chroma:
            stft = np.abs(librosa.stft(X))
            result = np.array([])
        if mfcc:
            mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
            result = np.hstack((result, mfccs))
        if chroma:
            chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0)
            result = np.hstack((result, chroma))
        if mel:
            mel = np.mean(librosa.feature.melspectrogram(y=X, sr=sample_rate).T, axis=0)
            result = np.hstack((result, mel))
    return result
        

In [5]:
emotions = {
    "01": "Neutral",
    "02": "Calm",
    "03": "Happy",
    "04": "Sad",
    "05": "Angry",
    "06": "Fearful",
    "07": "Disgust",
    "08": "Surprised"


    
}
observed_emotions = ["Calm", "Happy", "Fearful", "Disgust"]

In [28]:
"""Load data with a load_data function that takes test_size(0.2)
use glob to get all the pathnames for the sound file in the dataset
X, y are empty lists
Using the emotions dict, the number is turned into an emotion and then checked to see whether it is on the observed_emotions list. 
If not, it moves to the next file
and calls to extract_feature stored in the feature.
Append feature to X and emotion to y
Call the function train_test_split of X and y, test_size, and random_state"""

def load_data(test_size=0.2):
    x, y = [], []
    for file in glob.glob("C:\\Users\\Admin\\Downloads\\speech-emotion-recognition-ravdess-data\\Actor_*\\*.wav*"):
        file_name = os.path.basename(file)
        emotion = emotions[file_name.split("-")[2]]
        if emotion not in observed_emotions:
            continue
        feature = extract_feature(file, mfcc=True, chroma=True, mel=True)
        x.append(feature)
        y.append(emotion)
        
    return train_test_split(np.array(x), y, test_size=test_size, random_state=42)

In [7]:
x_train, x_test, y_train, y_test = load_data(test_size=0.25)
print(x_train.shape[0], x_test.shape[0])

576 192


In [8]:
print(f"Features Extracted: {x_train.shape[1]}")

Features Extracted: 180


In [12]:
model = MLPClassifier(alpha=0.01, batch_size=256, epsilon=1e-08, hidden_layer_sizes=(300,), learning_rate="adaptive", max_iter=1000)
model.fit(x_train, y_train)

In [13]:
y_pred = model.predict(x_test)
y_pred

array(['Disgust', 'Calm', 'Fearful', 'Disgust', 'Happy', 'Happy', 'Calm',
       'Calm', 'Disgust', 'Fearful', 'Disgust', 'Fearful', 'Calm',
       'Fearful', 'Disgust', 'Calm', 'Fearful', 'Calm', 'Fearful',
       'Fearful', 'Disgust', 'Calm', 'Fearful', 'Fearful', 'Calm',
       'Fearful', 'Calm', 'Disgust', 'Fearful', 'Fearful', 'Calm',
       'Disgust', 'Fearful', 'Happy', 'Fearful', 'Fearful', 'Calm',
       'Calm', 'Calm', 'Happy', 'Disgust', 'Happy', 'Disgust', 'Calm',
       'Calm', 'Disgust', 'Calm', 'Disgust', 'Disgust', 'Disgust',
       'Fearful', 'Calm', 'Fearful', 'Calm', 'Disgust', 'Fearful',
       'Disgust', 'Fearful', 'Fearful', 'Fearful', 'Fearful', 'Disgust',
       'Calm', 'Calm', 'Calm', 'Disgust', 'Fearful', 'Happy', 'Fearful',
       'Fearful', 'Fearful', 'Fearful', 'Calm', 'Fearful', 'Happy',
       'Calm', 'Fearful', 'Fearful', 'Fearful', 'Fearful', 'Calm', 'Calm',
       'Happy', 'Fearful', 'Fearful', 'Happy', 'Fearful', 'Fearful',
       'Calm', 'Calm', 'Cal

In [14]:
accuracy = accuracy_score(y_true=y_test, y_pred=y_pred)
print("Accuracy: {:.2f}%".format(accuracy*100))

Accuracy: 60.94%


In [15]:
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from sklearn.neural_network import MLPClassifier
from imblearn.over_sampling import ADASYN

"""Preprocessing"""
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)


"""Model training"""
model = MLPClassifier(alpha=0.01, batch_size=128, hidden_layer_sizes=(300,150), learning_rate="adaptive", max_iter=2000, 
                      early_stopping=True, random_state=42)
model.fit(x_train_scaled, y_train)

"""Evaluation"""
y_pred = model.predict(x_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: {:.2f}%".format(accuracy*100))
print(classification_report(y_test, y_pred))

Accuracy: 72.40%
              precision    recall  f1-score   support

        Calm       0.86      0.78      0.82        54
     Disgust       0.74      0.81      0.78        43
     Fearful       0.77      0.51      0.62        47
       Happy       0.58      0.79      0.67        48

    accuracy                           0.72       192
   macro avg       0.74      0.72      0.72       192
weighted avg       0.74      0.72      0.72       192



In [16]:
"""The accuracy score is not as high as I want. Let me try other models. Lets reduce dimensionality first"""
from sklearn.decomposition import PCA
pca = PCA(n_components=10)
xpca_train = pca.fit_transform(x_train_scaled)
xpca_test = pca.transform(x_test_scaled)


In [None]:
"""Hyperparameter tuning"""
from sklearn.model_selection import GridSearchCV
param_grid={
    "hidden_layer_sizes": [(100,), (100,50), (300,)],
    "alpha": [0.0001, 0.001, 0.01],
    "learning_rate": ["constant", "adaptive"],
    "batch_size": [64, 128, 256],
    "max_iter": [2000, 10_000]
}
grid = GridSearchCV(MLPClassifier(random_state=42), param_grid, cv=3, scoring="accuracy")
grid.fit(x_train, y_train)
grid.best_params_
grid.best_estimator_

In [17]:
"""Try different algorithms"""
"""RandomForest"""
from sklearn.ensemble import RandomForestClassifier
random = RandomForestClassifier(random_state=42)
random.fit(x_train_scaled, y_train)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: {:.2f}%".format(accuracy*100))

Accuracy: 72.40%


In [18]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.fit(y_test)
print("Classes:", label_encoder.classes_)

Classes: ['Calm' 'Disgust' 'Fearful' 'Happy']


In [22]:
"""XGBClassifier"""
from xgboost import XGBClassifier
xgb = XGBClassifier(eval_metric="mlogloss", random_state=42)
xgb.fit(x_train_scaled, y_train_encoded)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: {:.2f}%".format(accuracy*100))

Accuracy: 72.40%


In [20]:
"""Make predictions and convert predictions back to normal"""
y_pred_encoded = xgb.predict(x_test)
y_pred = label_encoder.inverse_transform(y_pred_encoded)

In [23]:
from sklearn.ensemble import VotingClassifier
model_ens = VotingClassifier(estimators=[
    ("mlp", MLPClassifier(batch_size=128, hidden_layer_sizes=(300,), max_iter=500, random_state=42)),
    ("random", RandomForestClassifier(n_estimators=100, random_state=42)),
    ("xgb", XGBClassifier(eval_metric="mlogloss", random_state=42))
], voting="soft")
model_ens.fit(x_train_scaled, y_train)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: {:.2f}%".format(accuracy*100))

Accuracy: 72.40%


In [None]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: {:.2f}%".format(accuracy*100))

In [24]:
"""Lets try stacking classifier and see it's performance"""
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

stacks = StackingClassifier(
    estimators=[
        ("mlp", MLPClassifier(batch_size=128, hidden_layer_sizes=(300,), max_iter=1000, random_state=42)),
        ("random", RandomForestClassifier(n_estimators=100, random_state=42)),
        ("xgb", XGBClassifier(eval_metric="mlogloss", random_state=42))
], 
    final_estimator=LogisticRegression()
)
stacks.fit(x_train, y_train_encoded)

In [25]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: {:.2f}%".format(accuracy*100))

Accuracy: 72.40%


In [None]:
"""To my conclusion, preprocessing data using the StandardScaler(), using the MLPClassifier gives the best accuracy score at 72.40% for this dataset. 
Other models have proven to give me the same accuracy scores even if I use them together."""