# Speech Emotion Recognition using Librosa


#### RAVDESS Dataset
This is the Ryerson Audio-Visual Database of Emotional Speech and Song dataset, and is free to download. This dataset has 7356 files rated by 247 individuals 10 times on emotional validity, intensity, and genuineness. The entire dataset is 24.8GB from 24 actors.

Dataset on Google Drive: https://drive.google.com/file/d/1wWsrN2Ep7x6lWqOXfr4rpKGYrJhWc8z7/view

In [2]:
#Connect your Drive with Colab
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [3]:
#Check where your Dataset Zip File is
!ls '/content/drive/MyDrive/Speech_Emotion_Detection-master/Speech_Emotion_Detection-master/speech-emotion-recognition-ravdess-data'


Actor_01  Actor_04  Actor_07  Actor_10	Actor_13  Actor_16  Actor_19  Actor_22
Actor_02  Actor_05  Actor_08  Actor_11	Actor_14  Actor_17  Actor_20  Actor_23
Actor_03  Actor_06  Actor_09  Actor_12	Actor_15  Actor_18  Actor_21  Actor_24


In [4]:
# #Unzip the file contents
# !unzip '/content/drive/My Drive/Important Extras/Data Science Works/_Data Science Work/Speech Emotion Recognition/speech-emotion-recognition-ravdess-data.zip'

In [5]:
#You can see the zip folder has been extracted
!ls

drive  sample_data


In [6]:
#Install Librosa and SoundFile to your Machine
!pip install librosa soundfile



In [7]:
#Import All Important Libraries
import librosa
import soundfile
import os, glob, pickle
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

In [8]:
def extract_feature(file_name, mfcc, chroma, mel, max_length=100):
    with soundfile.SoundFile(file_name) as sound_file:
        X = sound_file.read(dtype="float32")
        sample_rate = sound_file.samplerate
        result = np.array([])
        if mfcc:
            mfccs = librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40)
            mfccs_mean = np.mean(mfccs, axis=0)
            mfccs_padded = pad_or_truncate(mfccs_mean, max_length)
            result = np.hstack((result, mfccs_padded))
        if chroma:
            stft = np.abs(librosa.stft(X))
            chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0)
            chroma_padded = pad_or_truncate(chroma, max_length)
            result = np.hstack((result, chroma_padded))
        if mel:
            mel = np.mean(librosa.feature.melspectrogram(y=X, sr=sample_rate).T, axis=0)
            mel_padded = pad_or_truncate(mel, max_length)
            result = np.hstack((result, mel_padded))
    return result

def pad_or_truncate(feature, max_length):
    if len(feature) < max_length:
        # If feature is shorter than max_length, pad with zeros
        return np.pad(feature, (0, max_length - len(feature)), mode='constant')
    else:
        # If feature is longer than max_length, truncate
        return feature[:max_length]




In [9]:
#Define the motions dictionary
emotions = {
    '01':'neutral',
    '02':'calm',
    '03':'happy',
    '04':'sad',
    '05':'angry',
    '06':'fearful',
    '07':'disgust',
    '08':'surprised'
}

#Emotions we want to observe
observed_emotions = ['calm', 'happy', 'fearful', 'disgust']

In [10]:
# Load the data and extract features for each sound file
def load_data(data_dir, test_size=0.2):
    x, y = [], []
    for folder in glob.glob(os.path.join(data_dir, 'Actor_*')):
        print(folder)
        for file in glob.glob(os.path.join(folder, '*.wav')):
            file_name = os.path.basename(file)
            emotion = emotions[file_name.split('-')[2]]
            if emotion not in observed_emotions:
                continue
            feature = extract_feature(file, mfcc=True, chroma=True, mel=True)
            x.append(feature)
            y.append(emotion)
    return train_test_split(np.array(x), y, test_size=test_size, random_state=9)

# Specify the path to your dataset directory
data_dir = '/content/drive/MyDrive/Speech_Emotion_Detection-master/Speech_Emotion_Detection-master/speech-emotion-recognition-ravdess-data'

# Load the data
x_train, x_test, y_train, y_test = load_data(data_dir)

# Shape of train and test set and number of features extracted
print((x_train.shape[0], x_test.shape[0]))
print(f'Features extracted: {x_train.shape[1]}')


/content/drive/MyDrive/Speech_Emotion_Detection-master/Speech_Emotion_Detection-master/speech-emotion-recognition-ravdess-data/Actor_23
/content/drive/MyDrive/Speech_Emotion_Detection-master/Speech_Emotion_Detection-master/speech-emotion-recognition-ravdess-data/Actor_20
/content/drive/MyDrive/Speech_Emotion_Detection-master/Speech_Emotion_Detection-master/speech-emotion-recognition-ravdess-data/Actor_24
/content/drive/MyDrive/Speech_Emotion_Detection-master/Speech_Emotion_Detection-master/speech-emotion-recognition-ravdess-data/Actor_22
/content/drive/MyDrive/Speech_Emotion_Detection-master/Speech_Emotion_Detection-master/speech-emotion-recognition-ravdess-data/Actor_19
/content/drive/MyDrive/Speech_Emotion_Detection-master/Speech_Emotion_Detection-master/speech-emotion-recognition-ravdess-data/Actor_18
/content/drive/MyDrive/Speech_Emotion_Detection-master/Speech_Emotion_Detection-master/speech-emotion-recognition-ravdess-data/Actor_21
/content/drive/MyDrive/Speech_Emotion_Detection-

In [11]:
# #Load the data and extract features for each sound file
# def load_data(test_size = 0.2):
#   x, y = [], []
#   for folder in glob.glob('/content/Actor_*'):
#     print(folder)
#     for file in glob.glob(folder + '/*.wav'):
#       file_name = os.path.basename(file)
#       emotion = emotions[file_name.split('-')[2]]
#       if emotion not in observed_emotions:
#         continue
#       feature = extract_feature(file, mfcc = True, chroma = True, mel = True)
#       x.append(feature)
#       y.append(emotion)
#   return train_test_split(np.array(x), y, test_size = test_size, random_state = 9)


In [12]:
!pip install pydub

Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1


In [13]:
from sklearn.neural_network import MLPClassifier

# Initialize MLPClassifier with expected parameters
model = MLPClassifier(activation='relu', alpha=0.01, batch_size=256, beta_1=0.9,
                      beta_2=0.999, early_stopping=False, epsilon=1e-08,
                      hidden_layer_sizes=(300,), learning_rate='adaptive',
                      learning_rate_init=0.001, max_fun=15000, max_iter=500,
                      momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
                      power_t=0.5, random_state=None, shuffle=True, solver='adam',
                      tol=0.0001, validation_fraction=0.1, verbose=False,
                      warm_start=False)

# Now you can fit the model, predict, and evaluate as usual


In [14]:
# #Initialise Multi Layer Perceptron Classifier
model = MLPClassifier(alpha = 0.01, batch_size = 256, epsilon = 1e-08, hidden_layer_sizes = (300,), learning_rate = 'adaptive', max_iter = 500)

In [15]:
model.fit(x_train, y_train)

In [16]:
#Predict for the test set
y_pred = model.predict(x_test)

In [17]:
y_pred

array(['fearful', 'calm', 'calm', 'happy', 'happy', 'calm', 'fearful',
       'happy', 'calm', 'happy', 'fearful', 'happy', 'fearful', 'fearful',
       'fearful', 'calm', 'happy', 'calm', 'fearful', 'happy', 'calm',
       'fearful', 'fearful', 'calm', 'fearful', 'happy', 'happy', 'happy',
       'happy', 'calm', 'happy', 'calm', 'happy', 'happy', 'happy',
       'calm', 'happy', 'happy', 'calm', 'calm', 'happy', 'happy',
       'happy', 'happy', 'calm', 'happy', 'calm', 'calm', 'happy',
       'happy', 'calm', 'happy', 'happy', 'calm', 'happy', 'happy',
       'fearful', 'happy', 'happy', 'happy', 'calm', 'happy', 'calm',
       'fearful', 'happy', 'happy', 'happy', 'happy', 'happy', 'calm',
       'happy', 'calm', 'calm', 'calm', 'happy', 'calm', 'calm', 'happy',
       'happy', 'happy', 'fearful', 'calm', 'calm', 'disgust', 'calm',
       'disgust', 'happy', 'calm', 'calm', 'calm', 'calm', 'fearful',
       'happy', 'calm', 'disgust', 'happy', 'calm', 'calm', 'calm',
       'happy'

In [18]:
#Calculate Accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: {:.2f}%".format(accuracy*100))

Accuracy: 45.45%


In [19]:
from sklearn.metrics import accuracy_score, f1_score

In [20]:
import pandas as pd
df=pd.DataFrame({'Actual': y_test, 'Predicted':y_pred})
df.head(20)

Unnamed: 0,Actual,Predicted
0,happy,fearful
1,calm,calm
2,happy,calm
3,happy,happy
4,fearful,happy
5,calm,calm
6,happy,fearful
7,happy,happy
8,disgust,calm
9,happy,happy


In [21]:
import os

# Define the directory path
directory_path = '/content/drive/MyDrive/Speech_Emotion_Detection-master/Speech_Emotion_Detection-master/speech-emotion-recognition-ravdess-data/Actor_01'

# List all files in the directory
files = os.listdir(directory_path)

# Print the list of files
print("Files in the directory:")
for file in files:
    print(file)


Files in the directory:
03-01-01-01-02-02-01.wav
03-01-01-01-02-01-01.wav
03-01-01-01-01-02-01.wav
03-01-03-02-01-02-01.wav
03-01-02-01-01-02-01.wav
03-01-04-01-01-01-01.wav
03-01-03-01-01-01-01.wav
03-01-02-02-01-01-01.wav
03-01-02-02-02-01-01.wav
03-01-02-01-02-02-01.wav
03-01-03-02-02-01-01.wav
03-01-03-02-02-02-01.wav
03-01-02-02-01-02-01.wav
03-01-03-01-01-02-01.wav
03-01-03-01-02-01-01.wav
03-01-03-01-02-02-01.wav
03-01-02-01-02-01-01.wav
03-01-03-02-01-01-01.wav
03-01-02-01-01-01-01.wav
03-01-02-02-02-02-01.wav
03-01-05-01-01-02-01.wav
03-01-04-01-02-02-01.wav
03-01-05-02-02-01-01.wav
03-01-04-01-01-02-01.wav
03-01-06-01-01-01-01.wav
03-01-04-02-01-02-01.wav
03-01-05-02-01-01-01.wav
03-01-05-01-02-02-01.wav
03-01-04-02-02-01-01.wav
03-01-04-02-02-02-01.wav
03-01-05-02-02-02-01.wav
03-01-05-01-01-01-01.wav
03-01-04-02-01-01-01.wav
03-01-04-01-02-01-01.wav
03-01-05-02-01-02-01.wav
03-01-05-01-02-01-01.wav
03-01-06-02-02-01-01.wav
03-01-07-01-01-01-01.wav
03-01-07-02-01-01-01.wav
0

In [22]:
import pickle
# Writing different model files to file
with open( 'modelForPrediction1.sav', 'wb') as f:
    pickle.dump(model,f)

In [25]:
filename = 'modelForPrediction1.sav'
loaded_model = pickle.load(open(filename, 'rb')) # loading the model file from the storage

feature=extract_feature("/content/drive/MyDrive/Speech_Emotion_Detection-master/speech-emotion-recognition-ravdess-data/Actor_01/03-01-01-01-01-01-01.wav", mfcc=True, chroma=True, mel=True)

feature=feature.reshape(1,-1)

prediction=loaded_model.predict(feature)
prediction

array(['calm'], dtype='<U7')