In [1]:
import librosa
import soundfile
import os, glob, pickle
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix



In [2]:
def extract_feature(file_name, mfcc, chroma, mel):
    with soundfile.SoundFile(file_name) as sound_file:
        X = sound_file.read(dtype="float32")
        sample_rate=sound_file.samplerate
        if chroma:
            stft=np.abs(librosa.stft(X))
        result=np.array([])
        if mfcc:
            mfccs=np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
            result=np.hstack((result, mfccs))
        if chroma:
            chroma=np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
            result=np.hstack((result, chroma))
        if mel:
            mel=np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
            result=np.hstack((result, mel))
    return result


In [3]:
emotions={
  '01':'neutral',
  '02':'calm',
  '03':'happy',
  '04':'sad',
  '05':'angry',
  '06':'fearful',
  '07':'disgust',
  '08':'surprised'
}

observed_emotions=['calm', 'happy', 'fearful', 'disgust']


In [4]:
def load_data(test_size=0.2):
    x,y=[],[]
    for file in glob.glob("E:\Final Project\speech-emotion-recognition-ravdess-data\\Actor_*\\*.wav"):
        file_name=os.path.basename(file)
        emotion=emotions[file_name.split("-")[2]]
        if emotion not in observed_emotions:
            continue
        feature=extract_feature(file, mfcc=True, chroma=True, mel=True)
        x.append(feature)
        y.append(emotion)
    return train_test_split(np.array(x), y, test_size=test_size, random_state=9)


In [5]:
x_train,x_test,y_train,y_test=load_data(test_size=0.25)


In [6]:
print((x_train.shape[0], x_test.shape[0]))

(576, 192)


In [7]:
print(f'Features extracted: {x_train.shape[1]}')

Features extracted: 180


In [8]:
 # create regressor object 
classifier = RandomForestClassifier(n_estimators = 100, random_state = 0) 
  
# fit the regressor with x and y data 
classifier.fit(x_train, y_train)   

c_p = classifier.predict(x_test) 


print(accuracy_score(y_true=y_test,y_pred=c_p))
print(classification_report(y_test,c_p)) 
# creating a confusion matrix 
print(confusion_matrix(y_test,c_p) )


0.703125
              precision    recall  f1-score   support

        calm       0.75      0.91      0.83        57
     disgust       0.65      0.67      0.66        48
     fearful       0.71      0.65      0.68        37
       happy       0.68      0.54      0.60        50

    accuracy                           0.70       192
   macro avg       0.70      0.69      0.69       192
weighted avg       0.70      0.70      0.70       192

[[52  3  0  2]
 [ 7 32  2  7]
 [ 5  4 24  4]
 [ 5 10  8 27]]


In [9]:
 # create regressor object 
classifier = RandomForestClassifier(n_estimators = 20,random_state = 0) 
  
# fit the regressor with x and y data 
classifier.fit(x_train, y_train)   

c_p = classifier.predict(x_test) 


print(accuracy_score(y_true=y_test,y_pred=c_p))
print(classification_report(y_test,c_p)) 
# creating a confusion matrix 
print(confusion_matrix(y_test,c_p) )


0.671875
              precision    recall  f1-score   support

        calm       0.74      0.86      0.80        57
     disgust       0.63      0.65      0.64        48
     fearful       0.62      0.68      0.65        37
       happy       0.65      0.48      0.55        50

    accuracy                           0.67       192
   macro avg       0.66      0.67      0.66       192
weighted avg       0.67      0.67      0.67       192

[[49  7  0  1]
 [ 7 31  2  8]
 [ 5  3 25  4]
 [ 5  8 13 24]]
