In [1]:
import librosa
import soundfile
import os, glob, pickle
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

In [51]:
# Extract info from audio files
def extract_audio_features(file_name, mfcc, chroma, mel):
    """
    mfcc represents the short term power spectrum of the sound
    chroma is the pitch
    mel is the spectrogram frequency
    """
    with soundfile.SoundFile(file_name) as sound_file:
        X = sound_file.read(dtype="float32")
        sample_rate = sound_file.samplerate
        if chroma:
            fourier = np.abs(librosa.stft(X))
            
        # compile the three features into a result    
        result = np.array([])

        if mfcc:
            pwr_spec = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
            result = np.hstack((result, pwr_spec)) # add to result
        if chroma:
            chroma = np.mean(librosa.feature.chroma_stft(S=fourier, 
                                                        sr=sample_rate,
                                                        ).T, axis=0)
            result = np.hstack((result, chroma))
        if mel:
            mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T, axis=0)
            result = np.hstack((result, mel))
    return result

In [135]:
# Define the emotion labels in the RAVDESS dataset
emotions = {
    '01': 'neutral',
    '02': 'calm',    
    '03': 'happy',
    '04': 'sad',
    '05': 'angry',
    '06': 'fearful',
    '07': 'disgust',    
    '08': 'surprised' 
    }
# we are looking at a subset of emotions of interest    
observed_emotions = ('sad', 'happy', 'fearful', 'surprised' )


In [49]:
# Load the files in. 
# We needed the previous functions to make sense of the filename and info

def load_data(test_size=0.2):
    x,y = [], []
    # use the glob library to parse through files with wildcard
    files = [file for file in glob.glob("..\data\Actor_*\*.wav")]
    for file in files:
        file_name=os.path.basename(file)
        emotion = emotions[file_name.split("-")[2]]
        if emotion not in observed_emotions:
            continue
        feature = extract_audio_features(file, mfcc=True, chroma=True, mel=True)
        x.append(feature)
        y.append(emotion)

    return train_test_split(np.array(x), y, test_size=test_size, random_state=11)


In [92]:
# Note this is a rather conservative split given the amount of data.
# First I will run it as is but then I want to train on more of the data 
# Could use data augmentation to beef it up.
X_train, X_test, y_train, y_test = load_data(test_size=0.20)

In [93]:
print(type(X_train), type(y_test))
print('shapes of data:', (np.array(X_train).shape, np.array(X_test).shape, np.array(y_test).shape), np.shape(y_train))
print('The first number represents how many files, the second is how many feautures extracted' )

<class 'numpy.ndarray'> <class 'list'>
shapes of data: ((614, 180), (154, 180), (154,)) (614,)
The first number represents how many files, the second is how many feautures extracted


In [94]:
# The method shows here is from sklearn's neural network... could potentially try our own later
# Multi Layer Perceptron Classifier
model = MLPClassifier(alpha=0.01, batch_size=256, epsilon=1e-08, hidden_layer_sizes=(500,), learning_rate='adaptive', max_iter=500, verbose=True)


In [95]:
model.fit(X_train, y_train)

Iteration 1, loss = 20.89579859
Iteration 2, loss = 9.48402521
Iteration 3, loss = 9.23242576
Iteration 4, loss = 7.20703311
Iteration 5, loss = 3.76408598
Iteration 6, loss = 4.07899478
Iteration 7, loss = 3.28271669
Iteration 8, loss = 3.79703887
Iteration 9, loss = 2.02600256
Iteration 10, loss = 2.34710678
Iteration 11, loss = 1.77742139
Iteration 12, loss = 1.43767627
Iteration 13, loss = 1.59180284
Iteration 14, loss = 1.33044353
Iteration 15, loss = 1.23131094
Iteration 16, loss = 1.17443287
Iteration 17, loss = 1.11062840
Iteration 18, loss = 1.05289812
Iteration 19, loss = 1.03571937
Iteration 20, loss = 1.04958473
Iteration 21, loss = 0.98199835
Iteration 22, loss = 1.06285670
Iteration 23, loss = 1.11102844
Iteration 24, loss = 0.97210655
Iteration 25, loss = 1.03939402
Iteration 26, loss = 0.94622273
Iteration 27, loss = 0.93425317
Iteration 28, loss = 0.82279339
Iteration 29, loss = 0.79999273
Iteration 30, loss = 0.75767514
Iteration 31, loss = 0.74844225
Iteration 32, lo

MLPClassifier(alpha=0.01, batch_size=256, hidden_layer_sizes=(500,),
              learning_rate='adaptive', max_iter=500, verbose=True)

In [98]:
y_pred = model.predict(X_test)
print('Accuracy:', round(accuracy_score(y_true=y_test, y_pred=y_pred), ndigits=3))

Accuracy: 0.708


In [129]:
# Let's try reading in our own sample
audio_sample = extract_audio_features('C:\\Users\\jonma\\Programming\\speech-emotion\\fearful-sample.wav', mfcc=True, chroma=True, mel=True)

In [128]:
model.predict(np.array(audio_sample).reshape(1, -1))
# Hooray!!! It worked!

array(['fearful'], dtype='<U7')

In [130]:
audio_sample1 = extract_audio_features('C:\\Users\\jonma\\Programming\\speech-emotion\\disgust-sample.wav', mfcc=True, chroma=True, mel=True)
model.predict(np.array(audio_sample1).reshape(1, -1))


array(['fearful'], dtype='<U7')

In [136]:
audio_sample2 = extract_audio_features('C:\\Users\\jonma\\Programming\\speech-emotion\\extra-disgust-sample.wav', mfcc=True, chroma=True, mel=True)
model.predict(np.array(audio_sample2).reshape(1, -1))

array(['fearful'], dtype='<U7')