In [22]:
# pip install librosa soundfile numpy sklearn pyaudio
import librosa
import soundfile
import os
import glob
import pickle
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
import pyaudio
import wave

In [23]:
# Emotions in the RAVDESS dataset
emotions = {
    '01': 'neutral',
    '02': 'calm',
    '03': 'happy',
    '04': 'sad',
    '05': 'angry',
    '06': 'fearful',
    '07': 'disgust',
    '08': 'surprised'
}

# Emotions to observe
observed_emotions = ['calm', 'happy', 'fearful', 'disgust']

In [24]:
# Initialize the Multi Layer Perceptron Classifier
model = MLPClassifier(alpha=0.01, batch_size=256, epsilon=1e-08,
                      hidden_layer_sizes=(300,), learning_rate='adaptive', max_iter=500)


In [25]:
#Recording User Audio

# setting a volume for reading a data stream
CHUNK = 1024

# Sampling Resolution
FORMAT = pyaudio.paInt16

# Number of audo tracks
CHANNELS = 1

# sampling rate
RATE = 44100

# Recording duration, unit in second
RECORD_SECONDS = 5

p = pyaudio.PyAudio()

stream = p.open(format = FORMAT,
               channels = CHANNELS,
               rate = RATE,
               input = True,
               frames_per_buffer = CHUNK)

# starting Recording
print(" * recording")

frames = []

for i in range (0, int(RATE / CHUNK * RECORD_SECONDS)):
    data = stream.read(CHUNK)
    frames.append(data)

# Finishing Recording
print(" * finish")

stream.stop_stream()
stream.close()
p.terminate()

wf = wave.open("Sauda.wav", 'wb')
wf.setnchannels(CHANNELS)
wf.setsampwidth(p.get_sample_size(FORMAT))
wf.setframerate(RATE)
wf.writeframes(b''.join(frames))
wf.close()


 * recording
 * finish


In [26]:
#DataFlair - Extract features (mfcc, chroma, mel) from a sound file

from librosa.feature import melspectrogram


def extract_feature(file_name, mfcc, chroma, mel):
    with soundfile.SoundFile(file_name) as sound_file:
        X = sound_file.read(dtype="float32")
        sample_rate=sound_file.samplerate
        if chroma:
            stft=np.abs(librosa.stft(X))
        result=np.array([])
        if mfcc:
            mfccs=np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
            result=np.hstack((result, mfccs))
        if chroma:
            chroma=np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
            result=np.hstack((result, chroma))
        if mel:
            #mel=np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
            mel=np.mean(librosa.feature.melspectrogram(y=X, sr=sample_rate).T,axis=0)
            result=np.hstack((result, mel))
            
    return result

In [27]:
# Load the data and extract features for each sound file
def load_data(test_size=0.2):
    x, y = [], []
    for file in glob.glob("D:\\AR_VR\\RAVDESS_Dataset\\Actor_*\\*.wav"):
        file_name = os.path.basename(file)
        emotion = emotions[file_name.split("-")[2]]
        
        

In [28]:
from pydub import AudioSegment

In [29]:
def load_data(test_size=0.2):
    x,y=[],[]
    for file in glob.glob("D:\\AR_VR\\RAVDESS_Dataset\\Actor_*\\*.wav"):
        
        file_name=os.path.basename(file)
        sound = AudioSegment.from_wav(file)
        sound = sound.set_channels(1)
        sound.export(file, format="wav")
        emotion=emotions[file_name.split("-")[2]]
        
        if emotion not in observed_emotions:
            continue
        feature=extract_feature(file, mfcc=True, chroma=True, mel=True)
        x.append(feature)
        y.append(emotion)
        
    return train_test_split(np.array(x), y, test_size=test_size, random_state=9)

In [30]:
#DataFlair - Load the data and extract features for each sound file
def load_data(test_size=0.2):
    x,y=[],[]
    for file in glob.glob("D:\\AR_VR\\RAVDESS_Dataset\\Actor_*\\*.wav"):
        file_name=os.path.basename(file)
        emotion=emotions[file_name.split("-")[2]]
        if emotion not in observed_emotions:
            continue
        feature=extract_feature(file, mfcc=True, chroma=True, mel=True)
        x.append(feature)
        y.append(emotion)
        
    return train_test_split(np.array(x), y, test_size=test_size, random_state=9)

In [31]:
x_train,x_test,y_train,y_test=load_data(test_size=0.25)


#import sklearn.model_selection as model_selection

#x_train, x_test, y_train, y_test = model_selection.train_test_split(np.array(x), y, test_size=test_size, random_state=9)

In [32]:
#DataFlair - Get the shape of the training and testing datasets
print((x_train.shape[0], x_test.shape[0]))

(987, 329)


In [33]:
 # Get the number of features extracted
print(f'Features extracted: {x_train.shape[1]}')

Features extracted: 180


In [34]:
model.fit(x_train,y_train)
y_pred=model.predict(x_test)

accuracy=accuracy_score(y_true=y_test, y_pred=y_pred)
print("Accuracy: {:.2f}%".format(accuracy*100))

Accuracy: 80.55%


In [35]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
cm

array([[84,  4,  0, 19],
       [ 2, 35,  1, 12],
       [ 6,  2, 67, 11],
       [ 2,  1,  4, 79]], dtype=int64)

In [36]:
# record your audio and predict emotion
def record_predictAudio():
    x_predictAudio = []
    record_predictAudio() #Record audio to predict
    file = "D:\\AR_VR\\RAVDESS_Dataset\\Sauda.wav" #Recorded audio filepath
    featurePredictAudio = extract_feature(file, mfcc=True, chroma=True, mel=True) #extract features of recorded audio
    x_predictAudio.append(featurePredictAudio)
    y_predictAudio = model.predict(np.array(x_predictAudio))
    print("Emotion Predicted: {}".format(y_predictAudio))

In [37]:
# predict on pre-recorded audio
def predictAudio():
    file = input("Please enter path to your file.\n")
    x_predictAudio = []
    featurePredictAudio = extract_feature(file, mfcc=True, chroma=True, mel=True) #extract features of recorded audio
    x_predictAudio.append(featurePredictAudio)
    y_predictAudio = model.predict(np.array(x_predictAudio))
    print("Emotion Predicted: {}".format(y_predictAudio))

In [38]:
while True:
    choice = int(input("Enter 1 to create and train model. \nEnter 2 to record and predict audio. \nEnter 3 to predict on pre-recorded audio. \nEnter 4 to quit. \n"))
    if choice == 1:
        trainModel()
    elif choice == 2:
        record_predictAudio()
    elif choice == 3:
        predictAudio()
    else:
        quit()

Emotion Predicted: ['fearful']
Emotion Predicted: ['happy']
Emotion Predicted: ['fearful']
Emotion Predicted: ['happy']
Emotion Predicted: ['calm']
Emotion Predicted: ['happy']
