In [1]:
import librosa
import soundfile
import os, glob
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
import IPython
import pyaudio
import wave


1 - Librosa provides building blocks to create audio retrieval system.
2 - SoundFile reads and writes sound files supported in many platforms or operating systems.
3 - OS provides functions for creating and removing a directory, fetching its contents, 
    changing and identifying the current directory.
4 - GLOB returns all file paths matching a specific pattern.
5 - IPython provides a rich toolkit to helps make the most out of using Python interactively.
6 - Pyaudio is used to play and record audio in any operating system.
7 - Wave writes audio data in raw format to a file-like object and reads the attributes of a WAV file.


In [2]:
#Displaying emotions in the RAVDESS dataset
emotions = {
    '01': 'neutral',
    '02': 'calm',
    '03': 'happy',
    '04': 'sad',
    '05': 'angry',
    '06': 'fearful',
    '07': 'disgust',
    '08': 'surprised'
}

#Displaying the emotions to be observed
obs_emo = ['calm', 'happy', 'fearful', 'disgust']

In [3]:
#Initializing the MLPClassifier
model = MLPClassifier(alpha=0.01, batch_size='auto',epsilon=1e-08,
                      hidden_layer_sizes=100, learning_rate='adaptive', learning_rate_init=0.001, max_iter=500)

A Perceptron is an Artificial Neuron

It is the simplest possible Neural Network

Neural Networks are the building blocks of Machine Learning.
The original Perceptron was designed to take a number of binary inputs, and produce one binary output (0 or 1).

The idea was to use different weights to represent the importance of each input, and that the sum of the values 
should be greater than a threshold value before making a decision like true or false (0 or 1).

Perceptron Inputs are called nodes, whereby each node has a value and weight
Node values are the binary input values (0,1)
Node Weights shows the stregth of each node
Activation Function amps result(weighted sum) into values like 1 or 0

Multi-Layer Perceptron is used for sophisticated decision making


In [4]:
#Recording the user's audio
def recordAudio():
    #CHUNCK is the number of frames the signals are split into
    chunck = 1024 #Recoding in chuncks of 1024samples (block size)
    sample_format = pyaudio.paInt16 #16bits per sample, data type format
    #Each frame will have 1 sample as "channels=1"
    channels = 1
    #fs = sampling frequency
    #fs is the number of audio samples collected in 1 second
    fs = 48100  # Record at 44100 samples per 1 second //as per ravdess dataset the frequecy is 48kHz
    seconds = 5
    filename = "Predict-Record-Audio.wav"
    
    # Creating an interface to PortAudio
    p = pyaudio.PyAudio()
    
    print("Recording...")
    
    #Start recording
    stream = p.open(format=sample_format, channels=channels, rate=fs,
                   frames_per_buffer=chunck, input=True)
    
    #Initializing an array to store frames
    frames = []
    
    #Storing data in chuncks for 5seconds
    for i in range(0, int(fs / chunck * seconds)):
        data = stream.read(chunck)
        ft = frames.append(data)
   
        
    #Terminating and shutting down the stream/Recording
    stream.stop_stream()
    stream.close()
    
    #Terminating the PortAudio interface
    p.terminate()
    
    print("Recording Complete.")
    
    #Saving the recorded data as a .wav file
    wf = wave.open(filename, 'wb')
    wf.setnchannels(channels)
    wf.setsampwidth(p.get_sample_size(sample_format))
    wf.setframerate(fs)
    wf.writeframes(b''.join(frames))
    wf.close()

In [5]:
#Play the audio file
def play(file):
    chunck = 1024
    wf = wave.open(file, 'rb')
    
    p = pyaudio.PyAudio()
    
    #To record or play audio, open a stream on the desired device
    stream = p.open(format = p.get_format_from_width(wf.getsampwidth()),
                   channels=wf.getnchannels(),
                   rate = wf.getframerate(),
                   output=True)
    data = wf.readframes(chunck)
    
    while len(data) > 0:
        stream.write(data)
        data = wf.readframes(chunck)
    stream.stop_stream()
    stream.close()
    
    p.terminate()


Fourier Transform is a mathematical concept that can be used in the conversion of a continuous
signal from its original time-domain state to a frequency-domain state.
We will be using Fourier Transforms (FT) to convert audio signals to a 
frequency-centric representation.

Fourier Transforms is a mathematical concept that can decompose this signal and bring out the
individual frequencies. This is vital for understanding all the frequencies that are combined
together to form the sound we hear. Fourier Transform (FT) gives all the frequencies present
in the signal and also shows the magnitude of each frequency.


In [6]:
def extract_feature(file_name, mfcc, chroma, mel):
    with soundfile.SoundFile(file_name) as sound_file:
        X = sound_file.read(dtype="float64")
        sample_rate = sound_file.samplerate

        """
        Short Time Fourier Transform (STFT). 
        STFTs can be used as a way of quantifying the change of a nonstationary signal's frequency
        and phase content over time.
        """

        if chroma:
            stft = np.abs(librosa.stft(X))
        result = np.array([])
        
        if mfcc:
            mfcc = np.mean(librosa.feature.mfcc(y=X, sr = sample_rate, n_mfcc = 40).T, axis=0)
        result = np.hstack((result, mfcc))
        
        if chroma:
            chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0)
        result = np.hstack((result, chroma))
        
        if mel:
            mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T, axis=0)
        result = np.hstack((result, mel))
    
    return result

In [7]:
#Loading data and extracting features for each sound file
def load_data(test_size=0.2):
    x, y = [], []
    path = "C:/Users/Geraldine/Desktop/new-folder/Dataset/speech-emotion-recognition-ravdess-data/Actor_15/*.wav"
    for file in glob.glob(path):
              
        file_name = os.path.basename(file)
        emo = emotions[file_name.split("-")[2]]
        
        
        if emo not in obs_emo:
            continue
        feature = extract_feature(file, mfcc=True, chroma=True, mel=True)
        x.append(feature)
        y.append(emo)
    return train_test_split(np.array(x), y, test_size=test_size, random_state=9)
# load_data()

In [8]:
# Training the model is the program is started
# def trainModel():
    
#Splitting the dataset into train and test datasets
x_train, x_test, y_train, y_test = load_data(test_size = 0.25)
    
#Acquiring the shape of the training and testing datasets
print((x_train.shape[0], x_test.shape[0]))
    
#Acquiring the number of features extracted
print(f'Feature Extracted: {x_train.shape[1]}')
    
#Training the model
model.fit(x_train, y_train)
    
#Predicting the test accuracy
y_pred = model.predict(x_test)
    
#Getting the model's score/accuracy
accuracy = accuracy_score(y_true = y_test, y_pred=y_pred)
    
#Displaying the accuracy
print("Accuracy : {:.2f}%".format(accuracy*100))
    

 -1.83105469e-04 -2.13623047e-04] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T, axis=0)
  mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T, axis=0)
 1.83105469e-04 1.83105469e-04] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T, axis=0)
  mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T, axis=0)
 -3.05175781e-05 -3.05175781e-05] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T, axis=0)
  0.00000000e+00  0.00000000e+00] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T, axis=0)
 0.00000000e+00 

 1.52587891e-04 9.15527344e-05] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T, axis=0)
 6.10351562e-05 9.15527344e-05] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T, axis=0)
  6.10351562e-05  6.10351562e-05] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T, axis=0)
 3.05175781e-05 6.10351562e-05] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T, axis=0)
 9.15527344e-05 6.10351562e-05] as keyword args. From version 0.10 passing these as positional arguments will result in an error
  mel = np.mean(librosa.feature.melspectrogra

(24, 8)
Feature Extracted: 180
Accuracy : 87.50%


In [9]:
#Predicting on-predict audion

def predictAudio(path):
    file = "C:/Users/Geraldine/Desktop/new-folder/audios/"+path
    print(file)
    IPython.display.Audio(file)
    x_predictAudio = []
    featurePredictAudio = extract_feature(file, mfcc=True, chroma=True, mel=True) #extract features of recorded audio
    x_predictAudio.append(featurePredictAudio)
    y_predictAudio = model.predict(np.array(x_predictAudio))
    print(y_predictAudio)
    for i in y_predictAudio:
        prediction = i
    return prediction

In [10]:
#Recording the user's audio

def record_predictAudio():
    x_predictAudio = []
    recordAudio()#Recording user's audio for prediction
    file = "C:/Users/Geraldine/Desktop/RT-SER/Predict-Record-Audio.wav" #file path to the recorded audio
    
    #Extracting the features of the recorded audio
    featurePredAudio = extract_feature(file, mfcc=True, chroma=True, mel=True)
    x_predictAudio.append(featurePredAudio)
    y_predictAudio = model.predict(np.array(x_predictAudio))
    print(y_predictAudio)
    for i in y_predictAudio:
        prediction = i
        print(prediction)
    return prediction

In [11]:
#Playing the audio recorded


play('Predict-Record-Audio.wav')

In [12]:
import joblib

#Saving the model
filename = 'ser_model.sav'

joblib.dump(model, filename)

['ser_model.sav']

In [13]:
#Loading the model from disc
loaded_model = joblib.load(filename)
result = loaded_model.score(x_test, y_test)
print(result)

0.875


In [None]:
# app = Flask(__name__)
# @app.route('/', methods=["POST", "GET"])
# def home():
#     print('hell')
    
# if __name__ == '__main__':
#     app.run(host="0.0.0.0", port=5000, debug=True)

from werkzeug.wrappers import Request, Response
from flask import *

app = Flask(__name__)

@app.route("/")
def hello():
    return render_template("index.html");

@app.route("/record_pred.html")
def recPred():
    predicted = record_predictAudio()
    return render_template("record_pred.html", predicted = predicted);

# @app.route("/record_pred.html")
# def recPred():
#     record_predictAudio()
#     return render_template("record_pred.html");

@app.route("/input_aud.html", methods=('GET', 'POST'))
def inputPred():
    if request.method == 'POST':
        file_path = request.form.get('file')
    print("C:/Users/Geraldine/Desktop/new-folder/audios")
    predicted = predictAudio(file_path)
    return render_template("input_aud.html", predicted = predicted);

@app.route("/inputForm.html", methods=('GET', 'POST'))
def inputForm():
    if request.method == 'POST':
        file_path = request.form.get('file')
        print("C:/Users/Geraldine/Desktop/new-folder/audios/"+file_path)
    return render_template("inputForm.html");

if __name__ == '__main__':
    from werkzeug.serving import run_simple
    run_simple('localhost', 9000, app)

 * Running on http://localhost:9000/ (Press CTRL+C to quit)
