# Emotion Recognition in Voice Recordings
The aim of our project is to use logistic regression to classify a persons emotional state from a recording of him speaking.  

## Dataset
The dataset we used is “The Ryerson Audio-Visual Database of Emotional Speech and Song (RAVDESS)”  
https://zenodo.org/record/1188976  

The database contains 24 professional actors (12 female, 12 male), vocalizing two lexically-matched statements in a neutral North American accent. Speech includes calm, happy, sad, angry, fearful, surprise, and disgust expressions, and song contains calm, happy, sad, angry, and fearful emotions. Each expression is produced at two levels of emotional intensity (normal, strong), with an additional neutral expression. All conditions are available in three modality formats: Audio-only (16bit, 48kHz .wav), Audio-Video (720p H.264, AAC 48kHz, .mp4), and Video-only (no sound). We used only the speach files and not the song files, and used only the audio files and not the videos.

Speech file contains 1440 files: 60 trials per actor x 24 actors = 1440. The labels for each file will be taken from the filenames: The filename consists of a 7-part numerical identifier (e.g., 02-01-06-01-02-01-12.mp4). These identifiers define the stimulus characteristics: Filename identifiers Modality (01 = full-AV, 02 = video-only, 03 = audio-only). Vocal channel (01 = speech, 02 = song). Emotion (01 = neutral, 02 = calm, 03 = happy, 04 = sad, 05 = angry, 06 = fearful, 07 = disgust, 08 = surprised). Emotional intensity (01 = normal, 02 = strong). NOTE: There is no strong intensity for the 'neutral' emotion. Statement (01 = "Kids are talking by the door", 02 = "Dogs are sitting by the door"). Repetition (01 = 1st repetition, 02 = 2nd repetition). Actor (01 to 24. Odd numbered actors are male, even numbered actors are female).


## Dependencies

In [1]:
import tensorflow as tf
import librosa.display
import matplotlib.pyplot as plt
import numpy as np
import os
import random
import timeit

## First Attempt - Distinguishing Between Happy and Sad Recordings
At first we tried logistic regression with only two classes - happy and sad.

In [14]:
# Loading the filenames from the folder with the audio files.
filenames = []
for file in os.listdir('audio'):
    filenames.append(file)

# Shuffling the filenames array.
random.shuffle(filenames)

# Spliting the dataset into train and test files,
# 70% train and 30% test.
num_train = int(len(filenames)*0.7)
num_test = len(filenames) - num_train

print("Number of files =",len(filenames),",Number of actors =",int(len(filenames)/60))
print("Number of train examples =",num_train,",Number of test examples =",num_test)

Number of files = 600 ,Number of actors = 10
Number of train examples = 420 ,Number of test examples = 180


In [35]:
data_x_train = []
data_x_test = []
data_y_train = []
data_y_test = []

start_time = timeit.default_timer()

# For each of the training examples,
# extract from each file its Mel-frequency cepstral coefficients (MFCCs)
# and append the mfccs to the array that stores the features of each train file - data_x_train.
# look at the filename and create a label for the example,
# Where the 8'th character is '3' if the file is a sad recording and '4' if it's a happy recording.
for filename in filenames[:num_train]:
    if filename[7] == '3':
        data, sampling_rate = librosa.load("audio/" + filename, sr=22050*2, res_type='kaiser_fast', duration=2.5, offset=0.5)
        sampling_rate = np.array(sampling_rate)
        mfccs = np.mean(librosa.feature.mfcc(y=data, sr=sampling_rate, n_mfcc=13), axis=0)
        data_x_train.append(mfccs)
        data_y_train.append(1)
    elif filename[7] == '4':
        data, sampling_rate = librosa.load("audio/" + filename, sr=22050*2, res_type='kaiser_fast', duration=2.5, offset=0.5)
        sampling_rate = np.array(sampling_rate)
        mfccs = np.mean(librosa.feature.mfcc(y=data, sr=sampling_rate, n_mfcc=13), axis=0)
        data_x_train.append(mfccs)
        data_y_train.append(0)

# Do the same for the testing examples.
for filename in filenames[num_train:]:
    if filename[7] == '3':
        data, sampling_rate = librosa.load("audio/" + filename, sr=22050*2, res_type='kaiser_fast', duration=2.5, offset=0.5)
        sampling_rate = np.array(sampling_rate)
        mfccs = np.mean(librosa.feature.mfcc(y=data, sr=sampling_rate, n_mfcc=13), axis=0)
        data_x_test.append(mfccs)
        data_y_test.append(1)
    elif filename[7] == '4':
        data, sampling_rate = librosa.load("audio/" + filename, sr=22050*2, res_type='kaiser_fast', duration=2.5, offset=0.5)
        sampling_rate = np.array(sampling_rate)
        mfccs = np.mean(librosa.feature.mfcc(y=data, sr=sampling_rate, n_mfcc=13), axis=0)
        data_x_test.append(mfccs)
        data_y_test.append(0)
    
stop_time = timeit.default_timer()
print('Loading time:', stop_time - start_time, "Seconds")  

Loading time: 7.687271296661947 Seconds


In [36]:
# Tensorflow requires the y array that it gets to be of the shape (none, 1)
# This converts our data_y arrays from the shape (none, ) to the required shape (none, 1)
# ex: [0, 1, 1, 1, 0] => [[0], [1], [1], [1], [0]]
data_y_train_correct = []
data_y_test_correct = []

for val in data_y_train:
    val_arr = []
    val_arr.append(val)
    data_y_train_correct.append(val_arr)
    
for val in data_y_test:
    val_arr = []
    val_arr.append(val)
    data_y_test_correct.append(val_arr)

In [41]:
# We create a new array that will contain tuples where the first element is the features of the example,
# and the second element is the label of the example.
# This is neccesary so we can shuffle the order of the examples around after each training epoch.
data_xy_train = []
for i in range(len(data_x_train)):
    data_xy_train.append( (data_x_train[i], data_y_train_correct[i]) )
    
data_xy_test = []
for i in range(len(data_x_test)):
    data_xy_test.append( (data_x_test[i], data_y_test_correct[i]) )

In [45]:
# Function to extract only the features from data_xy
def getXvalues(data_xy):
    x_values = []
    for data in data_xy:
        x_values.append(data[0])
    return x_values

# Function to extract only the labels from data_xy
def getYvalues(data_xy):
    y_values = []
    for data in data_xy:
        y_values.append(data[1])
    return y_values

# Sigmoid function
def logistic_fun(z):
    return 1/(1.0 + np.exp(-z))

In [53]:
features = len(data_xy_train[0][0])
eps = 1e-12

x = tf.placeholder(tf.float32, [None, features])
y_ = tf.placeholder(tf.float32, [None, 1])
W = tf.Variable(tf.zeros([features,1]))
b = tf.Variable(tf.zeros([1]))
y = tf.nn.sigmoid(tf.matmul(x,W) + b)

loss = -tf.reduce_mean(y_*tf.log(y))
update = tf.train.GradientDescentOptimizer(0.00001).minimize(loss)

sess = tf.Session()
sess.run(tf.global_variables_initializer())

loss_history_train = []
loss_history_test = []
accuracy_history = []

start_time = timeit.default_timer()

for i in range(0,10):
    sess.run(update, feed_dict = {x:getXvalues(data_xy_train), y_:getYvalues(data_xy_train)}) #BGD
    train_loss = sess.run(loss, feed_dict = {x:getXvalues(data_xy_train), y_:getYvalues(data_xy_train)})
    loss_history_train.append(train_loss)
    random.shuffle(data_xy_train)
    
    if i%10 == 0:
        right = 0
        for i in range(len(data_x_test)):
            test_loss = sess.run(loss, feed_dict = {x:getXvalues(data_xy_test), y_:getYvalues(data_xy_test)})
            loss_history_test.append(np.mean(sess.run(loss1, feed_dict = {x:getXvalues(data_xy_test), y_:getYvalues(data_xy_test)})))
            pred = logistic_fun(np.matmul(getXvalues(data_xy_train)[i],sess.run(W)) + sess.run(b))

            if data_y_test[i] == 0 and pred < 0.5:
                right += 1
            if data_y_test[i] == 1 and pred > 0.5:
                right += 1
        accuracy_history.append(right/num_test)

stop_time = timeit.default_timer()
print('runtime: ', stop_time - start_time)  

runtime:  0.14309279376493578
