# Data preparation section

In [1]:
from IPython.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [2]:
installed_packages = [
    "tensorflow",
    "google",
    "librosa",
]

for package in installed_packages:
    !pip install {package}

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable


In [3]:
import numpy as np
import pandas as pd
import tensorflow as tf
import librosa

In [4]:
import glob

files = glob.glob("../data/raw/RAVDESS/*/*.wav")

In [5]:
file_name_to_emotion_mapping = {
    "01": "neutral",
    "02": "calm",
    "03": "happy",
    "04": "sad",
    "05": "angry",
    "06": "fearful",
    "07": "disgust",
    "08": "surprised",
}

In [6]:
emotion_to_number_mapping = {emotion: int(number_string)-1 for number_string, emotion in file_name_to_emotion_mapping.items()}
emotion_to_number_mapping

{'neutral': 0,
 'calm': 1,
 'happy': 2,
 'sad': 3,
 'angry': 4,
 'fearful': 5,
 'disgust': 6,
 'surprised': 7}

In [7]:
emotion_to_positivity_mapping = {
    "neutral": "positive",
    "calm": "positive",
    "happy": "positive",
    "sad": "negative",
    "angry": "negative",
    "fearful": "negative",
    "disgust": "negative",
    "surprised": "positive",
}

##### Data reading and normalisation

In [9]:
from keras.preprocessing.sequence import pad_sequences

input_data = []
max_length = 0

for file in files:
    data, sampling_rate = librosa.load(file)
    input_data.append(data)
    max_length = max(max_length, len(data))
    
padded_input_data = pad_sequences(input_data, padding="post", dtype="float32")
speech_data = np.array(padded_input_data)
# display(speech_data)
# display(np.shape(speech_data))

In [10]:
normalised_speech_data=(speech_data-speech_data.mean())/speech_data.std()
# normalised_speech_data.dropna(axis='columns', inplace=True)

In [11]:
emotion_labels = [file_name_to_emotion_mapping[file.split("-")[-5]] for file in files]

##### Splitting the data into training, validation and test sets

In [25]:
from sklearn.model_selection import train_test_split

speeches = normalised_speech_data
emotions = emotion_labels

speeches_train_and_val, speeches_test, emotions_train_and_val, emotions_test = train_test_split(speeches, emotions, test_size=0.2, random_state=42, stratify=emotions)
speeches_train, speeches_val, emotions_train, emotions_val = train_test_split(speeches_train_and_val, emotions_train_and_val, test_size=0.25, random_state=42, stratify=emotions_train_and_val)

In [26]:
emotions_binary_train = [emotion_to_positivity_mapping[emotion] for emotion in emotions_train]
emotions_binary_val = [emotion_to_positivity_mapping[emotion] for emotion in emotions_val]
emotions_binary_test = [emotion_to_positivity_mapping[emotion] for emotion in emotions_test]

In [27]:
from collections import Counter

print(f"{len(speeches_train)=}\t\t{len(emotions_train)=}")
print(f"{len(speeches_val)=}\t\t{len(emotions_val)=}")
print(f"{len(speeches_test)=}\t\t{len(emotions_test)=}")

print(f"{Counter(emotions_train)=}")
print(f"{Counter(emotions_val)=}")
print(f"{Counter(emotions_test)=}")

len(speeches_train)=864		len(emotions_train)=864
len(speeches_val)=288		len(emotions_val)=288
len(speeches_test)=288		len(emotions_test)=288
Counter(emotions_train)=Counter({'calm': 116, 'angry': 115, 'fearful': 115, 'sad': 115, 'happy': 115, 'surprised': 115, 'disgust': 115, 'neutral': 58})
Counter(emotions_val)=Counter({'sad': 39, 'disgust': 39, 'angry': 39, 'calm': 38, 'surprised': 38, 'fearful': 38, 'happy': 38, 'neutral': 19})
Counter(emotions_test)=Counter({'happy': 39, 'fearful': 39, 'surprised': 39, 'sad': 38, 'calm': 38, 'angry': 38, 'disgust': 38, 'neutral': 19})


# Classifier comparison section

##### MNIST example

In [15]:
mnist = tf.keras.datasets.mnist
(x_train, y_train), (x_test, y_test) = mnist.load_data()

In [16]:
# x_train = tf.keras.utils.normalize(x_train)
# y_train = tf.keras.utils.normalize(y_train)

In [17]:
ex_model = tf.keras.models.Sequential()
ex_model.add(tf.keras.layers.Flatten())
ex_model.add(tf.keras.layers.Dense(128, activation=tf.nn.relu))
ex_model.add(tf.keras.layers.Dense(128, activation=tf.nn.relu))
ex_model.add(tf.keras.layers.Dense(10, activation=tf.nn.softmax))

ex_model.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"],
)

ex_model.fit(x_train, y_train, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x1f490370c10>

In [18]:
ex_val_loss, ex_val_accuracy = ex_model.evaluate(x_test, y_test)
print(f"Loss: {ex_val_loss}, accuracy: {ex_val_accuracy}")

Loss: 0.2002231627702713, accuracy: 0.9470999836921692


##### Emotions

In [28]:
print(speeches_train.shape)

(864, 116247)


In [29]:
# speeches_train = tf.keras.utils.normalize(speeches_train[:, :10_000], axis=1)
# speeches_val = tf.keras.utils.normalize(speeches_val[:, :10_000], axis=1)
# speeches_test = tf.keras.utils.normalize(speeches_test[:, :10_000], axis=1)

In [30]:
emotions_train = np.array([emotion_to_number_mapping[emotion] for emotion in emotions_train])
emotions_val = np.array([emotion_to_number_mapping[emotion] for emotion in emotions_val])
emotions_test = np.array([emotion_to_number_mapping[emotion] for emotion in emotions_test])

In [31]:
speeches_train[0]

array([-5.1960973e-03, -8.5753230e-03, -6.9387625e-03, ...,
        1.2487776e-05,  1.2487776e-05,  1.2487776e-05], dtype=float32)

In [34]:
model = tf.keras.models.Sequential()
model.add(tf.keras.layers.Dense(1000, activation=tf.nn.relu))
model.add(tf.keras.layers.Dense(100, activation=tf.nn.relu))
model.add(tf.keras.layers.Dense(100, activation=tf.nn.relu))
model.add(tf.keras.layers.Dense(100, activation=tf.nn.relu))
model.add(tf.keras.layers.Dense(8, activation=tf.nn.softmax))

model.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"],
)

model.fit(speeches_train, emotions_train, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x1f493505db0>

In [35]:
val_loss, val_accuracy = model.evaluate(speeches_val, emotions_val)
print(f"Loss: {val_loss}, accuracy: {val_accuracy}")

Loss: 2.75772762298584, accuracy: 0.1770833283662796
