# Part 3 of RAVDESS Emotional Speech Recognition project by Anuj Soni (mcsnipe97)

## Modelling, training, testing
---
We will build a baseline CNN1D model.

Firstlt, I want to thank the author for their excellent dataset, without it, writing this notebook could not have been possible. 

The link for the dataset:
https://www.kaggle.com/uwrfkaggler/ravdess-emotional-speech-audio

In [None]:
from tensorflow.keras import Sequential, regularizers
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, Model, model_from_json
from tensorflow.keras.layers import Dense, Embedding, LSTM
from tensorflow.keras.layers import Input, Flatten, Dropout, Activation, BatchNormalization
from tensorflow.keras.layers import Conv1D, MaxPooling1D, AveragePooling1D
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.utils import to_categorical

import sklearn as sk
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


import librosa
import librosa.display
import json
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from matplotlib.pyplot import specgram
%matplotlib inline
import pandas as pd
import seaborn as sns
import glob 
import os
import pickle
import IPython.display as ipd

In [None]:
df = pd.read_csv('pathFeature.csv')

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(df.drop(['path', 'labels', 'source'], axis=1), df.labels, test_size = 0.25, shuffle=True, random_state = 42)

In [None]:
Xmean = np.mean(X_train, axis=0)
Xstd = np.std(X_train, axis=0)

X_train = (X_train - Xmean)/Xstd

In [None]:
X_train = np.array(X_train)
Y_train = np.array(Y_train)
X_test = np.array(X_test)
Y_test = np.array(Y_test)


# One hot encode the dataset
lb = LabelEncoder()
Y_train = to_categorical(lb.fit_transform(Y_train))
Y_test = to_categorical(lb.fit_transform(Y_test))

In [None]:
# Pickel the lb object for future use 
filename = 'labels.p'
outf = open(filename, 'wb')
pickle.dump(lb, outf)
outf.close()

In [None]:
print(lb.classes_)

Now because we will use CNN for our baseline model, we need to specify the 3rd dimension, which for us is 1. Its 1 because we're doing a 1D CNN and not a 2D CNN. If we use the MFCC data in its entirity, we could feed that through as the input data, thus making the network a 2D CNN.

In [None]:
X_train = np.expand_dims(X_train, axis=2)
X_test = np.expand_dims(X_test, axis=2)
X_train.shape

In [None]:
X_train.shape[1]

# Modelling

In [None]:
model = Sequential()


model.add(Conv1D(256, 8, padding='same', input_shape=(X_train.shape[1], 1))) #X_train.shape[1] = No. of columns (features)
model.add(Activation('relu'))


model.add(Conv1D(256, 8, padding='same'))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(0.25))
model.add(MaxPooling1D(pool_size=(8)))


model.add(Conv1D(128, 8, padding = 'same'))
model.add(Activation('relu'))


model.add(Conv1D(128, 8, padding = 'same'))
model.add(Activation('relu'))


model.add(Conv1D(128, 8, padding = 'same'))
model.add(Activation('relu'))


model.add(Conv1D(128, 8, padding = 'same'))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Dropout(0.25))
model.add(MaxPooling1D(pool_size=(8)))

model.add(Conv1D(64, 8, padding='same'))
model.add(Activation('relu'))

model.add(Conv1D(64, 8, padding='same'))
model.add(Activation('relu'))

model.add(Flatten())
model.add(Dense(14)) #Target Class
model.add(Activation('softmax'))

opt = keras.optimizers.RMSprop(lr=0.00001, decay = 1e-6)
model.summary()


In [None]:
model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])
modelHistory = model.fit(X_train, Y_train, batch_size=16, epochs = 100, validation_data = (X_test, Y_test))

In [None]:
plt.figure(figsize=(12,6))
plt.plot(modelHistory.history['loss'])
plt.plot(modelHistory.history['accuracy'])
plt.plot(modelHistory.history['val_loss'])
plt.plot(modelHistory.history['val_accuracy'])
plt.title('Model Loss Graph')
plt.xlabel('epochs')
plt.ylabel('loss')
plt.legend(['train loss', 'train acc', 'test loss', 'test acc'])
plt.show()

In [None]:
modelName = 'Emotion.h5'
saveDir = os.path.join(os.getcwd(), 'models')
if not os.path.isdir(saveDir):
    os.makedirs(saveDir)
modelPath = os.path.join(saveDir, modelName)
model.save(modelPath)

print(f'Saved trained model and weights at {modelPath}')

In [None]:
modelJson = model.to_json()
with open('model.json', 'w') as jFile:
    jFile.write(modelJson)

In [None]:
jFile = open('model.json', 'r')
modelJson = jFile.read()
jFile.close()

loadedModel = model_from_json(modelJson)

loadedModel.load_weights('models/Emotion.h5')
print('Model Loaded!')

In [None]:
## Keras Optimizer

opt = keras.optimizers.RMSprop(lr = 0.00001, decay=1e-6)
loadedModel.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])
score = loadedModel.evaluate(X_test, Y_test, verbose = 0)

print(f'{loadedModel.metrics_names[1]}: {score[1]*100}')

In [None]:
preds = loadedModel.predict(X_test, batch_size = 16, verbose = 1)

preds = preds.argmax(axis=1)
preds

In [None]:
preds = preds.astype(int).flatten()
preds = (lb.inverse_transform((preds)))
preds = pd.DataFrame({'predictedvalues': preds})

# Actual labels
actual=y_test.argmax(axis=1)
actual = actual.astype(int).flatten()
actual = (lb.inverse_transform((actual)))
actual = pd.DataFrame({'actualvalues': actual})

# Lets combined both of them into a single dataframe
finaldf = actual.join(preds)
finaldf[170:180]

In [None]:
finalDF.to_csv('Predictions.csv', index=True)
finaldf.groupby('predictedvalues').count()

# Getting the Confusion Matrix

In [None]:
def buildConfusionMatrix(confusion_matrix, class_names, figsize=(12,12), fontsize=12):
    """Prints a confusion matrix, as returned by sklearn.metrics.confusion_matrix, as a heatmap.
    
    Arguments
    ---------
    confusion_matrix: numpy array
        The numpy array object returned from a call to sklearn.metrics.confusion_matrix. 
    class_names: list
        An ordered list of class names, in the order they index the given confusion matrix.
    figsize: tuple
        A 2-long tuple, the first value determining the horizontal size of the ouputted figure,
        the second determining the vertical size. Defaults to (12,12).
    fontsize: int
        Font size for axes labels. Defaults to 12.
    Returns
    -------
    matplotlib.figure.Figure
        The resulting confusion matrix figure
    """
    df = pd.DataFrame(confusion_matrix, index=class_names, columns=class_names)
    fig = plt.figure(figsize=figsize)
    try:
        heatmap = sns.heatmap(df, annot=True, fmt ="d")
    except ValueError:
        raise ValueError("Confusion matrix values must be integers.")

    heatmap.yaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(), rotation=0, ha='right', fontsize=fontsize)
    heatmap.xaxis.set_ticklabels(heatmap.xaxis.get_ticklabels(), rotation=45, ha='right', fontsize=fontsize)
    plt.ylabel('True Labels')
    plt.xlabel('Predicted Labels')

    def gender(row):
        if row=='female_happy' or 'female_fear' or 'female_surprise' or 'female_disgust' or 'female_sad' or 'female_neutral':
            return 'female'
        elif row=='male_happy' or 'male_fear' or 'male_surprise' or 'male_disgust' or 'male_sad' or 'male_neutral':
            return 'male'    

In [None]:
finalDF = pd.read_csv('Predictions.csv')
classes = finalDF.actualValues.unique()
classes.sort()

# Confusion Matrix
cMatrix = confusion_matrix(finalDF.actualValues, finalDF.predictedValues)

print(accuracy_score(finalDF.actualValues, finalDF.predictedValues))

buildConfusionMatrix(cMatrix, class_names=classes)

In [None]:
classes = finaldf.actualvalues.unique()
classes.sort()    
print(classification_report(finaldf.actualvalues, finaldf.predictedvalues, target_names=classes))

## My thoughts
---

The gender seperation turns out to be a curcial implementation in order to accurately classify emotions. Upon closer inspection of the confusion matrix, it seems that female tends to express emotions in a more, obvious manner, for the lack of a better word. Whilst males tend to be very placid or subtle. This is probably why we see the error rate amongst males are really high. For example, male happy and angry gets mixed up quite often.