In [1]:
# todo: check all imports --> put everything on top and label
import pandas as pd
import numpy as np

import os
import sys

# librosa is a Python library for analyzing audio and music. It can be used to extract the data from the audio files we will see it later.
! apt-get update
! apt-get install -y libsndfile-dev
import librosa
import librosa.display
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

# to play the audio files
from IPython.display import Audio

import tensorflow.keras
! pip install np_utils
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Dropout, BatchNormalization
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import ModelCheckpoint

import warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")
warnings.filterwarnings("ignore", category=DeprecationWarning) 

! pip install pyAudioAnalysis
! pip install eyed3

In [95]:
from keras.layers import Bidirectional
from tensorflow.keras.layers import Dense, LSTM, Flatten, Dropout, BatchNormalization
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

from pyAudioAnalysis import audioBasicIO
from pyAudioAnalysis import ShortTermFeatures
import matplotlib.pyplot as plt

In [60]:
## todo: reference repository

# Paths for data.
def preprocess_ravdess_data():
    ravdess = "/kaggle/input/ravdess-emotional-speech-audio/audio_speech_actors_01-24/"
    ravdess_directory_list = os.listdir(ravdess)

    file_emotion = []
    file_path = []
    for dir in ravdess_directory_list:
        # as their are 20 different actors in our previous directory we need to extract files for each actor.
        actor = os.listdir(ravdess + dir)
        for file in actor:
            part = file.split('.')[0]
            part = part.split('-')
            # third part in each file represents the emotion associated to that file.
            file_emotion.append(int(part[2]))
            file_path.append(ravdess + dir + '/' + file)

    # dataframe for emotion of files
    emotion_df = pd.DataFrame(file_emotion, columns=['Emotions'])

    # dataframe for path of files.
    path_df = pd.DataFrame(file_path, columns=['Path'])
    data_path = pd.concat([emotion_df, path_df], axis=1)

    # changing integers to actual emotions.
    data_path.Emotions.replace({1:'neutral', 2:'calm', 3:'happy', 4:'sad', 5:'angry', 6:'fear', 7:'disgust', 8:'surprise'}, inplace=True)
    data_path.to_csv("data_path.csv",index=False)
    data_path.head()
    
    return data_path

In [96]:
def extract__short_term_features(path): # with py audioanalysis

    [Fs, data] = audioBasicIO.read_audio_file(path)# read the wav file
    data = audioBasicIO.stereo_to_mono(data) 
    results, feature_names = ShortTermFeatures.feature_extraction(data, Fs, 0.050*Fs, 0.025*Fs, deltas=False)
    if results.shape[1]>100:
        results=results[:,:100]
    elif results.shape[1]<100:
        padding = np.zeros((34,100))
        padding[:results.shape[0],:results.shape[1]]=results
        results=padding
    return results


In [97]:
# get all features of all files by using py audio
def extract_features(data_path):
    X, Y = [], []
    for path, emotion in zip(data_path.Path, data_path.Emotions):
        feature = extract_short_term_features(path)
        for element in feature:
            X.append(element)
            Y.append(emotion)
    features = pd.DataFrame(X)
    features['labels'] = Y
    features.to_csv('features.csv', index=False)
    return features

In [98]:
## todo: to different modes: generate data or use csv file
## To read the data from csv file - py_audio
def load_data_and_features():
    features=pd.read_csv('../input/py-audio-features-and-data/features_py_audio.csv')
    data_path = pd.read_csv('../input/py-audio-features-and-data/data_path_py_audio.csv')
    return features, data_path

In [99]:
def prepare_training_data(data_available):
    if data_available:
        features, data_path= load_data_and_features()
    else:
        data_path = preprocess_ravdess_data()
        features = extract_features(data_path)
        
    X = features.iloc[: ,:-1].values
    Y = features['labels'].values
    # As this is a multiclass classification problem onehotencoding our Y.
    encoder = OneHotEncoder()
    Y = encoder.fit_transform(np.array(Y).reshape(-1,1)).toarray()
    # splitting data
    x_train, x_test, y_train, y_test = train_test_split(X, Y, random_state=0, shuffle=True)
    x_train.shape, y_train.shape, x_test.shape, y_test.shape
    # scaling our data with sklearn's Standard scaler
    scaler = StandardScaler()
    x_train = scaler.fit_transform(x_train)
    x_test = scaler.transform(x_test)
    x_train.shape, y_train.shape, x_test.shape, y_test.shape
    # making our data compatible to model.
    x_train = np.expand_dims(x_train, axis=2)
    x_test = np.expand_dims(x_test, axis=2)
    x_train.shape, y_train.shape, x_test.shape, y_test.shape
    return x_train, x_test, y_train, y_test

In [100]:
def create_speech_model(input_shape):
    inputs = keras.Input(shape=(input_shape,1))

    #Low level features
    low_level = layers.Bidirectional(LSTM(256, return_sequences=True))(inputs)
    low_level = layers.Dropout(0.2)(low_level)
    low_level = layers.Bidirectional(LSTM(256))(low_level)
    low_level = layers.Dropout(0.2)(low_level)
    low_level = layers.Dense(8, activation='relu')(low_level)
    
    #High level features 
    high_level = layers.Conv1D(256, kernel_size=3, strides=1, padding='same', activation='relu')(inputs)
    high_level = layers.Conv1D(256, kernel_size=3, strides=1, padding='same', activation='relu')(high_level)
    high_level = layers.Conv1D(256, kernel_size=3, strides=1, padding='same', activation='relu')(high_level)
    high_level = layers.Conv1D(256, kernel_size=3, strides=1, padding='same', activation='relu')(high_level)
    high_level = layers.MaxPooling1D(pool_size=2, strides = 2, padding = 'same')(high_level)
    high_level = layers.Dropout(0.2)(high_level)
    high_level = layers.Dense(8, activation='relu')(high_level)
    high_level = layers.Flatten()(high_level)

    #Concatenate low and high level features
    merge = layers.concatenate([low_level, high_level], axis=1)
    merge = layers.Flatten()(merge)

    outputs = Dense(8, activation='softmax')(merge)

    model = keras.Model(inputs, outputs)
    #plot_model(model, to_file='network_image.png')
    return model

In [None]:
data_available=True
epochs = 500
x_train, x_test, y_train, y_test=prepare_training_data(data_available)
model=create_speech_model(x_train.shape[1])
model.compile(optimizer = 'adam' , loss = 'categorical_crossentropy' , metrics = ['accuracy'])
model.summary()
rlrp = ReduceLROnPlateau(monitor='loss', factor=0.4, verbose=0, patience=2, min_lr=0.0000001)
history=model.fit(x_train, y_train, batch_size=64, epochs=epochs, validation_data=(x_test, y_test), callbacks=[rlrp])

In [None]:
## Just some vizualizations - nothing final yet - todo

print("Accuracy of our model on test data : " , model.evaluate(x_test,y_test)[1]*100 , "%")

epochs = [i for i in range(epochs)]
fig , ax = plt.subplots(1,2)
train_acc = history.history['accuracy']
train_loss = history.history['loss']
test_acc = history.history['val_accuracy']
test_loss = history.history['val_loss']

fig.set_size_inches(20,6)
ax[0].plot(epochs , train_loss , label = 'Training Loss')
ax[0].plot(epochs , test_loss , label = 'Testing Loss')
ax[0].set_title('Training & Testing Loss')
ax[0].legend()
ax[0].set_xlabel("Epochs")

ax[1].plot(epochs , train_acc , label = 'Training Accuracy')
ax[1].plot(epochs , test_acc , label = 'Testing Accuracy')
ax[1].set_title('Training & Testing Accuracy')
ax[1].legend()
ax[1].set_xlabel("Epochs")
plt.show()

In [None]:
# predicting on test data.
pred_test = model.predict(x_test)
y_pred = encoder.inverse_transform(pred_test)

y_test = encoder.inverse_transform(y_test)

In [None]:
df = pd.DataFrame(columns=['Predicted Labels', 'Actual Labels'])
df['Predicted Labels'] = y_pred.flatten()
df['Actual Labels'] = y_test.flatten()

df.head(10)

In [None]:
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize = (12, 10))
cm = pd.DataFrame(cm , index = [i for i in encoder.categories_] , columns = [i for i in encoder.categories_])
sns.heatmap(cm, linecolor='white', cmap='Blues', linewidth=1, annot=True, fmt='')
plt.title('Confusion Matrix', size=20)
plt.xlabel('Predicted Labels', size=14)
plt.ylabel('Actual Labels', size=14)
plt.show()

In [None]:
print(classification_report(y_test, y_pred))