# Speech Emotion Recognition

In [None]:
import pandas as pd
import numpy as np
import tensorflow
import os
import sys

# librosa is a Python library for analyzing audio and music. It can be used to extract the data from the audio files we will see it later.
import librosa
import librosa.display
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

# to play the audio files
from IPython.display import Audio

import keras
from keras.callbacks import ReduceLROnPlateau
from keras.models import Sequential
from keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Dropout, BatchNormalization
from tensorflow.keras.utils import to_categorical

from keras.callbacks import ModelCheckpoint

import warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")
warnings.filterwarnings("ignore", category=DeprecationWarning)

## Data Preparation

In [None]:
Savee = "/kaggle/input/surrey-audiovisual-expressed-emotion-savee/ALL"
Ravdess = "/kaggle/input/ravdess-emotional-speech-audio/audio_speech_actors_01-24/"

##  <center> 1. Ravdess Dataframe <center>

In [None]:
ravdess_directory_list = os.listdir(Ravdess)

file_emotion = []
file_path = []
for dir in ravdess_directory_list:
    # as their are 20 different actors in our previous directory we need to extract files for each actor.
    actor = os.listdir(Ravdess + dir)
    for file in actor:
        part = file.split('.')[0]
        part = part.split('-')
        # third part in each file represents the emotion associated to that file.
        file_emotion.append(int(part[2]))
        file_path.append(Ravdess + dir + '/' + file)

# dataframe for emotion of files
emotion_df = pd.DataFrame(file_emotion, columns=['Emotions'])

# dataframe for path of files.
path_df = pd.DataFrame(file_path, columns=['Path'])
Ravdess_df = pd.concat([emotion_df, path_df], axis=1)

# changing integers to actual emotions.
Ravdess_df.Emotions.replace({1:'neutral', 2:'calm', 3:'happy', 4:'sad', 5:'angry', 6:'fear', 7:'disgust', 8:'surprise'}, inplace=True)
Ravdess_df.head()

##  <center> 2. CREMA-D dataset <center>


In [None]:
savee_directory_list = os.listdir(Savee)

file_emotion = []
file_path = []

for file in savee_directory_list:
    file_path.append(Savee + file)
    part = file.split('_')[1]
    ele = part[:-6]
    if ele=='a':
        file_emotion.append('angry')
    elif ele=='d':
        file_emotion.append('disgust')
    elif ele=='f':
        file_emotion.append('fear')
    elif ele=='h':
        file_emotion.append('happy')
    elif ele=='n':
        file_emotion.append('neutral')
    elif ele=='sa':
        file_emotion.append('sad')
    else:
        file_emotion.append('surprise')

# dataframe for emotion of files
emotion_df = pd.DataFrame(file_emotion, columns=['Emotions'])

# dataframe for path of files.
path_df = pd.DataFrame(file_path, columns=['Path'])
Savee_df = pd.concat([emotion_df, path_df], axis=1)
Savee_df.head()

In [None]:
# creating Dataframe using all the 4 dataframes we created so far.
data_path = pd.concat([ Ravdess_df,Savee_df], axis = 0)
data_path.to_csv("data_path.csv",index=False)
data_path.head()


## Data Visualisation and Exploration

In [None]:


# Plotting the count of emotions
plt.figure(figsize=(10, 6))
plt.title('Count of Emotions', fontsize=16)
sns.countplot(x='Emotions', data=data_path, palette='viridis')
plt.ylabel('Count', fontsize=12)
plt.xlabel('Emotions', fontsize=12)
sns.despine(top=True, right=True, left=False, bottom=False)
plt.xticks(rotation=45)  # Rotate x-axis labels if they overlap
plt.tight_layout()  # Adjust subplots to fit into figure area.
plt.show()


In [None]:


# Function to create waveplot
def create_waveplot(data, sr, e):
    plt.figure(figsize=(10, 3))
    plt.title(f'Waveplot for audio with {e} emotion', size=15)
    librosa.display.waveshow(data, sr=sr)
    plt.show()

# Function to create spectrogram (assuming you have this function defined elsewhere)
def create_spectrogram(data, sr, e):
    plt.figure(figsize=(10, 3))
    plt.specgram(data, Fs=sr)
    plt.title(f'Spectrogram for audio with {e} emotion', size=15)
    plt.xlabel('Time')
    plt.ylabel('Frequency')
    plt.show()

# Example usage
emotion='fear'
path = np.array(data_path.Path[data_path.Emotions==emotion])[1]
data, sampling_rate = librosa.load(path)
create_waveplot(data, sampling_rate, emotion)
create_spectrogram(data, sampling_rate, emotion)
Audio(path)


In [None]:
emotion='angry'
path = np.array(data_path.Path[data_path.Emotions==emotion])[1]
data, sampling_rate = librosa.load(path)
create_waveplot(data, sampling_rate, emotion)
create_spectrogram(data, sampling_rate, emotion)
Audio(path)

In [None]:
emotion='sad'
path = np.array(data_path.Path[data_path.Emotions==emotion])[1]
data, sampling_rate = librosa.load(path)
create_waveplot(data, sampling_rate, emotion)
create_spectrogram(data, sampling_rate, emotion)
Audio(path)

In [None]:
emotion='happy'
path = np.array(data_path.Path[data_path.Emotions==emotion])[1]
data, sampling_rate = librosa.load(path)
create_waveplot(data, sampling_rate, emotion)
create_spectrogram(data, sampling_rate, emotion)
Audio(path)

## Data Augmentation


In [None]:
# Define augmentation functions
def noise(data):
    noise_amp = 0.035 * np.random.uniform() * np.amax(data)
    data = data + noise_amp * np.random.normal(size=data.shape[0])
    return data

def stretch(data, rate=0.8):
    return librosa.effects.time_stretch(data, rate=rate)

def shift(data):
    shift_range = int(np.random.uniform(low=-5, high=5) * 1000)
    return np.roll(data, shift_range)

def pitch(data, sampling_rate, pitch_factor=0.7):
    return librosa.effects.pitch_shift(data, sr=sampling_rate, n_steps=pitch_factor)

#### 1. Simple Audio

In [None]:

#plt.figure(figsize=(14,4))
#librosa.display.waveshow(y=data, sr=sampling_rate)
#Audio(path)

plt.figure(figsize=(14, 4))
librosa.display.waveshow(y=data, sr=sampling_rate)
plt.title('Waveform')
plt.xlabel('Time (s)')
plt.ylabel('Amplitude')
plt.show()

# Play the audio
Audio(path)

#### 2. Noise Injection

In [None]:
x = noise(data)
plt.figure(figsize=(14,4))
librosa.display.waveshow(y=x, sr=sampling_rate)
Audio(x, rate=sampling_rate)

#### 3. Stretching

In [None]:
import librosa.effects

# Stretch the audio
x = librosa.effects.time_stretch(data,rate=1.5)
plt.figure(figsize=(14,4))
librosa.display.waveshow(y=x, sr=sampling_rate)
Audio(x, rate=sampling_rate)

#### 4. Shifting

In [None]:
x = shift(data)
plt.figure(figsize=(14,4))
librosa.display.waveshow(y=x, sr=sampling_rate)
Audio(x, rate=sampling_rate)

#### 5. Pitch

In [None]:
x = librosa.effects.pitch_shift(data,sr=sampling_rate,n_steps=7)
plt.figure(figsize=(14,4))
librosa.display.waveshow(y=x, sr=sampling_rate)
Audio(x, rate=sampling_rate)


## Feature Extraction


In [None]:
def extract_features(data, sampling_rate):
    # ZCR
    result = np.array([])
    zcr = np.mean(librosa.feature.zero_crossing_rate(y=data).T, axis=0)
    result = np.hstack((result, zcr))  # stacking horizontally

    # Chroma_stft
    stft = np.abs(librosa.stft(data))
    chroma_stft = np.mean(librosa.feature.chroma_stft(S=stft, sr=sampling_rate).T, axis=0)
    result = np.hstack((result, chroma_stft))  # stacking horizontally

    # MFCC
    mfcc = np.mean(librosa.feature.mfcc(y=data, sr=sampling_rate, n_mfcc=13).T, axis=0)
    result = np.hstack((result, mfcc))  # stacking horizontally

    # Root Mean Square Value
    rms = np.mean(librosa.feature.rms(y=data).T, axis=0)
    result = np.hstack((result, rms))  # stacking horizontally

    # MelSpectrogram
    mel = np.mean(librosa.feature.melspectrogram(y=data, sr=sampling_rate, n_mels=128).T, axis=0)
    result = np.hstack((result, mel))  # stacking horizontally

    return result


In [None]:
def get_features(path):
    try:
        # duration and offset are used to take care of the no audio in start and the ending of each audio files as seen above.
        data, sampling_rate = librosa.load(path, duration=2.5, offset=0.6)

        # without augmentation
        res1 = extract_features(data, sampling_rate)
        result = np.array(res1)

        # data with noise
        noise_data = noise(data)
        res2 = extract_features(noise_data, sampling_rate)
        result = np.vstack((result, res2))  # stacking vertically

        # data with stretching and pitching
        new_data = stretch(data)
        data_stretch_pitch = pitch(new_data, sampling_rate)
        res3 = extract_features(data_stretch_pitch, sampling_rate)
        result = np.vstack((result, res3))  # stacking vertically

        return result
    except Exception as e:
        print(f"Error processing {path}: {e}")
        return None


In [None]:
X, Y = [], []
for path, emotion in zip(data_path.Path, data_path.Emotions):
    features = get_features(path)
    if features is not None:
        for feature in features:
            X.append(feature)
            # appending emotion 3 times as we have made 3 augmentation techniques on each audio file.
            Y.append(emotion)

X = np.array(X)
Y = np.array(Y)


In [None]:
len(X), len(Y), data_path.Path.shape

In [None]:
print(f"Features shape: {X.shape}")
print(f"Labels shape: {Y.shape}")


In [None]:
import plotly.express as px
import plotly.graph_objects as go
from yellowbrick.features import PCA as YBPCA

fig = px.histogram(data_path, x='Emotions', title='Count of Emotions', 
                   labels={'Emotions':'Emotions'}, 
                   color='Emotions', 
                   color_discrete_sequence=px.colors.qualitative.Vivid)

fig.update_layout(title_text='Count of Emotions', title_x=0.5, 
                  xaxis_title='Emotions', yaxis_title='Count')

fig.show()


In [None]:
# Ensure all features have the same length
max_length = max([len(x) for x in X])
X_padded = np.array([np.pad(x, (0, max_length - len(x)), mode='constant') for x in X])

print(f"Padded features shape: {X_padded.shape}")


In [None]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
Y_encoded = label_encoder.fit_transform(Y)
visualizer = YBPCA(scale=True, projection=2, classes=np.unique(Y_encoded))
visualizer.fit_transform(X, Y_encoded)
visualizer.show()


In [None]:
fig = px.box(data_frame=pd.DataFrame(X_padded), points="all", title="Distribution of Features")
fig.update_layout(title_text='Distribution of Features', title_x=0.5, 
                  xaxis_title='Feature Index', yaxis_title='Value')

fig.show()


In [None]:
Features = pd.DataFrame(X)
Features['labels'] = Y
Features.to_csv('features.csv', index=False)
Features.head()

## Data Preparation


In [None]:
X = Features.iloc[: ,:-1].values
Y = Features['labels'].values


In [None]:
# As this is a multiclass classification problem onehotencoding our Y.
encoder = OneHotEncoder()
Y = encoder.fit_transform(np.array(Y).reshape(-1,1)).toarray()

In [None]:
# splitting data
x_train, x_test, y_train, y_test = train_test_split(X, Y, random_state=0, shuffle=True)
x_train.shape, y_train.shape, x_test.shape, y_test.shape

In [None]:
# scaling our data with sklearn's Standard scaler
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)
x_train.shape, y_train.shape, x_test.shape, y_test.shape

In [None]:
# making our data compatible to model.
x_train = np.expand_dims(x_train, axis=2)
x_test = np.expand_dims(x_test, axis=2)
x_train.shape, y_train.shape, x_test.shape, y_test.shape

In [None]:
import pickle
# Saving The Encoder
with open('speechmodel_tokenizer.pickle', 'wb') as handle:
    pickle.dump(encoder, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer

# Save the tokenizer to a file
with open('textmodel_tokenizer.pickle', 'wb') as handle:
    pickle.dump(Tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
# Save the Encoder to a file
with open('text_Encoder.pickle', 'wb') as hander:
    pickle.dump(encoder, hander, protocol=pickle.HIGHEST_PROTOCOL)

## Modelling

In [None]:
from keras.models import Sequential
from keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout
from keras.callbacks import ReduceLROnPlateau
import numpy as np

# Define the model
model = Sequential()
model.add(Conv1D(256, kernel_size=5, strides=1, padding='same', activation='relu', input_shape=(x_train.shape[1], 1)))
model.add(MaxPooling1D(pool_size=5, strides=2, padding='same'))

model.add(Conv1D(256, kernel_size=5, strides=1, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=5, strides=2, padding='same'))

model.add(Conv1D(128, kernel_size=5, strides=1, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=5, strides=2, padding='same'))
model.add(Dropout(0.2))

model.add(Conv1D(64, kernel_size=5, strides=1, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=5, strides=2, padding='same'))

model.add(Flatten())
model.add(Dense(units=32, activation='relu'))
model.add(Dropout(0.3))

# Adjust the output layer to have 7 units for 7 classes
model.add(Dense(units=8, activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Define the learning rate reducer callback
rlrp = ReduceLROnPlateau(monitor='loss', factor=0.4, verbose=0, patience=2, min_lr=0.0000001)

# Ensure y_train and y_test are one-hot encoded
from keras.utils import to_categorical
y_train_one_hot = to_categorical(y_train)
y_test_one_hot = to_categorical(y_test)

# Fit the model
history = model.fit(x_train, y_train_one_hot, batch_size=64, epochs=50, validation_data=(x_test, y_test_one_hot), callbacks=[rlrp])


In [None]:
# Define the learning rate reducer callback
rlrp = ReduceLROnPlateau(monitor='loss', factor=0.4, verbose=0, patience=2, min_lr=0.0000001)

# Fit the model
history = model.fit(x_train, y_train, batch_size=64, epochs=100, validation_data=(x_test, y_test), callbacks=[rlrp])



In [None]:
model.save('Speech_Emotion_model.h5')


In [None]:
print("Accuracy of our model on test data : " , model.evaluate(x_test,y_test)[1]*100 , "%")

epochs = [i for i in range(50)]
fig , ax = plt.subplots(1,2)
train_acc = history.history['accuracy']
train_loss = history.history['loss']
test_acc = history.history['val_accuracy']
test_loss = history.history['val_loss']

fig.set_size_inches(20,6)
ax[0].plot(epochs , train_loss , label = 'Training Loss')
ax[0].plot(epochs , test_loss , label = 'Testing Loss')
ax[0].set_title('Training & Testing Loss')
ax[0].legend()
ax[0].set_xlabel("Epochs")

ax[1].plot(epochs , train_acc , label = 'Training Accuracy')
ax[1].plot(epochs , test_acc , label = 'Testing Accuracy')
ax[1].set_title('Training & Testing Accuracy')
ax[1].legend()
ax[1].set_xlabel("Epochs")
plt.show()

In [None]:
# predicting on test data.
pred_test = model.predict(x_test)
y_pred = encoder.inverse_transform(pred_test)

y_test = encoder.inverse_transform(y_test)

In [None]:
df = pd.DataFrame(columns=['Predicted Labels', 'Actual Labels'])
df['Predicted Labels'] = y_pred.flatten()
df['Actual Labels'] = y_test.flatten()

df.head(10)

In [None]:
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize = (12, 10))
cm = pd.DataFrame(cm , index = [i for i in encoder.categories_] , columns = [i for i in encoder.categories_])
sns.heatmap(cm, linecolor='white', cmap='Blues', linewidth=1, annot=True, fmt='')
plt.title('Confusion Matrix', size=20)
plt.xlabel('Predicted Labels', size=14)
plt.ylabel('Actual Labels', size=14)
plt.show()

In [None]:
print(classification_report(y_test, y_pred))