#  <center> Speech Emotion Recognition <center>

# Importing Libraries

In [1]:
import pandas as pd
import numpy as np

import os
import sys

# librosa is a Python library for analyzing audio and music. It can be used to extract the data from the audio files we will see it later.
import librosa
import librosa.display
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

# to play the audio files
from IPython.display import Audio

import keras
from keras.callbacks import ReduceLROnPlateau
from keras.models import Sequential
from keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Dropout, BatchNormalization
from keras.utils import np_utils, to_categorical
from keras.callbacks import ModelCheckpoint

import warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")
warnings.filterwarnings("ignore", category=DeprecationWarning) 



## Data Preparation

In [2]:
# Paths for data.
dataset_path = "/kaggle/input/ravdess-emotional-speech-audio/audio_speech_actors_01-24/"

### Feature Extraction

In [3]:
# Adding 3 types of data augmentation techniques
def noise(data):
    noise_amp = 0.035*np.random.uniform()*np.amax(data)
    data = data + noise_amp*np.random.normal(size=data.shape[0])
    return data

def stretch(audio_data):
    return librosa.effects.time_stretch(audio_data, rate=0.8)

def shift(data):
    shift_range = int(np.random.uniform(low=-5, high = 5)*1000)
    return np.roll(data, shift_range)

In [4]:
# Function to Load data
def load_data(file_path):
    audio, sampling_rate = librosa.load(file_path, duration=2.5, offset=0.6)
    return audio

In [5]:
# Function to preprocess the data
def input_label_define(dataset_path):
    y=[]
    X=[]
    gender=[]
    actors = []
    for folder in os.listdir(dataset_path):
        for file in os.listdir(os.path.join(dataset_path, folder)):
            if file.endswith('.wav'): 
                emotion = file.split('-')[2]
                if int(emotion) == 1:
                    label = 'Neutral'
                elif int(emotion) == 2:
                    label = 'Calm'
                elif int(emotion) == 3:
                    label = 'Happy'
                elif int(emotion) == 4:
                    label = 'Sad'
                elif int(emotion) == 5:
                    label = 'Angry'
                elif int(emotion) == 6:
                    label = 'Fearful'
                elif int(emotion) == 7:
                    label = 'Disgust'
                elif int(emotion) == 8:
                    label = 'Surprised'
                else:
                    label = 'UNK'
                actor = int(file.split('-')[-1].split('.')[0])
                         
                file_path = os.path.join(dataset_path, folder, file)
                audio_array=load_data(file_path)
                
                # gender
                gender_value=file_path.split("-")[-1].split(".")[0]
                if (int(gender_value)%2)==0:
                    gender_feature=0
                else:
                    gender_feature=1
                
                X.append(load_data(file_path))
                y.append(label)
                gender.append(gender_feature)
                actors.append(actor)
                
                # data with noise
                X.append(noise(audio_array))
                y.append(label)
                gender.append(gender_feature)
                actors.append(actor)
                
                # Data with shift
                X.append(shift(audio_array))
                y.append(label)
                gender.append(gender_feature)
                actors.append(actor)
                
                # data with stretching and pitching
                X.append(stretch(audio_array))
                y.append(label)
                gender.append(gender_feature)
                actors.append(actor)
            
    return X,y,gender, actors

In [6]:
X_r_aug, y_r_aug, gender, actors= input_label_define(dataset_path)

In [7]:
np.array(X_r_aug).shape,np.array(y_r_aug).shape,np.array(y_r_aug).shape

((5760,), (5760,), (5760,))

In [None]:
# Saving the raw input data
# import pickle

# open a file in binary write mode
# with open('RAVDESS_raw_aug.pkl', 'wb') as f:
    # write the array to the file using pickle.dump()
#     pickle.dump(np.array(X_r_aug), f)

### Extract Accoustic features

In [None]:
def acoustic_features(data):
    # ZCR
    result = np.array([])
    zcr = np.mean(librosa.feature.zero_crossing_rate(y=data).T, axis=0)
    result=np.hstack((result, zcr)) # stacking horizontally

    # Chroma_stft
    stft = np.abs(librosa.stft(data))
    chroma_stft = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0)
    result = np.hstack((result, chroma_stft)) # stacking horizontally

    # MFCC
    mfcc = np.mean(librosa.feature.mfcc(y=data, sr=sample_rate).T, axis=0)
    result = np.hstack((result, mfcc)) # stacking horizontally

    # Root Mean Square Value
    rms = np.mean(librosa.feature.rms(y=data).T, axis=0)
    result = np.hstack((result, rms)) # stacking horizontally

    # MelSpectogram
    mel = np.mean(librosa.feature.melspectrogram(y=data, sr=sample_rate).T, axis=0)
    result = np.hstack((result, mel)) # stacking horizontally
    
    return result

In [None]:
sampling_rate=22050

In [None]:
X_ac = []
for i in X_r_aug:
    X_ac.append(acoustic_features(i))

In [None]:
y_ac=y_r_aug
np.array(X_ac).shape,np.array(y_ac).shape

In [None]:
# Accoustic features saved
# np.save('RAVDESS_Aug_Acoustic.npy',np.array(X_ac))
# np.save('RAVDESS_Aug_Acoustic_emotion.npy',np.array(y_ac))

### Extract Statistical Features

In [None]:
import os, glob
import librosa
import numpy as np
import scipy

# Function to extract features from each audio file
def statistical_features(audio):
    mean = np.mean(audio)
    variance = np.var(audio)
    skewness = scipy.stats.skew(audio)
    kurtosis = scipy.stats.kurtosis(audio)
    audio_rms = librosa.feature.rms(y=audio)
    audio_rms_mean = audio_rms.mean()
    spectral_centroids = librosa.feature.spectral_centroid(y=audio, sr=sampling_rate)[0]
    spectral_bandwidth = librosa.feature.spectral_bandwidth(y=audio, sr=sampling_rate)[0]
    mfccs = librosa.feature.mfcc(y=audio, sr=sampling_rate, n_mfcc=13)
    mfcc = mfccs.mean()
    return [mean, variance, skewness, kurtosis, audio_rms_mean, spectral_centroids.mean(), spectral_bandwidth.mean(), mfcc]
  

In [None]:
X_stat = []
for i in X_r_aug:
    X_stat.append(statistical_features(i))
np.array(X_stat).shape

In [None]:
# Saving statistical features
# np.save('RAVDESS_Aug_Stat.npy',np.array(X_stat))

## Skip Preprocessing

In [8]:
X_stat = np.load('/kaggle/input/ravdess-revathi-npy/RAVDESS_Aug_Stat.npy')
X_ac = np.load('/kaggle/input/ravdess-revathi-npy/RAVDESS_Aug_Acoustic.npy')
y_ac = np.load('/kaggle/input/ravdess-revathi-npy/RAVDESS_Augmented_emotion.npy')

In [9]:
Features = pd.DataFrame(X_ac)
Features['labels'] = y_ac
Features.to_csv('features.csv', index=False)
Features.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,153,154,155,156,157,158,159,160,161,labels
0,0.237382,0.682273,0.670519,0.614793,0.579039,0.566803,0.637206,0.667451,0.663883,0.669903,...,0.000234,0.000235,0.000138,0.000128,0.000299,0.00036,0.000275,0.000173,2.896729e-05,Surprised
1,0.331642,0.727662,0.734769,0.731634,0.710158,0.717446,0.743283,0.713234,0.700561,0.683346,...,0.001221,0.001236,0.001138,0.001159,0.001292,0.001365,0.001208,0.001139,0.001052731,Surprised
2,0.330164,0.730809,0.734809,0.727598,0.707369,0.71733,0.74534,0.724385,0.699645,0.686211,...,0.001076,0.001074,0.000989,0.001015,0.001213,0.001241,0.001193,0.001121,0.0008812787,Surprised
3,0.329427,0.728282,0.734317,0.737076,0.720081,0.723116,0.740559,0.714165,0.689678,0.682365,...,0.001137,0.001108,0.001044,0.001099,0.001215,0.001272,0.001126,0.001112,0.0009334161,Surprised
4,0.256113,0.621621,0.580278,0.598515,0.628296,0.598015,0.597272,0.650352,0.698636,0.681863,...,1.8e-05,1.6e-05,1.3e-05,1.2e-05,7e-06,8e-06,8e-06,5e-06,5.802853e-07,Neutral


In [10]:
Features.shape

(5760, 163)

In [11]:
# Combining both statistic and deep features and gender to create a common dataframe
stat_col_index=len(Features.columns)
stat_features=8
X_stat_df = pd.DataFrame(X_stat,columns=list(range(stat_col_index,stat_col_index+stat_features)))

gender_col_index=len(Features.columns)+stat_features
X_gender_df=pd.DataFrame(gender,columns=list(range(gender_col_index,gender_col_index+1)))
# stacking horizontally for getting all combinations
X_ac_stat_nogen = pd.concat([Features, X_stat_df], axis=1)

X_ac_stat = pd.concat([Features, X_stat_df,X_gender_df], axis=1)

In [12]:
np.array(X_ac_stat).shape

(5760, 172)

## Data Preparation

In [13]:
X = X_ac_stat.drop('labels',axis=1).values
Y = X_ac_stat['labels'].values

In [14]:
# As this is a multiclass classification problem onehotencoding our Y.
encoder = OneHotEncoder()
Y = encoder.fit_transform(np.array(Y).reshape(-1,1)).toarray()

In [15]:
# splitting data
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2,random_state=0, shuffle=True,stratify=Y)
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((4608, 171), (4608, 8), (1152, 171), (1152, 8))

In [16]:
# Normalizing our data with sklearn's Standard scaler
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((4608, 171), (4608, 8), (1152, 171), (1152, 8))

In [17]:
# making our data compatible to model.
x_train = np.expand_dims(x_train, axis=2)
x_test = np.expand_dims(x_test, axis=2)
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((4608, 171, 1), (4608, 8), (1152, 171, 1), (1152, 8))

In [18]:
# Acoustic feature segregation for deep feature extraction
x_train_ac = x_train[:, :162]
x_test_ac=x_test[:, :162]

x_train_stat = x_train[:, 162:170]
x_test_stat=x_test[:, 162:170]

x_train_gen=x_train[:,-1]
x_test_gen=x_test[:,-1]

In [19]:
x_train_ac.shape,x_train_stat.shape,x_train_gen.shape

((4608, 162, 1), (4608, 8, 1), (4608, 1))

In [20]:
x_train_stat.shape,x_test_stat.shape,x_test_gen.shape

((4608, 8, 1), (1152, 8, 1), (1152, 1))

In [21]:
x_train_gen.shape,x_test_gen.shape

((4608, 1), (1152, 1))

## Modelling

In [22]:
model=Sequential()
model.add(Conv1D(256, kernel_size=5, strides=1, padding='same', activation='relu', input_shape=(x_train_ac.shape[1], 1)))
model.add(MaxPooling1D(pool_size=5, strides = 2, padding = 'same'))

model.add(Flatten())
model.add(Dense(units=32, activation='relu'))
model.add(Dropout(0.3))

model.add(Dense(units=8, activation='softmax'))
model.compile(optimizer = 'adam' , loss = 'categorical_crossentropy' , metrics = ['accuracy'])

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d (Conv1D)             (None, 162, 256)          1536      
                                                                 
 max_pooling1d (MaxPooling1D  (None, 81, 256)          0         
 )                                                               
                                                                 
 flatten (Flatten)           (None, 20736)             0         
                                                                 
 dense (Dense)               (None, 32)                663584    
                                                                 
 dropout (Dropout)           (None, 32)                0         
                                                                 
 dense_1 (Dense)             (None, 8)                 264       
                                                        

In [23]:
model.input

<KerasTensor: shape=(None, 162, 1) dtype=float32 (created by layer 'conv1d_input')>

In [24]:
x_train_ac.shape,y_train.shape

((4608, 162, 1), (4608, 8))

In [25]:
x_test_ac.shape,y_test.shape

((1152, 162, 1), (1152, 8))

In [26]:
rlrp = ReduceLROnPlateau(monitor='loss', factor=0.4, verbose=0, patience=2, min_lr=0.0000001)
history=model.fit(x_train_ac, y_train, batch_size=64, epochs=50, validation_data=(x_test_ac, y_test), callbacks=[rlrp])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [27]:
# predicting on test data.
pred_test = model.predict(x_test_ac)
# y_pred = encoder.inverse_transform(pred_test)

# y_test = encoder.inverse_transform(y_test)
# from sklearn.metrics import accuracy_score
# accuracy = accuracy_score(y_test, pred_test)



In [28]:
model.evaluate(x_test_ac,y_test)



[0.5902487635612488, 0.7881944179534912]

### Getting the features from the Dense layer of conv1d model

In [29]:
model.layers[-3]

<keras.layers.core.dense.Dense at 0x730a5eb0da20>

In [30]:
from tensorflow.keras.models import Model

conv1d_feature_model=Model(inputs=model.input,outputs=model.layers[-3].output)

In [31]:
conv1d_features=conv1d_feature_model.predict(x_train_ac)



In [32]:
conv1d_features_test=conv1d_feature_model.predict(x_test_ac)



### Concatenating with statistical features

In [None]:
x_train_gen.shape,y_train.shape

In [None]:
x_train_stat.shape

In [None]:
# Getting back old shape of xtrain and xtest
x_train_stat = np.squeeze(x_train_stat)

# x_train_gen = np.squeeze(x_train_gen)
x_train_stat.shape,y_train.shape,x_train_gen.shape

In [None]:
conv1d_features.shape,x_train_stat.shape,x_train_gen.shape

In [None]:
x_train_concat=np.concatenate((conv1d_features,x_train_stat,x_train_gen),axis=1)

In [None]:
x_train_concat.shape

In [None]:
x_test_stat = np.squeeze(x_test_stat)

In [None]:
conv1d_features_test.shape,x_test_stat.shape,x_test_gen.shape

In [None]:
x_test_concat=np.concatenate((conv1d_features_test,x_test_stat,x_test_gen),axis=1)

In [None]:
x_test_concat.shape

## 1) Output of extracted features to Machine Learning Classification model

### Classification using Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [None]:
rfc = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)

In [None]:
rfc.fit(x_train_concat, y_train)

In [None]:
y_pred = rfc.predict(x_test_concat)

In [None]:
y_pred.shape,y_test.shape

In [None]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)

In [None]:
accuracy

### Just based on Deep Learning Features

In [None]:
rfc.fit(conv1d_features, y_train)

In [None]:
y_pred = rfc.predict(conv1d_features_test)

In [None]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)

In [None]:
accuracy

### Just based on Statistical Features

In [None]:
rfc.fit(x_train_stat, y_train)

In [None]:
y_pred = rfc.predict(x_test_stat)

In [None]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)

In [None]:
accuracy

## 2) Extracted features to Deep Learning model 

### Just Based on Deep Features

In [None]:
pd.DataFrame(conv1d_features)

In [None]:
conv1d_features.shape,y_train.shape

In [None]:
model_classify = Sequential()
model_classify.add(Dense(12, input_shape=(32,), activation='relu'))
model_classify.add(Dense(8, activation='softmax'))
model_classify.compile(optimizer = 'adam' , loss = 'categorical_crossentropy' , metrics = ['accuracy'])

In [None]:
model_classify.fit(conv1d_features,y_train,validation_split=0.2,epochs=50)

### Based on Statistic and Deep Features

In [None]:
x_test_stat.shape

In [None]:
stat_deep.shape

In [None]:
model_classify = Sequential()
model_classify.add(Dense(12, input_shape=(40,), activation='relu'))
model_classify.add(Dense(8, activation='softmax'))
model_classify.compile(optimizer = 'adam' , loss = 'categorical_crossentropy' , metrics = ['accuracy'])

model_classify.fit(stat_deep,y_train,validation_split=0.2,epochs=50)

## Cross Validation

In [33]:
# Experimenting by reducing the learning rate
from keras.optimizers import Adam

opt = keras.optimizers.Adam(learning_rate=0.0001)

In [34]:
x_train_stat = np.squeeze(x_train_stat)
x_test_stat = np.squeeze(x_test_stat)
stat_deep=np.concatenate((conv1d_features,x_train_stat,x_train_gen),axis=1)
stat_deep_test = np.concatenate((conv1d_features_test,x_test_stat,x_test_gen),axis=1)

In [35]:
X_final = np.concatenate((stat_deep, stat_deep_test), axis=0)
y_final = np.concatenate((y_train, y_test), axis=0)

In [36]:
final_df = pd.DataFrame({'X':list(X_final), 'y':list(y_final), 'actor':actors})
final_df.head()

Unnamed: 0,X,y,actor
0,"[4.397921562194824, 0.0, 0.0, 0.0, 0.0, 0.0, 0...","[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0]",2
1,"[0.8107146620750427, 0.0, 0.0, 0.6349496841430...","[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0]",2
2,"[0.0, 0.0, 0.0, 3.413447618484497, 0.0, 0.0, 0...","[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0]",2
3,"[0.0, 0.0, 0.0, 0.24801909923553467, 0.0, 0.0,...","[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]",2
4,"[0.0, 0.0, 5.0113701820373535, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0]",2


In [37]:
def create_classification_model():
    model = Sequential()
    model.add(Dense(12, input_shape=(41,), activation='relu'))
    model.add(Dense(8, activation='softmax'))
    return model

In [38]:
def train_evaluate(model, X_train, y_train, X_val, y_val):
    
    model.compile(optimizer = opt , loss = 'categorical_crossentropy' , metrics = ['accuracy'])
    model.fit(X_train, y_train, epochs=200, validation_data=(X_val, y_val), verbose=0)
    model.evaluate(X_val, y_val)
    
    return model.evaluate(X_val, y_val)

In [39]:
actors_in_fold = []
train_folds = []
test_fold = []
for i in range(6):
    actors_in_fold.append(list(range((4*i)+1, (4*i)+5)))
    all_folds = list(range(6))
    all_folds.remove(i)
    train_folds.append(all_folds)
    test_fold.append(i)

In [40]:
print('actors_in_fold')
print(actors_in_fold)
print('train_folds')
print(train_folds)
print('test_folds')
print(test_fold)

actors_in_fold
[[1, 2, 3, 4], [5, 6, 7, 8], [9, 10, 11, 12], [13, 14, 15, 16], [17, 18, 19, 20], [21, 22, 23, 24]]
train_folds
[[1, 2, 3, 4, 5], [0, 2, 3, 4, 5], [0, 1, 3, 4, 5], [0, 1, 2, 4, 5], [0, 1, 2, 3, 5], [0, 1, 2, 3, 4]]
test_folds
[0, 1, 2, 3, 4, 5]


In [42]:
def cross_valiadate_actors(df):
    for i in range(len(train_folds)):
        print('Fold '+str(i+1)+'/'+str(6))
        train_fold_actors = []
        for j in train_folds[i]:
            train_fold_actors.extend(actors_in_fold[j])
        test_fold_actors = actors_in_fold[i]
        train_df = df[df['actor'].isin(train_fold_actors)]
        test_df = df[df['actor'].isin(test_fold_actors)]
        model = create_classification_model()
        X_train = np.stack(train_df['X'].tolist())
        y_train = np.stack(train_df['y'].tolist())
        X_val = np.stack(test_df['X'].tolist())
        y_val = np.stack(test_df['y'].tolist())
        train_evaluate(model, X_train, y_train, X_val, y_val)
        print('Done')

In [43]:
cross_valiadate_actors(final_df)

Fold 1/6
Done
Fold 2/6
Done
Fold 3/6
Done
Fold 4/6
Done
Fold 5/6
Done
Fold 6/6
Done


## Modeling

In [None]:
model_classify = Sequential()
model_classify.add(Dense(12, input_shape=(40,), activation='relu'))
model_classify.add(Dense(8, activation='softmax'))
model_classify.compile(optimizer = opt , loss = 'categorical_crossentropy' , metrics = ['accuracy'])

model_classify.fit(stat_deep,y_train,validation_split=0.2,epochs=200)