In [2]:
import soundfile # to read audio file
import numpy as np
import librosa # to extract speech features
import glob
import os
import pickle # to save model after training
from sklearn.model_selection import train_test_split # for splitting training and testing
from sklearn.neural_network import MLPClassifier # multi-layer perceptron model
from sklearn.metrics import accuracy_score # to measure how good we are
import pandas as pd
from tqdm import tqdm

In [3]:
df= pd.read_csv("EMOVO/data.csv")
files_list=[]
for  (index ,path) in zip(df.index,df.file_name):
  files_list.append('EMOVO/EMOVO_dataset/'+path)

In [4]:
#add white noise to the original signal
def noise_addition(data,noise_percentage_factor=0.035):
    noise = np.random.normal(0, data.std(), data.size)
    augmented_data = data + noise * noise_percentage_factor
    return augmented_data

#lower the pitch of the original signal
def pitch_scaling(data, sr, num_semitones=-2):
    return librosa.effects.pitch_shift(y = data, sr = sr, n_steps = num_semitones)

#increase the pitch of the original signal
def pitch_scaling2(data, sr, num_semitones=2):
    return librosa.effects.pitch_shift(y = data, sr = sr, n_steps = num_semitones)

In [5]:
#mfccs and deltas extraction
import math

def extract_embeddings(path):
    y, sr = librosa.load(path)
    duration=librosa.get_duration(y=y,sr=sr)
    #reduce the duration of files longer than 3 seconds
    if(duration > 3.0):
        y,sr=librosa.load(path, offset=(duration/2)-1.5 , duration = 3)
    else:
        y,sr=librosa.load(path)
    return y

def extract_embeddings_with_noise(path):
    y, sr = librosa.load(path)
    duration=librosa.get_duration(y=y,sr=sr)
    #reduce the duration of files longer than 3 seconds
    if(duration > 3.0):
        y,sr=librosa.load(path, offset=(duration/2)-1.5 , duration = 3)
        y=noise_addition(y)
    else:
        y,sr=librosa.load(path)
        y=noise_addition(y)
    
    return y


def extract_embeddings_with_pitch_scaling(path):
    y, sr = librosa.load(path)
    duration=librosa.get_duration(y=y,sr=sr)
    #reduce the duration of files longer than 3 seconds
    if(duration > 3.0):
        y,sr=librosa.load(path, offset=(duration/2)-1.5 , duration = 3)
        y=pitch_scaling(y,sr)
    else:
        y,sr=librosa.load(path)
        y=pitch_scaling(y,sr)
    return y

def extract_embeddings_pitch_scaling2(path):
    y, sr = librosa.load(path)
    duration=librosa.get_duration(y=y,sr=sr)
    #reduce the duration of files longer than 3 seconds
    if(duration > 3.0):
        y,sr=librosa.load(path, offset=(duration/2)-1.5 , duration = 3)
        y=pitch_scaling2(y,sr)
    else:
        y,sr=librosa.load(path)
        y=pitch_scaling2(y,sr)
  
    
    return y

def create_padding(length):
  diff = 66150 - length
  if diff % 2 == 0:
    return (int(diff/2), int(diff/2))
  else:
    return (int(diff/2) +1 , int(diff/2))

In [6]:
audio_list=[]
for i in tqdm(files_list):
  y = extract_embeddings(i)
  y= np.pad(y, create_padding(len(y)), 'constant', constant_values=0)
  audio_list.append(y)

audio_list_pitch_shifting1=[]
for i in tqdm(files_list):
    y=extract_embeddings_with_pitch_scaling(i)
    y= np.pad(y, create_padding(len(y)), 'constant', constant_values=0)
    audio_list_pitch_shifting1.append(y)

audio_list_pitch_shifting2=[]
for i in tqdm(files_list):
    y=extract_embeddings_pitch_scaling2(i)
    y= np.pad(y, create_padding(len(y)), 'constant', constant_values=0)
    audio_list_pitch_shifting2.append(y)

audio_list_noise_addition=[]
for i in tqdm(files_list):
    y=extract_embeddings_with_noise(i)
    y= np.pad(y, create_padding(len(y)), 'constant', constant_values=0)
    audio_list_noise_addition.append(y)

100%|██████████| 588/588 [00:11<00:00, 50.79it/s]
100%|██████████| 588/588 [00:27<00:00, 21.04it/s]
100%|██████████| 588/588 [00:19<00:00, 29.79it/s]
100%|██████████| 588/588 [00:06<00:00, 90.84it/s] 


In [7]:
df1 = pd.DataFrame({'data': audio_list, 'label': df["label"]})
df2 = pd.DataFrame({'data': audio_list_pitch_shifting1, 'label': df["label"]})
df3 = pd.DataFrame({'data': audio_list_pitch_shifting2, 'label': df["label"]})
df4 = pd.DataFrame({'data': audio_list_noise_addition, 'label': df["label"]})

df = pd.concat([df1, df2, df3, df4])

In [9]:
df["label"].unique()

array(['surprise', 'neutrality', 'sadness', 'fear', 'disgust', 'joy',
       'anger'], dtype=object)

In [11]:
int2emotion = {
    "sor":"surprise",
    "neu": "neutrality",
    "tri": "sadness",
    "pau": "fear",
    "dis": "disgust",
    "gio": "joy",
    "rab": "anger"
}

def load_data(test_size=0.2):
    X, y = [], []
    for i in tqdm(range(len(df))):
        row = df.iloc[i]
        features = row['data']
        emotion = row['label']
        # add to data
        X.append(features)
        y.append(emotion)
    # split the data to training and testing and return it
    return train_test_split(np.array(X), y, test_size=test_size, random_state=7)

In [12]:
# load RAVDESS dataset, 75% training 25% testing
X_train, X_test, y_train, y_test = load_data(test_size=0.25)

100%|██████████| 2352/2352 [00:00<00:00, 41528.64it/s]


In [13]:
# best model, determined by a grid search
model_params = {
    'alpha': 0.01,
    'batch_size': 256,
    'epsilon': 1e-08,
    'hidden_layer_sizes': (300,),
    'learning_rate': 'adaptive',
    'max_iter': 100,
    'verbose':2,
    'early_stopping' : True,
}
# initialize Multi Layer Perceptron classifier
# with best parameters ( so far )
model = MLPClassifier(**model_params)

In [14]:
X_train[0]

array([-0.00049104, -0.02937746, -0.07408743, ..., -0.01282057,
       -0.01485587, -0.01174409])

In [15]:
model.fit(X_train,y_train)

Iteration 1, loss = 1.85949679
Validation score: 0.355932
Iteration 2, loss = 1.08633344
Validation score: 0.372881
Iteration 3, loss = 0.82390344
Validation score: 0.372881
Iteration 4, loss = 0.64092929
Validation score: 0.361582
Iteration 5, loss = 0.50530869
Validation score: 0.355932
Iteration 6, loss = 0.40237883
Validation score: 0.355932
Iteration 7, loss = 0.32099843
Validation score: 0.361582
Iteration 8, loss = 0.25942557
Validation score: 0.367232
Iteration 9, loss = 0.21409601
Validation score: 0.361582
Iteration 10, loss = 0.17840478
Validation score: 0.361582
Iteration 11, loss = 0.15230092
Validation score: 0.361582
Iteration 12, loss = 0.13173723
Validation score: 0.361582
Iteration 13, loss = 0.11681613
Validation score: 0.361582
Validation score did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.


In [16]:
y_pred = model.predict(X_test)

# calculate the accuracy
accuracy = accuracy_score(y_true=y_test, y_pred=y_pred)

print("Accuracy: {:.2f}%".format(accuracy*100))

Accuracy: 41.33%
