<a href="https://colab.research.google.com/github/MeidanGR/GooglePlayStore_AppsAnalysisBI/blob/main/EmotionRecognition_DL_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Speech Emotion Recognition algorithm**
*Using Deep Learning (LSTM) model*

B.Sc. Final project by Meidan Greenberg; Linoy Hadad;

Instructor: Dr. Dima Alberg

# UNDER DEVELOPMENT.

# **PACKAGES & GOOGLE AUTH**

In [1]:
%%capture
!pip install soundfile
!pip install noisereduce
!pip install pydub
!pip install pywt

In [2]:
import numpy as np
import librosa
import pywt
import noisereduce as nr
import IPython.display as ipd #Audio player
import os
import pandas as pd
import sklearn
from pydub import AudioSegment, effects


  from tqdm.autonotebook import tqdm


In [28]:
from google.colab import drive
drive.mount('/content/drive', force_remount = True)

Mounted at /content/drive


# **LOADING FULL DATA**
The speech emotion audio databases used:

- RAVDASS: https://zenodo.org/record/1188976#.X4sE0tDXKUl
  - 1440 files = 24 actors x 60 trails per actor
  - 8 Emotions (neutral, calm, happy, sad, angry, fearful, disgust, surprised).
-TESS: https://tspace.library.utoronto.ca/handle/1807/24487
  - 2800 files = 2 actors x 200 pharses x 7 emotions
  - 7 Emotions (neutral, happiness, sadness, anger, fear, disgust, pleasant surprise)
   - ('calm' is not a part of this DB).


## **RAVDESS Database**

All of  RAVDESS files has a unique filename. The filename consists of a 7-part numerical identifier (e.g., 02-01-06-01-02-01-12.mp4). The format which carry only emotion expressed by speech is taken as: 03-01-X-X-X-X-X.wav, as the 8 emotions are stated in the 3rd part. For additional information see the link above.

 
## **TESS Database**

The TESS Database file name contain the emotion by text, for e.g. "YAF_youth_happy.wav". Therefore a find_emotion function has been executed.

---

# **FEATURE EXTRACTION**
...


In [17]:
#Normalization test for one sample.


print('Original: librosa.load')
x,sr = librosa.load('/content/drive/My Drive/AudioFiles/RAVDESS/Actor_16/03-01-08-02-01-01-16.wav', sr = None, duration = None)
print(np.shape(x))
ipd.display(ipd.Audio(data = x, rate=sr))

from pydub import AudioSegment, effects  
rawsound = AudioSegment.from_file('/content/drive/My Drive/AudioFiles/RAVDESS/Actor_16/03-01-08-02-01-01-16.wav', "wav",duration = None) 
normalizedsound = effects.normalize(rawsound, headroom = 0)  

print('Normalized: AudioSegment.from_file')
samples = normalizedsound.get_array_of_samples()
print(np.shape(samples))
normalizedsound

#ipd.display(ipd.Audio(data = rawsound, rate=sr))
ipd.display(ipd.Audio(data = samples, rate=sr))


Original: librosa.load
(169770,)


Normalized: AudioSegment.from_file
(169770,)


In [4]:
#Padding function in order to length equalization of audio files.
def padding(array, frames):
    h = array.shape[0]

    a = (frames - h) // 2
    aa = frames - a - h

    return np.pad(array, pad_width=(a, aa), mode='constant')

#Emotion kind validation function for TESS database, due to the emotions are written within the file names.
def find_emotion(name): 
        if('neutral' in name): return "01"
        elif('happy' in name): return "03"
        elif('sad' in name): return "04"
        elif('angry' in name): return "05"
        elif('fear' in name): return "06"
        elif('disgust' in name): return "07"
        elif('ps' in name): return "08"
        else: return "-1"


In [None]:
import time
tic = time.perf_counter()

#Initizlizing data lists
audio_data = []
sample_rate = []
mfcc = []
zcr = []
rms = []
dwt = []
pitch = []
emotions = []

filename=[]
frames=[]

#Running over BOTH databases for frames & features extraction (x)
folder_path = '/content/drive/My Drive/AudioFiles'


for subdir, dirs, files in os.walk(folder_path):
  for file in files: 
  # Loading file frames, normalizing to -3 dBFS, and padding all audio files to about 5 secs.
      rawsound = AudioSegment.from_file(os.path.join(subdir,file), "wav",duration = None) 
      normalizedsound = effects.normalize(rawsound, headroom = -3) #-3 dBFS
      normal_x = np.array(normalizedsound.get_array_of_samples(), dtype = float)
      padded_x = padding(normal_x, 253053) #Padding to maximum frame number in the database to get length equalization: 5.2719 * 48000 = 253053

      _, sr = librosa.load(path = os.path.join(subdir,file), sr = None) # sr (the sample rate) is used for librosa's features extraction, _ is irrelevant.

  # Features extraction
      f0 = librosa.yin(padded_x, fmin=20, fmax = 20000, sr = sr, frame_length=2048, win_length=None, hop_length=None) # Pitch
      f1 = librosa.feature.rms(padded_x , S=None, frame_length=2048, hop_length=512, center=True, pad_mode='reflect') # Energy - Root Mean Square
      f2 = librosa.feature.zero_crossing_rate(padded_x , frame_length=2048, hop_length=512,center=True) # ZCR
      f3 = librosa.feature.mfcc(padded_x, sr=sr, S=None, n_mfcc=13,dct_type=2, norm='ortho', lifter=0) # MFCC
      cA, cD = pywt.dwt(padded_x, 'db2', 'sym') # DWT
     

  # Emotion extraction (y)      
      if (find_emotion(file) != "-1"): #TESS database validation
        name = find_emotion(file)
      else: 
        name = file[6:8]               #RAVDESS database validation

  # Filling the data lists for each iteration (each file)
      audio_data.append(padded_x.T) 
      sample_rate.append(sr) 

      filename.append(file)
      frames.append(np.shape(padded_x.T)[0])

      pitch.append(f0)
      rms.append(f1)
      zcr.append(f2)
      mfcc.append(f3)
      dwt.append(cD)
    
      emotions.append(name)  

toc = time.perf_counter()
print(f"Running time: {(toc - tic)/60:0.4f} minutes")

In [30]:
df = pd.DataFrame(columns= ['name', 'frames','x','sample_rate', 'mfccs', 'zcr', 'rms', 'dwt', 'pitch', 'y']) #DataFrame is used to 1) Visualization, 
#                                                                          2) Converting emotion number into a name using 'map' function.

df.name = filename
df.frames = frames
df.x = audio_data
df.sample_rate = sample_rate
df.mfccs = mfcc
df.zcr = zcr
df.rms = rms
df.dwt = dwt
df.pitch = pitch

df.y = emotions
df.y = df.y.map({'01' : 'neutral', '02' : 'calm', '03' : 'happy', '04' : 'sad', '05' : 'angry',
                             '06' : 'fearful', '07' : 'disgust', '08' : 'suprised'})

print(df.info())
df.sample(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4233 entries, 0 to 4232
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   name         4233 non-null   object
 1   frames       4233 non-null   int64 
 2   x            4233 non-null   object
 3   sample_rate  4233 non-null   int64 
 4   mfccs        4233 non-null   object
 5   zcr          4233 non-null   object
 6   rms          4233 non-null   object
 7   dwt          4233 non-null   object
 8   pitch        4233 non-null   object
 9   y            4233 non-null   object
dtypes: int64(2), object(8)
memory usage: 330.8+ KB
None


Unnamed: 0,name,frames,x,sample_rate,mfccs,zcr,rms,dwt,pitch,y
2522,OAF_keg_ps.wav,49348,"[0.0, -6.0, -17.0, -17.0, -17.0, -17.0, -28.0,...",24414,"[[543.2524766508576, 624.0596834034671, 717.95...","[[0.05615234375, 0.17626953125, 0.2978515625, ...","[[212.07310520932398, 330.2676530577132, 704.6...","[3.6742346141747673, -4.665544431833574, -4.44...","[30.512894267567617, 24414.0, 1943.44232881705...",suprised
3001,03-01-04-01-02-01-07.wav,169770,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",48000,"[[524.8260894876497, 518.9104447938847, 501.97...","[[0.0400390625, 0.060546875, 0.0771484375, 0.0...","[[26.472789685826463, 25.73247279703216, 24.28...","[0.0, 0.0, 0.0, 0.0, -25.114071483515776, 6.72...","[60.14594824653552, 60.515631741888356, 77.816...",sad
3558,03-01-07-01-02-02-14.wav,180981,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",48000,"[[537.6236863120865, 537.6236863120865, 537.62...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[24000.0, 24000.0, 24000.0, 24000.0, 24000.0, ...",disgust
2667,OAF_luck_fear.wav,39555,"[0.0, -55.0, -44.0, -65.0, -65.0, -65.0, -65.0...",24414,"[[549.7070601642288, 560.1196578861142, 628.40...","[[0.00048828125, 0.02490234375, 0.11962890625,...","[[720.5046876202906, 589.5293073284981, 574.05...","[33.6804839632687, 6.913784732241904, -2.71759...","[23.865102639296186, 749.5343698326512, 24414....",fearful
2958,03-01-07-01-01-01-05.wav,208208,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",48000,"[[503.27605800983207, 511.0084001344432, 512.4...","[[0.03759765625, 0.076171875, 0.076171875, 0.0...","[[70.68988653685723, 56.12918014082559, 55.667...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[1534.5383591232626, 4039.9907441797, 4014.529...",disgust


In [31]:
dferr = df[df.frames > 48000*5.5]
dferr

Unnamed: 0,name,frames,x,sample_rate,mfccs,zcr,rms,dwt,pitch,y


In [33]:
frameslist=[]
for i in range(4233):
  frameslist.append(np.shape(df.x[i]))

lst2 = [item[0] for item in frameslist]
lst2

np.max(lst2)

#Maximum audio frames length, for padding definition.
np.max(frames)

253053

# AUDIO & EMOTION CHECKS

In [27]:
#Enter a row num between [0-4236]
row = 3828

print('File Name:', df.name[row])
print('Emotion:', df.y[row])
print('Frames:', np.shape(df.x[row])[0])

ipd.display(ipd.Audio(data = df.x[row], rate=df.sample_rate[row]))
print(np.shape(df.x[row]))


Emotion: happy
File Name: 03-01-03-01-02-01-20.wav


(333132,)


# **TRAIN & TEST SETS SPLIT**

In [13]:



from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df.x, df.y, test_size = 0.3, random_state = 42)
