In [1]:
import os

import pandas as pd
import seaborn as sns

from IPython.display import Audio
import librosa
import pyworld as pw
import numpy as np

In [2]:
emo_dict = {1: "neutral", 2: "calm", 3: "happy", 4: "sad", 5: "angry",
            6: "fearful", 7: "disgust", 8: "surprised"}

inten_dict = {1: 'normal', 2: 'strong'}

stat_dict = {1:  "Kids are talking by the door", 2: "Dogs are sitting by the door"}

In [3]:
def read_RAVDESS_from_dir(data_path='.'):
    data = pd.DataFrame(columns=['Statement','Path','Emotion',
                                 'Emotion intensity','Gender','Actor','Repetition'])
    for dirname, _, filenames in os.walk(data_path):
        for filename in filenames:
            if filename == '.DS_Store':
                continue
            file_path = os.path.join(dirname, filename)
            identifiers = filename.split('.')[0].split('-')

            ## Emotion (01 = neutral, 02 = calm, 03 = happy, 04 = sad,
            ## 05 = angry, 06 = fearful, 07 = disgust, 08 = surprised). ?
            emotion = emo_dict[int(identifiers[2])]

            # intensity: 01 = normal, 02 = strong
            #  no strong intensity for the 'neutral'
            emotion_intensity = inten_dict[int(identifiers[3])]

            statement = stat_dict[int(identifiers[4])]

            repetition = int(identifiers[5])

            actor = identifiers[6]

            #  Odd numbered actors are male, even numbered actors are female
            if int(identifiers[6]) % 2 == 0:
                gender = 'female'
            else:
                gender = 'male'

            new_data = pd.DataFrame.from_records([{"Emotion": emotion,
                                                   "Emotion intensity": emotion_intensity,
                                                   "Gender": gender,
                                                   "Path": file_path,
                                                   "Statement": statement,
                                                   "Actor": actor,
                                                   "Repetition": repetition
                                                   }])

            data = pd.concat([data, new_data], ignore_index=True)

            #if data.shape[0] == 100:
            #  return data


    return data

In [4]:
data_path ='./audio_speech_actors_01-24'
load_ravdess_df = read_RAVDESS_from_dir(data_path)
load_ravdess_df.head()

Unnamed: 0,Statement,Path,Emotion,Emotion intensity,Gender,Actor,Repetition
0,Dogs are sitting by the door,./audio_speech_actors_01-24/Actor_16/03-01-05-...,angry,normal,female,16,1
1,Dogs are sitting by the door,./audio_speech_actors_01-24/Actor_16/03-01-06-...,fearful,normal,female,16,2
2,Kids are talking by the door,./audio_speech_actors_01-24/Actor_16/03-01-06-...,fearful,strong,female,16,2
3,Kids are talking by the door,./audio_speech_actors_01-24/Actor_16/03-01-05-...,angry,strong,female,16,1
4,Kids are talking by the door,./audio_speech_actors_01-24/Actor_16/03-01-07-...,disgust,normal,female,16,1


In [5]:
sample = load_ravdess_df["Path"][1436] # emo = surprised
Audio(sample)

In [6]:
# need to be consistent with SER part ?
# offset = 0.5 --> 0 ? sr=48000 -> 16000 ?
sample_wave, sr = librosa.load(sample, duration=3, offset=0, sr=16000)
print(sample_wave.shape) # = (duration-offset)*sr

Audio(sample_wave,rate=sr)

(48000,)


In [7]:
sr ## ?? fs = 16000 in original paper

16000

In [9]:
FFT_SIZE = 1024
SP_DIM = FFT_SIZE // 2 + 1
FEAT_DIM = SP_DIM + SP_DIM + 1 + 1 + 256 + 1  # [sp, ap, f0, en, emo_feats, s]
RECORD_BYTES = FEAT_DIM * 4  # all features saved in `float32`

EPSILON = 1e-10

1285

In [68]:
# dtype: float32 --> double
sample_wave = sample_wave.astype(np.double)
sample_x = sample_wave
_sample_f0, t = pw.dio(sample_x, sr)    # raw pitch extractor
f0 = pw.stonemask(sample_x, _sample_f0, t, sr)  # pitch refinement
sp = pw.cheaptrick(sample_x, f0, t, sr, fft_size=1024)  # extract smoothed spectrogram
ap = pw.d4c(sample_x, f0, t, sr, fft_size=1024)         # extract aperiodicity

y = pw.synthesize(f0, sp, ap, sr) # synthesize an utterance using the parameters

In [69]:
y.shape

(48080,)

In [71]:
Audio(y,rate=sr)

In [74]:
print(f0.shape)
print(sp.shape) # spectral envelop
print(ap.shape)

(601,)
(601, 513)
(601, 513)


In [73]:
sp[0:5]

array([[2.84115332e-08, 2.84226345e-08, 2.84554022e-08, ...,
        4.08028124e-13, 3.98974128e-13, 3.95993278e-13],
       [1.18527917e-08, 1.18529241e-08, 1.18533858e-08, ...,
        3.41284327e-15, 3.28450328e-15, 3.24191459e-15],
       [3.73725027e-09, 3.73768153e-09, 3.73894269e-09, ...,
        7.54835273e-15, 7.27702015e-15, 7.18992226e-15],
       [2.32593897e-09, 2.32597053e-09, 2.32605625e-09, ...,
        3.80731527e-15, 3.79022928e-15, 3.78308760e-15],
       [1.91751817e-09, 1.91851393e-09, 1.92152114e-09, ...,
        5.20591436e-15, 4.95725211e-15, 4.87774413e-15]])

In [72]:
f0

array([  0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.        ,
         0.        ,   0.        ,   0.        ,   0.  