# **RAVDESS LECTURA DE DATOS**

Este notebook contiene una serie de métodos cuyo objetivo es la generación de datos que se usarán en el entrenamiento de las redes.

Proyecto Fin de Máster
</br>
Luisa Sanchez Avivar
    _luisasanavi@gmail.com_

In [1]:
# IMPORT LIBRARIES

# Processing
import librosa
import librosa.display
import numpy as np
import random
from tqdm import tqdm

# Visualization
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import IPython.display as ipd

# Files
import os
import joblib
import pickle

# Machine Learning
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import confusion_matrix
import keras
from keras.utils import np_utils
from tensorflow.keras.utils import to_categorical
from keras.callbacks import ReduceLROnPlateau, EarlyStopping
from keras.models import Sequential, Model, model_from_json
from keras.layers import Dense, Embedding, LSTM
from keras.layers import Input, Flatten, Dropout, Activation, BatchNormalization
from keras.layers import Conv1D, MaxPooling1D, AveragePooling1D
import tensorflow as tf

# ####### TEST ####### 
# Scipy
from scipy import signal
from scipy.io import wavfile

In [2]:
AUDIO_DATA_PATH = 'data/'
GPATH = '/content/drive/My Drive/Master/Asignaturas/2 Cuatrimestre/Proyecto/Code/'
LPATH_DATA = 'SpeechEmotionRecognition/data/procesed/' 
LPATH_IMG = 'SpeechEmotionRecognition/data/spectrograms/' 
SAMPLE_FILE = "03-01-01-01-01-01-01.wav"

# Maps
EMOTION_MAP = {1:'neutral', 2:'calm', 3:'happy', 4:'sad', 5:'angry', 6:'fear', 7:'disgust', 8:'surprise'}
INTENSITY_MAP = {1:'normal', 2:'strong'}

In [3]:
import sys  
sys.path.insert(0, GPATH + 'SpeechEmotionRecognition/data/foreignSER/')



In [8]:
# import processDataSets

!ls '/content/drive/My Drive/Master/Asignaturas/2 Cuatrimestre/Proyecto/Code/SpeechEmotionRecognition/data/foreignSER/'

ls: cannot access '/content/drive/My Drive/Master/Asignaturas/2 Cuatrimestre/Proyecto/Code/SpeechEmotionRecognition/data/foreignSER/': No such file or directory


In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
dir_list = os.listdir(GPATH + AUDIO_DATA_PATH)
dir_list.sort()

print(dir_list)

emotion = []
gender = []
intensity = []
path = []

# Extraemos de cada archivo de sonido sus datos
for dir in dir_list:
  path_dir = os.listdir(GPATH + AUDIO_DATA_PATH + dir) # todos los archivos de audios asociados a un directorio
  for filepath in path_dir:
    info_vector = filepath.split('.')[0].split('-')
    n_emotion = int(info_vector[2])
    n_gender = int(info_vector[6])
    n_intensity = int(info_vector[3])
    str_path = GPATH + AUDIO_DATA_PATH + dir + '/' + str(filepath)
    path.append(str_path)
    emotion.append(n_emotion)
    intensity.append(n_intensity)
    if n_gender%2 == 0:
      gender.append('female')
    else:
      gender.append('male')

# Construimos el data frame
EnglishSpeech_df = pd.DataFrame(columns=['emotion', 'gender', 'intensity', 'path'])
EnglishSpeech_df['emotion'] = emotion
EnglishSpeech_df['gender'] = gender
EnglishSpeech_df['intensity'] = intensity
EnglishSpeech_df['path'] = path
EnglishSpeech_df['emotion'] = EnglishSpeech_df['emotion'].map(EMOTION_MAP) 
EnglishSpeech_df['intensity'] = EnglishSpeech_df['intensity'].map(INTENSITY_MAP)


print("Size of the dataset: {} \n".format(len(EnglishSpeech_df)))
class_distribution = EnglishSpeech_df['emotion'].value_counts()
print(class_distribution)

['Actor_01', 'Actor_02', 'Actor_03', 'Actor_04', 'Actor_05', 'Actor_06', 'Actor_07', 'Actor_08', 'Actor_09', 'Actor_10', 'Actor_11', 'Actor_12', 'Actor_13', 'Actor_14', 'Actor_15', 'Actor_16', 'Actor_17', 'Actor_18', 'Actor_19', 'Actor_20', 'Actor_21', 'Actor_22', 'Actor_23', 'Actor_24']
Size of the dataset: 1440 

happy       192
fear        192
surprise    192
calm        192
disgust     192
angry       192
sad         192
neutral      96
Name: emotion, dtype: int64


In [20]:
# Leemos los archivos del directorio con los datos aumentados
LPATH_AUGMENTED = GPATH  + 'RAVDESS/augmented/'

dir_list = os.listdir(LPATH_AUGMENTED)
dir_list.sort()
emotion = []
path = []

for dir in dir_list:
  path_dir = os.listdir(GPATH + 'RAVDESS/augmented/' + dir) 
  for filepath in path_dir:
    str_path = LPATH_AUGMENTED + dir + '/' + str(filepath)
    emotion.append(dir)
    path.append(str_path)

RAVDESS_augmented_df = pd.DataFrame(columns=['emotion','path'])
RAVDESS_augmented_df['emotion'] = emotion
RAVDESS_augmented_df['path'] = path

Unnamed: 0,emotion,path
0,angry,/content/drive/My Drive/Master/Asignaturas/2 C...
1,angry,/content/drive/My Drive/Master/Asignaturas/2 C...
2,angry,/content/drive/My Drive/Master/Asignaturas/2 C...
3,angry,/content/drive/My Drive/Master/Asignaturas/2 C...
4,angry,/content/drive/My Drive/Master/Asignaturas/2 C...


### **OPERACIONES CON TECNICAS DE _DATA AUGMENTATION_**

In [None]:
def white_noise(data):
  '''
  Agrega ruido blanco a una pista de audio
  '''
  wn_spectrum = np.random.randn(len(data))
  data_wn = data + 0.005*wn_spectrum
  return data_wn

def shift_audio_sample(data, f_low = -5, f_high = 5, spec = 1):
  '''
  Desplaza una se;al acustica en un rango de frecuencia
  '''
  d_range = int(np.random.uniform(low=f_low, high = f_high)*spec) 
  data_shiftted = np.roll(data, d_range)

  return data_shiftted

def pitch_shift(data, bins_per_octave=12, pitch_pm = 2):
  '''
  Modula el tono y modifica la velocidad de una pista de audio
  '''
  pitch_change =  pitch_pm * 2*(np.random.uniform())   
  data_pitch = librosa.effects.pitch_shift(data.astype('float64'),16000, n_steps=pitch_change, bins_per_octave=bins_per_octave)
  return data_pitch


## **CARACTERISTICAS 1D**

In [16]:
def get_features(df, modifier):
  '''
  Extrae las caracteristicas de un conjunto de pistas de audio a 
  partir de un dataframe usando librosa

  Aguments
  ---------
    df : dataframe
    Dataframe que contiene el path donde se encuentra la pista de audio
    modifier: Function
    Funcion que modifica los datos

  Return
  -------
   data: np.array 
   Caracteristicas extraidas

  '''
  bar_data_range = tqdm(range(len(df)))
  data = pd.DataFrame(columns = ['data'])
  for index in bar_data_range:
    data_features = modifier(df.path[index])
    data.loc[index] = [data_features]

  return data

def get_features_single_file(pathfile):
  '''
  Extrae las caracteristicas  de una unica pista de audio usando MFCC 
  a traves de librosa.
  
  Aguments
  ---------
    pathfile: str 
      Path del archivo del que se extraeran las caracteristicas

  Return
  -------
    data_features

  '''
  X, sample_rate = librosa.load(pathfile, res_type='kaiser_fast')
  mfcc = librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40)
  data_features = np.mean(mfcc.T, axis = 0)

  return data_features



def get_features_white_noise(pathfile):
  '''
  Extrae las caracteristicas  de una unica pista de audio usando MFCC 
  a traves de librosa habiendoles aplicado ruido blanco.
  
  Aguments
  ---------
    pathfile: str 
      Path del archivo del que se extraeran las caracteristicas

  Return
  -------
    data_features

  '''
  X, sample_rate = librosa.load(pathfile, res_type='kaiser_fast')
  # X = librosa.core.load(random_sample)[0]

  x_data_wn = white_noise(X)
  mfcc = librosa.feature.mfcc(y=x_data_wn, sr=sample_rate, n_mfcc=40)
  data_features = np.mean(mfcc.T, axis = 0)

  return data_features


def get_features_shiftted(pathfile):
  '''
  Extrae las caracteristicas  de una unica pista de audio usando MFCC 
  a traves de librosa habiendo desplazado las frecuencias perviamente.
  
  Aguments
  ---------
    pathfile: str 
      Path del archivo del que se extraeran las caracteristicas

  Return
  -------
    data_features

   '''
  X, sample_rate = librosa.load(pathfile, res_type='kaiser_fast')
  # X = librosa.core.load(random_sample)[0]

  x_data_wn = shift_audio_sample(X)
  mfcc = librosa.feature.mfcc(y=x_data_wn, sr=sample_rate, n_mfcc=40)
  data_features = np.mean(mfcc.T, axis = 0)

  return data_features


def get_features_pitch(pathfile):
  '''
  Aplica modulacion del tono en cada muestra y despues extrae las caracteristicas 
  usando el algoritmo MFCC
  
  Aguments
  ---------
    pathfile: str 
      Path del archivo del que se extraeran las caracteristicas

  Return
  -------
    data_features

  '''
  X, sample_rate = librosa.load(pathfile, res_type='kaiser_fast')
  # X = librosa.core.load(random_sample)[0]

  x_data_wn = pitch_shift(X)
  mfcc = librosa.feature.mfcc(y=x_data_wn, sr=sample_rate, n_mfcc=40)
  data_features = np.mean(mfcc.T, axis = 0)


  return data_features



## **CARACTERISTICAS 2D: ESPECTOGRAMAS**

In [15]:

def get_spectrograms(df, output_path, get_spectrogram):
  '''
  Genera y almacena espectogramas unas caracteristicas espeficicas
  Aguments
  ---------
  df: DataFrame
    dataframe donde estan almacenados los datos

  output_path: str
    Ruta donde se almacenaran los archivos generados

  '''

  bar_data_range = tqdm(range(len(df)))
  data = pd.DataFrame(columns = ['data'])
  
  for index in bar_data_range:
    get_spectrogram(df.path[index], df.emotion[index] , output_path, index)



def save_melspectrogram(pathfile, emotionName, output_path, index):
  '''
  Genera un espectograma Mel como imagen a partir de un archivo, y lo guarda en una ruta especificada
  Aguments
  ---------
  pathfile: str
    Ruta donde se encuentra el archivo.

  output_path: str
    Ruta donde se guardara la imagen generada.
  '''
  X, sample_rate = librosa.load(pathfile, res_type='kaiser_fast')
  features_melspectrogram = librosa.feature.melspectrogram(y=X, sr=sample_rate, n_mels=128,fmax=8000) 
  melspectrogram_data = librosa.power_to_db(features_melspectrogram)

  # Definimos y convertimos a imagen
  fig = plt.figure(figsize=(12,4))
  ax = plt.Axes(fig, [0., 0., 1., 1.])
  ax.set_axis_off()
  fig.add_axes(ax)
  librosa.display.specshow(melspectrogram_data, sr=sample_rate, x_axis='time', y_axis='mel')

  filename = output_path + emotionName + "/ravdess_mel_"+ str(index)+".jpg"
  if not os.path.exists(output_path + emotionName):
    os.makedirs(output_path + emotionName)

  plt.savefig(filename, bbox_inches='tight', transparent=True, pad_inches=-0.05)
  plt.close()


def save_mfccspectrograma(pathfile, emotionName, output_path, index):
  '''
  Genera un espectograma MFCC como imagen a partir de un archivo, y lo guarda en una ruta especificada
  Aguments
  ---------
  pathfile: str
    Ruta donde se encuentra el archivo.

  output_path: str
    Ruta donde se guardara la imagen generada.
  '''
  X, sample_rate = librosa.load(pathfile, res_type='kaiser_fast')
  features_mfccspectrogram = librosa.feature.mfcc(X, sr=sample_rate, n_mfcc=20)
  fig = plt.figure(figsize=(12,4))
  ax = plt.Axes(fig, [0., 0., 1., 1.])
  ax.set_axis_off()
  fig.add_axes(ax)
  librosa.display.specshow(features_mfccspectrogram, sr=sample_rate, x_axis='time', y_axis='mel')

  filename = output_path + emotionName + "/ravdess_mfccspectrogram_" + str(index)+".jpg"
  if not os.path.exists(output_path + emotionName):
    os.makedirs(output_path + emotionName)

  plt.savefig(filename, bbox_inches='tight', transparent=True, pad_inches=-0.05)
  plt.close()




In [17]:
def concat_features(*features):
  '''
  Concatena varios arrays de caracteristicas devolviendo la combinacion de todos ellos

  '''
  features_list = []
  for feature in features:
    # Cada valor es una caracteristica
    feature_df = pd.DataFrame(feature['data'].values.tolist())
    # Concateno las columnas genero y emocion para poder dividirlo mas tarde
    feature_spec = pd.concat((feature_df, EnglishSpeech_df['gender'], EnglishSpeech_df['emotion']), axis = 1)
    # Elimino filas vacias
    feature_spec = feature_spec.fillna(0)
    # Añado a la lista
    features_list.append(feature_spec)
  
  features_complete_df = pd.concat(features_list, ignore_index = True)
  return shuffle(features_complete_df)


def read_features():
  '''
  Lee las independientemente las caracteristicas de cada set 
  '''
  # Leemos las caracteristicas estandar (sin data augmentation)
  features_standard = get_features(EnglishSpeech_df, get_features_single_file)
  try:
    pickle.dump(features_standard, open(GPATH + LPATH_DATA + 'features_standard_RAVDESS.pkl', 'wb'))
  except Exception as ex:
    print(ex)
  print("Standard features into file")
  # Leemos para Ruido Blanco
  features_wn = get_features(EnglishSpeech_df, get_features_white_noise)
  try:
    pickle.dump(features_wn, open(GPATH + LPATH_DATA + 'features_wn_RAVDESS.pkl', 'wb'))
  except Exception as ex:
    print(ex)
  print("White Noise features into file")

  # Leemos para Desplazamiento del Sonido
  features_shiftted = get_features(EnglishSpeech_df, get_features_shiftted)
  try:
    pickle.dump(features_shiftted, open(GPATH + LPATH_DATA + 'features_shiftted_RAVDESS.pkl', 'wb'))
  except Exception as ex:
    print(ex)
  print("Shiftted into file")

  # Leemos para Modificacion del Tono
  features_pitch = get_features(EnglishSpeech_df, get_features_pitch)
  try:
    pickle.dump(features_pitch, open(GPATH + LPATH_DATA + 'features_pitch_RAVDESS.pkl', 'wb'))
  except Exception as ex:
    print(ex)
  print("Pitch Tunning features into file")

  return features_standard, features_wn, features_shiftted, features_pitch
  

### **ESPECTOGRAMAS DE MEL**

In [None]:
melpath = GPATH + LPATH_IMG + 'mel/'
print(melpath)
# get_spectrograms(EnglishSpeech_df, melpath , save_melspectrogram)

get_spectrograms(RAVDESS_augmented_df, melpath , save_melspectrogram)

  0%|          | 0/1440 [00:00<?, ?it/s]

/content/drive/My Drive/Master/Asignaturas/2 Cuatrimestre/Proyecto/Code/SpeechEmotionRecognition/data/spectrograms/mel/


100%|██████████| 1440/1440 [11:34<00:00,  2.07it/s]


### **ESPECTOGRAMAS MFCC**

In [22]:
# get_spectrograms(EnglishSpeech_df, GPATH + LPATH_IMG + 'mfcc/', save_mfccspectrograma) 

get_spectrograms(RAVDESS_augmented_df, GPATH + LPATH_IMG + 'mfcc/', save_mfccspectrograma) 



  0%|          | 0/1440 [00:00<?, ?it/s][A[A

  0%|          | 1/1440 [00:01<43:31,  1.81s/it][A[A

  0%|          | 2/1440 [00:02<38:39,  1.61s/it][A[A

  0%|          | 3/1440 [00:04<35:49,  1.50s/it][A[A

  0%|          | 4/1440 [00:05<33:07,  1.38s/it][A[A

  0%|          | 5/1440 [00:06<31:25,  1.31s/it][A[A

  0%|          | 6/1440 [00:07<30:01,  1.26s/it][A[A

  0%|          | 7/1440 [00:08<28:57,  1.21s/it][A[A

  1%|          | 8/1440 [00:10<30:45,  1.29s/it][A[A

  1%|          | 9/1440 [00:11<28:45,  1.21s/it][A[A

  1%|          | 10/1440 [00:12<28:33,  1.20s/it][A[A

  1%|          | 11/1440 [00:13<27:47,  1.17s/it][A[A

  1%|          | 12/1440 [00:14<27:40,  1.16s/it][A[A

  1%|          | 13/1440 [00:15<27:31,  1.16s/it][A[A

  1%|          | 14/1440 [00:16<27:21,  1.15s/it][A[A

  1%|          | 15/1440 [00:17<26:47,  1.13s/it][A[A

  1%|          | 16/1440 [00:19<26:58,  1.14s/it][A[A

  1%|          | 17/1440 [00:20<27:21,  1.15s/i

### **ESPECTOGRAMAS ??**