In [17]:

# Standard Python Operations 
import pandas as pd 
import numpy as np
import math
import os
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm, tqdm_pandas

# Audio Extraction and Augmentation tools
from scipy import signal 
import librosa 
import librosa.display 
import IPython.display as ipd

# Machine Learning 
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

# Neural Networks 
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import Sequential
from tensorflow.random import set_seed
from keras import regularizers
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.models import load_model
from tensorflow.keras.regularizers import l1, l2
from tensorflow.keras.layers import *
from tensorflow.keras.callbacks import ReduceLROnPlateau

In [13]:
# Setting string representing path for folder containing audio files
Ravdess = '/Users/stephen/Emotion_Detection-/data/RAVDESS/'

# Turn Path string into Path Object 
ravdess_directory_list = os.listdir(Ravdess)
ravdess_directory_list.sort

# Create empty lists to store audio paths and their labels associated with each file 
file_emotion = []
file_path = []
# Since each actor is in there own folder we must iterate through each folder 
for dir in ravdess_directory_list:
    # as their are 24 different actors in our previous directory we need to extract files for each actor.
    actor = os.listdir(Ravdess + dir)
    for file in actor:
        # splitting up file name to decode labels 
        part = file.split('.')[0]
        part = part.split('-')
        # third part in each file represents the emotion associated to that file.
        file_emotion.append(int(part[2]))
        file_path.append(Ravdess + dir + '/' + file)
        
# dataframe for emotion of files
emotion_df = pd.DataFrame(file_emotion, columns=['Emotions'])

# dataframe for path of files.
path_df = pd.DataFrame(file_path, columns=['Path'])
Ravdess_df = pd.concat([emotion_df, path_df], axis=1)

# changing integers to actual emotions.
Ravdess_df.Emotions.replace({1:'neutral', 2:'calm', 3:'happy', 4:'sad', 5:'angry', 6:'fear', 7:'disgust', 8:'surprise'}, inplace=True)
Ravdess_df.head()

Unnamed: 0,Emotions,Path
0,angry,/Users/stephen/Emotion_Detection-/data/RAVDESS...
1,fear,/Users/stephen/Emotion_Detection-/data/RAVDESS...
2,fear,/Users/stephen/Emotion_Detection-/data/RAVDESS...
3,angry,/Users/stephen/Emotion_Detection-/data/RAVDESS...
4,disgust,/Users/stephen/Emotion_Detection-/data/RAVDESS...


In [16]:
def noise(data, SNR=10):   
    """
    Adds Additive White Gausian Noise to signal 
    https://medium.com/analytics-vidhya/adding-noise-to-audio-clips-5d8cee24ccb

    the higher the SNR the lower the noise amplitude is. 
    finds RMS value of signal 
    Finds RMS value of noise 
    Uses population formula for standard deviation

    Args:
        data (np.darray): audio time series 
        SNR (int, optional): signal to noise ratio. Defaults to 10.

    Returns:
        (np.darray) : [Return original signal plus random noise]
    """
    RMS_s = math.sqrt(np.mean(data**2))
    RMS_n = math.sqrt(RMS_s**2/(pow(10,SNR/10)))
    STD_n = RMS_n 
    noise = np.random.normal(0, STD_n, data.shape[0])
    new_data = data + noise
    return new_data

# Time Strectching 
def stretch(data, rate=0.90):
    """
    Stretches audio data using librosa 
    librosa documentation: https://librosa.org/doc/main/generated/librosa.effects.time_stretch.html

    Args:
        data (np.ndarray): audio time series
        rate (float, optional): Stretch factor. If rate > 1, then the signal is sped up. If rate < 1,
                                         then the signal is slowed down.. Defaults to 0.90.

    Returns:
        (np.ndarray): audio time series stretched by the specified rate
    """
    return librosa.effects.time_stretch(data, rate)

def shift(data):
    """
    Randomly shift timing of audio data 

    Args:
        data (np.ndarray): audio time series

    Returns:
        (np.ndarray): audio time series randomly shifted in time
    """
    shift_range = int(np.random.uniform(low=-1, high = 1)*1000)
    return np.roll(data, shift_range)

# Pre Emphasis 
def pre_emphasis(data, coef=1):
    """
    Adds Pre-Emphasis to audio signal 
    info on Pre-Emphasis in speech recognition https://wiki.aalto.fi/display/ITSP/Pre-emphasis
    librosa Pre-Emphasis documentation https://librosa.org/doc/main/generated/librosa.effects.preemphasis.html

    Args:
        data (np.ndarray): audio time series
        coef (int, optional): Pre-emphasis coefficient.. Defaults to 1.

    Returns:
        (np.ndarray): pre-emphasized signal
    """
    return librosa.effects.preemphasis(data, coef=1)

def pitch(data, sr=16000, n_steps=3):
    """
    Apply a pitch shift to an audio time series.
    Librosa pitch shift documentation http://librosa.org/doc/main/generated/librosa.effects.pitch_shift.html
    
    Args:
        data (np.ndarray): audio time series
        sr (int, optional): sample rate. Defaults to 16000.
        n_steps (int, optional): Shift by n_steps semitones. Defaults to 3.

    Returns:
        (np.ndarray): The pitch-shifted audio time-series
    """
    return librosa.effects.pitch_shift(data, sr, n_steps=1, bins_per_octave=24)

In [None]:
# augment data 

def aug_data(data, sample_rate):
    
    

In [None]:
# extract features for 2D CNN

def prepare_data(df, n_mfcc, noise=True, stretch=True, shift=True, pitch=True, pre_emphasis=True, mfcc):
    X = np.empty(shape=(df.shape[0], n_mfcc, 216, 1))
    input_length = sampling_rate * audio_duration

    cnt = 0
    for fname in tqdm(df.path):
        file_path = fname
        data, _ = librosa.load(file_path, sr=sampling_rate
                               ,res_type="kaiser_fast"
                               ,duration=2.5
                               ,offset=0.5
                              )

        # Random offset / Padding
        if len(data) > input_length:
            max_offset = len(data) - input_length
            offset = np.random.randint(max_offset)
            data = data[offset:(input_length+offset)]
        else:
            if input_length > len(data):
                max_offset = input_length - len(data)
                offset = np.random.randint(max_offset)
            else:
                offset = 0
            data = np.pad(data, (offset, int(input_length) - len(data) - offset), "constant")

        # Augmentation? 
        if noise == True:
            