In [1]:
import glob
import os
import librosa
import numpy as np

### Extractng Features

#### MFCCs
Extract MFCC features from the audio signal.

In [23]:
def extract_mfcc(X, sample_rate, n_mfcc=42):
    mfccs = librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=n_mfcc)
    mfccs = np.mean(mfccs.T, axis=0)  # Mean across time frames to get 1D array/vector
    return mfccs # 1D array/vector, mean of MFCC features (shape: n_mfcc,)

#### Chroma
Extract Chroma features from the audio signal.

In [26]:
def extract_chroma(X, sample_rate):
    stft = np.abs(librosa.stft(X))  # Compute the STFT (Short-Time Fourier Transform) of the signal
    chroma = librosa.feature.chroma_stft(S=stft, sr=sample_rate)
    chroma = np.mean(chroma.T, axis=0)  # Mean across the time frames
    return chroma # 1D array/vector, mean chroma features (shape: 12,)

#### Mel
Extract Mel-spectrogram features from the audio signal.

In [29]:
def extract_mel(X, sample_rate):
    mel = librosa.feature.melspectrogram(y=X, sr=sample_rate)
    mel = np.mean(mel.T, axis=0) # Mean across the time frames
    return mel # 1D array/vector, mean Mel-spectrogram features (shape: 128,)

#### Combining the functions
Extract audio features (MFCC, Chroma, Mel-spectrogram) from an audio file.

Parameters:
- file_name: str, path to the audio file.
- mfcc: bool, whether to extract MFCC features (default True).
- chroma: bool, whether to extract Chroma features (default True).
- mel: bool, whether to extract Mel-spectrogram features (default True).    

In [32]:
def extract_features(file_name, mfcc=True, chroma=True, mel=True):
    
    X, sample_rate = librosa.load(os.path.join(file_name), res_type='kaiser_fast')
    
    result = np.array([])
    
    if mfcc:
        mfccs = extract_mfcc(X, sample_rate)
        result = np.hstack((result, mfccs))

    if chroma:
        chroma = extract_chroma(X, sample_rate)
        result = np.hstack((result,chroma))

    if mel:
        mel = extract_mel(X, sample_rate)
        result = np.hstack((result,mel))

    return result # numpy array, concatenated feature vector.

In [34]:
# Emotions present in the dataset RAVDESS and TESS
emotions = {'01':'neutral', '02':'calm', '03':'happy', '04':'sad', '05':'angry', '06':'fearful', '07':'disgust', '08':'surprised'}

# Emotions that we want to predict 
# observed_emotions = ['neutral', 'calm', 'happy', 'sad', 'angry', 'fearful', 'disgust', 'surprised']
observed_emotions = ['happy', 'sad', 'angry', 'fearful', 'disgust', 'surprised']

### Loading the Data and Feature engineering
    Split the Dataset
    Let’s keep the test set 25% of everything and use the loadData function for this.

In [37]:
def loadData(speeches = True, songs = True, tess = True, mfccs = True, chromas = False, mels = False):
    # Initializing the feature and output labels
    x,y =[],[]
    
    # Loading the data and extracting the features for each audio file (RAVDESS)
    if speeches:
        for file in glob.glob('Audio_Speech_Actors_01-24/Actor_*/*.wav'): 
            file_name = os.path.basename(file)
            emotion = emotions[file_name.split("-")[2]]
            if emotion not in observed_emotions:
                continue
            feature = extract_features(file, mfcc=mfccs, chroma=chromas, mel=mels)
            x.append(feature)
            y.append(emotion)
        
    # Loading the data and extracting the features for each song file (RAVDESS)
    if songs:
        for file in glob.glob('Audio_Song_Actors_01-24/Actor_*/*.wav'): 
            file_name = os.path.basename(file)
            emotion = emotions[file_name.split("-")[2]]
            if emotion not in observed_emotions:
                continue
            feature = extract_features(file, mfcc=mfccs, chroma=chromas, mel=mels)
            x.append(feature)
            y.append(emotion)
        
    # Loading the data and extracting the features for each audio file (TESS)
    if tess:
        for file in glob.glob('TESS_Toronto_emotional_speech_set_data/*/*.wav'): 
            file_name = os.path.basename(file)
            emotion = file_name.split("_")[-1]
            emotion = emotion.split(".")[0]
            if emotion == "ps":
                emotion = "surprised"
            if emotion == "fear":
                emotion = "fearful"
            if emotion not in observed_emotions:
                continue
            feature = extract_features(file, mfcc=mfccs, chroma=chromas, mel=mels)
            x.append(feature)
            y.append(emotion)
        
    return  {"X":x,"y":y}