In [1]:
import glob
import librosa
import numpy as np
import csv
import matplotlib.pyplot as plt
from scipy.stats import kurtosis
import pandas as pd
from tqdm import tqdm
import librosa.display
import noisereduce as nr

In [2]:
AUDIO_DIR = "../../EMO-DB_Dataset/"
PREPROCESSED_EXTRACTED_FEATURES_FILE = './traditional/preprocessed_extracted_features_emodb.csv'
PREPROCESSED_SPECTROGRAM_IMAGES_DIR = './deep_learning/'

In [3]:
def preprocess_audio(y, sr):
    y = nr.reduce_noise(y=y, sr=sr, n_fft=2048, hop_length=512, prop_decrease=.75, time_constant_s=1)
    y, _ = librosa.effects.trim(y, top_db=30)
    return y

def spikes(data):
    if len(data.shape) != 1:
        data = np.concatenate(data)
    mean = np.mean(data)
    std = np.std(data)
    threshold = mean + std * 2 / 100
    num_spikes = 0
    for value in data:
        if value >= threshold:
            num_spikes += 1
    
    return num_spikes

In [4]:
emotion_label = {
    'W': 'anger', 'F': 'happy', 'N': 'neutral', 'T': 'sad'
}

emotion_number = {
    'W': 0, 'F': 1, 'N': 3, 'T': 2
}

In [5]:
def extract_features(audio_file, emotion, preprocess=True): 
    file = audio_file.split("\\")[-1][:-4]

    y, sr = librosa.load(audio_file, sr=16000)

    bef_prep_duration = librosa.get_duration(y=y, sr=sr)
    aft_prep_duration = bef_prep_duration

    if preprocess:
        y = preprocess_audio(y, sr)

    mfcc = librosa.feature.mfcc(y=y, sr=sr)

    zcr = librosa.feature.zero_crossing_rate(y=y)
    mel_spect = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=127)
    chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr)
    spec_bw = librosa.feature.spectral_bandwidth(y=y, sr=sr)

    min_zcr = np.min(zcr)
    spikes_zcr = spikes(zcr)
    var_mel_spect = np.var(mel_spect)
    spikes_mel_spect = spikes(mel_spect)
    thpercentile25_chroma_stft = np.percentile(chroma_stft, 0.25)
    spikes_chroma_stft = spikes(chroma_stft)
    mean_spec_bw = np.mean(spec_bw)
    max_spec_bw = np.max(spec_bw)
    thpercentile25_rms = np.percentile(librosa.feature.rms(y=y), 0.25)

    var_mfcc1 = np.var(mfcc[0])
    var_mfcc3 = np.var(mfcc[2])
    max_mfcc5 = np.max(mfcc[4])
    var_mfcc5 = np.var(mfcc[4])
    median_mfcc5 = np.median(mfcc[4])
    spikes_mfcc6 = spikes(mfcc[5])
    thpercentile75_mfcc7 = np.percentile(mfcc[6], 0.75)
    max_mfcc7 = np.max(mfcc[6])
    var_mfcc8 = np.var(mfcc[7])
    sum_mfcc10 = np.sum(mfcc[9])
    max_mfcc10 = np.max(mfcc[9])
    thpercentile75_mfcc11 = np.percentile(mfcc[10], 0.75)
    max_mfcc11 = np.max(mfcc[10])
    sum_mfcc12 = np.sum(mfcc[11])
    kurtosis_mfcc12 = kurtosis(mfcc[11])
    mean_mfcc13 = np.mean(mfcc[12])
    mean_mfcc15 = np.mean(mfcc[14])
    spikes_mfcc16 = spikes(mfcc[15])
    kurtosis_mfcc17 = kurtosis(mfcc[16])
    mean_mfcc17 = np.mean(mfcc[16])
    kurtosis_mfcc18 = kurtosis(mfcc[17])
    spikes_mfcc19 = spikes(mfcc[18])
    mean_mfcc19 = np.mean(mfcc[18])
    mean_mfcc20 = np.mean(mfcc[19])

    features_str = f'{file} {bef_prep_duration} {aft_prep_duration} {emotion_label[emotion]} {emotion_number[emotion]}\
        {min_zcr} {spikes_zcr} {var_mel_spect} {spikes_mel_spect} {thpercentile25_chroma_stft}\
        {spikes_chroma_stft} {mean_spec_bw} {max_spec_bw} {thpercentile25_rms} {var_mfcc1} {var_mfcc3} {max_mfcc5}\
        {var_mfcc5} {median_mfcc5} {spikes_mfcc6} {thpercentile75_mfcc7} {max_mfcc7} {var_mfcc8}\
        {sum_mfcc10} {max_mfcc10} {thpercentile75_mfcc11} {max_mfcc11} {sum_mfcc12} {kurtosis_mfcc12}\
        {mean_mfcc13} {mean_mfcc15} {spikes_mfcc16} {kurtosis_mfcc17} {mean_mfcc17} {kurtosis_mfcc18}\
        {spikes_mfcc19} {mean_mfcc19} {mean_mfcc20}'

    return features_str.split()

In [6]:
def extract_dl_features(audio_file, emotion, preprocess=True): 
    y, sr = librosa.load(audio_file, sr=16000)

    if preprocess:
        y = preprocess_audio(y, sr)

    fig = plt.figure(dpi=100)
    ax = fig.add_subplot()
    ax.axes.get_xaxis().set_visible(False)
    ax.axes.get_yaxis().set_visible(False)
    ax.set_frame_on(False)
    spec = librosa.amplitude_to_db(np.abs(librosa.stft(y,  n_fft=2048, hop_length=512)), ref=np.max)
    librosa.display.specshow(spec, sr=sr, hop_length=512, ax=ax, cmap="viridis_r")
    filename = audio_file.split("\\")[-1][:-4]
    fig.savefig(f'{PREPROCESSED_SPECTROGRAM_IMAGES_DIR}{filename}-{emotion_number[emotion]}-spec_img.png',
                bbox_inches='tight', pad_inches=0, dpi=100)
    fig.clf()
    plt.close(fig)


In [7]:
final_headers = [
    'File', 'Original_Duration', 'Duration', 'Emotion', 'Emotion_Id',
    'min_zcr', 'spikes_zcr', 'var_mel_spect', 'spikes_mel_spect', 'thpercentile25_chroma_stft',
    'spikes_chroma_stft', 'mean_spec_bw', 'max_spec_bw', 'thpercentile25_rms', 'var_mfcc1', 'var_mfcc3', 'max_mfcc5',
    'var_mfcc5', 'median_mfcc5', 'spikes_mfcc6', 'thpercentile75_mfcc7', 'max_mfcc7', 'var_mfcc8',
    'sum_mfcc10', 'max_mfcc10', 'thpercentile75_mfcc11', 'max_mfcc11', 'sum_mfcc12', 'kurtosis_mfcc12',
    'mean_mfcc13', 'mean_mfcc15', 'spikes_mfcc16', 'kurtosis_mfcc17', 'mean_mfcc17', 'kurtosis_mfcc18',
    'spikes_mfcc19', 'mean_mfcc19', 'mean_mfcc20'
]

In [8]:
def process_data(audio_dir, headers, proc_feat_dataset, preprocess=True):
    # Create a CSV for storing all processed features and write the header
    file = open(proc_feat_dataset, 'w', newline='')
    writer = csv.writer(file)
    writer.writerow(headers)

    print("Processing audio files from all subjects:")
    for file_path in tqdm(glob.glob(audio_dir+'wav/*.wav')):
        # for windows:
        emotion = file_path[-6:-5]

        if emotion not in emotion_number:
            continue

        processed_data = extract_features(file_path, emotion, preprocess=preprocess)
        writer.writerow(processed_data)

        extract_dl_features(file_path, emotion, preprocess=preprocess)

In [9]:
process_data(AUDIO_DIR, final_headers, PREPROCESSED_EXTRACTED_FEATURES_FILE, preprocess=True)

Processing audio files from all subjects:


100%|██████████| 535/535 [01:09<00:00,  7.65it/s]
