In [11]:
import sys
import os
# go to upper diretory
sys.path.append(os.path.abspath('./../../../'))
import csv
import glob
import librosa
import librosa.display
import numpy as np
import tensorflow as tf
import pickle
import noisereduce as nr
import matplotlib.pyplot as plt
from tqdm import tqdm
from Audio_Sentiment_Analysis.utils.Configuration import Configuration

AUTOTUNE = tf.data.AUTOTUNE
AUDIO_DIR = f"{os.path.abspath('./../../../')}/IEMOCAP_Dataset"
EXTRACTED_FEATURES_FILE = 'extracted_features_iemocap.csv'
DENOISED_EXTRACTED_FEATURES_FILE = 'denoised_extracted_features_iemocap.csv'
ALL_EXTRACTED_FEATURES_FILE = 'all_extracted_features_iemocap.csv'
DENOISED_ALL_EXTRACTED_FEATURES_FILE = 'denoised_all_extracted_features_iemocap.csv'
RAW_AUDIO_FILES = 'raw_audio_files.csv'
AUDIO_SPECTROGRAMS_DIR = './../data/spectrograms/'
AUDIO_MFCCS_DIR = './../data/mfccs/'
AUDIO_MEL_SPECTROGRAMS_DIR = './../data/mel_spectrograms/'
AUDIO_MEL_SPECTROGRAM_IMAGES_DIR = './../data/mel_spectrogram_images/'
AUDIO_MEL_SPECTROGRAM_IMAGES_DIR_2 = './../data/mel_spectrogram_images_2/'

CONFIG_FILE = f"{os.path.abspath('./../../../')}/Audio_Sentiment_Analysis/iemocap/config.json"
config = Configuration.load_json(CONFIG_FILE)
plt.rcParams['figure.dpi'] = 300

## Extracting the 24 features selected from the eNTERFACE05 dataset study

In [2]:
chosen_features = {
    'var_mfcc15', 'min_mfcc17', 'std_chroma_stft', 'mean_zcr',
    'min_zcr', 'var_mfcc4', 'spikes_spec_cent', 'var_spec_bw', 'max_mfcc6',
    'min_mfcc19', 'max_mfcc13', 'max_mfcc5', 'var_mfcc9', 'min_mfcc7', 'var_mfcc1',
    'max_mfcc1', 'max_mfcc9', 'var_mel_spect', 'mean_spec_cont', 'var_mfcc2', 'max_mfcc10',
    'min_spec_cent', 'var_mfcc14', 'var_mfcc3'
}
all_headers = ['File', 'Interaction', 'Gender', 'Duration', 'Emotion', 'Emotion_Id',\
    'Valence', 'Activation', 'Dominance',\
    'spikes_mel_spect', 'mean_mel_spect', 'min_mel_spect', 'max_mel_spect', 'var_mel_spect', 'std_mel_spect', 'sum_mel_spect',\
    'spikes_chroma_stft', 'mean_chroma_stft', 'var_chroma_stft', 'std_chroma_stft', 'sum_chroma_stft',\
    'spikes_rms', 'mean_rms', 'max_rms', 'var_rms', 'std_rms', 'sum_rms',\
    'spikes_spec_cent', 'mean_spec_cent', 'min_spec_cent', 'max_spec_cent', 'var_spec_cent', 'std_spec_cent', 'sum_spec_cent',\
    'spikes_spec_bw', 'mean_spec_bw', 'min_spec_bw', 'max_spec_bw', 'var_spec_bw', 'std_spec_bw', 'sum_spec_bw',\
    'spikes_spec_rolloff', 'mean_spec_rolloff', 'min_spec_rolloff', 'max_spec_rolloff', 'var_spec_rolloff', 'std_spec_rolloff', 'sum_spec_rolloff',\
    'spikes_spec_cont', 'mean_spec_cont', 'min_spec_cont', 'max_spec_cont', 'var_spec_cont', 'std_spec_cont', 'sum_spec_cont',\
    'spikes_zcr', 'mean_zcr', 'min_zcr', 'max_zcr', 'var_zcr', 'std_zcr', 'sum_zcr']
for i in range(1, 21):
    all_headers.extend([f'spikes_mfcc{i}', f'mean_mfcc{i}', f'min_mfcc{i}', f'max_mfcc{i}', f'var_mfcc{i}', f'std_mfcc{i}', f'sum_mfcc{i}'])
headers = [
    'File', 'Interaction', 'Gender', 'Duration', 'Emotion', 'Emotion_Id',
    'Valence', 'Activation', 'Dominance',
    'std_chroma_stft', 'mean_zcr', 'min_zcr',
    'var_mel_spect', 'spikes_spec_cent', 'min_spec_cent',
    'var_spec_bw', 'mean_spec_cont',
    'var_mfcc1', 'max_mfcc1', 'var_mfcc2', 'var_mfcc3', 'var_mfcc4',
    'max_mfcc5', 'max_mfcc6', 'min_mfcc7', 'var_mfcc9', 'max_mfcc9',
    'max_mfcc10', 'max_mfcc13', 'var_mfcc14', 'var_mfcc15',
    'min_mfcc17', 'min_mfcc19'
]
headers_raw = [
    'File', 'Interaction', 'Gender', 'Duration', 'Emotion', 'Emotion_Id',
    'Valence', 'Activation', 'Dominance'
]
emotion_label = {
    'ang': "angry", 'hap': "happy", 'sad': "sad", 'neu': "neutral",
    'fru': "frustrated", 'exc': "excited", 'fea': "fearful",
    'sur': "surprised", 'dis': "disgusted", 'xxx': "other", 'oth': "other"
}
emotion_number = {
    'ang': 0, 'hap': 1, 'sad': 2, 'neu': 3, 'fru': 4,
    'exc': 5, 'fea': 6, 'sur': 7, 'dis': 8, 'oth': 9, 'xxx': 9
}

In [3]:
def spikes(data):
    if len(data.shape) != 1:
        data = np.concatenate(data)
    mean = np.mean(data)
    std = np.std(data)
    threshold = mean + std * 2 / 100
    num_spikes = 0
    for value in data:
        if value >= threshold:
            num_spikes += 1

    return num_spikes

In [5]:
def extract_features(audio_file, duration, emotion, valence, activation, dominance, writer, noisereduce=False):
    file = audio_file.split(".")[-2].split("/")[-1]
    acting = "improvised" if file.split("_")[1][:5] == "impro" else "scripted"
    gender = "Male" if file.split("_")[-1][0] == 'M' else "Female"

    y, sr = librosa.load(audio_file, sr=16000)
    if noisereduce:
        y = nr.reduce_noise(y=y, sr=sr)

    std_chroma_stft = np.std(librosa.feature.chroma_stft(y=y, sr=sr))
    zcr = librosa.feature.zero_crossing_rate(y=y)
    mean_zcr = np.mean(zcr)
    min_zcr = np.min(zcr)
    var_mel_spect = np.var(librosa.feature.melspectrogram(
        y=y, sr=sr, n_mels=config.n_mels))
    spec_cent = librosa.feature.spectral_centroid(y=y, sr=sr)
    spikes_spec_cent = spikes(spec_cent)
    min_spec_cent = np.min(spec_cent)
    var_spec_bw = np.var(librosa.feature.spectral_bandwidth(y=y, sr=sr))
    mean_spec_cont = np.mean(librosa.feature.spectral_contrast(y=y, sr=sr))
    mfcc = librosa.feature.mfcc(y=y, sr=sr)
    var_mfcc1 = np.var(mfcc[0])
    max_mfcc1 = np.max(mfcc[0])
    var_mfcc2 = np.var(mfcc[1])
    var_mfcc3 = np.var(mfcc[2])
    var_mfcc4 = np.var(mfcc[3])
    max_mfcc5 = np.max(mfcc[4])
    max_mfcc6 = np.max(mfcc[5])
    min_mfcc7 = np.min(mfcc[6])
    var_mfcc9 = np.var(mfcc[8])
    max_mfcc9 = np.max(mfcc[8])
    max_mfcc10 = np.max(mfcc[9])
    max_mfcc13 = np.max(mfcc[12])
    var_mfcc14 = np.var(mfcc[13])
    var_mfcc15 = np.var(mfcc[14])
    min_mfcc17 = np.min(mfcc[16])
    min_mfcc19 = np.min(mfcc[18])

    features_str = f'{file} {acting} {gender} {duration} {emotion_label[emotion]} {emotion_number[emotion]}\
        {valence} {activation} {dominance}\
        {std_chroma_stft} {mean_zcr} {min_zcr} {var_mel_spect} {spikes_spec_cent} {min_spec_cent}\
        {var_spec_bw} {mean_spec_cont} {var_mfcc1} {max_mfcc1} {var_mfcc2} {var_mfcc3} {var_mfcc4}\
        {max_mfcc5} {max_mfcc6} {min_mfcc7} {var_mfcc9} {max_mfcc9} {max_mfcc10} {max_mfcc13}\
        {var_mfcc14} {var_mfcc15} {min_mfcc17} {min_mfcc19}'

    writer.writerow(features_str.split())

In [6]:
def extract_all_features(audio_file, duration, emotion, valence, activation, dominance, writer, noisereduce=False):
    file = audio_file.split(".")[-2].split("/")[-1]
    acting = "improvised" if file.split("_")[1][:5] == "impro" else "scripted"
    gender = "Male" if file.split("_")[-1][0] == 'M' else "Female"

    y, sr = librosa.load(audio_file, sr=16000)
    if noisereduce:
        y = nr.reduce_noise(y=y, sr=sr)

    mfcc = librosa.feature.mfcc(y=y, sr=sr)
    mel_spect = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=config.n_mels)
    rms = librosa.feature.rms(y=y)
    chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr)
    spec_cent = librosa.feature.spectral_centroid(y=y, sr=sr)
    spec_cont = librosa.feature.spectral_contrast(y=y, sr=sr)
    spec_bw = librosa.feature.spectral_bandwidth(y=y, sr=sr)
    spec_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
    zcr = librosa.feature.zero_crossing_rate(y=y)

    features_str = f'{file} {acting} {gender} {duration} {emotion_label[emotion]} {emotion_number[emotion]}\
        {valence} {activation} {dominance}\
        {spikes(mel_spect)} {np.mean(mel_spect)} {np.min(mel_spect)} {np.max(mel_spect)} {np.var(mel_spect)} {np.std(mel_spect)} {np.sum(mel_spect)}\
        {spikes(chroma_stft)} {np.mean(chroma_stft)} {np.var(chroma_stft)} {np.std(chroma_stft)} {np.sum(chroma_stft)}\
        {spikes(rms)} {np.mean(rms)} {np.max(rms)} {np.var(rms)} {np.std(rms)} {np.sum(rms)}\
        {spikes(spec_cent)} {np.mean(spec_cent)} {np.min(spec_cent)} {np.max(spec_cent)} {np.var(spec_cent)} {np.std(spec_cent)} {np.sum(spec_cent)}\
        {spikes(spec_bw)} {np.mean(spec_bw)} {np.min(spec_bw)} {np.max(spec_bw)} {np.var(spec_bw)} {np.std(spec_bw)} {np.sum(spec_bw)}\
        {spikes(spec_rolloff)} {np.mean(spec_rolloff)} {np.min(spec_rolloff)} {np.max(spec_rolloff)} {np.var(spec_rolloff)} {np.std(spec_rolloff)} {np.sum(spec_rolloff)}\
        {spikes(spec_cont)} {np.mean(spec_cont)} {np.min(spec_cont)} {np.max(spec_cont)} {np.var(spec_cont)} {np.std(spec_cont)} {np.sum(spec_cont)}\
        {spikes(zcr)} {np.mean(zcr)} {np.min(zcr)} {np.max(zcr)} {np.var(zcr)} {np.std(zcr)} {np.sum(zcr)}'

    for e in mfcc:
        features_str += f' {spikes(e)} {np.mean(e)} {np.min(e)} {np.max(e)} {np.var(e)} {np.std(e)} {np.sum(e)}'

    writer.writerow(features_str.split())

In [7]:
def process_data(audio_dir, headers, proc_feat_dataset=EXTRACTED_FEATURES_FILE, all_features=False, noisereduce=False):
    # Create a CSV for storing all processed features and write the header
    file = open(proc_feat_dataset, 'w', newline='')
    writer = csv.writer(file)
    writer.writerow(headers)

    print("Processing audio files from all dialogs:")
    for file_path in tqdm(glob.glob(audio_dir+'/Session[0-9]*/dialog/EmoEvaluation/*.txt')):
        emoEvalFile = open(file_path)

        for line in emoEvalFile:
            if line[0] == '[':
                args = line.split()
                duration = float(args[2][:-1]) - float(args[0][1:])
                # on windows
                audio_file = f'{AUDIO_DIR}/Session{args[3][4]}/sentences/wav/'+file_path.split(
                    "\\")[-1][:-4]+f'/{args[3]}.wav'
                # on linux/macOS
                # audio_file = f'{AUDIO_DIR}/Session{args[3][4]}/sentences/wav/{file_path.split("/")[-1][:-4]}/{args[3]}.wav'
                if all_features:
                    if args[4] in {'ang', 'neu', 'exc', 'hap', 'sad'}:
                        extract_all_features(
                            audio_file, duration, args[4], args[5][1:-1], args[6][:-1], args[7][:-1], writer, noisereduce)                    
                else:
                    extract_features(
                        audio_file, duration, args[4], args[5][1:-1], args[6][:-1], args[7][:-1], writer, noisereduce)

In [9]:
process_data(AUDIO_DIR, headers)

Processing audio files from all dialogs:


100%|██████████| 151/151 [06:17<00:00,  2.50s/it]


In [12]:
process_data(AUDIO_DIR, headers, DENOISED_EXTRACTED_FEATURES_FILE, noisereduce=True)

Processing audio files from all dialogs:


100%|██████████| 151/151 [11:13<00:00,  4.46s/it]


In [None]:
process_data(AUDIO_DIR, all_headers, ALL_EXTRACTED_FEATURES_FILE, all_features=True)

Processing audio files from all dialogs:


100%|██████████| 151/151 [08:39<00:00,  3.44s/it]


In [None]:
process_data(AUDIO_DIR, all_headers, DENOISED_ALL_EXTRACTED_FEATURES_FILE, all_features=True, noisereduce=True)

In [None]:
def extract_raw_data(audio_dir, headers, raw_audio_dataset):
   file = open(raw_audio_dataset, 'w', newline='')
   writer = csv.writer(file)
   writer.writerow(headers)

   print("Processing audio files from all dialogs:")
   for file_path in tqdm(glob.glob(audio_dir+'/Session[0-9]*/dialog/EmoEvaluation/*.txt')):
      emoEvalFile = open(file_path)

      for line in emoEvalFile:
         if line[0] == '[':
            args = line.split()
            duration = float(args[2][:-1]) - float(args[0][1:])
            # on windows
            #  audio_file = f'{AUDIO_DIR}/Session{args[3][4]}/sentences/wav/'+file_path.split(
            #      "\\")[-1][:-4]+f'/{args[3]}.wav'
            # on linux/macOS
            audio_file = f'{AUDIO_DIR}/Session{args[3][4]}/sentences/wav/{file_path.split("/")[-1][:-4]}/{args[3]}.wav'
            emotion, valence, activation, dominance = args[4], args[5][1:-
                                                                       1], args[6][:-1], args[7][:-1]
            file = audio_file.split(".")[-2].split("/")[-1]
            acting = "improvised" if file.split(
                "_")[1][:5] == "impro" else "scripted"
            gender = "Male" if file.split("_")[-1][0] == 'M' else "Female"

            features_str = f'{audio_file} {acting} {gender} {duration} {emotion_label[emotion]} {emotion_number[emotion]}\
                {valence} {activation} {dominance}'

            writer.writerow(features_str.split())

In [None]:
extract_raw_data(AUDIO_DIR, headers_raw, RAW_AUDIO_FILES)

Processing audio files from all dialogs:


100%|████████████████████████████████████████| 151/151 [00:00<00:00, 488.21it/s]


In [35]:
emotion_number = {'ang': 0, 'hap': 1, 'sad': 2, 'neu': 3, 'exc': 1}

def extract_spectograms(audio_dir):
  for file_path in tqdm(glob.glob(audio_dir+'/Session[0-9]*/dialog/EmoEvaluation/*.txt')):
    for line in open(file_path):
      if line[0] == '[':
        args = line.split()
        emotion = args[4]
        if emotion not in emotion_number.keys():
          continue
        # on windows
        audio_file = f'{AUDIO_DIR}/Session{args[3][4]}/sentences/wav/'+file_path.split(
            "\\")[-1][:-4]+f'/{args[3]}.wav'
        # on linux/macOS
        # audio_file = f'{AUDIO_DIR}/Session{args[3][4]}/sentences/wav/{file_path.split("/")[-1][:-4]}/{args[3]}.wav'
        file = audio_file.split(".")[-2].split("/")[-1]
        filename = file

        # Obtain 6 seconds of the audio waveform
        y, sr = librosa.load(audio_file, sr=16000)
        y = nr.reduce_noise(y=y, sr=sr)
        # seconds = 6
        # input_len = sr * seconds
        # y = y[:input_len]
        # if (input_len > tf.shape(y)[0]):
        #   zero_padding = tf.zeros(
        #       [input_len] - tf.shape(y),
        #       dtype=tf.float32)
        #   y = tf.cast(y, dtype=tf.float32)
        #   y = tf.concat([y, zero_padding], 0)
        #   y = y.numpy()

        # ------------Spectrogram------------
        # spec = np.abs(librosa.stft(y, n_fft=512, hop_length=256))
        # with open(f'{AUDIO_SPECTROGRAMS_DIR}{filename}-{emotion_number[emotion]}.pickle', 'wb') as f:
        #   pickle.dump(spec, f, protocol=pickle.HIGHEST_PROTOCOL)

        # # ---------------MFCCs---------------
        # mfcc = librosa.feature.mfcc(y=y, sr=sr, S=spec, n_mfcc=256)
        # with open(f'{AUDIO_MFCCS_DIR}{filename}-{emotion_number[emotion]}-mfcc.pickle', 'wb') as f:
        #   pickle.dump(mfcc, f, protocol=pickle.HIGHEST_PROTOCOL)

        # # ----------Mel-Spectrogram----------
        # mel_spec = librosa.feature.melspectrogram(
        #     y=y, sr=sr, S=spec, n_mels=256)
        # with open(f'{AUDIO_MEL_SPECTROGRAMS_DIR}{filename}-{emotion_number[emotion]}-mel.pickle', 'wb') as f:
        #   pickle.dump(mel_spec, f, protocol=pickle.HIGHEST_PROTOCOL)

        # -------Mel-Spectrogram Image-------
        mel_spec = librosa.feature.melspectrogram(y=y, sr=sr)
        fig = plt.figure()
        ax = fig.add_subplot()
        ax.axes.get_xaxis().set_visible(False)
        ax.axes.get_yaxis().set_visible(False)
        ax.set_frame_on(False)
        librosa.display.specshow(librosa.power_to_db(mel_spec, ref=np.max), cmap="viridis")
        plt.savefig(f'{AUDIO_MEL_SPECTROGRAM_IMAGES_DIR_2}{filename}-{emotion_number[emotion]}.png', dpi=400,
                    bbox_inches='tight', pad_inches=0)
        plt.close('all')

In [36]:
extract_spectograms(AUDIO_DIR)

100%|██████████| 151/151 [23:18<00:00,  9.26s/it]
