In [2]:
import sys
import os
# go to upper diretory
sys.path.append(os.path.abspath('./../../../'))
import csv
import glob
import librosa
import librosa.display
import numpy as np
import tensorflow as tf
import pickle
import noisereduce as nr
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt
from tqdm import tqdm
from Audio_Sentiment_Analysis.utils.Configuration import Configuration
from scipy.stats import skew, kurtosis
import warnings
warnings.filterwarnings('ignore')

AUTOTUNE = tf.data.AUTOTUNE
AUDIO_DIR = f"{os.path.abspath('./../../../')}/IEMOCAP_Dataset"
EXTRACTED_FEATURES_FILE = 'extracted_features_iemocap.csv'
PREPROCESSED_EXTRACTED_FEATURES_FILE = 'preprocessed_extracted_features_iemocap.csv'
PREPROCESSED_ALL_EXTRACTED_FEATURES_FILE = 'all_preprocessed_extracted_features_iemocap.csv'
ALL_EXTRACTED_FEATURES_FILE = 'all_extracted_features_iemocap.csv'
RAW_AUDIO_FILES = 'raw_audio_files.csv'
AUDIO_SPECTROGRAMS_DIR = './../data/spectrograms/'
AUDIO_MFCCS_DIR = './../data/mfccs/'
AUDIO_MEL_SPECTROGRAMS_DIR = './../data/mel_spectrograms/'
AUDIO_SPECTROGRAM_IMAGES_DIR = './../data/spectrograms_images/'
AUDIO_MEL_SPECTROGRAM_IMAGES_DIR = './../data/mel_spectrograms_images/'
AUDIO_MFCCS_IMAGES_DIR = './../data/mfccs_images/'
AUDIO_ALL_DATA_DIR = './../data/all_data/'

CONFIG_FILE = f"{os.path.abspath('./../../../')}/Audio_Sentiment_Analysis/iemocap/config.json"
config = Configuration.load_json(CONFIG_FILE)
plt.rcParams['figure.dpi'] = 300

## Extracting the 43 features selected from the eNTERFACE05 dataset study

In [2]:
all_headers = [
    'File', 'Interaction', 'Gender', 'Duration', 'Emotion', 'Emotion_Id',
    'Valence', 'Activation', 'Dominance',
    'spikes_mel_spect', '25thpercentile_mel_spect', 'median_mel_spect', '75thpercentile_mel_spect', 'mean_mel_spect', 'min_mel_spect', 'max_mel_spect', 'var_mel_spect', 'std_mel_spect', 'sum_mel_spect',
    'spikes_chroma_stft', '25thpercentile_chroma_stft', 'median_chroma_stft', '75thpercentile_chroma_stft', 'mean_chroma_stft', 'var_chroma_stft', 'std_chroma_stft', 'sum_chroma_stft',
    'spikes_rms', '25thpercentile_rms', 'median_rms', '75thpercentile_rms', 'mean_rms', 'max_rms', 'var_rms', 'std_rms', 'sum_rms',
    'spikes_spec_cent', '25thpercentile_spec_cent', 'median_spec_cent', '75thpercentile_spec_cent', 'mean_spec_cent', 'min_spec_cent', 'max_spec_cent', 'var_spec_cent', 'std_spec_cent', 'sum_spec_cent',
    'spikes_spec_bw', '25thpercentile_spec_bw', 'median_spec_bw', '75thpercentile_spec_bw', 'mean_spec_bw', 'min_spec_bw', 'max_spec_bw', 'var_spec_bw', 'std_spec_bw', 'sum_spec_bw',
    'spikes_spec_rolloff', '25thpercentile_spec_rolloff', 'median_spec_rolloff', '75thpercentile_spec_rolloff', 'mean_spec_rolloff', 'min_spec_rolloff', 'max_spec_rolloff', 'var_spec_rolloff', 'std_spec_rolloff', 'sum_spec_rolloff',
    'spikes_spec_cont', '25thpercentile_spec_cont', 'median_spec_cont', '75thpercentile_spec_cont', 'mean_spec_cont', 'min_spec_cont', 'max_spec_cont', 'var_spec_cont', 'std_spec_cont', 'sum_spec_cont',
    'spikes_tonnetz', '25thpercentile_tonnetz', 'median_tonnetz', '75thpercentile_tonnetz', 'mean_tonnetz', 'min_tonnetz', 'max_tonnetz', 'var_tonnetz', 'std_tonnetz', 'sum_tonnetz',
    'spikes_zcr', '25thpercentile_zcr', 'median_zcr', '75thpercentile_zcr', 'mean_zcr', 'min_zcr', 'max_zcr', 'var_zcr', 'std_zcr', 'sum_zcr'
]

for i in range(1, 21):
    all_headers.extend([f'kurtosis_mfcc{i}', f'skew_mfcc{i}', f'spikes_mfcc{i}', f'25thpercentile_mfcc{i}', f'median_mfcc{i}',
                       f'75thpercentile_mfcc{i}', f'mean_mfcc{i}', f'min_mfcc{i}', f'max_mfcc{i}', f'var_mfcc{i}', f'std_mfcc{i}', f'sum_mfcc{i}'])

headers = [
    'File', 'Interaction', 'Gender', 'Duration', 'Emotion', 'Emotion_Id',
    'Valence', 'Activation', 'Dominance',
    'max_tonnetz', 'var_spec_bw', 'thpercentile75_spec_bw', 'mean_spec_cont', 'thpercentile75_spec_rolloff',
    'var_mel_spect', 'spikes_zcr', 'std_mfcc1', 'kurtosis_mfcc1', 'kurtosis_mfcc2', 'thpercentile75_mfcc2',
    'kurtosis_mfcc3', 'var_mfcc3', 'thpercentile75_mfcc4', 'var_mfcc4', 'max_mfcc4', 'kurtosis_mfcc4',
    'median_mfcc4', 'median_mfcc5', 'std_mfcc6', 'max_mfcc6', 'max_mfcc7', 'skew_mfcc7', 'skew_mfcc9',
    'max_mfcc9', 'kurtosis_mfcc9', 'var_mfcc10', 'skew_mfcc11', 'var_mfcc12', 'thpercentile75_mfcc13',
    'skew_mfcc13', 'max_mfcc14', 'thpercentile75_mfcc18', 'median_mfcc20', 'skew_mfcc20'
]

headers_raw = [
    'File', 'Interaction', 'Gender', 'Duration', 'Emotion', 'Emotion_Id',
    'Valence', 'Activation', 'Dominance'
]

emotion_label = {
    'ang': "angry", 'hap': "happy", 'sad': "sad", 'neu': "neutral",
    'fru': "frustrated", 'exc': "excited", 'fea': "fearful",
    'sur': "surprised", 'dis': "disgusted", 'xxx': "other", 'oth': "other"
}

emotion_number = {
    'ang': 0, 'hap': 1, 'sad': 2, 'neu': 3, 'fru': 4,
    'exc': 5, 'fea': 6, 'sur': 7, 'dis': 8, 'oth': 9, 'xxx': 9
}


In [3]:
def spikes(data):
    if len(data.shape) != 1:
        data = np.concatenate(data)
    mean = np.mean(data)
    std = np.std(data)
    threshold = mean + std * 2 / 100
    num_spikes = 0
    for value in data:
        if value >= threshold:
            num_spikes += 1

    return num_spikes

In [4]:
def preprocess_audio(y, sr):
    y = nr.reduce_noise(y=y, sr=sr, n_fft=2048, hop_length=512, prop_decrease=.75, time_constant_s=1)
    y, _ = librosa.effects.trim(y, top_db=30)
    return y

In [8]:
def extract_all_features(audio_file, emotion, valence, activation, dominance, writer, preprocess=False):
    file = audio_file.split(".")[-2].split("/")[-1]
    acting = "improvised" if file.split("_")[1][:5] == "impro" else "scripted"
    gender = "Male" if file.split("_")[-1][0] == 'M' else "Female"

    y, sr = librosa.load(audio_file, sr=16000)
    duration = librosa.get_duration(y, sr)

    if preprocess:
        y = preprocess_audio(y, sr)

    mfcc = librosa.feature.mfcc(y=y, sr=sr)
    mel_spect = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=config.n_mels)
    rms = librosa.feature.rms(y=y)
    chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr)
    spec_cent = librosa.feature.spectral_centroid(y=y, sr=sr)
    spec_cont = librosa.feature.spectral_contrast(y=y, sr=sr)
    spec_bw = librosa.feature.spectral_bandwidth(y=y, sr=sr)
    spec_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)
    tonnetz = librosa.feature.tonnetz(y=y, sr=sr)
    zcr = librosa.feature.zero_crossing_rate(y=y)

    features_str = f'{file} {acting} {gender} {duration} {emotion_label[emotion]} {emotion_number[emotion]}\
        {valence} {activation} {dominance}\
        {spikes(mel_spect)} {np.nanpercentile(mel_spect, 0.25)} {np.median(mel_spect)} {np.nanpercentile(mel_spect, 0.75)} {np.mean(mel_spect)} {np.min(mel_spect)} {np.max(mel_spect)} {np.var(mel_spect)} {np.std(mel_spect)} {np.sum(mel_spect)}\
        {spikes(chroma_stft)} {np.nanpercentile(chroma_stft, 0.25)} {np.median(chroma_stft)} {np.nanpercentile(chroma_stft, 0.75)} {np.mean(chroma_stft)} {np.var(chroma_stft)} {np.std(chroma_stft)} {np.sum(chroma_stft)}\
        {spikes(rms)} {np.nanpercentile(rms, 0.25)} {np.median(rms)} {np.nanpercentile(rms, 0.75)} {np.mean(rms)} {np.max(rms)} {np.var(rms)} {np.std(rms)} {np.sum(rms)}\
        {spikes(spec_cent)} {np.nanpercentile(spec_cent, 0.25)} {np.median(spec_cent)} {np.nanpercentile(spec_cent, 0.75)} {np.mean(spec_cent)} {np.min(spec_cent)} {np.max(spec_cent)} {np.var(spec_cent)} {np.std(spec_cent)} {np.sum(spec_cent)}\
        {spikes(spec_bw)} {np.nanpercentile(spec_bw, 0.25)} {np.median(spec_bw)} {np.nanpercentile(spec_bw, 0.75)} {np.mean(spec_bw)} {np.min(spec_bw)} {np.max(spec_bw)} {np.var(spec_bw)} {np.std(spec_bw)} {np.sum(spec_bw)}\
        {spikes(spec_rolloff)} {np.nanpercentile(spec_rolloff, 0.25)} {np.median(spec_rolloff)} {np.nanpercentile(spec_rolloff, 0.75)} {np.mean(spec_rolloff)} {np.min(spec_rolloff)} {np.max(spec_rolloff)} {np.var(spec_rolloff)} {np.std(spec_rolloff)} {np.sum(spec_rolloff)}\
        {spikes(spec_cont)} {np.nanpercentile(spec_cont, 0.25)} {np.median(spec_cont)} {np.nanpercentile(spec_cont, 0.75)} {np.mean(spec_cont)} {np.min(spec_cont)} {np.max(spec_cont)} {np.var(spec_cont)} {np.std(spec_cont)} {np.sum(spec_cont)}\
        {spikes(tonnetz)} {np.nanpercentile(tonnetz, 0.25)} {np.median(tonnetz)} {np.nanpercentile(tonnetz, 0.75)} {np.mean(tonnetz)} {np.min(tonnetz)} {np.max(tonnetz)} {np.var(tonnetz)} {np.std(tonnetz)} {np.sum(tonnetz)}\
        {spikes(zcr)} {np.nanpercentile(zcr, 0.25)} {np.median(zcr)} {np.nanpercentile(zcr, 0.75)} {np.mean(zcr)} {np.min(zcr)} {np.max(zcr)} {np.var(zcr)} {np.std(zcr)} {np.sum(zcr)}'

    for e in mfcc:
        features_str += f' {kurtosis(e)} {skew(e)} {spikes(e)} {np.nanpercentile(e, 0.25)} {np.median(e)} {np.nanpercentile(e, 0.75)} {np.mean(e)} {np.min(e)} {np.max(e)} {np.var(e)} {np.std(e)} {np.sum(e)}'

    writer.writerow(features_str.split())

In [6]:
def extract_features(audio_file, emotion, valence, activation, dominance, writer, preprocess=False): 
    file = audio_file.split(".")[-2].split("/")[-1]
    acting = "improvised" if file.split("_")[1][:5] == "impro" else "scripted"
    gender = "Male" if file.split("_")[-1][0] == 'M' else "Female"
    
    y, sr = librosa.load(audio_file, sr=16000)

    bef_prep_duration = librosa.get_duration(y, sr)
    aft_prep_duration = bef_prep_duration

    if preprocess:
        y = nr.reduce_noise(y=y, sr=sr)
        aft_prep_duration = librosa.get_duration(y, sr)

    mfcc = librosa.feature.mfcc(y=y, sr=sr)

    spec_bw = librosa.feature.spectral_bandwidth(y=y, sr=sr)

    max_tonnetz = np.max(librosa.feature.tonnetz(y, sr=sr))
    var_spec_bw = np.var(spec_bw)
    thpercentile75_spec_bw = np.nanpercentile(spec_bw, 0.75)
    mean_spec_cont = np.mean(librosa.feature.spectral_contrast(y=y, sr=sr))
    thpercentile75_spec_rolloff = np.nanpercentile(librosa.feature.spectral_rolloff(y, sr=sr), 0.75)
    var_mel_spect = np.var(librosa.feature.melspectrogram(y, sr=sr, n_mels=127))
    spikes_zcr = spikes(librosa.feature.zero_crossing_rate(y))

    std_mfcc1 = np.std(mfcc[0])
    kurtosis_mfcc1 = kurtosis(mfcc[0])
    kurtosis_mfcc2 = kurtosis(mfcc[1])
    thpercentile75_mfcc2 = np.nanpercentile(mfcc[1], 0.75)
    kurtosis_mfcc3 = kurtosis(mfcc[2])
    var_mfcc3 = np.var(mfcc[2])
    thpercentile75_mfcc4 = np.nanpercentile(mfcc[3], 0.75)
    var_mfcc4 = np.var(mfcc[3])
    max_mfcc4 = np.max(mfcc[3])
    kurtosis_mfcc4 = kurtosis(mfcc[3])
    median_mfcc4 = np.median(mfcc[3])
    median_mfcc5 = np.median(mfcc[4])
    std_mfcc6 = np.std(mfcc[5])
    max_mfcc6 = np.max(mfcc[5])
    max_mfcc7 = np.max(mfcc[6])
    skew_mfcc7 = skew(mfcc[6])
    skew_mfcc9 = skew(mfcc[8])
    max_mfcc9 = np.max(mfcc[8])
    kurtosis_mfcc9 = kurtosis(mfcc[8])
    var_mfcc10 = np.var(mfcc[9])
    skew_mfcc11 = skew(mfcc[10])
    var_mfcc12 = np.var(mfcc[11])
    thpercentile75_mfcc13 = np.nanpercentile(mfcc[12], 0.75)
    skew_mfcc13 = skew(mfcc[12])
    max_mfcc14 = np.max(mfcc[13])
    thpercentile75_mfcc18 = np.nanpercentile(mfcc[17], 0.75)
    median_mfcc20 = np.median(mfcc[19])
    skew_mfcc20 = skew(mfcc[19])

    features_str = f'{file} {acting} {gender} {bef_prep_duration} {aft_prep_duration} {emotion_label[emotion]} {emotion_number[emotion]}\
        {valence} {activation} {dominance}\
        {max_tonnetz} {var_spec_bw} {thpercentile75_spec_bw} {mean_spec_cont} {thpercentile75_spec_rolloff}\
        {var_mel_spect} {spikes_zcr} {std_mfcc1} {kurtosis_mfcc1} {kurtosis_mfcc2} {thpercentile75_mfcc2}\
        {kurtosis_mfcc3} {var_mfcc3} {thpercentile75_mfcc4} {var_mfcc4} {max_mfcc4} {kurtosis_mfcc4}\
        {median_mfcc4} {median_mfcc5} {std_mfcc6} {max_mfcc6} {max_mfcc7} {skew_mfcc7} {skew_mfcc9}\
        {max_mfcc9} {kurtosis_mfcc9} {var_mfcc10} {skew_mfcc11} {var_mfcc12} {thpercentile75_mfcc13}\
        {skew_mfcc13} {max_mfcc14} {thpercentile75_mfcc18} {median_mfcc20} {skew_mfcc20}'

    writer.writerow(features_str.split())

In [7]:
def process_data(audio_dir, headers, proc_feat_dataset=EXTRACTED_FEATURES_FILE, all_features=False, preprocess=False):
    # Create a CSV for storing all processed features and write the header
    file = open(proc_feat_dataset, 'w', newline='')
    writer = csv.writer(file)
    writer.writerow(headers)

    print("Processing audio files from all dialogs:")
    for file_path in tqdm(glob.glob(audio_dir+'/Session[0-9]*/dialog/EmoEvaluation/*.txt')):
        emoEvalFile = open(file_path)

        for line in emoEvalFile:
            if line[0] == '[':
                args = line.split()
                if args[4] in {'ang', 'neu', 'exc', 'hap', 'sad'}:
                    # on windows
                    audio_file = f'{AUDIO_DIR}/Session{args[3][4]}/sentences/wav/'+file_path.split(
                        "\\")[-1][:-4]+f'/{args[3]}.wav'
                    # on linux/macOS
                    # audio_file = f'{AUDIO_DIR}/Session{args[3][4]}/sentences/wav/{file_path.split("/")[-1][:-4]}/{args[3]}.wav'
                    if all_features:
                        extract_all_features(
                            audio_file, args[4], args[5][1:-1], args[6][:-1], args[7][:-1], writer, preprocess)
                    else:
                        extract_features(
                            audio_file, args[4], args[5][1:-1], args[6][:-1], args[7][:-1], writer, preprocess)


In [41]:
process_data(AUDIO_DIR, headers)

Processing audio files from all dialogs:


100%|██████████| 151/151 [12:20<00:00,  4.90s/it]


In [9]:
process_data(AUDIO_DIR, headers, PREPROCESSED_EXTRACTED_FEATURES_FILE, preprocess=True)

Processing audio files from all dialogs:


100%|██████████| 151/151 [13:59<00:00,  5.56s/it]


In [12]:
process_data(AUDIO_DIR, all_headers, ALL_EXTRACTED_FEATURES_FILE, all_features=True)

Processing audio files from all dialogs:


100%|██████████| 151/151 [18:07<00:00,  7.20s/it]


In [13]:
process_data(AUDIO_DIR, all_headers, PREPROCESSED_ALL_EXTRACTED_FEATURES_FILE, all_features=True, preprocess=True)

Processing audio files from all dialogs:


100%|██████████| 151/151 [19:41<00:00,  7.82s/it]


In [55]:
emotion_number = {'ang': 0, 'hap': 1, 'sad': 2, 'neu': 3, 'exc': 1}
emotion_label = {'ang': "angry", 'hap': "happy", 'sad': "sad", 'neu': "neutral", 'exc': "happy"}

def extract_raw_data(audio_dir, headers, raw_audio_dataset):
   file = open(raw_audio_dataset, 'w', newline='')
   writer = csv.writer(file)
   writer.writerow(headers)

   print("Processing audio files from all dialogs:")
   for file_path in tqdm(glob.glob(audio_dir+'/Session[0-9]*/dialog/EmoEvaluation/*.txt')):
      emoEvalFile = open(file_path)

      for line in emoEvalFile:
         if line[0] == '[':
            args = line.split()
            duration = float(args[2][:-1]) - float(args[0][1:])
            # on windows
            emotion, valence, activation, dominance = args[4], args[5][1:-
                                                                       1], args[6][:-1], args[7][:-1]
            if emotion in emotion_label.keys():
                audio_file = f'{args[3]}-{emotion_number[emotion]}'
                acting = "improvised" if audio_file.split(
                    "_")[1][:5] == "impro" else "scripted"
                gender = "Male" if audio_file.split("_")[-1][0] == 'M' else "Female"

                features_str = f'{audio_file} {acting} {gender} {duration} {emotion_label[emotion]} {emotion_number[emotion]}\
                    {valence} {activation} {dominance}'

                writer.writerow(features_str.split())

In [56]:
extract_raw_data(AUDIO_DIR, headers_raw, RAW_AUDIO_FILES)

Processing audio files from all dialogs:


100%|██████████| 151/151 [00:00<00:00, 2251.87it/s]


In [21]:
emotion_number = {'ang': 0, 'hap': 1, 'sad': 2, 'neu': 3, 'exc': 1}

def extract_spectograms():
  for file_path in tqdm(glob.glob(AUDIO_DIR+'/Session[0-9]*/dialog/EmoEvaluation/*.txt')):
    for line in open(file_path):
      if line[0] == '[':
        args = line.split()
        emotion = args[4]
        if emotion not in emotion_number.keys():
          continue
        # on windows
        audio_file = f'{AUDIO_DIR}/Session{args[3][4]}/sentences/wav/'+file_path.split(
            "\\")[-1][:-4]+f'/{args[3]}.wav'
        # on linux/macOS
        # audio_file = f'{AUDIO_DIR}/Session{args[3][4]}/sentences/wav/{file_path.split("/")[-1][:-4]}/{args[3]}.wav'
        filename = audio_file.split(".")[-2].split("/")[-1]

        y, sr = librosa.load(audio_file, sr=16000)
        y = preprocess_audio(y, sr)
        # Obtain 6 seconds of the audio waveform
        duration_seconds = 6
        y = librosa.util.fix_length(y, int(sr * duration_seconds))

        # ------------Spectrogram------------
        spec = np.abs(librosa.stft(y, n_fft=2048))
        # with open(f'{AUDIO_SPECTROGRAMS_DIR}{filename}-{emotion_number[emotion]}-spec.pickle', 'wb') as f:
        #   pickle.dump(spec, f, protocol=pickle.HIGHEST_PROTOCOL)


        # # ----------Mel-Spectrogram----------
        # mel_spec = librosa.feature.melspectrogram(S=spec, n_mels=256)
        # with open(f'{AUDIO_MEL_SPECTROGRAMS_DIR}{filename}-{emotion_number[emotion]}-mel.pickle', 'wb') as f:
        #   pickle.dump(mel_spec, f, protocol=pickle.HIGHEST_PROTOCOL)

        # # ---------------MFCCs---------------
        # mfcc = librosa.feature.mfcc(S=mel_spec, n_mfcc=40)
        # with open(f'{AUDIO_MFCCS_DIR}{filename}-{emotion_number[emotion]}-mfcc.pickle', 'wb') as f:
        #   pickle.dump(mfcc, f, protocol=pickle.HIGHEST_PROTOCOL)

        # ----------Spectrogram Image----------
        # fig = plt.figure(dpi=100)
        # ax = fig.add_subplot()
        # ax.axes.get_xaxis().set_visible(False)
        # ax.axes.get_yaxis().set_visible(False)
        # ax.set_frame_on(False)
        # spec = librosa.amplitude_to_db(np.abs(librosa.stft(y,  n_fft=2048, hop_length=512)), ref=np.max)
        # librosa.display.specshow(spec, sr=sr, hop_length=512, ax=ax, cmap="viridis_r")
        # fig.savefig(f'{AUDIO_SPECTROGRAM_IMAGES_DIR}{filename}-{emotion_number[emotion]}-spec_img.png',
        #             bbox_inches='tight', pad_inches=0, dpi=100)
        # fig.clf()
        # plt.close(fig)

        # -------Mel-Spectrogram Image-------
        # fig = plt.figure(dpi=100)
        # ax = fig.add_subplot()
        # ax.axes.get_xaxis().set_visible(False)
        # ax.axes.get_yaxis().set_visible(False)
        # ax.set_frame_on(False)
        # mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=256, n_fft=2048, hop_length=512)
        # librosa.display.specshow(librosa.power_to_db(mel_spec, ref=np.max), cmap="viridis_r", ax=ax)
        # fig.savefig(f'{AUDIO_MEL_SPECTROGRAM_IMAGES_DIR}{filename}-{emotion_number[emotion]}-mel_img.png',
        #             bbox_inches='tight', pad_inches=0, dpi=100)
        # fig.clf()
        # plt.close(fig)

        # # ----------MFCCs Image----------
        # fig = plt.figure(dpi=100)
        # ax = fig.add_subplot()
        # ax.axes.get_xaxis().set_visible(False)
        # ax.axes.get_yaxis().set_visible(False)
        # ax.set_frame_on(False)
        # spec = np.abs(librosa.stft(y, n_fft=2048, hop_length=512))
        # mfcc = librosa.feature.mfcc(y=y, sr=sr, S=spec, n_mfcc=40)
        # librosa.display.specshow(mfcc, sr=sr, ax=ax, cmap="viridis_r")
        # fig.savefig(f'{AUDIO_MFCCS_IMAGES_DIR}{filename}-{emotion_number[emotion]}-mfcc_img.png',
        #             bbox_inches='tight', pad_inches=0, dpi=100)
        # fig.clf()
        # plt.close(fig)

        # --Spectrogram,MFCC,Mel-Spectrogram--
        # spec = np.abs(librosa.stft(y, n_fft=255, hop_length=755))
        # mfcc = librosa.feature.mfcc(y=y, sr=sr, S=spec, n_mfcc=128)
        # mel_spec = librosa.feature.melspectrogram(y=y, sr=sr, S=spec, n_mels=128)
        # all = np.stack((spec, mfcc, mel_spec), axis=-1)
        # with open(f'{AUDIO_ALL_DATA_DIR}{filename}-{emotion_number[emotion]}-all.pickle', 'wb') as f:
        #   pickle.dump(all, f, protocol=pickle.HIGHEST_PROTOCOL)

In [22]:
extract_spectograms()

  0%|          | 0/151 [00:00<?, ?it/s]

(1025, 188)
(256, 188)
(40, 188)





In [30]:
emotion_number = {'ang': 0, 'hap': 1, 'sad': 2, 'neu': 3, 'exc': 1}
y = None

for file_path in glob.glob(AUDIO_DIR+'/Session[0-9]*/dialog/EmoEvaluation/*.txt'):
    for line in open(file_path):
        if line[0] == '[':
            args = line.split()
            emotion = args[4]

            if emotion not in emotion_number.keys():
                continue

            audio_file = f'{AUDIO_DIR}/Session{args[3][4]}/sentences/wav/'+file_path.split(
                "\\")[-1][:-4]+f'/{args[3]}.wav'
            filename = audio_file.split(".")[-2].split("/")[-1]

            y, sr = librosa.load(audio_file, sr=16000)
            y = preprocess_audio(y, sr)
            break
    if y.any():
        break


In [31]:
fig = plt.figure(constrained_layout=True, figsize=(16, 12))

grid = plt.GridSpec(1, 3, wspace=0.2, hspace=0.2)

ax = plt.subplot(grid[0, 0])
spec = np.abs(librosa.stft(y, n_fft=2048))
librosa.display.specshow(librosa.power_to_db(spec, ref=np.max), sr=sr, ax=ax, cmap="viridis_r", y_axis='linear', x_axis='time')
plt.title('Spectrogram')

ax = plt.subplot(grid[0, 1])
mel_spec = librosa.feature.melspectrogram(S=spec, n_mels=256)
librosa.display.specshow(librosa.power_to_db(mel_spec, ref=np.max), cmap="viridis_r", ax=ax)
plt.title('Mel Spectrogram')

ax = plt.subplot(grid[0, 2])
mfcc = librosa.feature.mfcc(S=mel_spec, n_mfcc=40)
librosa.display.specshow(mfcc, sr=sr, ax=ax, cmap="viridis_r", x_axis='time')
plt.title('MFCCs')
plt.show()