In [1]:
import sys
import os
import glob
import librosa
import numpy as np
import librosa.display
import csv
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
# go to upper diretory
sys.path.append(os.path.abspath('./../../../'))
from Audio_Sentiment_Analysis.utils.Configuration import Configuration

In [4]:
AUDIO_DIR = f"{os.path.abspath('./../../../')}/IEMOCAP_Dataset"
EXTRACTED_FEATURES_FILE = 'extracted_features_iemocap.csv'
CONFIG_FILE = f"{os.path.abspath('./../../../')}/Audio_Sentiment_Analysis/iemocap/config.json"
config = Configuration.load_json(CONFIG_FILE)

## Extracting the 24 features selected from the eNTERFACE05 dataset study

In [8]:
chosen_features = {'var_mfcc15', 'min_mfcc17', 'std_chroma_stft', 'mean_zcr',
    'min_zcr', 'var_mfcc4', 'spikes_spec_cent', 'var_spec_bw', 'max_mfcc6',
    'min_mfcc19', 'max_mfcc13', 'max_mfcc5', 'var_mfcc9', 'min_mfcc7', 'var_mfcc1',
    'max_mfcc1', 'max_mfcc9', 'var_mel_spect', 'mean_spec_cont', 'var_mfcc2', 'max_mfcc10',
    'min_spec_cent', 'var_mfcc14', 'var_mfcc3'}

In [9]:
headers = [
    'File', 'Interaction', 'Gender', 'Duration', 'Emotion', 'Emotion_Id',
    'Valence', 'Activation', 'Dominance',
    'std_chroma_stft', 'mean_zcr', 'min_zcr',
    'var_mel_spect', 'spikes_spec_cent', 'min_spec_cent',
    'var_spec_bw', 'mean_spec_cont', 
    'var_mfcc1', 'max_mfcc1', 'var_mfcc2', 'var_mfcc3', 'var_mfcc4',
    'max_mfcc5', 'max_mfcc6', 'min_mfcc7', 'var_mfcc9', 'max_mfcc9',
    'max_mfcc10', 'max_mfcc13', 'var_mfcc14', 'var_mfcc15', 
    'min_mfcc17', 'min_mfcc19'
]

In [10]:
def spikes(data):
    if len(data.shape) != 1:
        data = np.concatenate(data)
    mean = np.mean(data)
    std = np.std(data)
    threshold = mean + std * 2 / 100
    num_spikes = 0
    for value in data:
        if value >= threshold:
            num_spikes += 1

    return num_spikes

In [11]:
emotion_label = {
    'ang': "angry",
    'hap': "happy",
    'sad': "sad",
    'neu': "neutral",
    'fru': "frustrated",
    'exc': "excited",
    'fea': "fearful",
    'sur': "surprised",
    'dis': "disgusted",
    'xxx': "other",
    'oth': "other"
}

emotion_number = {
    'ang': 0,
    'hap': 1,
    'sad': 2,
    'neu': 3,
    'fru': 4,
    'exc': 5,
    'fea': 6,
    'sur': 7,
    'dis': 8,
    'oth': 9,
    'xxx': 10,
}

In [13]:
def extract_features(audio_file, duration, emotion, valence, activation, dominance, writer):
    # file = audio_file.split(".")[-2].split("\\")[-1]
    file = audio_file.split(".")[-2].split("/")[-1]
    acting = "improvisation" if file.split("_")[1][:6] == "improv" else "scripted"
    gender = "Male" if file.split("_")[-1][0] == 'M' else "Female"

    y, sr = librosa.load(audio_file, res_type='kaiser_fast')

    std_chroma_stft = np.std(librosa.feature.chroma_stft(y, sr=sr))
    zcr = librosa.feature.zero_crossing_rate(y)
    mean_zcr = np.mean(zcr)
    min_zcr = np.min(zcr)
    var_mel_spect = np.var(librosa.feature.melspectrogram(y, sr=sr, n_mels=config.n_mels))
    spec_cent = librosa.feature.spectral_centroid(y, sr=sr)
    spikes_spec_cent = spikes(spec_cent)
    min_spec_cent = np.min(spec_cent)
    var_spec_bw = np.var(librosa.feature.spectral_bandwidth(y, sr=sr))
    mean_spec_cont = np.mean(librosa.feature.spectral_contrast(y, sr=sr))
    mfcc = librosa.feature.mfcc(y, sr=sr)
    var_mfcc1 = np.var(mfcc[0])
    max_mfcc1 = np.max(mfcc[0])
    var_mfcc2 = np.var(mfcc[1])
    var_mfcc3 = np.var(mfcc[2])
    var_mfcc4 = np.var(mfcc[3])
    max_mfcc5 = np.max(mfcc[4])
    max_mfcc6 = np.max(mfcc[5])
    min_mfcc7 = np.min(mfcc[6])
    var_mfcc9 = np.var(mfcc[8])
    max_mfcc9 = np.max(mfcc[8])
    max_mfcc10 = np.max(mfcc[9])
    max_mfcc13 = np.max(mfcc[12])
    var_mfcc14 = np.var(mfcc[13])
    var_mfcc15 = np.var(mfcc[14])
    min_mfcc17 = np.min(mfcc[16])
    min_mfcc19 = np.min(mfcc[18])

    features_str = f'{file} {acting} {gender} {duration} {emotion_label[emotion]} {emotion_number[emotion]}\
        {valence} {activation} {dominance}\
        {std_chroma_stft} {mean_zcr} {min_zcr} {var_mel_spect} {spikes_spec_cent} {min_spec_cent}\
        {var_spec_bw} {mean_spec_cont} {var_mfcc1} {max_mfcc1} {var_mfcc2} {var_mfcc3} {var_mfcc4}\
        {max_mfcc5} {max_mfcc6} {min_mfcc7} {var_mfcc9} {max_mfcc9} {max_mfcc10} {max_mfcc13}\
        {var_mfcc14} {var_mfcc15} {min_mfcc17} {min_mfcc19}'

    writer.writerow(features_str.split())

In [38]:
def process_data(audio_dir, headers, proc_feat_dataset):
    # Create a CSV for storing all processed features and write the header
    file = open(proc_feat_dataset, 'w', newline='')
    writer = csv.writer(file)
    writer.writerow(headers)

    print("Processing audio files from all dialogs:")
    for file_path in tqdm(glob.glob(audio_dir+'/Session[0-9]*/dialog/EmoEvaluation/*.txt')):
        emoEvalFile = open(file_path)

        for line in emoEvalFile:
            if line[0] == '[':
                args = line.split()
                duration = float(args[2][:-1]) - float(args[0][1:])
                '/Users/mario/Desktop/VADER/IEMOCAP_Dataset/Session4/sentences/wav/t/Ses04F_script02_1_F000.wav'
                audio_file = f'{AUDIO_DIR}/Session{args[3][4]}/sentences/wav/{file_path.split("/")[-1][:-4]}/{args[3]}.wav'
                extract_features(audio_file, duration, args[4], args[5][1:-1], args[6][:-1], args[7][:-1], writer)

In [39]:
process_data(AUDIO_DIR, headers, EXTRACTED_FEATURES_FILE)

Processing audio files from all dialogs:


100%|██████████| 151/151 [29:56<00:00, 11.89s/it]


In [8]:
df = pd.read_csv(EXTRACTED_FEATURES_FILE)
print(f"Number of Audio Files: {df.shape[0]}")
df = df.sort_values(['Emotion', 'Gender'], ascending = (True, True))
df = df.set_index('File')
df

Number of Audio Files: 10039


Unnamed: 0_level_0,Interaction,Gender,Duration,Emotion,Emotion_Id,Valence,Activation,Dominance,std_chroma_stft,mean_zcr,...,max_mfcc6,min_mfcc7,var_mfcc9,max_mfcc9,max_mfcc10,max_mfcc13,var_mfcc14,var_mfcc15,min_mfcc17,min_mfcc19
File,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Ses04F_script02_1_F007,scripted,Female,2.3027,angry,0,2.0,3.0000,2.5000,0.328248,0.061035,...,54.387356,-50.059189,51.791885,15.879789,11.448400,19.471115,33.886478,34.005157,-20.161358,-18.028835
Ses04F_script02_1_F008,scripted,Female,2.6200,angry,0,2.0,3.0000,3.0000,0.325717,0.051218,...,24.525238,-41.043949,141.870651,30.748220,19.321951,17.711790,73.043159,56.129311,-20.042635,-26.435205
Ses04F_script02_1_F012,scripted,Female,2.8100,angry,0,2.0,3.5000,3.5000,0.312264,0.080887,...,56.987560,-35.728943,57.412914,36.346241,7.725285,24.113718,86.904289,83.267319,-30.209728,-34.726349
Ses04F_script02_1_F013,scripted,Female,7.8696,angry,0,1.5,4.0000,3.5000,0.317440,0.089717,...,60.942680,-45.283386,107.373848,29.710926,12.648742,42.300812,117.452675,118.971069,-40.664703,-28.176968
Ses04F_script02_1_F014,scripted,Female,8.1000,angry,0,2.0,3.5000,3.5000,0.316980,0.106910,...,50.430084,-43.132835,119.257858,21.167702,22.152235,19.599295,115.348228,89.030464,-33.020531,-28.692318
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Ses01F_impro03_M018,scripted,Male,1.4375,surprised,7,3.0,2.6667,2.6667,0.297418,0.048434,...,45.260635,-13.307287,143.821762,28.158386,12.092248,15.586620,21.829449,34.880535,-14.911965,-18.645388
Ses01F_impro05_M032,scripted,Male,6.2475,surprised,7,3.0,2.5000,2.0000,0.301693,0.064050,...,46.169548,-26.762585,103.567039,23.435570,14.563293,19.879362,51.003719,29.242901,-25.568983,-31.159067
Ses01F_impro05_M033,scripted,Male,6.6700,surprised,7,2.5,3.0000,2.0000,0.315035,0.057646,...,49.789436,-22.771132,155.705765,18.928852,11.715882,23.202303,56.172832,46.299030,-27.097666,-20.788668
Ses01F_script02_2_M039,scripted,Male,1.6768,surprised,7,3.0,1.5000,1.5000,0.247377,0.049243,...,24.280342,-38.083015,29.814821,22.335604,7.294564,12.798818,13.436497,9.737317,-18.367386,-11.531448


In [17]:
df.groupby(['Emotion']).agg({'Emotion': ['count']})

Unnamed: 0_level_0,Emotion
Unnamed: 0_level_1,count
Emotion,Unnamed: 1_level_2
angry,1103
disgusted,2
excited,1041
fearful,40
frustrated,1849
happy,595
neutral,1708
other,2510
sad,1084
surprised,107


In [18]:
df.groupby(['Gender']).agg({'Emotion': ['count']})

Unnamed: 0_level_0,Emotion
Unnamed: 0_level_1,count
Gender,Unnamed: 1_level_2
Female,4800
Male,5239


In [24]:
df[["Duration"]].mean()

Duration    4.460109
dtype: float64