# Libraries

In [1]:
import os
import glob
import pandas as pd
from collections import defaultdict

In [3]:
# Folder path where the IEMOCAP is stored
folder_path = r'Data/IEMOCAP'

# Extract IEMOCAP values to a dictionary

Function to extract the emotions for each file

In [4]:
def get_emotions(root_folder:str):
    """
        This function returns a dictionary of emotions for each utterance in the dataset.
        root_folder: The root folder of the IEMOCAP dataset.
        return: A dictionary of emotions for each utterance in the dataset.
    """
    # initialise the emotions list
    data = []
    # get all the EmoEvaluation files
    emo_pattern = os.path.join(root_folder, "**", "EmoEvaluation", "*.txt")
    emo_files = glob.glob(emo_pattern, recursive=True)

    # read the EmoEvaluation files and get the emotions
    for filepath in emo_files:
        with open(filepath) as f:
            rows = [line.split('\t')[1:3] for line in f if ']' in line]
            data.extend([ele for ele in rows if ele != []])

    emotions = {row[0]: row[1] for row in data}
    
    return emotions

Function to get the transcriptions for each file

In [5]:
def get_transcriptions(root_folder:str):
    """
        This function is used to consolidate all the transcriptions from the IEMOCAP dataset
        root_folder: path to the IEMOCAP dataset
        returns: a dictionary with the filename as key and the transcription as value
    """
    # initialise list to store the text transcriptions
    data = []
    transc_pattern = os.path.join(root_folder, "**", "transcriptions", "*.txt")
    transc_files = glob.glob(transc_pattern, recursive=True)

    # iterate over the transcription files and get the transcriptions
    for filepath in transc_files:
        # open the file and read the transcriptions
        with open(filepath) as f:
            # split the lines and get the filename and transcription
            rows = [line.strip().split(':', 1) for line in f]
            clean_row = [[filename.split(' ')[0], transcription.strip()] for filename, transcription in rows]
            data.extend(clean_row)

    transcriptions = {row[0]: row[1] for row in data}
    return transcriptions

Function to get the wav file paths

In [6]:
def get_filepaths(root_folder:str):
    """
        This function gets the filepaths from the IEMOCAP dataset
        root_folder: path to the IEMOCAP dataset
        returns: a dictionary with the filename as key and the filepath as value
    """

    wav_pattern = os.path.join(root_folder, "**", "sentences", "wav", "**", "*.wav")
    wav_files = glob.glob(wav_pattern, recursive=True)
    
    # Extract filename and create a list with filename, filepath
    values = {os.path.basename(file).split('.')[0]: file for file in wav_files}

    # trans_pattern = os.path.join(root_folder, "**", "transcriptions", "*.txt")
    # trans_files = glob.glob(trans_pattern, recursive=True)

    return values


Function to merge the dictionaries for each of the bits based on the filename which will work as the keyname for the dictionary

In [7]:
def merge_dicts(files:dict, emotions:dict, transcriptions:dict):
    """
        This function merges the dictionaries of the files, emotions and transcriptions
        files: dictionary of the files
        emotions: dictionary of the emotions
        transcription: dictionary of the transcriptions
        returns: a dictionary with the filename as key and a list of the filepath, transcription and emotion as value
    """
    dd = defaultdict(list)
    # iterate over the dictionaries and merge them based on the filename
    for dic in (files, emotions, transcriptions): 
        for key, value in dic.items():
            dd[key].append(value)
    return dd

Create the dataframe that will be used in the project to find the right file locations, transacriptions, emotions etc

In [8]:
def build_dataframe(merge_dicts:dict):

    # Create dataframe from dictionary and rename columns
    df = pd.DataFrame.from_dict(final, orient='index').reset_index()
    df.columns = ['filename','filepath', 'emotion', 'transcription']
    df['dataset'] = 'IEMOCAP'

    # filter dataframe to only show rows where emotion is not null
    df = df[df['emotion'].notnull()]

    # map emotion label to a emotion label
    mapping_val = {
                    'hap': 'happy',
                    'ang': 'angry',
                    'sad': 'sad',
                    'fru': 'frustrated',
                    'neu': 'neutral',
                    'exc': 'excited',
                    'fea': 'fearful',
                    'sur': 'surprised',
                    'dis': 'disgusted', 
                    'cal': 'calm',
                    'oth': 'other',
                    'xxx': 'xxx'
                }

    # Create a new column with the emotion label
    df['emotion_label'] = df['emotion'].map(mapping_val)

    # Get the 5th character of the filename to produce the gender
    df['gender'] = df['filename'].str[5]

    # Extract the method from filename
    df['method'] = df['filename'].str[7:13]

    # Extract the session number from filename
    df['session'] = df['filename'].str[3:5]

    # Convert session to integer
    df['session'] = df['session'].astype(int)

    return df

Get the information necessary to build the dataframe and print a sample to see the results

In [20]:
files = get_filepaths(folder_path)
print(files['Ses01M_script01_1_F000'])

transcriptions = get_transcriptions(folder_path)
print(transcriptions['Ses01M_script01_1_F000'])

emotions = get_emotions(folder_path)
print(emotions['Ses01M_script01_1_F000'])

Data/IEMOCAP/Session1/sentences/wav/Ses01M_script01_1/Ses01M_script01_1_F000.wav
What he's going to say?
fru


In [21]:
# merge dictionaries
final = merge_dicts(files, emotions, transcriptions)

df = build_dataframe(final)

In [10]:
df

Unnamed: 0,filename,filepath,emotion,transcription,dataset,emotion_label,gender,method,session
0,Ses04M_script02_2_M042,Data/IEMOCAP/Session4/sentences/wav/Ses04M_scr...,xxx,Or not.,IEMOCAP,xxx,M,script,4
1,Ses04M_script02_2_M040,Data/IEMOCAP/Session4/sentences/wav/Ses04M_scr...,hap,I don't know. It seemed like a good spot to m...,IEMOCAP,happy,M,script,4
2,Ses04M_script02_2_M041,Data/IEMOCAP/Session4/sentences/wav/Ses04M_scr...,hap,"Shh. If we're very quiet, the fish might come.",IEMOCAP,happy,M,script,4
3,Ses04M_script02_2_F015,Data/IEMOCAP/Session4/sentences/wav/Ses04M_scr...,xxx,Do you remember the first time we came to see it?,IEMOCAP,xxx,M,script,4
4,Ses04M_script02_2_F001,Data/IEMOCAP/Session4/sentences/wav/Ses04M_scr...,xxx,About what?,IEMOCAP,xxx,M,script,4
...,...,...,...,...,...,...,...,...,...
10034,Ses01F_script02_2_M010,Data/IEMOCAP/Session1/sentences/wav/Ses01F_scr...,fru,We've missed them twice.,IEMOCAP,frustrated,F,script,1
10035,Ses01F_script02_2_M004,Data/IEMOCAP/Session1/sentences/wav/Ses01F_scr...,fru,"Not just me, look at all of these people.",IEMOCAP,frustrated,F,script,1
10036,Ses01F_script02_2_F019,Data/IEMOCAP/Session1/sentences/wav/Ses01F_scr...,hap,"You whispered the sweetest, most intimate thin...",IEMOCAP,happy,F,script,1
10037,Ses01F_script02_2_F031,Data/IEMOCAP/Session1/sentences/wav/Ses01F_scr...,xxx,Sure. This is standing on the beach waiting. ...,IEMOCAP,xxx,F,script,1


## IEMOCAP Testing

To ensure that the data is in perfect shape we run several unit tests to verify the results

In [31]:
import unittest
import sys
from Test.TestIEMOCAP import TestIEMOCAP

In [32]:
# Pass the variables to the test class
TestIEMOCAP.df = df
TestIEMOCAP.folder = folder_path
TestIEMOCAP.files = files
TestIEMOCAP.emotions = emotions
TestIEMOCAP.transcriptions = transcriptions

# Create an instance of the test class
test_class = TestIEMOCAP()

In [34]:
# Run the tests
suite = unittest.TestLoader().loadTestsFromTestCase(TestIEMOCAP)
unittest.TextTestRunner(verbosity=4,stream=sys.stderr).run(suite)

test_columns_datatypes (Test.Data.TestIEMOCAP.TestIEMOCAP.test_columns_datatypes) ... ok
test_df_shape (Test.Data.TestIEMOCAP.TestIEMOCAP.test_df_shape) ... ok
test_emotion_length (Test.Data.TestIEMOCAP.TestIEMOCAP.test_emotion_length) ... ok
test_folder_path (Test.Data.TestIEMOCAP.TestIEMOCAP.test_folder_path) ... ok
test_gender_length (Test.Data.TestIEMOCAP.TestIEMOCAP.test_gender_length) ... ok
test_is_dataframe (Test.Data.TestIEMOCAP.TestIEMOCAP.test_is_dataframe) ... ok
test_length (Test.Data.TestIEMOCAP.TestIEMOCAP.test_length) ... ok
test_method_length (Test.Data.TestIEMOCAP.TestIEMOCAP.test_method_length) ... ok
test_null_values (Test.Data.TestIEMOCAP.TestIEMOCAP.test_null_values) ... ok
test_session_length (Test.Data.TestIEMOCAP.TestIEMOCAP.test_session_length) ... ok

----------------------------------------------------------------------
Ran 10 tests in 0.045s

OK


<unittest.runner.TextTestResult run=10 errors=0 failures=0>

### Export to CSV

The data passed all the unit tests, so we can export to create the IEMOCAP_DF

In [35]:
export_filename = 'IEMOCAP_DF.csv'
export_path = os.path.join(folder_path, export_filename)

# Export to CSV
df.to_csv(export_path, index=False)