# Libraries

In [1]:
import os
import pandas as pd
import glob

In [2]:
# Folder path where the IEMOCAP is stored
folder_path = r'Data/MELD/'

# Get the MELD dataframes and analyse the values

In [14]:
from os import path

train_path = path.join(folder_path, 'csv/train_sent_emo.csv')
dev_path = path.join(folder_path, 'csv/dev_sent_emo.csv')
test_path = path.join(folder_path, 'csv/test_sent_emo.csv')

# read the csv files
train_df = pd.read_csv(train_path, encoding='utf-8')
dev_df = pd.read_csv(dev_path, encoding='utf-8')
test_df = pd.read_csv(test_path, encoding='utf-8')

In [15]:
# count rows in each dataset
print('Train: ', len(train_df))
print('Dev: ', len(dev_df))
print('Test: ', len(test_df))

Train:  9988
Dev:  1108
Test:  2610


In [16]:
def concat_dfs(train_df:pd.DataFrame, dev_df:pd.DataFrame, test_df:pd.DataFrame):
    """
        Concatenate the train, dev and test dataframes
        train_df: train dataframe
        dev_df: dev dataframe
        test_df: test dataframe
        return: concatenated dataframe
    """
    # Add a new column to differentiate between train, dev and test
    train_df['Data'] = 'train'
    dev_df['Data'] = 'dev'
    test_df['Data'] = 'test'

    # Concatenate the dataframes
    df = pd.concat([train_df, dev_df, test_df], ignore_index=True)

    return df

In [17]:
def process_df(df:pd.DataFrame):
    """
        Add the filepath to the video files to the dataframe
        df: dataframe
        return: dataframe with the filepath
    """
    # Dataset name
    df['dataset'] = 'MELD'

    # create a new column that contains the dialogue id and utterance id
    df['filename'] = df['Data'].astype(str) + '_dia' + df['Dialogue_ID'].astype(str) + '_utt' + df['Utterance_ID'].astype(str)

    # Add the filepath to the video files to the dataframe using df['Data'] + '/' + df['filename'] + '.mp4'
    df['filepath'] = folder_path + df['Data'] + '/mp4/' + df['filename'] + '.mp4'

    return df

In [18]:
# Concatenate the dataframes and add the filepath to the video files
df = concat_dfs(train_df, dev_df, test_df)
df = process_df(df)

In [19]:
# Print the number of emotions in each dataset
pd.pivot_table(df, index=['Emotion'], columns=['Data'], values=['Sentiment'], aggfunc='count', margins=True)

Unnamed: 0_level_0,Sentiment,Sentiment,Sentiment,Sentiment
Data,dev,test,train,All
Emotion,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
anger,153,345,1109,1607
disgust,22,68,271,361
fear,40,50,268,358
joy,163,402,1743,2308
neutral,469,1256,4709,6434
sadness,111,208,683,1002
surprise,150,281,1205,1636
All,1108,2610,9988,13706


In [20]:
# delete mp4 files that are not in the dataframe
def delete_mp4_files(df:pd.DataFrame):
    """
        Delete the mp4 files that are not in the dataframe
        df: dataframe
    """
    # convert the filepath column to a list
    videos_df = df['filepath'].tolist()

    # Create a list of all the video files in the folder
    all_video_files = glob.glob(folder_path + '/**/*.mp4', recursive=True)

    # Create a list of the video files that are not in the dataframe
    files_to_delete = [x for x in all_video_files if x not in videos_df]

    # Delete the video files that are not in the dataframe
    for file in files_to_delete:
        os.remove(file)

In [12]:
# delete mp4 files that are not in the dataframe
delete_mp4_files(df)

In [34]:
# rename all files in the folders to include dev, test, train in the filename
def rename_files(df:pd.DataFrame):
    """
        The dataset contain duplicate filenames. This function renames the files to include dev, test, train in the filename
        df: dataframe
    """
    # Loop through the dataframe
    for index, row in df.iterrows():
        # Check if the filename starts with train, dev or test if so, skip
        if not row['filename'].startswith('train') and not row['filename'].startswith('dev') and not row['filename'].startswith('test'):
            # Create the new filename
            new_filename = row['Data'] + '_dia' + str(row['Dialogue_ID']) + '_utt' + str(row['Utterance_ID']) + '.mp4'
            # Rename the file
            os.rename(row['filepath'], folder_path + row['Data'] + '/mp4/' + new_filename)

In [35]:
rename_files(df)

Function to get the wav file paths

In [26]:
def convert_mp4_to_wav(df:pd.DataFrame):
    """
        Convert the videos to audio files using ffmpeg
        df: dataframe
    """
    # Convert the videos to audio files using ffmpeg
    video_list = df['filepath'].tolist()

    # Create a list of the audio files
    audio_list = [x.replace('mp4', 'wav') for x in video_list]

    # Create a list of the commands to be executed
    commands = ['ffmpeg -i ' + video_list[i] + ' -ab 160k -ac 2 -ar 44100 -vn ' + audio_list[i] for i in range(len(video_list))]

    # Execute the commands
    for command in commands:
        os.system(command)
        # break

    # Write the filepath of the audio files to the dataframe
    df['wav_filepath'] = folder_path + df['Data'] + '/wav/' + df['filename'] + '.wav'

    return df


In [27]:
# Convert the videos to audio files using the function convert_mp4_to_wav
# df = convert_mp4_to_wav(df)

In [29]:
# Show duplicate filenames in the dataframe
df[df.duplicated(['filename'], keep=False)].sort_values(by=['filename'])

Unnamed: 0,Sr No.,Utterance,Speaker,Emotion,Sentiment,Dialogue_ID,Utterance_ID,Season,Episode,StartTime,EndTime,Data,dataset,filename,filepath


## Test MELD Dataframe result


In [30]:
import unittest
import sys
from Test.Data.TestMELD import TestMELD

In [31]:
# Pass the variables to the test class
TestMELD.train_df = train_df
TestMELD.dev_df = dev_df
TestMELD.test_df = test_df
TestMELD.df = df
TestMELD.folder = folder_path

# Create an instance of the test class
test_class = TestMELD()

In [32]:
# Run the tests
suite = unittest.TestLoader().loadTestsFromTestCase(TestMELD)
unittest.TextTestRunner(verbosity=4,stream=sys.stderr).run(suite)

test_dev_df_columns (Test.Data.TestMELD.TestMELD.test_dev_df_columns) ... ok
test_dev_df_not_empty (Test.Data.TestMELD.TestMELD.test_dev_df_not_empty) ... ok
test_dev_df_rows (Test.Data.TestMELD.TestMELD.test_dev_df_rows) ... ok
test_df_data (Test.Data.TestMELD.TestMELD.test_df_data) ... ok
test_df_datatype (Test.Data.TestMELD.TestMELD.test_df_datatype) ... ok
test_df_emotions (Test.Data.TestMELD.TestMELD.test_df_emotions) ... ok
test_df_filename_duplicate (Test.Data.TestMELD.TestMELD.test_df_filename_duplicate) ... ok
test_df_filename_unique (Test.Data.TestMELD.TestMELD.test_df_filename_unique) ... ok
test_df_sentiments (Test.Data.TestMELD.TestMELD.test_df_sentiments) ... ok
test_df_wav_rows (Test.Data.TestMELD.TestMELD.test_df_wav_rows) ... ok
test_folder_path (Test.Data.TestMELD.TestMELD.test_folder_path) ... ok
test_test_df_columns (Test.Data.TestMELD.TestMELD.test_test_df_columns) ... ok
test_test_df_not_empty (Test.Data.TestMELD.TestMELD.test_test_df_not_empty) ... ok
test_test_d

<unittest.runner.TextTestResult run=17 errors=0 failures=0>

All tests have passed therefore we can export to CSV with the confidence that all the data is in good shape

In [33]:
export_filename = 'MELD_DF.csv'
export_path = path.join(folder_path, export_filename)

# Export to CSV
df.to_csv(export_path, index=False)