# Libraries

In [1]:
import os
import pandas as pd
from os import path

In [2]:
# Folder path where the MELD data is stored
folder_path = r'Data/MELD/'

## Get the MELD dataframes and analyse the values

Below we will load the csv files that we have for MELD into DataFrames, analyse the values, and merge the datasets into a single dataset.

In [10]:
# Create the paths for the csv files
train_path = path.join(folder_path, 'csv/train_sent_emo.csv')
dev_path = path.join(folder_path, 'csv/dev_sent_emo.csv')
test_path = path.join(folder_path, 'csv/test_sent_emo.csv')

# read the csv files to pandas dataframes
train_df = pd.read_csv(train_path, encoding='utf-8')
dev_df = pd.read_csv(dev_path, encoding='utf-8')
test_df = pd.read_csv(test_path, encoding='utf-8')

In [20]:
# count rows in each dataset
print('Train: ', len(train_df))
print('Dev: ', len(dev_df))
print('Test: ', len(test_df))

Train:  9988
Dev:  1108
Test:  2610


In [21]:
def concat_dfs(train_df:pd.DataFrame, dev_df:pd.DataFrame, test_df:pd.DataFrame):
    """
        Concatenate the MELD train, dev and test dataframes into a single dataframe
        train_df: train dataframe
        dev_df: dev dataframe
        test_df: test dataframe
        return: concatenated dataframe
    """
    # Add a new column to differentiate between train, dev and test
    train_df['Data'] = 'train'
    dev_df['Data'] = 'dev'
    test_df['Data'] = 'test'

    # Concatenate the dataframes
    df = pd.concat([train_df, dev_df, test_df], ignore_index=True)

    return df

In [22]:
# Concatenate the dataframes and add the filepath to the video files
df = concat_dfs(train_df, dev_df, test_df)

In [23]:
df

Unnamed: 0,Sr No.,Utterance,Speaker,Emotion,Sentiment,Dialogue_ID,Utterance_ID,Season,Episode,StartTime,EndTime,Data
0,1,also I was the point person on my company's tr...,Chandler,neutral,neutral,0,0,8,21,"00:16:16,059","00:16:21,731",train
1,2,You must've had your hands full.,The Interviewer,neutral,neutral,0,1,8,21,"00:16:21,940","00:16:23,442",train
2,3,That I did. That I did.,Chandler,neutral,neutral,0,2,8,21,"00:16:23,442","00:16:26,389",train
3,4,So let's talk a little bit about your duties.,The Interviewer,neutral,neutral,0,3,8,21,"00:16:26,820","00:16:29,572",train
4,5,My duties? All right.,Chandler,surprise,positive,0,4,8,21,"00:16:34,452","00:16:40,917",train
...,...,...,...,...,...,...,...,...,...,...,...,...
13701,2760,"Yeah, I mean, come on Ross, no one will even n...",Rachel,neutral,neutral,279,11,6,4,"00:14:35,457","00:14:40,211",test
13702,2761,They're not listening too me?,Ross,surprise,negative,279,12,6,4,"00:14:42,256","00:14:43,840",test
13703,2762,Of course they're listening to you! Everybody ...,Rachel,neutral,neutral,279,13,6,4,"00:14:44,008","00:14:48,511",test
13704,2763,Monica you really think I should try this phas...,Ross,neutral,neutral,279,14,6,4,"00:14:48,138","00:14:52,390",test


## Add essential columns to the dataframe

Our dataframe currently is missing the dataset name, filename, and filepath columns, which are added through the function below. Furthermore, our dataset contains duplicate filenames, hence we will modify the names and include the type of dataset

In [24]:
def process_df(df:pd.DataFrame):
    """
        Add the filepath to the video files, filename and dataset name to the dataframe
        df: dataframe
        return: dataframe with the added columns
    """
    # Dataset name
    df['dataset'] = 'MELD'

    # create a new column that contains the dialogue id and utterance id
    df['filename'] = df['Data'].astype(str) + '_dia' + df['Dialogue_ID'].astype(str) + '_utt' + df['Utterance_ID'].astype(str)

    # Add the filepath to the video files to the dataframe using df['Data'] + '/' + df['filename'] + '.mp4'
    df['filepath'] = folder_path + df['Data'] + '/mp4/' + df['filename'] + '.mp4'

    return df

In [25]:
df = process_df(df)

In [26]:
# Print the number of emotions in each dataset
pd.pivot_table(df, index=['Emotion'], columns=['Data'], values=['Sentiment'], aggfunc='count', margins=True)

Unnamed: 0_level_0,Sentiment,Sentiment,Sentiment,Sentiment
Data,dev,test,train,All
Emotion,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
anger,153,345,1109,1607
disgust,22,68,271,361
fear,40,50,268,358
joy,163,402,1743,2308
neutral,469,1256,4709,6434
sadness,111,208,683,1002
surprise,150,281,1205,1636
All,1108,2610,9988,13706


In [27]:
def rename_files(df:pd.DataFrame):
    """
        The dataset contain duplicate filenames. This function renames the files to include dev, test, train in the filename
        df: dataframe
    """
    # Loop through the dataframe
    for index, row in df.iterrows():
        # Check if the filename starts with train, dev or test if so, skip
        if not row['filename'].startswith('train') and not row['filename'].startswith('dev') and not row['filename'].startswith('test'):
            # Create the new filename
            new_filename = row['Data'] + '_dia' + str(row['Dialogue_ID']) + '_utt' + str(row['Utterance_ID']) + '.mp4'
            # Rename the file
            os.rename(row['filepath'], folder_path + row['Data'] + '/mp4/' + new_filename)

In [28]:
rename_files(df)

In [29]:
df

Unnamed: 0,Sr No.,Utterance,Speaker,Emotion,Sentiment,Dialogue_ID,Utterance_ID,Season,Episode,StartTime,EndTime,Data,dataset,filename,filepath
0,1,also I was the point person on my company's tr...,Chandler,neutral,neutral,0,0,8,21,"00:16:16,059","00:16:21,731",train,MELD,train_dia0_utt0,Data/MELD/train/mp4/train_dia0_utt0.mp4
1,2,You must've had your hands full.,The Interviewer,neutral,neutral,0,1,8,21,"00:16:21,940","00:16:23,442",train,MELD,train_dia0_utt1,Data/MELD/train/mp4/train_dia0_utt1.mp4
2,3,That I did. That I did.,Chandler,neutral,neutral,0,2,8,21,"00:16:23,442","00:16:26,389",train,MELD,train_dia0_utt2,Data/MELD/train/mp4/train_dia0_utt2.mp4
3,4,So let's talk a little bit about your duties.,The Interviewer,neutral,neutral,0,3,8,21,"00:16:26,820","00:16:29,572",train,MELD,train_dia0_utt3,Data/MELD/train/mp4/train_dia0_utt3.mp4
4,5,My duties? All right.,Chandler,surprise,positive,0,4,8,21,"00:16:34,452","00:16:40,917",train,MELD,train_dia0_utt4,Data/MELD/train/mp4/train_dia0_utt4.mp4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13701,2760,"Yeah, I mean, come on Ross, no one will even n...",Rachel,neutral,neutral,279,11,6,4,"00:14:35,457","00:14:40,211",test,MELD,test_dia279_utt11,Data/MELD/test/mp4/test_dia279_utt11.mp4
13702,2761,They're not listening too me?,Ross,surprise,negative,279,12,6,4,"00:14:42,256","00:14:43,840",test,MELD,test_dia279_utt12,Data/MELD/test/mp4/test_dia279_utt12.mp4
13703,2762,Of course they're listening to you! Everybody ...,Rachel,neutral,neutral,279,13,6,4,"00:14:44,008","00:14:48,511",test,MELD,test_dia279_utt13,Data/MELD/test/mp4/test_dia279_utt13.mp4
13704,2763,Monica you really think I should try this phas...,Ross,neutral,neutral,279,14,6,4,"00:14:48,138","00:14:52,390",test,MELD,test_dia279_utt14,Data/MELD/test/mp4/test_dia279_utt14.mp4


## Convert mp4 files to wav using ffmpeg and write the wav filepath to the dataframe

The MELD dataset only had video files, therefore it is necessary to convert the mp4 to wav files to be used in this project

In [26]:
def convert_mp4_to_wav(df:pd.DataFrame):
    """
        Convert the videos to audio files using ffmpeg
        df: dataframe
    """
    # Convert the videos to audio files using ffmpeg
    video_list = df['filepath'].tolist()

    # Create a list of the audio files
    audio_list = [x.replace('mp4', 'wav') for x in video_list]

    # Create a list of the commands to be executed
    commands = ['ffmpeg -i ' + video_list[i] + ' -ab 160k -ac 2 -ar 44100 -vn ' + audio_list[i] for i in range(len(video_list))]

    # Execute the commands
    for command in commands:
        os.system(command)
        # break

    # Write the filepath of the audio files to the dataframe
    df['wav_filepath'] = folder_path + df['Data'] + '/wav/' + df['filename'] + '.wav'

    return df


In [27]:
# Convert the videos to audio files using the function convert_mp4_to_wav
df = convert_mp4_to_wav(df)

In [31]:
df

Unnamed: 0,Sr No.,Utterance,Speaker,Emotion,Sentiment,Dialogue_ID,Utterance_ID,Season,Episode,StartTime,EndTime,Data,dataset,filename,filepath,wav_filepath
0,1,also I was the point person on my company's tr...,Chandler,neutral,neutral,0,0,8,21,"00:16:16,059","00:16:21,731",train,MELD,train_dia0_utt0,Data/MELD/train/mp4/train_dia0_utt0.mp4,Data/MELD/train/wav/train_dia0_utt0.wav
1,2,You must've had your hands full.,The Interviewer,neutral,neutral,0,1,8,21,"00:16:21,940","00:16:23,442",train,MELD,train_dia0_utt1,Data/MELD/train/mp4/train_dia0_utt1.mp4,Data/MELD/train/wav/train_dia0_utt1.wav
2,3,That I did. That I did.,Chandler,neutral,neutral,0,2,8,21,"00:16:23,442","00:16:26,389",train,MELD,train_dia0_utt2,Data/MELD/train/mp4/train_dia0_utt2.mp4,Data/MELD/train/wav/train_dia0_utt2.wav
3,4,So let's talk a little bit about your duties.,The Interviewer,neutral,neutral,0,3,8,21,"00:16:26,820","00:16:29,572",train,MELD,train_dia0_utt3,Data/MELD/train/mp4/train_dia0_utt3.mp4,Data/MELD/train/wav/train_dia0_utt3.wav
4,5,My duties? All right.,Chandler,surprise,positive,0,4,8,21,"00:16:34,452","00:16:40,917",train,MELD,train_dia0_utt4,Data/MELD/train/mp4/train_dia0_utt4.mp4,Data/MELD/train/wav/train_dia0_utt4.wav
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13701,2760,"Yeah, I mean, come on Ross, no one will even n...",Rachel,neutral,neutral,279,11,6,4,"00:14:35,457","00:14:40,211",test,MELD,test_dia279_utt11,Data/MELD/test/mp4/test_dia279_utt11.mp4,Data/MELD/test/wav/test_dia279_utt11.wav
13702,2761,They're not listening too me?,Ross,surprise,negative,279,12,6,4,"00:14:42,256","00:14:43,840",test,MELD,test_dia279_utt12,Data/MELD/test/mp4/test_dia279_utt12.mp4,Data/MELD/test/wav/test_dia279_utt12.wav
13703,2762,Of course they're listening to you! Everybody ...,Rachel,neutral,neutral,279,13,6,4,"00:14:44,008","00:14:48,511",test,MELD,test_dia279_utt13,Data/MELD/test/mp4/test_dia279_utt13.mp4,Data/MELD/test/wav/test_dia279_utt13.wav
13704,2763,Monica you really think I should try this phas...,Ross,neutral,neutral,279,14,6,4,"00:14:48,138","00:14:52,390",test,MELD,test_dia279_utt14,Data/MELD/test/mp4/test_dia279_utt14.mp4,Data/MELD/test/wav/test_dia279_utt14.wav


## Test MELD Dataframe result

To ensure that the data is in good shape and that nothing has been missed we will execute a series of unit tests to validate the results

In [None]:
import unittest
import sys
from Test.TestMELD import TestMELD

In [31]:
# Pass the variables to the test class
TestMELD.train_df = train_df
TestMELD.dev_df = dev_df
TestMELD.test_df = test_df
TestMELD.df = df
TestMELD.folder = folder_path

# Create an instance of the test class
test_class = TestMELD()

In [32]:
# Run the tests
suite = unittest.TestLoader().loadTestsFromTestCase(TestMELD)
unittest.TextTestRunner(verbosity=4,stream=sys.stderr).run(suite)

test_dev_df_columns (Test.Data.TestMELD.TestMELD.test_dev_df_columns) ... ok
test_dev_df_not_empty (Test.Data.TestMELD.TestMELD.test_dev_df_not_empty) ... ok
test_dev_df_rows (Test.Data.TestMELD.TestMELD.test_dev_df_rows) ... ok
test_df_data (Test.Data.TestMELD.TestMELD.test_df_data) ... ok
test_df_datatype (Test.Data.TestMELD.TestMELD.test_df_datatype) ... ok
test_df_emotions (Test.Data.TestMELD.TestMELD.test_df_emotions) ... ok
test_df_filename_duplicate (Test.Data.TestMELD.TestMELD.test_df_filename_duplicate) ... ok
test_df_filename_unique (Test.Data.TestMELD.TestMELD.test_df_filename_unique) ... ok
test_df_sentiments (Test.Data.TestMELD.TestMELD.test_df_sentiments) ... ok
test_df_wav_rows (Test.Data.TestMELD.TestMELD.test_df_wav_rows) ... ok
test_folder_path (Test.Data.TestMELD.TestMELD.test_folder_path) ... ok
test_test_df_columns (Test.Data.TestMELD.TestMELD.test_test_df_columns) ... ok
test_test_df_not_empty (Test.Data.TestMELD.TestMELD.test_test_df_not_empty) ... ok
test_test_d

<unittest.runner.TextTestResult run=17 errors=0 failures=0>

## Export to CSV

With all tests being successful we can export to CSV with confidence that all the data is as expected

In [14]:
export_filename = 'MELD_DF.csv'
export_path = path.join(folder_path, export_filename)

# Export to CSV
df.to_csv(export_path, index=False)