In [4]:
import pandas as pd
import os
import ast

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

annos = pd.read_csv(os.path.join(os.getcwd(), '..', 'CSV_files', 'annotations_with_path_matches.csv'))

In [5]:
# check whether all filenames in csv are present in audio folder
audio_paths = os.listdir(os.path.join(os.getcwd(), '..', 'audios'))
missing_files = [file for file in annos['file_path'] if file not in audio_paths]
annos.loc[annos['file_path'] == 'jokerdefeatedclown:callmejokerhildurgunadottirsample2mp3.mp3', 'file_path'] = 'jokerdefeatedclown_callmejokerhildurgunadottirsample2mp3.mp3'
annos = annos.drop(index=annos.loc[annos['file_path'].isna()].index)

print('One file name mismatch between csv and audio folder fixed. Errors also dropped')

One file name mismatch between csv and audio folder fixed. Errors also dropped


In [6]:
# check whether film metadata matches with file_path in csv

annos_unique = annos.copy()
annos_unique = annos.drop_duplicates(subset=['file_path'], ignore_index=True)

mismatch_path_artist = []

for index, row in annos_unique[['file_path', 'Composer/Artist_meta']].iterrows():
    composer_tokens = row['Composer/Artist_meta'].lower().split()

    count = 0
    for token in composer_tokens:
        if token not in row['file_path']:
            count += 1

    if count == len(composer_tokens):
        mismatch_path_artist.append(index)

print('File names and film metadata match, only special character discrepancy (letter accents)')

File names and film metadata match, only special character discrepancy (letter accents)


In [7]:
drop_cols = ['participant_id','audio_file_path','num_annotations','all_timestamps','file_name_clean','fuzzy_match','fuzzy_match_meta','00:28:00_meta','Size (MB)_meta','Sample Rate (kHz)_meta',
             'Stereo = 0/Mono =1_meta','ISRC_meta','Start Time Sample 1 (seconds)_meta','End Time Sample 1_meta','Sample 1 length (seconds)_meta','Sample 2 filename:_meta', 'emotion_sentence',
             'Start Time Sample 2 (seconds)_meta','End Time Sample 2_meta','Sample 2 length (seconds)_meta','sample2_clean_meta','sample_number','Score = 0, Soundtrack = 1_meta']

annos = annos.drop(columns=drop_cols).reindex(columns=['file_path','Track Title_meta','Composer/Artist_meta','Film_meta','Year_meta','Film Genre - IMDb_meta',
                                                       'Director_meta','all_valence_values','all_arousal_values','familiarity_rating'])
annos = annos.rename({'file_path':'file','Track Title_meta':'title','Composer/Artist_meta':'composer','Film_meta':'film','Year_meta':'year','Film Genre - IMDb_meta':'genre',
                      'Director_meta':'director','all_valence_values':'valence','all_arousal_values':'arousal','familiarity_rating':'familiarity'}, axis=1)

print('Irrelevant columns dropped, relevant columns reordered and renamed')

Irrelevant columns dropped, relevant columns reordered and renamed


In [8]:
valence_index = 8
arousal_index = 9

data = []
print(f'Annos shape before extracting individual V/A values: {annos.shape}')
for _, row in annos.iterrows():
    valence, arousal = ast.literal_eval(row['valence']), ast.literal_eval(row['arousal'])

    for v, a in zip(valence, arousal):
        data.append((row['file'], row['title'], row['composer'], row['film'], row['year'], row['genre'], row['director'], v, a, row['familiarity']))
annos = pd.DataFrame(data, columns=annos.columns)
print(f'Annos shape after extracting individual V/A values: {annos.shape}')


Annos shape before extracting individual V/A values: (660, 10)
Annos shape after extracting individual V/A values: (1678, 10)


In [9]:
annos.to_csv(os.path.join(os.getcwd(), '..', 'CSV_files', 'arousal_valence.csv'), index=False)