### The code below is used to clean the original dataset

In [None]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

# load dataset
path = '/Users/maed/Documents/Projects/SSClaVis/Dataset/FinalDataset/02 Dataset.csv'
df = pd.read_csv(path, sep=';')

# create unique audio id
dataset = df.copy()
audio_id = dataset['ID'].astype(str) + '_' + dataset['Trigger_counter'].astype(str)
dataset.insert(2, 'file_name', audio_id)
dataset.head()

In [None]:
# keep only specific columns
to_keep = ['file_name', 'Soundscape_eventfulness', 'Soundscape_pleasantness', 'BGpleasant', 'BGchaotic', 'BGvibrant', 'BGuneventful', 'BGcalm', 'BGannoying', 
           'BGeventful', 'BGmonotonous', 'SC_Nature', 'SC_Human', 'SC_Household', 'SC_Installation', 'SC_Signals', 'SC_Traffic', 'SC_Speech', 
           'SC_Music', 'FGsource', 'Activity',  'Location8']

dataset = dataset[to_keep]
dataset.head()

In [None]:
# rename columns
to_rename = {'Soundscape_eventfulness':'ISO_Eventfulness', 'Soundscape_pleasantness': 'ISO_Pleasantness', 
             'BGpleasant':'pleasant', 'BGchaotic':'chaotic', 'BGvibrant':'vibrant', 'BGuneventful':'uneventful', 
             'BGcalm':'calm', 'BGannoying':'annoying', 'BGeventful':'eventful', 'BGmonotonous':'monotonous'}

dataset.rename(columns=to_rename, inplace=True)
dataset.head()

In [None]:
# add duration in seconds and suffix:
dataset.insert(1, 'duration_s', 15)
dataset.insert(2, 'suffix', '.wav')
dataset.head()

In [None]:
# create new value ranges of soundscape items
def range_zero_to_four(x):
    return (x / (4 + np.sqrt(32)) + 1) * 2

def sc_range(x):
    x = round(x * 0.4, 1)

    return x.astype(float)

dataset['ISO_Eventfulness'] = dataset['ISO_Eventfulness'].apply(range_zero_to_four)
dataset['ISO_Pleasantness'] = dataset['ISO_Pleasantness'].apply(range_zero_to_four)

col = ['SC_Nature', 'SC_Human', 'SC_Household', 'SC_Installation', 'SC_Signals', 'SC_Traffic', 'SC_Speech', 'SC_Music']
dataset[col] = dataset[col].apply(sc_range)

dataset.head()

In [None]:
# clean acoustic dataset
path = '/Users/maed/Documents/Projects/SSClaVis/Dataset/FinalDataset/AcousticFeatures_SingleValues.csv'
acoustic_dataset = pd.read_csv(path, sep=';')

# items to keep from acoustic dataset
columns_to_select = ['Key', 'Channel', 'LAeq_default', 'N5_default', 'FavgArith_default', 'RAavgArith', 'SavgArith_default', 'R_default', 'T_default']
acoustic_dataset = acoustic_dataset[columns_to_select]

acoustic_dataset_max_values = acoustic_dataset.groupby('Key').max().reset_index()
acoustic_dataset_max_values = acoustic_dataset_max_values.drop(columns=['Channel'])
acoustic_dataset_max_values.head()

# calculate mean of both audio channels
#acoustic_dataset = acoustic_dataset.groupby('Key').mean().reset_index()
# acoustic_dataset = acoustic_dataset.drop(columns=['Channel'])
# acoustic_dataset.head()

In [None]:
# merge dataset and acoustic_dataset
final_dataset = dataset.merge(acoustic_dataset_max_values, left_on='file_name', right_on='Key', how='left')
final_dataset = final_dataset.drop(columns=['Key'])
final_dataset.head()


In [None]:
# get min max ranges of acoustic features
# --> used for Sliders in WebApp

col = ['LAeq_default', 'N5_default', 'FavgArith_default', 'RAavgArith', 'SavgArith_default', 'R_default', 'T_default']

min_max_values = final_dataset[col].agg({'min', 'max'})
min_max_values

In [110]:
# store final dataset
final_dataset.to_csv('/Users/maed/Documents/Projects/SSClaVis/Dataset/FinalDataset/FinalDataset.csv', sep=';', index=False)