# Data preparation
This notebook is used to analyze the used dataset and transform it to a format usable in the models.

The Beethoven Piano Sonata with Function Harmony (BPS-FH) dataset will be used. It is available here: https://github.com/Tsung-Ping/functional-harmony/tree/master.

## Imports

In [1]:
import pandas as pd
import numpy as np

## Data analysis

In [30]:
df_notes = pd.read_csv('data/functional-harmony/BPS_FH_Dataset/1/notes.csv', header=None)
df_notes

Unnamed: 0,0,1,2,3,4,5
0,-1.0,60,60,1.0,0,0
1,0.0,65,63,1.0,0,1
2,1.0,68,65,1.0,0,1
3,2.0,72,67,1.0,0,1
4,3.0,77,70,1.0,0,1
...,...,...,...,...,...,...
2184,796.0,53,56,1.0,1,152
2185,796.0,65,63,1.0,0,152
2186,796.0,68,65,1.0,0,152
2187,796.0,72,67,1.0,0,152


In [31]:
df_chords = pd.read_excel('data/functional-harmony/BPS_FH_Dataset/1/chords.xlsx', header=None)
df_chords

Unnamed: 0,0,1,2,3,4,5,6
0,-1,8,f,1,m,0,i
1,8,16,f,5,D7,1,V65
2,16,20,f,1,m,0,i
3,20,24,f,5,D7,2,V43
4,24,26,f,1,m,1,i6
...,...,...,...,...,...,...,...
244,790,791,f,1,m,0,i
245,791,792,f,6,M,0,VI
246,792,794,f,2,h7,1,ii=65
247,794,796,f,5,D7,0,V7


The dataset's columns should be given descriptive names. The information about the columns is written on the dataset's Github README.md.

In [32]:
df_notes = df_notes.rename(columns={
    0: 'onset_time',
    1: 'midi_note',
    2: 'morphetic_pitch_number',
    3: 'duration',
    4: 'staff_number',
    5: 'measure'
})
df_notes

Unnamed: 0,onset_time,midi_note,morphetic_pitch_number,duration,staff_number,measure
0,-1.0,60,60,1.0,0,0
1,0.0,65,63,1.0,0,1
2,1.0,68,65,1.0,0,1
3,2.0,72,67,1.0,0,1
4,3.0,77,70,1.0,0,1
...,...,...,...,...,...,...
2184,796.0,53,56,1.0,1,152
2185,796.0,65,63,1.0,0,152
2186,796.0,68,65,1.0,0,152
2187,796.0,72,67,1.0,0,152


In [33]:
df_chords = df_chords.rename(columns={
    0: 'onset_time',
    1: 'offset_time',
    2: 'key',
    3: 'degree',
    4: 'quality',
    5: 'inversion',
    6: 'roman_numeral_notation'
})
df_chords

Unnamed: 0,onset_time,offset_time,key,degree,quality,inversion,roman_numeral_notation
0,-1,8,f,1,m,0,i
1,8,16,f,5,D7,1,V65
2,16,20,f,1,m,0,i
3,20,24,f,5,D7,2,V43
4,24,26,f,1,m,1,i6
...,...,...,...,...,...,...,...
244,790,791,f,1,m,0,i
245,791,792,f,6,M,0,VI
246,792,794,f,2,h7,1,ii=65
247,794,796,f,5,D7,0,V7


In [40]:
# Sort the notes in the dataset first by their onset time (this is already sorted in the inital dataset) ascending, and then by the note pitch descending.
df_notes = df_notes.sort_values(by=['onset_time', 'midi_note'], ascending=[True, False])

monophonic_melody = []
current_onset = None

for _, row in df_notes.iterrows():
    onset_time = row['onset_time']
    midi_note = row['midi_note']
    
    # Since the dataset is sorted, if there are multiple notes played at the same time, we only keep the first one (the one with the highest pitch).
    if onset_time != current_onset:
        monophonic_melody.append(row)
        current_onset = onset_time

df_notes_clean = pd.DataFrame(monophonic_melody).reset_index(drop=True)
df_notes_clean

Unnamed: 0,onset_time,midi_note,morphetic_pitch_number,duration,staff_number,measure
0,-1.0,60.0,60.0,1.0,0.0,0.0
1,0.0,65.0,63.0,1.0,0.0,1.0
2,1.0,68.0,65.0,1.0,0.0,1.0
3,2.0,72.0,67.0,1.0,0.0,1.0
4,3.0,77.0,70.0,1.0,0.0,1.0
...,...,...,...,...,...,...
1278,790.0,72.0,67.0,1.0,0.0,150.0
1279,791.0,77.0,70.0,1.0,0.0,150.0
1280,792.0,77.0,70.0,1.0,0.0,151.0
1281,794.0,76.0,69.0,1.0,0.0,151.0


## Data processing