In [None]:
# Get features for each file and import to csv
import os
import csv
import KeyPos as KP
from music21 import note

os.makedirs(f'./Data', exist_ok = True)
files = os.listdir('./Statistical_Learning_and_Estimation_of_Piano_Fingering./PianoFingeringDataset_v1.2/FingeringFiles')
output_csv_path = './Data./features.csv'
filenames = []
fieldnames = [
    'Note_number',
    'Onset_time_in_sec',
    'Offset_time_in_sec',
    'Duration',
    'Pitch',
    'KeyPos_x',
    'KeyPos_y',
    'Onset_velocity',
    'Offset_velocity',
    'Begin_hand',
    'Begin_fingering',
    'Has_fingering_substitution',
    'End_hand',
    'End_fingering'
]

with open(output_csv_path, 'w', newline='') as csv_file:
    csv_writer = csv.DictWriter(csv_file, fieldnames = fieldnames)
    csv_writer.writeheader()  # write the header row

    for file in files:
        filenames.append(file)
        path = f'./Statistical_Learning_and_Estimation_of_Piano_Fingering./PianoFingeringDataset_v1.2./FingeringFiles./{file}'
        with open(path, 'r') as f:
            line = f.readline() # read the first useless line
            for line in f:
                temp = line.split()
                # read each feature
                features = {
                    'Note_number': temp[0],
                    'Onset_time_in_sec': temp[1],
                    'Offset_time_in_sec': temp[2],
                    'Duration': None,
                    'Pitch': note.Note(temp[3]).pitch.midi,
                    'KeyPos_x': None,
                    'KeyPos_y': None,
                    'Onset_velocity': temp[4],
                    'Offset_velocity': temp[5],
                    'Begin_hand': temp[6],
                    'Begin_fingering': temp[7],
                    'Has_fingering_substitution': 1,
                    'End_hand': None,
                    'End_fingering': temp[7]
                }

                # set Key position
                keypos = KP.PitchToKeyPos(features['Pitch'])
                features['KeyPos_x'] = keypos.x
                features['KeyPos_y'] = keypos.y
                
                # set Duration
                features['Duration'] = float(features['Offset_time_in_sec']) - float(features['Onset_time_in_sec'])

                # if same note with different fingering
                if '_' in features['Begin_fingering']:
                    fingerings = features['Begin_fingering'].split('_')
                    features['Begin_fingering'] = fingerings[0]
                    features['End_fingering'] = fingerings[len(fingerings)-1]
                features['Begin_fingering'] = int(features['Begin_fingering'])
                if features['End_fingering'] != '': # there're '4_'-like error data in dataset (ex. 028-4_fingering.txt)
                    features['End_fingering'] = int(features['End_fingering'])
                else:
                    features['End_fingering'] = features['Begin_fingering']
                # convert hand representation: left 1->0, right 0->1
                if features['Begin_hand'] == '0':
                    features['Begin_hand'] = 1
                else:
                    features['Begin_hand'] = 0
                # compare Begin and End
                if features['Begin_fingering'] == features['End_fingering']:
                    features['Has_fingering_substitution'] = 0
                # add next starting hand
                if features['End_fingering'] > 0: # right
                    features['End_hand'] = 1
                else: # left
                    features['End_hand'] = 0
                # convert to float
                features = {key: float(value) for key, value in features.items()}
                csv_writer.writerow(features)

In [1]:
# Data preprocessing: all use Min-Max

import pandas as pd
import numpy as np
import math

df = pd.read_csv('./Data./features.csv')
x_to_scale = ['Note_number', 'Onset_time_in_sec', 'Offset_time_in_sec', 'Onset_velocity', 'Offset_velocity']
y_to_scale = ['Begin_fingering', 'End_fingering']
sub_dfs = []
length = len(df)
index = 0

sub_dfs = [group for _, group in df.groupby((df['Note_number'] == 0).cumsum())]

In [2]:
# skip this block

from sklearn.preprocessing import MinMaxScaler

fingering_mapping = {
    -5.0: 0.1,
    -4.0: 0.2,
    -3.0: 0.3,
    -2.0: 0.4,
    -1.0: 0.5,
    1.0: 0.6,
    2.0: 0.7,
    3.0: 0.8,
    4.0: 0.9,
    5.0: 1.0,
}

def column_scaling(arr, min, max):
    for i in range(len(arr)):
        arr.values[i] = (arr.values[i] - min) / (max - min)
    return arr

def get_duration(sub_df):
    for i in range(len(sub_df)):
        sub_df['Duration'].values[i] = sub_df['Offset_time_in_sec'].values[i] - sub_df['Onset_time_in_sec'].values[i]
    return sub_df

def Scale(sub_df, begin_index, end_index):
    min_time = sub_df['Onset_time_in_sec'][begin_index]
    max_time = sub_df['Offset_time_in_sec'][end_index-1]
    scaler = MinMaxScaler()
    for column in sub_df.columns:
        if column == 'Note_number':
            sub_df[column] = scaler.fit_transform(sub_df[column].values.reshape(-1, 1))
        elif column in ['Onset_time_in_sec', 'Offset_time_in_sec']:
            sub_df[column] = column_scaling(sub_df[column], min_time, max_time)
        elif column == 'Duration':
            sub_df = get_duration(sub_df)
        elif column == 'Pitch':
            sub_df[column] = column_scaling(sub_df[column], 21, 108)
        elif column == 'KeyPos_x':
            sub_df[column] = column_scaling(sub_df[column], -23, 28)
        elif column in ['Onset_velocity', 'Offset_velocity']:
            sub_df[column] = column_scaling(sub_df[column], 0, 127)
        elif column in ['Begin_fingering', 'End_fingering']:
            sub_df[column] = sub_df[column].map(fingering_mapping)
    return sub_df

# scaling

begin_index = 0
end_index = 0
for sub_df in sub_dfs:
        end_index = begin_index + len(sub_df) - 1
        sub_df = Scale(sub_df, begin_index, end_index)
        begin_index = end_index + 1

In [2]:
# split hands and train-test
import csv
fieldnames = [
    'Note_number',
    'Onset_time_in_sec',
    'Offset_time_in_sec',
    'Duration',
    'Pitch',
    'KeyPos_x',
    'KeyPos_y',
    'Onset_velocity',
    'Offset_velocity',
    'Begin_hand',
    'Begin_fingering',
    'Has_fingering_substitution',
    'End_hand',
    'End_fingering'
]

import math

# Set train-validation-test ratio
train_ratio = 0.7
val_ratio = 0.15
test_ratio = train_ratio - val_ratio

# Calculate the number of files for each set
total_files = len(sub_dfs)
train_files_num = math.floor(total_files * train_ratio)
val_files_num = math.floor(total_files * val_ratio)
test_files_num = total_files - train_files_num - val_files_num

# Split the data into train-validation-test sets
temp_train_df = pd.concat(sub_dfs[:train_files_num], ignore_index = True)
temp_val_df = pd.concat(sub_dfs[train_files_num:train_files_num+val_files_num], ignore_index = True)
temp_test_df = pd.concat(sub_dfs[train_files_num+val_files_num:], ignore_index = True)

# Set output path
train_left_path = f'./Data./train_left.csv'
train_right_path = f'./Data./train_right.csv'
val_left_path = f'./Data./val_left.csv'
val_right_path = f'./Data./val_right.csv'
test_left_path = f'./Data./test_left.csv'
test_right_path = f'./Data./test_right.csv'

In [3]:
# write train
with open(train_left_path, 'w', newline = '') as left, open(train_right_path, 'w', newline = '') as right:
    # write headers
    left_writer = csv.DictWriter(left, fieldnames = fieldnames)
    left_writer.writeheader()
    right_writer = csv.DictWriter(right, fieldnames = fieldnames)
    right_writer.writeheader()
    # write line
    for _, row in temp_train_df.iterrows():
        if row['Begin_hand'] == 0.0: # left
            left_writer.writerow(row.to_dict())
        else: # right
            right_writer.writerow(row.to_dict())

In [4]:
# write validation
with open(val_left_path, 'w', newline = '') as left, open(val_right_path, 'w', newline = '') as right:
    # write headers
    left_writer = csv.DictWriter(left, fieldnames = fieldnames)
    left_writer.writeheader()
    right_writer = csv.DictWriter(right, fieldnames = fieldnames)
    right_writer.writeheader()
    # write line
    for _, row in temp_val_df.iterrows():
        if row['Begin_hand'] == 0.0: # left
            left_writer.writerow(row.to_dict())
        else: # right
            right_writer.writerow(row.to_dict())

In [5]:
# write test
with open(test_left_path, 'w', newline = '') as left, open(test_right_path, 'w', newline = '') as right:
    # write headers
    left_writer = csv.DictWriter(left, fieldnames = fieldnames)
    left_writer.writeheader()
    right_writer = csv.DictWriter(right, fieldnames = fieldnames)
    right_writer.writeheader()
    # write line
    for _, row in temp_test_df.iterrows():
        if row['Begin_hand'] == 0.0: # left
            left_writer.writerow(row.to_dict())
        else: # right
            right_writer.writerow(row.to_dict())