In [2]:
from collections import Counter, OrderedDict
import numpy as np
import functools
import json
import math
import os 

%run midi_utils.ipynb

## Additional preprocessing

In this .ipynb, some additional fixes is done to better the encoding made in midi_utils.
For an example, since Music21 often deals with fractions, the rounding and subtraction done in midi_utils resulted in some problems, such as values that should've resulted in 0 resulting in 0.001 or 0.333 resulting in 0.334. These are fixed here.

In addition, musical piece starting and ending tokens are added and rare (seen less than 100 times in the entire dataset) wait values are replaced with their closest often-occurring counterpart.

Finally, the resulting data is turned into integers, with the token-integer pairing saved to vocabulary.json and a .txt file is generated, where each line represents a 100 integers long sequence input and a single integer output, which will be training data for the RNN. 

<strong>Note</strong> that due the generated training data file has 2M lines (~600MB), so to use it for your own training you have to run this entire notebook, because GitHub wouldn't be happy if I uploaded it.

In [89]:
# Read all midi tokens into a single list of lists
filename = './training_data.txt'
with open(filename) as file:
    training_data = file.read().splitlines() 
    
# Split each midi by space to get its original token list form
training_data = [tokens.split(" ") for tokens in training_data]

# Add beginning and ending tokens
for tokens in training_data:
    tokens.insert(0, "<BOS>")
    tokens.append("<EOS>")
    
# Handle rare wait values by finding rarely occurring wait values and replacing them with the closest often-occurring value
training_data = [item for sublist in training_data for item in sublist]

for i in range(len(training_data)):
    if training_data[i].startswith("wait"):
        fraction_to_float = round(float(Fraction(training_data[i].split(":")[1])), 3)
        training_data[i] = "wait:" + str(fraction_to_float)

token_occurrences = Counter(training_data)
occurrence_threshold = 100
need_to_be_replaced = sorted(list({float(k.split(":")[1]) for k, v in sorted(token_occurrences.items(), key=lambda item: item[1]) if k.startswith("wait") and v < occurrence_threshold}))
occurring_often = sorted(list({float(k.split(":")[1]) for k, v in sorted(token_occurrences.items(), key=lambda item: item[1]) if k.startswith("wait") and v >= occurrence_threshold}))

# Find closest often-occurring value to each rarely occurring value
replacements = list()
for wait_time in need_to_be_replaced:
    replacements.append([str(wait_time), str(min(occurring_often, key=lambda x:abs(x-wait_time)))])
    
for old, new in replacements:
    old = "wait:" + str(old)
    new = "wait:" + str(new)
    training_data = [new if token == old else token for token in training_data]

In [90]:
# Gather some information of the data
token_occurrences = Counter(training_data)
sorted_token_occurrences = {k: v for k, v in sorted(token_occurrences.items(), key=lambda item: item[1])}

print("Number of unique tokens: " + str(len(sorted_token_occurrences)))

print("Occurrences:")
for token, count in sorted_token_occurrences.items():
    print(token, "\t", count)

Number of unique tokens: 208
Occurrences:
note:106 	 1
stop_note:106 	 1
note:107 	 1
stop_note:107 	 1
note:21 	 3
stop_note:21 	 3
note:23 	 4
stop_note:23 	 4
note:105 	 8
stop_note:105 	 8
note:104 	 16
stop_note:104 	 16
note:22 	 17
stop_note:22 	 17
note:24 	 65
stop_note:24 	 65
note:103 	 82
stop_note:103 	 82
note:102 	 101
stop_note:102 	 101
wait:1.667 	 114
note:25 	 117
stop_note:25 	 117
note:26 	 150
stop_note:26 	 150
tempo:40 	 239
wait:4.0 	 245
wait:2.5 	 248
wait:1.75 	 257
note:27 	 273
stop_note:27 	 273
<BOS> 	 295
<EOS> 	 295
wait:1.25 	 303
note:28 	 310
stop_note:28 	 310
note:101 	 341
stop_note:101 	 341
wait:3.0 	 387
wait:0.667 	 633
note:100 	 671
stop_note:100 	 671
wait:0.417 	 686
note:98 	 702
stop_note:98 	 702
note:30 	 729
stop_note:30 	 729
note:99 	 799
stop_note:99 	 799
note:96 	 1029
stop_note:96 	 1029
note:29 	 1049
stop_note:29 	 1049
tempo:40.0 	 1127
note:97 	 1140
stop_note:97 	 1140
note:95 	 1168
stop_note:95 	 1168
wait:1.5 	 1522
no

In [91]:
print(training_data[:100])

['<BOS>', 'tempo:90.0', 'wait:0.5', 'note:81', 'note:57', 'wait:0.5', 'stop_note:81', 'stop_note:57', 'note:88', 'note:64', 'tempo:100.0', 'wait:3.0', 'stop_note:88', 'stop_note:64', 'note:86', 'velocity:50', 'note:62', 'wait:0.083', 'velocity:70', 'note:88', 'velocity:50', 'note:64', 'wait:0.167', 'stop_note:86', 'stop_note:62', 'tempo:110.0', 'velocity:70', 'note:86', 'note:62', 'wait:0.083', 'stop_note:88', 'stop_note:64', 'wait:0.417', 'stop_note:86', 'stop_note:62', 'note:84', 'note:60', 'wait:0.5', 'stop_note:84', 'stop_note:60', 'tempo:100.0', 'note:86', 'note:62', 'wait:0.5', 'stop_note:86', 'stop_note:62', 'tempo:110.0', 'note:88', 'note:64', 'wait:0.5', 'stop_note:88', 'stop_note:64', 'tempo:120.0', 'note:89', 'note:65', 'wait:0.5', 'stop_note:89', 'stop_note:65', 'note:91', 'note:67', 'wait:0.5', 'stop_note:91', 'stop_note:67', 'tempo:110.0', 'note:88', 'note:64', 'wait:0.5', 'stop_note:88', 'stop_note:64', 'note:89', 'note:65', 'wait:0.5', 'stop_note:89', 'stop_note:65', 't

In [92]:
# Turn all tokens to ints
unique_tokens = list(set(training_data))
token_int_pairings = {token: unique_tokens.index(token) for token in unique_tokens}

training_data_ints = [token_int_pairings[token] for token in training_data]

# Save dictionary
json.dump(dict(token_int_pairings.items()), open("./dictionary.json", 'w'))

In [93]:
seq_length = 100
ints_as_str = [str(token_int) for token_int in training_data_ints]

with open("./training_data_preprocessed.txt", "w") as f:
    for idx in range(0, len(training_data_ints) - seq_length, 1):
        f.write(' '.join(ints_as_str[idx:idx + seq_length]) + ", " + ints_as_str[idx + seq_length] + '\n')