In [1]:
from music21 import *
import pandas as pd
import numpy as np
from keras.preprocessing.sequence import pad_sequences
from math import ceil, floor
import pickle

Using TensorFlow backend.


### Feature Extraction

In [2]:
# extract the notes from each piece in the corpus and place in in the x_train variable

bach_corpus = corpus.getComposer('bach')
pitch_train = []  # training data for the sequence of pitches (MIDI number)
duration_train = []  # training data for the sequence of note durations

pitches = [50, 53, 55, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74,
           75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 88] # all MIDI pitches found in the Bach corpus
pitch_one_hot_indices = {pitches[i]: i for i in range(len(pitches))}  # for the one-hot vector encoding
durations = [0.125, 0.25, 0.5, 0.75, 1.0, 1.5, 2.0, 3.0, 4.0, 6.0, 8.0]  # all durations found in the Bach corpus
duration_one_hot_indices = {durations[i]: i for i in range(len(durations))}  # for the one-hot vector encoding

for piece in bach_corpus:
    pitch_sequence = []  # the sequence of pitches for this piece
    duration_sequence = []  # the sequence of note durations for this piece
    
    piece_score = corpus.parse(piece)  # stream.Score object
    melody_part = piece_score.getElementsByClass('Part')[0]  # melody parts are always the first part in the score
    melody_notes = melody_part.flat.getElementsByClass(['Note', 'Rest'])
    
    # DON'T CHANGE n to Note or note (namespace conflict)
    # builds the feature vectors by One-Hot encoding MIDI numbers and durations
    for n in melody_notes.recurse():  # iterates through all notes in the piece
        this_duration = np.zeros(len(durations))
        this_pitch = np.zeros(len(pitches))
        
        note_duration = n.duration.quarterLength
        if note_duration == 0.0:
            continue  # this is a grace note, toss it and move to next note
        else:
            this_duration[duration_one_hot_indices[note_duration]] = 1

        if n.isNote:  # and is therefore not a rest; has pitch
            midi_pitch = n.pitch.midi
            this_pitch[pitch_one_hot_indices[midi_pitch]] = 1
            
            pitch_sequence.append(this_pitch)
            duration_sequence.append(this_duration)
        
    pitch_train.append(pitch_sequence)
    duration_train.append(duration_sequence)      

In [3]:
sequence_length_counts = {}
all_sequence_lengths = []
for sequence in pitch_train:
    if len(sequence) == 0:
        continue
    if len(sequence) in sequence_length_counts.keys():
        sequence_length_counts[len(sequence)] += 1
    else:
        sequence_length_counts[len(sequence)] = 1
    all_sequence_lengths.append(len(sequence))
        
all_sequence_lengths = np.array(all_sequence_lengths)
mean_len = np.mean(all_sequence_lengths)
std_len = np.std(all_sequence_lengths)

#print(f"Mean: {mean_len}\nStdev: {std_len}\nMean ± 1 stdev: {mean_len - std_len} – {mean_len + std_len}\nMean ± 1.5 stdev: {mean_len - 1.5*std_len} {mean_len + 1.5*std_len}")
lower_seq_len, upper_seq_len = ceil(mean_len - std_len), floor(mean_len + std_len)
num_samples_in_range = sum([sequence_length_counts[length] for length in sequence_length_counts.keys()\
                            if length in range(lower_seq_len, upper_seq_len + 1)])
print(f"Number of samples with {lower_seq_len} ≤ len ≤ {upper_seq_len} = {num_samples_in_range}")
[(num, sequence_length_counts[num]) for num in sorted(sequence_length_counts.keys())]

Number of samples with 17 ≤ len ≤ 94 = 388


[(7, 1),
 (23, 2),
 (25, 1),
 (26, 2),
 (28, 1),
 (29, 2),
 (30, 2),
 (31, 4),
 (32, 9),
 (33, 8),
 (34, 8),
 (35, 9),
 (36, 6),
 (37, 11),
 (38, 8),
 (39, 10),
 (40, 11),
 (41, 17),
 (42, 8),
 (43, 20),
 (44, 15),
 (45, 13),
 (46, 24),
 (47, 12),
 (48, 11),
 (49, 10),
 (50, 9),
 (51, 17),
 (52, 12),
 (53, 10),
 (54, 6),
 (55, 9),
 (56, 3),
 (57, 7),
 (58, 4),
 (59, 10),
 (60, 5),
 (61, 5),
 (62, 4),
 (63, 6),
 (64, 8),
 (65, 5),
 (66, 4),
 (67, 4),
 (68, 4),
 (69, 2),
 (70, 6),
 (71, 4),
 (72, 1),
 (73, 1),
 (74, 1),
 (76, 2),
 (77, 1),
 (78, 1),
 (79, 1),
 (80, 1),
 (81, 2),
 (82, 2),
 (86, 1),
 (87, 3),
 (88, 1),
 (89, 2),
 (97, 2),
 (98, 1),
 (99, 2),
 (101, 1),
 (103, 1),
 (104, 2),
 (105, 1),
 (109, 2),
 (114, 1),
 (116, 1),
 (132, 1),
 (135, 1),
 (153, 1),
 (155, 1),
 (168, 1),
 (186, 1),
 (358, 1),
 (400, 1),
 (558, 1)]

In [4]:
# prune the data set
print(f"len before {len(pitch_train)}")
drop_indices = []  # indices of sequences which we are pruning

for seq_num in range(len(pitch_train)):
    # the sequence doesn't fall in the length range we determined
    if (not len(pitch_train[seq_num]) >= lower_seq_len) or (not len(pitch_train[seq_num]) <= upper_seq_len):
        drop_indices.append(seq_num)
        
for i in sorted(drop_indices, reverse=True):
    pitch_train.pop(i)
    duration_train.pop(i)
        
print(f"len after {len(pitch_train)}")

len before 433
len after 388


In [5]:
# pad sequences
pitch_train = pad_sequences(pitch_train, padding="post", dtype='float32')
duration_train = pad_sequences(duration_train, padding="post", dtype='float32')

In [6]:
# generate label sequences
# pitch_labels = pitch_train[1:]
# duration_labels = duration_labels = duration_train[1:]
pitch_labels = pitch_train
duration_labels = duration_labels = duration_train

In [7]:
# serialize training data and labels for use in another script
with open('pickles/pitch_X_train.pickle', 'wb') as f:
    pickle.dump(pitch_train, f)
    
with open('pickles/duration_X_train.pickle', 'wb') as f:
    pickle.dump(duration_train, f)
    
with open('pickles/pitch_Y_train.pickle', 'wb') as f:
    pickle.dump(pitch_labels, f)
    
with open('pickles/duration_Y_train.pickle', 'wb') as f:
    pickle.dump(duration_labels, f)

In [8]:
# serialize the relevant pitches/durations so we can get the respective values from the prediction output vectors
with open('pickles/pitches.pickle', 'wb') as f:
    pickle.dump(pitches, f)

with open('pickles/durations.pickle', 'wb') as f:
    pickle.dump(durations, f)