In [1]:
from music21 import *
import pandas as pd
import numpy as np
from keras.preprocessing.sequence import pad_sequences
from math import ceil, floor
import pickle

Using TensorFlow backend.


### Feature Extraction

In [2]:
# extract the notes from each piece in the corpus and place in in the x_train variable

bach_corpus = corpus.getComposer('bach')
pitch_train = []  # training data for the sequence of pitches (MIDI number)
duration_train = []  # training data for the sequence of note durations

pitches = [50, 53, 55, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74,
           75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 88] # all MIDI pitches found in the Bach corpus
pitch_one_hot_indices = {pitches[i]: i for i in range(len(pitches))}  # for the one-hot vector encoding
durations = [0.125, 0.25, 0.5, 0.75, 1.0, 1.5, 2.0, 3.0, 4.0, 6.0, 8.0]  # all durations found in the Bach corpus
duration_one_hot_indices = {durations[i]: i for i in range(len(durations))}  # for the one-hot vector encoding

for piece in bach_corpus:
    pitch_sequence = []  # the sequence of pitches for this piece
    duration_sequence = []  # the sequence of note durations for this piece
    
    piece_score = corpus.parse(piece)  # stream.Score object
    melody_part = piece_score.getElementsByClass('Part')[0]  # melody parts are always the first part in the score
    melody_notes = melody_part.flat.getElementsByClass(['Note', 'Rest'])
    
    # DON'T CHANGE n to Note or note (namespace conflict)
    # builds the feature vectors by One-Hot encoding MIDI numbers and durations
    for n in melody_notes.recurse():  # iterates through all notes in the piece
        this_duration = np.zeros(len(durations))
        this_pitch = np.zeros(len(pitches))
        
        note_duration = n.duration.quarterLength
        if note_duration == 0.0:
            continue  # this is a grace note, toss it and move to next note
        else:
            this_duration[duration_one_hot_indices[note_duration]] = 1

        if n.isNote:  # and is therefore not a rest; has pitch
            midi_pitch = n.pitch.midi
            this_pitch[pitch_one_hot_indices[midi_pitch]] = 1
            
            pitch_sequence.append(this_pitch)
            duration_sequence.append(this_duration)
        
    pitch_train.append(pitch_sequence)
    duration_train.append(duration_sequence)      

In [3]:
sequence_length_counts = {}
all_sequence_lengths = []
for sequence in pitch_train:
    if len(sequence) == 0:
        continue
    if len(sequence) in sequence_length_counts.keys():
        sequence_length_counts[len(sequence)] += 1
    else:
        sequence_length_counts[len(sequence)] = 1

sequence_length_counts = {key: sequence_length_counts[key] for key in sorted(sequence_length_counts.keys())}
#sequence_length_counts

In [4]:
# arbitrarily decide on ceiling/floor of lengths that we will consider for training
# this is a bit clunky, but I'm trying to make it an general as possible (in case the corpus somehow drastically changes)
min_length, max_length = 40, 70
range_size = ceil((max_length - min_length) / 3)
short_range = range(min_length, min_length + range_size)
medium_range = range(min_length + range_size, max_length - range_size) 
long_range = range(max_length - range_size, max_length)
short_samples, med_samples, long_samples = [], [], []

In [6]:
# prune the data set
print(f"Number of sequences before pruning: {len(pitch_train)}")
short_sequences_duration = []
short_sequences_pitch = []
medium_sequences_duration = []
medium_sequences_pitch = []
long_sequences_duration = []
long_sequences_pitch = []

for seq_index in range(len(pitch_train)):
    # the sequence doesn't fall in the length range we determined
    if len(pitch_train[seq_index]) in short_range:
        short_sequences_duration.append(duration_train[seq_index])
        short_sequences_pitch.append(pitch_train[seq_index])
    elif len(pitch_train[seq_index]) in medium_range:
        medium_sequences_duration.append(duration_train[seq_index])
        medium_sequences_pitch.append(pitch_train[seq_index])
    elif len(pitch_train[seq_index]) in long_range:
        long_sequences_duration.append(duration_train[seq_index])
        long_sequences_pitch.append(pitch_train[seq_index])
    # else we don't use the sample
        
print(f"Number of sequences (short, medium, long) after pruning: {len(short_sequences_duration), len(medium_sequences_duration), len(long_sequences_duration)}")

Number of sequences before pruning: 433
Number of sequences (short, medium, long) after pruning: (141, 87, 47)


In [7]:
# pad sequences
short_sequences_duration = pad_sequences(short_sequences_duration, padding="post", dtype='float32')
short_sequences_pitch = pad_sequences(short_sequences_pitch, padding="post", dtype='float32')
medium_sequences_duration = pad_sequences(medium_sequences_duration, padding="post", dtype='float32')
medium_sequences_pitch = pad_sequences(medium_sequences_pitch, padding="post", dtype='float32')
long_sequences_duration = pad_sequences(long_sequences_duration, padding="post", dtype='float32')
long_sequences_pitch = pad_sequences(long_sequences_pitch, padding="post", dtype='float32')

In [8]:
# serialize training data and labels for use in another script
with open('pickles/short_sequences_duration.pickle', 'wb') as f:
    pickle.dump(short_sequences_duration, f)
    
with open('pickles/short_sequences_pitch.pickle', 'wb') as f:
    pickle.dump(short_sequences_pitch, f)
    
with open('pickles/medium_sequences_duration.pickle', 'wb') as f:
    pickle.dump(medium_sequences_duration, f)
    
with open('pickles/medium_sequences_pitch.pickle', 'wb') as f:
    pickle.dump(medium_sequences_pitch, f)
    
with open('pickles/long_sequences_duration.pickle', 'wb') as f:
    pickle.dump(long_sequences_duration, f)
    
with open('pickles/long_sequences_pitch.pickle', 'wb') as f:
    pickle.dump(long_sequences_pitch, f)

In [9]:
# serialize the relevant pitches/durations so we can get the respective values from the prediction output vectors
with open('pickles/pitches.pickle', 'wb') as f:
    pickle.dump(pitches, f)

with open('pickles/durations.pickle', 'wb') as f:
    pickle.dump(durations, f)