In [None]:
from music21 import *
import pandas as pd
import numpy as np
from math import ceil, floor
import pickle

### Feature Extraction

In [None]:
# extract the notes from each piece in the corpus and place in the x_train variable

bach_corpus = corpus.getComposer('bach')
# pitch_train/duration_train are NOT one-hot encoded, they will instead be encoded by an embedding layer in the network
pitch_train = []  # training data for the sequence of pitches (MIDI number)
duration_train = []  # training data for the sequence of note durations (floats)
pitch_freq = {} # how often a given pitch occurs (will be normalized after all pieces have been analyzed)
duration_freq = {}  # how often each pitch occurs (will be normalized after all pieces have been analyzed)
starting_pitch_likelihood = {}  # how likely a piece is to start with a given MIDI pitch 
starting_duration_likelihood = {}  # how likely a piece is to start with a given duration

for piece in bach_corpus:
    piece_score = corpus.parse(piece)  # stream.Score object
    melody_part = piece_score.getElementsByClass('Part')[0]  # melody parts are always the first part in the score
    melody_notes = melody_part.flat.getElementsByClass(['Note', 'Rest'])
    
    # transpose all pitches within the range of +6 to -5 half steps (including the unaltered version)
    for i in range(6,-6,-1):
        # transpose the melody part
        melody_transposed = melody_notes.transpose(i)
        pitch_sequence = []  # the sequence of MIDI pitches for this piece
        duration_sequence = []  # the sequence of quarter length note durations for this piece
    
        is_first_note = True # used to build the prob. dist of starting pitch/duration
        for n in melody_transposed.recurse():  # iterates through all notes in the piece
            note_duration = n.duration.quarterLength
            if note_duration < 0.25 or note_duration >= 6.0:
                continue  # discard all notes less than 16th notes
            else:
                duration_sequence.append(note_duration)
                if note_duration not in duration_freq.keys():
                    duration_freq[note_duration] = 1
                else:
                    duration_freq[note_duration] += 1

            if n.isNote:  # and is therefore not a rest; has pitch
                midi_pitch = n.pitch.midi
                pitch_sequence.append(midi_pitch)
                if midi_pitch not in pitch_freq.keys():
                    pitch_freq[midi_pitch] = 1
                else:
                    pitch_freq[midi_pitch] += 1
            else:  # is a rest
                if -1 not in pitch_freq.keys():
                    pitch_freq[-1] = 1
                else:
                    pitch_freq[-1] += 1
                pitch_sequence.append(-1) # -1 pitch indicates it is a rest
                
            if is_first_note: # save the note info to keep track of how often each pitch/duration starts a piece
                if midi_pitch in starting_pitch_likelihood.keys():
                    starting_pitch_likelihood[midi_pitch] += 1
                else:
                    starting_pitch_likelihood[midi_pitch] = 1
                    
                if note_duration in starting_duration_likelihood:
                    starting_duration_likelihood[note_duration] += 1
                else:
                    starting_duration_likelihood[note_duration] = 1
                    
                is_first_note = False
        
        pitch_train.append(pitch_sequence)
        duration_train.append(duration_sequence) 

In [None]:
pitch_vocab, duration_vocab = sorted(list(pitch_freq.keys())), sorted(list(duration_freq.keys()))
pitch_vocab_size, duration_vocab_size = len(pitch_vocab), len(duration_vocab)

# normalize the frequency of ALL pitches/duration
pitch_freq = {p: float(pitch_freq[p]/sum(pitch_freq.values())) for p in sorted(pitch_freq.keys())}
duration_freq = {d: float(duration_freq[d]/sum(duration_freq.values())) for d in sorted(duration_freq.keys())}
# normalize the occurrences of STARTING pitches/durations
starting_pitch_likelihood = {p: float(starting_pitch_likelihood[p]/sum(starting_pitch_likelihood.values())) for p in sorted(starting_pitch_likelihood.keys())}
starting_duration_likelihood = {d: float(starting_duration_likelihood[d]/sum(starting_duration_likelihood.values())) for d in sorted(starting_duration_likelihood.keys())}

In [None]:
sequence_length_counts = {}
all_sequence_lengths = []
for sequence in pitch_train:
    if len(sequence) == 0:
        continue
    if len(sequence) in sequence_length_counts.keys():
        sequence_length_counts[len(sequence)] += 1
    else:
        sequence_length_counts[len(sequence)] = 1

sequence_length_counts = {key: sequence_length_counts[key] for key in sorted(sequence_length_counts.keys())}

# sequence_length_counts

In [None]:
# arbitrarily decide on ceiling/floor of lengths that we will consider for training
# this is a bit clunky, but I'm trying to make it an general as possible (in case the corpus somehow drastically changes)
min_length, max_length = 40, 70
range_size = ceil((max_length - min_length) / 3)
short_range = range(min_length, min_length + range_size)
medium_range = range(min_length + range_size, max_length - range_size) 
long_range = range(max_length - range_size, max_length)

In [None]:
# prune the data set
print(f"Number of sequences before pruning: {len(pitch_train)}")
short_seqs_duration = []
short_seqs_pitch = []
medium_seqs_duration = []
medium_seqs_pitch = []
long_seqs_duration = []
long_seqs_pitch = []

for seq_index in range(len(pitch_train)):
    # the sequence must fall in one of the ranges we have determined
    if len(pitch_train[seq_index]) in short_range:
        short_seqs_duration.append(duration_train[seq_index])
        short_seqs_pitch.append(pitch_train[seq_index])
    elif len(pitch_train[seq_index]) in medium_range:
        medium_seqs_duration.append(duration_train[seq_index])
        medium_seqs_pitch.append(pitch_train[seq_index])
    elif len(pitch_train[seq_index]) in long_range:
        long_seqs_duration.append(duration_train[seq_index])
        long_seqs_pitch.append(pitch_train[seq_index])
    # else we don't use the sample
        
print(f"Number of sequences (short, medium, long) after pruning: {len(short_seqs_duration), len(medium_seqs_duration), len(long_seqs_duration)}")

In [None]:
print(len(short_seqs_duration), len(short_seqs_pitch))
print(len(medium_seqs_duration), len(medium_seqs_pitch))
print(len(long_seqs_duration), len(long_seqs_pitch))

In [None]:
# serialize training data and labels for use in another script
with open('pickles/short_seqs_duration.pickle', 'wb') as f:
    pickle.dump(short_seqs_duration, f)
    
with open('pickles/short_seqs_pitch.pickle', 'wb') as f:
    pickle.dump(short_seqs_pitch, f)
    
with open('pickles/medium_seqs_duration.pickle', 'wb') as f:
    pickle.dump(medium_seqs_duration, f)
    
with open('pickles/medium_seqs_pitch.pickle', 'wb') as f:
    pickle.dump(medium_seqs_pitch, f)
    
with open('pickles/long_seqs_duration.pickle', 'wb') as f:
    pickle.dump(long_seqs_duration, f)
    
with open('pickles/long_seqs_pitch.pickle', 'wb') as f:
    pickle.dump(long_seqs_pitch, f)

In [None]:
# serialize the relevant pitches/durations so we can get the respective values from the prediction output vectors
with open('pickles/pitch_vocab.pickle', 'wb') as f:
    pickle.dump(pitch_vocab, f)

with open('pickles/duration_vocab.pickle', 'wb') as f:
    pickle.dump(duration_vocab, f)
    
with open('pickles/starting_pitch_likelihood.pickle', 'wb') as f:
    pickle.dump(starting_pitch_likelihood, f)
    
with open('pickles/starting_duration_likelihood.pickle', 'wb') as f:
    pickle.dump(starting_duration_likelihood, f)