In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import Sequential
from tensorflow.keras.layers import LSTM, Dense, TimeDistributed, Bidirectional, Lambda
import numpy as np
from music21 import *
from copy import deepcopy
import random
import pickle
import os
import time

MODELS_DIRECTORY = 'models'
if not os.path.exists(MODELS_DIRECTORY):
    os.makedirs(MODELS_DIRECTORY)
PICKLES_DIRECTORY = 'pickles'
if not os.path.exists(PICKLES_DIRECTORY):
    os.makedirs(PICKLES_DIRECTORY)
COMPOSITIONS_DIRECTORY = 'compositions'
if not os.path.exist(COMPOSITIONS_DIRECTORY):
    os.makedirs(COMPOSITIONS_DIRECTORY)

# Retrieving data

In [2]:
# load the datasets
with open(f"{PICKLES_DIRECTORY}/short_seqs_duration.pickle", 'rb') as short_duration,\
     open(f"{PICKLES_DIRECTORY}/short_seqs_pitch.pickle", 'rb') as short_pitch:
    short_seqs_duration = pickle.load(short_duration)
    short_seqs_pitch = pickle.load(short_pitch)

with open(f"{PICKLES_DIRECTORY}/medium_seqs_duration.pickle", 'rb') as medium_duration,\
     open(f"{PICKLES_DIRECTORY}/medium_seqs_pitch.pickle", 'rb') as medium_pitch:
    medium_seqs_duration = pickle.load(medium_duration)
    medium_seqs_pitch = pickle.load(medium_pitch)
    
with open(f"{PICKLES_DIRECTORY}/long_seqs_duration.pickle", 'rb') as long_duration,\
     open(f"{PICKLES_DIRECTORY}/long_seqs_pitch.pickle", 'rb') as long_pitch:
    long_seqs_duration = pickle.load(long_duration)
    long_seqs_pitch = pickle.load(long_pitch)

In [3]:
# retrieve the pitches and durations that were used to build the data set
with open(f'{PICKLES_DIRECTORY}/durations.pickle', 'rb') as d, open(f'{PICKLES_DIRECTORY}/pitches.pickle', 'rb') as p:
    durations = pickle.load(d)
    pitches = pickle.load(p)
    
# retrieve the mapping from pitch/duration values to one-hot vector indices
with open(f'{PICKLES_DIRECTORY}/duration_indices.pickle', 'rb') as d, open(f'{PICKLES_DIRECTORY}/pitch_indices.pickle', 'rb') as p:
    duration_indices = pickle.load(d)
    pitch_indices = pickle.load(p)
    
num_durations = len(durations)
num_pitches = len(pitches)
short_seq_len = short_seqs_duration.shape[1]
medium_seq_len = medium_seqs_duration.shape[1]
long_seq_len = long_seqs_duration.shape[1]

# Experimenting with how temperature affects a softmax distribution

In [4]:
def softmax(x):
    return np.exp(x) / np.sum(np.exp(x), axis=0)

logits = np.array([1,2,3,4])
logits_temp_p0 = logits / 0.01
logits_temp_p2 = logits / 0.2
logits_temp_p4 = logits / 0.4
logits_temp_p6 = logits / 0.6
logits_temp_p8 = logits / 0.8
logits_temp_p10 = logits / 1.0
logits_temp_p15 = logits / 1.5
logits_temp_p150 = logits / 150

print(f"RAW: {softmax(logits)}")
print(f"TEMP 0.0: {softmax(logits_temp_p0)}")
print(f"TEMP 0.2: {softmax(logits_temp_p2)}")
print(f"TEMP 0.4: {softmax(logits_temp_p4)}")
print(f"TEMP 0.6: {softmax(logits_temp_p6)}")
print(f"TEMP 0.8: {softmax(logits_temp_p8)}")
print(f"TEMP 1.0: {softmax(logits_temp_p10)}")
print(f"TEMP 1.5: {softmax(logits_temp_p15)}")
print(f"TEMP 150: {softmax(logits_temp_p150)}")

RAW: [0.0320586  0.08714432 0.23688282 0.64391426]
TEMP 0.0: [5.14820022e-131 1.38389653e-087 3.72007598e-044 1.00000000e+000]
TEMP 0.2: [3.03841168e-07 4.50940275e-05 6.69254708e-03 9.93262055e-01]
TEMP 0.4: [5.07707490e-04 6.18514343e-03 7.53504725e-02 9.17956677e-01]
TEMP 0.6: [0.00547228 0.02897292 0.15339683 0.81215798]
TEMP 0.8: [0.01689363 0.05896455 0.20580651 0.71833531]
TEMP 1.0: [0.0320586  0.08714432 0.23688282 0.64391426]
TEMP 1.5: [0.07076911 0.13783941 0.26847452 0.52291696]
TEMP 150: [0.24750558 0.24916113 0.25082776 0.25250553]


# Model

### Helper functions

In [5]:
# takes a 1D vector (a single sam) and converts it to 3D for input to the network
def timestep_to3d(x):
    return np.reshape(x, (1, 1, x.shape[0]))

# create one-hot vector representation for a time step given the index position of the encoded value
def vectorize(index, vec_size):
    index = int(index)
    vec = np.zeros(vec_size, np.float32)
    vec[index] = 1.0
    return vec

# returns the index of the one-hot encoded value
def unvectorize(x):
    return np.argmax(x)

### Model building and training

In [6]:
# single layer Unidirectional or Bidirectional LSTM; will easily allow us to test various configurations
def get_model(num_features, lstm_cells=500, bidirectional=True, temperature=1.0, optimizer="adam"):
    model = Sequential()
    # only dif. betwn. bi. LSTM and uni. LSTM is the presence/absence of Bidirectional wrapper
    # hidden layer 1; 20  units; input (# timesteps, # features); return a sequence of each time step's outputs
    # input_shape first value None makes it variable (we don't have fixed length sequences)
    # output of LSTM cell uses tanh activation, recurrent connections use sigmoid
    if bidirectional:
        model.add(Bidirectional(LSTM(lstm_cells, input_shape=(None, num_features), return_sequences=True)))
    else:
        model.add(LSTM(lstm_cells, input_shape=(None, num_features), return_sequences=True))
        
    # so that we can divibe by temperature before feeding through softmax
    model.add(Lambda(lambda x: x / temperature))
        
    # TimeDistributed is a wrapper allowing one output per time step; 
    # ...requires hidden layer to have return_sequences == True
    model.add(TimeDistributed(Dense(num_features, activation='softmax')))
    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy', 'categorical_crossentropy'])
    return model

In [7]:
# train LSTM
def train_model(model, X, bidirectional=True, epochs=15, batch_size=32, verbose=1):
    Y = deepcopy(X)
    if not bidirectional:
        X = X[0:-1] # do not input the final time step in unidirectional LSTM
        Y = Y[1:] # labels include all time steps but the first one in unidir. LSTM
    model.fit(X, Y, epochs=epochs, batch_size=batch_size, verbose=verbose)

# Composing with the model

In [19]:
# return a single index from a sample of a softmax distribution vector
def sample_distribution(dist_vec, num_categories):
    return tf.random.categorical(dist_vec.reshape(1, num_categories), 1).numpy().flatten()[0]

# use the trained model to compose new music by feeding in a single input and desired length
def compose(pitch_model, duration_model, pitch_prompt, duration_prompt, length=50):
    '''pitch_model: the trained model for pitch predictions
       duration_model: trained model for duration predictions
       pitch_prompt: the first pitch of the piece (index of the one-hot encoded pitch vector)
       duration_prompt: the first duration of the piece (index of the one-hot encoded duration vector)
       length: how many time steps to generate for
       
       returns a music21.stream.Stream object representing the composition
    '''    
    # the lists that hold the indices of the values to index in to pitches/durations lists
    generated_pitches, generated_durations = [pitch_prompt], [duration_prompt]
    
    current_pitch, current_duration = pitch_prompt, duration_prompt
    for t in range(length):
        # model only accepts 3D inputs
        pitch_vec = timestep_to3d(vectorize(current_pitch, num_pitches))
        duration_vec = timestep_to3d(vectorize(current_duration, num_durations))
        
        # predict the output distributions
        pitch_pred = pitch_model.predict(pitch_vec)
        duration_pred = duration_model.predict(duration_vec)
        # sample the distributions (returns the index of the one-hot vectors)
        next_pitch = sample_distribution(pitch_pred, num_pitches)
        print(f'Sampled pitch index {next_pitch}: MIDI no. {pitches[next_pitch]}')
        next_duration = sample_distribution(duration_pred, num_durations)
        print(f'Sampled duration index {next_duration}: quarter length {durations[next_duration]}')
        generated_pitches.append(next_pitch)
        generated_durations.append(next_duration)
        
        # get ready for next iteration
        current_pitch, current_duration = next_pitch, next_duration
        
    
    composed_stream = stream.Stream()
    for pair in list(zip(generated_pitches, generated_durations)):
        p = pitch.Pitch(midi=pitches[pair[0]])
        d = duration.Duration(durations[pair[1]])
        n = note.Note()
        n.pitch = p
        n.duration = d
        composed_stream.append(n)
    
    return composed_stream

# Generation infrastructure

In [9]:
bidirectional_vals = [True, False]
lstm_cell_vals = [500, 1000, 2000]
temperature_vals = [0.2, 0.5, 1.0, 10.0]
epoch_batch_vals = [(1, 1), (5, 8), (10, 16), (30, 32)] # [(epochs, batch_size), ...]

# generate sets of parameters on which to build/train models for testing purposes
def generate_param_sets(start=None, end=None):
    if start ^ end:  # ^ is XOR; must pass either no arguments or both
        raise ValueError('Must pass no optional arguments or both optional arguments; received one')
        
    sets = []
    for bidirectional in bidirectional_vals:
        for lstm_cells in lstm_cell_vals:
            for temperature in temperature_vals:
                for epochs, batch_size in epoch_batch_vals:
                    s = {
                        "bidirectional": bidirectional,
                        "lstm_cells": lstm_cells,
                        "temperature": temperature,
                        "epochs": epochs,
                        "batch_size": batch_size
                    }
                    sets.append(s)
                    
    if start is None and end is None:
        return sets
    else:
        return sets[start:end+1] # make end index inclusive

In [10]:
class MetadataModel:
    def __init__(self, num_features, lstm_cells, bidirectional, temperature, epochs,\
                 batch_size, verbose=0, id=None):
        self._model = get_model(num_features, lstm_cells, bidirectional, temperature)
        self._num_features = num_features
        self._lstm_cells = lstm_cells
        self._bidirectional = bidirectional
        self._temperature = temperature
        self._epochs = epochs
        self._batch_size = batch_size
        self.verbose = verbose # no underscore because this should be mutable
        self._name = '_'.join([str(bidirectional), str(lstm_cells), str(temperature), str(epochs), str(batch_size)])
        if id is not None:
            self.name += f"_{id}"
        self._total_training_time = 0.0 # in seconds
    
    # ---- setting properties so that these attributes are immutable ----
    @property
    def name(self):
        return self._name
    
    @property
    def model(self):
        return self._model
        
    @property
    def num_features(self):
        return self._num_features
    
    @property
    def lstm_cells(self):
        return self._lstm_cells
    
    @property
    def bidirectional(self):
        return self._bidirectional
    
    @property
    def temperature(self):
        return self._temperature
    
    @property
    def epochs(self):
        return self._epochs
    
    @property
    def batch_size(self):
        return self._batch_size
        
    # returns a string version of training time
    @property
    def training_duration(self):
        total_time = self._total_training_time
        hours = int(total_time // 3600)  # 3600 seconds/hour
        minutes = int((total_time - (3600 * hours)) // 60) # subtract the hours from remaining time; 60 sec/min
        seconds = (total_time - (3600 * hours)) - (60 * minutes)
        return f"{hours}h {minutes}m {seconds}s"
            
    def train(self, X):
        start_time = time.time()
        train_model(self.model, X, self.bidirectional, self.epochs, self.batch_size, self.verbose)
        end_time = time.time()
        self._total_training_time += end_time - start_time

In [13]:
def run_tests(pitch_short, pitch_medium, pitch_long, duration_short, duration_medium, duration_long,\
              num_compositions=5, verbose=0):
    param_sets = generate_param_sets()
    pitch_models, duration_models = [], []
    model_no = 1 # which iteration of model we are on
    if not os.path.exists("models"):
            os.mkdir("models")
            
    test_start_time=time.time()            
    for params in param_sets:
        bidirectional = params["bidirectional"]
        lstm_cells = params["lstm_cells"]
        temperature = params["temperature"]
        epochs = params["epochs"]
        batch_size = params["batch_size"]
        
        if verbose > 0:
            print(f"Model {model_no}:\n\tbidirectional – {bidirectional}\n\tlstm cells – {lstm_cells}\
            \n\ttemperature – {temperature}\n\tepochs – {epochs}\n\tbatch size – {batch_size}")
        
        pitch_model = MetadataModel(num_pitches, lstm_cells, bidirectional, temperature, epochs, batch_size, verbose)
        duration_model = MetadataModel(num_durations, lstm_cells, bidirectional, temperature, epochs, batch_size, verbose)
        
        # train the pitch network
        if verbose > 0:
            print('\n\tTraining pitch generation network...')
            print('\t\tOn short data set:')
        pitch_model.train(pitch_short)
        if verbose > 0:
            print('\t\tOn medium data set:')
        pitch_model.train(pitch_medium)
        if verbose > 0:
            print('\t\tOn long data set:')
        pitch_model.train(pitch_long)
        pitch_model.model.save(f"{MODEL_DIRECTORY}/{model_no}_pitch_{pitch_model.name}.h5")
        if verbose > 0:
            print(f"\t\tPitch model training complete: saved at {MODEL_DIRECTORY}/{model_no}_pitch_{pitch_model.name}.h5")
            print("\tTotal training time'")
        
        # train the duration network
        if verbose > 0:
            print('\n\tTraining rhythm (duration) generation network...')
            print('\t\tOn short data set:')
        duration_model.train(duration_short)
        return
        if verbose > 0:
            print('\t\tOn medium data set:')
        duration_model.train(duration_medium)
        if verbose > 0:
            print('\t\tOn long data set:')
        duration_model.train(duration_long)
        duration_model.model.save(f"{MODEL_DIRECTORY}/{model_no}_duration_{duration_model.name}.h5")
        if verbose > 0:
            print(f"\t\tPitch model training complete: saved at {MODEL_DIRECTORY}/{model_no}_duration_{duration_model.name}.h5")
        
        if verbose > 0:
            print(f'\tModel {model_iter} training complete')
        
        # compose outputs and save them
        if not os.path.exists(f"{COMPOSITION_DIRECTORY}/model_no_{model_no}"):
            os.makedirs(f"{COMPOSITION_DIRECTORY}/model_no_{model_no}")
        comp_count = 1
        for c in num_compositions:
            pitch_prompt = #TODO
            duration_prompt = #TODO
            composition = compose(pitch_model, duration_model, pitch_prompt, duration_prompt, length=100)
            composition.write('musicxml', f'composition_{comp_count}.mxl')
            composition.write('midi', f'composition_{comp_count}.mid')
            
        model_no = model_no + 1
        print('–' * 50 + "\n\n")
        
        
    test_end_time=time.time()
    total_time = test_end_time - test_start_time
    hours = int(total_time // 3600)  # 3600 seconds/hour
    minutes = int((total_time - (3600 * hours)) // 60) # subtract the hours from remaining time; 60 sec/min
    seconds = (total_time - (3600 * hours)) - (60 * minutes)
    print(f"TRAINING COMPLETE – elapsed time: {hours}h {minutes}m {seconds}s")
    
    
        

In [14]:
run_tests(short_seqs_pitch, medium_seqs_pitch, long_seqs_pitch, short_seqs_duration, medium_seqs_duration, long_seqs_duration, verbose=1)

Model 1:
	bidirectional – True
	lstm cells – 500            
	temperature – 0.2
	epochs – 1
	batch size – 1

	Training pitch generation network...
		On short data set:
Train on 1668 samples
		On medium data set:
Train on 972 samples
		On long data set:
Train on 612 samples
		Pitch model training complete: saved at models/1_pitch_True_500_0.2_1_1.h5
'tTotal training time'

	Training rhythm (duration) generation network...
		On short data set:
Train on 1668 samples

KeyboardInterrupt: 