# ECBM 4040 Fall 2020 FINAL PROJECT 

### Author:

Wenjun Yang (wy2347)   

Qihang Yang (qy2231)

In [1]:
import numpy as np
import tensorflow as tf

In [5]:
from prep import *
# from model import *

### Step 1: Extract the midi data from zip file.

* the data we use in this project comes from [Classical Piano Midi Page](http://www.piano-midi.de/) 
* all of them are midi file containing two tracks of piano for left hand and right hand


In [None]:
## you don't have to run this block

# import zipfile
# with zipfile.ZipFile('data/midifile.zip', 'r') as zip_ref:
#     zip_ref.extractall('data/')

### Step 2: Prepare your data for training

#### A little insight about the input data structure
The input data proposed by the original paper is a little bit complex and requires some knowledge about music. \
For more information on background knowledge of music, you can refer to the following two links:
* [MIDI_events](https://www.mixagesoftware.com/en/midikit/help/HTML/midi_events.html#:~:text=The%20Note%20Off%20Event%20is,hard%20the%20key%20was%20released.) 
* [MIDI Turtorial](http://www.music-software-development.com/midi-tutorial.html) 

We spend quite a lot time trying to figure out the whole logic behind this and provide the following glossary table for your information.\
And for consistency, we will stick to the name convention stated in the original paper.

* stateMatrix: matrix of state, for state definition see below
* note: 0-77 lower_bound=24; upper_bound=102 
* part_position(1) = note
* pitchclass = 1 of 12 half steps CDEFGAB b#
* part_pitchclass(12): one-hot pitchclass 
* state: (1,0) (1,1) (0,0) -> denoting holding or repeating a note
* context: the count of each pitchclass played in last timestep 
* part_context(12): rearranged context
* part_prev_vicinity(50):

#### Note:
* input for model: part_position + part_pitchclass + part_prev_vicinity + part_context + beat + [0] 
* total number of arguments: 1 + 12 + 50 + 12 + 4 + 1 = 80
* for each of the 78 note you have 80 arguments in above structure
* and we only use sequences of 128 timesteps for training
* so the input data form will be 128 X 78 X 80

Please refer to prep.py for the details of implementation of data preprocessing part.

The general data preparation process is as the following flowchart:
<img src="image/Data Prep.png" />


In [6]:
training_data = load_data('data/music')

load liz_et2
load scn16_3
load haydn_43_1
load mendel_op30_3
load beethoven_les_adieux_1
load br_rhap
load grieg_zwerge
load haydn_7_1
load muss_8
load schu_143_2
load muss_4
load scn68_12
load clementi_opus36_1_1
load waldstein_3
load liz_et_trans5
load ty_maerz
load ty_juli
load rac_op3_2
load chpn-p3
load gra_esp_4
load scn15_5
load beethoven_opus10_2
load grieg_wedding
load mz_330_1
load haydn_35_2
load chpn_op27_1
load schubert_D935_3
load mendel_op19_4
load schub_d960_1
load haydn_9_1
load mz_545_1
load rac_op33_6
load liz_rhap15
load alb_esp2
load grieg_berceuse
load rac_op23_2
load bach_850
load beethoven_opus22_1
load burg_quelle
load debussy_cc_4
load schubert_D850_4
load chpn-p4
load mendel_op30_1
load pathetique_3
load clementi_opus36_6_1
load mendel_op62_3
load schu_143_1
load chpn_op25_e11
load scn15_4
load schub_d960_4
load mz_570_3
load clementi_opus36_5_1
load scn15_13
load grieg_waechter
load chpn-p8
load brahms_opus1_2
load rac_op32_1
load liz_et_trans4
load haydn_8_

### (Optional) Step 3: Train the theano-based model

In [4]:
from Original.ori_prep import *
import Original.ori_multi_training
import Original.ori_model



In [5]:
pcs = multi_training.loadPieces("music")

m = model.Model([300,300],[100,50], dropout=0.5)

multi_training.trainPiece(m, pcs, 10000)

pickle.dump( m.learned_config, open( "output/final_learned_config.p", "wb" ))

NameError: name 'multi_training' is not defined

### Step 4: Train our model

description

* flowchart

Try #2

In [46]:
# Unsolved Problem
############################################################
# 1. Is this generally the right architecture
# 2. Probably need change all the 128 into None? But this raises error
# 3. 'mask' function in original code, does Keras take care of that for us?
# 4. What is the target to validate model? The whole state sequence [batch,128,78,2] 
#                                          or the state of only the next timestep
# 5. The function to predict next note to play?
# 6. Way too many parameters in the last dense layer! Not quite sure it is what it should be

inputs = tf.keras.Input(shape=(127,78,80))

# For why use permute dimensions and use time distributed layers 
# please refer to https://keras.io/api/layers/recurrent_layers/time_distributed/

inputs_rotate= tf.keras.backend.permute_dimensions(inputs,(0,2,1,3)) #(batch,78,128,80)

time_lstm1 = tf.keras.layers.LSTM(300,return_sequences=True,dropout=0.5)
time_lstm2 = tf.keras.layers.LSTM(300,return_sequences=True,dropout=0.5)

inter1 = tf.keras.layers.TimeDistributed(time_lstm1)(inputs_rotate) #(batch,78,128,300)
inter2 = tf.keras.layers.TimeDistributed(time_lstm2)(inter1) #(batch,78,128,300)

note_lstm1 = tf.keras.layers.LSTM(100,return_sequences=True,dropout=0.5)
note_lstm2 = tf.keras.layers.LSTM(50,return_sequences=True,dropout=0.5)

inter2_rotate= tf.keras.backend.permute_dimensions(inter2,(0,2,1,3)) #(batch,128,78,300)

# the input of note-axis part of model will be 
# 1) the note-state vector from previous LSTM stack (batch,128,78,300)
# 2) where the previous note was chosen to be played (batch,128,78,1)
# 3) where the previous note was chosen to be articulated (batch,128,78,1)
# that's why we are using padding here and concatenate the 3 together 
# please see https://www.tensorflow.org/api_docs/python/tf/pad 
# https://www.tensorflow.org/api_docs/python/tf/concat
# for reference

state_inputs = tf.keras.Input(shape=(127,78,2))

paddings=[[0,0],[0,0],[1,0],[0,0]]

prev_note_state=tf.pad(state_inputs[:,:,:-1,:], paddings, 'CONSTANT', constant_values=0)   # (batch,128,78,2)

inter_input1=tf.concat((inter2_rotate,prev_note_state),axis=-1) # (batch,128,78,302)

inter3 = tf.keras.layers.TimeDistributed(note_lstm1)(inter_input1) #(batch,128,78,100)

inter_input2=tf.concat((inter3,prev_note_state),axis=-1) #(batch,128,78,102)

inter4 = tf.keras.layers.TimeDistributed(note_lstm2)(inter_input2) ##(batch,128,78,50)

outputs = tf.keras.layers.Flatten()(inter4)

outputs = tf.keras.layers.Dropout(.5)(outputs)

outputs = tf.keras.layers.Dense(156,activation='sigmoid')(outputs) #（batch,128,78,2）

# output the final result, i.e., probability of playing or articulating certain notes
outputs = tf.keras.layers.Reshape((78,2))(outputs) #（batch,78,2）

model=tf.keras.Model(inputs=[inputs,state_inputs],outputs=outputs)

In [90]:
model.summary()

Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_7 (InputLayer)            [(None, 127, 78, 80) 0                                            
__________________________________________________________________________________________________
tf_op_layer_Transpose_6 (Tensor [(None, 78, 127, 80) 0           input_7[0][0]                    
__________________________________________________________________________________________________
time_distributed_12 (TimeDistri (None, 78, 127, 300) 457200      tf_op_layer_Transpose_6[0][0]    
__________________________________________________________________________________________________
input_8 (InputLayer)            [(None, 127, 78, 2)] 0                                            
____________________________________________________________________________________________

In [35]:
# custom loss function
# the output of model is the same shape with the sample's state matrix
# that is (time,note(78),state(2))
# the 2 for each time and note denote the probability of the note being played or articulated repectively in the last step
# we use the negative log likelihood to denote the loss, the log function can avoid the numbers being too small

def my_loss(y_true, y_pred):
#     y_pred=np.asarray(y_pred)
#     y_true=np.asarray(y_true)
    loss=-tf.keras.backend.sum(tf.math.log(y_pred*y_true+(1-y_pred)*(1-y_true)+np.spacing(np.float32(1.0)))) # numeric stablity
    return loss


In [47]:
model.compile(optimizer=tf.keras.optimizers.Adam(3e-5),loss= my_loss)

In [10]:
### new generator
def update_input_batch_generator(statemat_dict):
    
    # training data generator
    
    while True:
        batch=build_input_batch(statemat_dict)
        yield ([batch[0][:,:-1],batch[1][:,:-1]],batch[1][:,-1])
    

In [43]:
gen=update_input_batch_generator(training_data)
num=0
for i in gen:
    print(i[0][0].shape)
    print(i[0][1].shape)
    print(i[1].shape)
    num=num+1
    if num>=2:
        break

(10, 127, 78, 80)
(10, 127, 78, 2)
(10, 78, 2)
(10, 127, 78, 80)
(10, 127, 78, 2)
(10, 78, 2)


In [76]:
print(next(gen)[0][0].shape)

(10, 127, 78, 80)


In [34]:
np.spacing(np.float32(1.0))

1.1920929e-07

In [11]:
data_gen=update_input_batch_generator(training_data)
model.fit_generator(data_gen,epochs=20,steps_per_epoch=25)

Instructions for updating:
Please use Model.fit, which supports generators.
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7fb728eb6e50>

In [26]:
model.save('model/model_adam_3e5',save_format='h5')

In [36]:
model=tf.keras.models.load_model("model/model_adam_3e5")



ValueError: Unknown loss function:my_loss

In [39]:
model.save_weights('model/model_adam_3e5_wt/model_adam_3e5')

In [50]:
load_status = model.load_weights("model/model_adam_3e5_wt/model_adam_3e5")

In [48]:
p = tf.constant([0.7])
r = tf.random.uniform(shape=(1,127,78,80), maxval=1)
b = tf.math.greater(p, r)
test_data3 = tf.cast(b, dtype=tf.float32)

p = tf.constant([0.7])
r = tf.random.uniform(shape=(1,127,78,2), maxval=1)
b = tf.math.greater(p, r)
test_data4 = tf.cast(b, dtype=tf.float32)


In [51]:
test_out=model.predict([test_data3,test_data4])
test_out

array([[[0.9999969 , 0.999554  ],
        [0.99999917, 0.9999236 ],
        [0.9999962 , 0.99991536],
        [0.9960603 , 0.99979776],
        [0.9999871 , 0.99988556],
        [0.9988757 , 0.999962  ],
        [0.99994206, 0.9997838 ],
        [0.9980246 , 0.99979335],
        [0.9970086 , 0.9996977 ],
        [0.99999976, 0.9999151 ],
        [0.9999621 , 0.9937192 ],
        [0.98498386, 0.9788271 ],
        [0.99999976, 0.99959475],
        [0.28327635, 0.03772808],
        [0.9996704 , 0.9981192 ],
        [0.98224556, 0.999483  ],
        [0.9999658 , 0.9999943 ],
        [0.99996114, 0.99991655],
        [0.89228654, 0.9581778 ],
        [0.99880886, 0.9964418 ],
        [0.9995591 , 0.9986577 ],
        [0.9999927 , 0.9996051 ],
        [0.9972894 , 0.88581795],
        [0.9997012 , 0.9992974 ],
        [0.9999989 , 0.99973005],
        [0.9757715 , 0.9984345 ],
        [0.9966445 , 0.9971347 ],
        [0.9596553 , 0.09248953],
        [0.9999974 , 0.8396393 ],
        [0.999

In [52]:
test_dt=build_single_input(training_data)
test_dt = [np.asarray(test_dt[0][:-1]).reshape(1, 127, 78, 80), \
                  np.asarray(test_dt[1][:-1]).reshape(1, 127, 78, 2)]
test_pred = model.predict(test_dt)

In [53]:
test_pred

array([[[2.31379119e-04, 5.34797509e-05],
        [8.12519924e-04, 4.29042957e-05],
        [3.93849361e-04, 4.70474515e-05],
        [3.69199057e-04, 1.19843105e-04],
        [1.94216482e-04, 4.42676865e-05],
        [9.02631902e-04, 2.31614176e-04],
        [1.67449849e-04, 8.45202303e-05],
        [2.36275885e-03, 4.70423547e-04],
        [9.54611693e-04, 7.18006777e-05],
        [2.71783373e-03, 4.88245787e-05],
        [3.80599638e-03, 2.23870986e-04],
        [2.85926531e-03, 1.37052033e-04],
        [1.14651490e-02, 5.03182819e-04],
        [1.54680456e-03, 6.64710766e-04],
        [1.05249584e-02, 3.83364881e-04],
        [2.67630536e-03, 3.47686844e-04],
        [1.78490439e-03, 4.23549092e-04],
        [1.18279364e-02, 1.15591788e-03],
        [3.67357838e-03, 9.25846514e-04],
        [8.57416913e-03, 4.39623371e-03],
        [1.61695909e-02, 1.45369244e-03],
        [2.87704146e-03, 7.96780456e-04],
        [1.03692524e-02, 2.39031482e-03],
        [3.50671448e-03, 5.6606688

* Tensorboard

### Step 5: Difference in outcome

### Step 6: Compare between the models (Discussion: The advantage of biaxial LSTM)

Time-axis Model

In [34]:
t_inputs = tf.keras.Input(shape=(128,78,80))

t_inputs_rotate= tf.keras.backend.permute_dimensions(t_inputs,(0,2,1,3)) #(78,128,80)

t_time_lstm1 = tf.keras.layers.LSTM(300,return_sequences=True)
t_time_lstm2 = tf.keras.layers.LSTM(300,return_sequences=True)

t_inter1 = tf.keras.layers.TimeDistributed(t_time_lstm1)(t_inputs_rotate) #(78,128,80)
t_inter2 = tf.keras.layers.TimeDistributed(t_time_lstm2)(t_inter1) #(78,128,80)

t_inter2_rotate= tf.keras.backend.permute_dimensions(t_inter2,(0,2,1,3)) #(128,78,80)
t_outputs = tf.keras.layers.Dense(2,activation='sigmoid')(t_inter2_rotate) #(128,78,2)

time_model=tf.keras.Model(inputs=t_inputs,outputs=t_outputs)

In [35]:
time_model.summary()

Model: "model_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         [(None, 128, 78, 80)]     0         
_________________________________________________________________
tf_op_layer_Transpose_5 (Ten [(None, 78, 128, 80)]     0         
_________________________________________________________________
time_distributed_9 (TimeDist (None, 78, 128, 300)      457200    
_________________________________________________________________
time_distributed_10 (TimeDis (None, 78, 128, 300)      721200    
_________________________________________________________________
tf_op_layer_Transpose_6 (Ten [(None, 128, 78, 300)]    0         
_________________________________________________________________
dense_3 (Dense)              (None, 128, 78, 2)        602       
Total params: 1,179,002
Trainable params: 1,179,002
Non-trainable params: 0
_________________________________________________

In [36]:
time_model.compile(optimizer=tf.keras.optimizers.Adam(),loss=my_loss)

In [37]:
data_gen=input_batch_generator(training_data)
time_model.fit_generator(data_gen,epochs=10,steps_per_epoch=20)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f7ca4673850>

Note_axis Model

In [46]:
n_inputs = tf.keras.Input(shape=(128,78,80))

n_note_lstm1 = tf.keras.layers.LSTM(100,return_sequences=True)
n_note_lstm2 = tf.keras.layers.LSTM(50,return_sequences=True)

n_inter3 = tf.keras.layers.TimeDistributed(n_note_lstm1)(n_inputs)
n_inter4 = tf.keras.layers.TimeDistributed(n_note_lstm2)(n_inter3)

n_outputs = tf.keras.layers.Dense(2,activation='sigmoid')(n_inter4)

note_model=tf.keras.Model(inputs=n_inputs,outputs=n_outputs)

In [47]:
note_model.summary()

Model: "model_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_7 (InputLayer)         [(None, 128, 78, 80)]     0         
_________________________________________________________________
time_distributed_13 (TimeDis (None, 128, 78, 100)      72400     
_________________________________________________________________
time_distributed_14 (TimeDis (None, 128, 78, 50)       30200     
_________________________________________________________________
dense_5 (Dense)              (None, 128, 78, 2)        102       
Total params: 102,702
Trainable params: 102,702
Non-trainable params: 0
_________________________________________________________________


In [48]:
note_model.compile(optimizer=tf.keras.optimizers.Adam(),loss=my_loss)

In [49]:
data_gen=input_batch_generator(training_data)
note_model.fit_generator(data_gen,epochs=10,steps_per_epoch=20)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f7ca50d6fd0>

### Step 7: Improvement

### Step 8: Unsolved problems and Drawbacks

1. The model doesn't include factors like velocity and tempo, which makes the generated music somewhat plain and lacking in style.
2. The model requires a lot of hand-picked arguments based on empirical knowledge about music.
3. The model only deals with single instrument. If we simply run the model on different instruments and then combine each track together, this sure won't give us a good melody. How to make these intruments sound good together could be an interesting task.