# ECBM 4040 Fall 2020 FINAL PROJECT 

### Author:

Wenjun Yang (wy2347)   

Qihang Yang (qy2231)

In [1]:
import numpy as np
import tensorflow as tf

In [3]:
from prep import *
# from model import *

### Step 1: Extract the midi data from zip file.

* the data we use in this project comes from [Classical Piano Midi Page](http://www.piano-midi.de/) 
* all of them are midi file containing two tracks of piano for left hand and right hand


In [None]:
## you don't have to run this block

# import zipfile
# with zipfile.ZipFile('data/midifile.zip', 'r') as zip_ref:
#     zip_ref.extractall('data/')

### Step 2: Prepare your data for training

#### A little insight about the input data structure
The input data proposed by the original paper is a little bit complex and requires some knowledge about music. \
For more information on background knowledge of music, you can refer to the following two links:
* [MIDI_events](https://www.mixagesoftware.com/en/midikit/help/HTML/midi_events.html#:~:text=The%20Note%20Off%20Event%20is,hard%20the%20key%20was%20released.) 
* [MIDI Turtorial](http://www.music-software-development.com/midi-tutorial.html) 

We spend quite a lot time trying to figure out the whole logic behind this and provide the following glossary table for your information.\
And for consistency, we will stick to the name convention stated in the original paper.

* stateMatrix: matrix of state, for state definition see below
* note: 0-77 lower_bound=24; upper_bound=102 
* part_position(1) = note
* pitchclass = 1 of 12 half steps CDEFGAB b#
* part_pitchclass(12): one-hot pitchclass 
* state: (1,0) (1,1) (0,0) -> denoting holding or repeating a note
* context: the count of each pitchclass played in last timestep 
* part_context(12): rearranged context
* part_prev_vicinity(50):

#### Note:
* input for model: part_position + part_pitchclass + part_prev_vicinity + part_context + beat + [0] 
* total number of arguments: 1 + 12 + 50 + 12 + 4 + 1 = 80
* for each of the 78 note you have 80 arguments in above structure
* and we only use sequences of 128 timesteps for training
* so the input data form will be 128 X 78 X 80

Please refer to prep.py for the details of implementation of data preprocessing part.

The general data preparation process is as the following flowchart:
<img src="image/Data Prep.png" />


In [4]:
training_data = load_data('data/music')

load liz_et2
load scn16_3
load haydn_43_1
load mendel_op30_3
load beethoven_les_adieux_1
load br_rhap
load grieg_zwerge
load haydn_7_1
load muss_8
load schu_143_2
load muss_4
load scn68_12
load clementi_opus36_1_1
load waldstein_3
load liz_et_trans5
load ty_maerz
load ty_juli
load rac_op3_2
load chpn-p3
load gra_esp_4
load scn15_5
load beethoven_opus10_2
load grieg_wedding
load mz_330_1
load haydn_35_2
load chpn_op27_1
load schubert_D935_3
load mendel_op19_4
load schub_d960_1
load haydn_9_1
load mz_545_1
load rac_op33_6
load liz_rhap15
load alb_esp2
load grieg_berceuse
load rac_op23_2
load bach_850
load beethoven_opus22_1
load burg_quelle
load debussy_cc_4
load schubert_D850_4
load chpn-p4
load mendel_op30_1
load pathetique_3
load clementi_opus36_6_1
load mendel_op62_3
load schu_143_1
load chpn_op25_e11
load scn15_4
load schub_d960_4
load mz_570_3
load clementi_opus36_5_1
load scn15_13
load grieg_waechter
load chpn-p8
load brahms_opus1_2
load rac_op32_1
load liz_et_trans4
load haydn_8_

### (Optional) Step 3: Train the theano-based model

In [None]:
import 

In [None]:
pcs = multi_training.loadPieces("music")

m = model.Model([300,300],[100,50], dropout=0.5)

multi_training.trainPiece(m, pcs, 10000)

pickle.dump( m.learned_config, open( "output/final_learned_config.p", "wb" ))

### Step 4: Train our model

description

* flowchart

In [50]:
inputs = tf.keras.Input(shape=(128,78,80))

inputs_rotate= tf.keras.backend.permute_dimensions(inputs,(0,2,1,3)) #(batch,78,128,80)

time_lstm1 = tf.keras.layers.LSTM(300,return_sequences=True,dropout=0.5)
time_lstm2 = tf.keras.layers.LSTM(300,return_sequences=True,dropout=0.5)

inter1 = tf.keras.layers.TimeDistributed(time_lstm1)(inputs_rotate) #(batch,78,128,300)
inter2 = tf.keras.layers.TimeDistributed(time_lstm2)(inter1) #(batch,78,128,300)

note_lstm1 = tf.keras.layers.LSTM(100,return_sequences=True,dropout=0.5)
note_lstm2 = tf.keras.layers.LSTM(50,return_sequences=True,dropout=0.5)

inter2_rotate= tf.keras.backend.permute_dimensions(inter2,(0,2,1,3)) #(batch,128,78,50)

inter3 = tf.keras.layers.TimeDistributed(note_lstm1)(inter2_rotate)
inter4 = tf.keras.layers.TimeDistributed(note_lstm2)(inter3)

outputs = tf.keras.layers.Dense(2,activation='sigmoid')(inter4) #（batch,128,78,2）

model=tf.keras.Model(inputs=inputs,outputs=outputs)

In [6]:
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_2 (InputLayer)         [(None, 128, 78, 80)]     0         
_________________________________________________________________
tf_op_layer_Transpose_1 (Ten [(None, 78, 128, 80)]     0         
_________________________________________________________________
time_distributed_1 (TimeDist (None, 78, 128, 300)      457200    
_________________________________________________________________
time_distributed_2 (TimeDist (None, 78, 128, 300)      721200    
_________________________________________________________________
tf_op_layer_Transpose_2 (Ten [(None, 128, 78, 300)]    0         
_________________________________________________________________
time_distributed_3 (TimeDist (None, 128, 78, 100)      160400    
_________________________________________________________________
time_distributed_4 (TimeDist (None, 128, 78, 50)       30200 

In [51]:
# custom loss function
# the output of model is the same shape with the sample's state matrix
# that is (time,note(78),state(2))
# the 2 for each time and note denote the probability of the note being played or articulated repectively in the last step
# we use the negative log likelihood to denote the loss, the log function can avoid the numbers being too small

def my_loss(y_true, y_pred):
#     y_pred=np.asarray(y_pred)
#     y_true=np.asarray(y_true)
    loss=-tf.keras.backend.mean(tf.math.log(y_pred*y_true+(1-y_pred)*(1-y_true)))
    return loss


In [54]:
model.compile(optimizer=tf.keras.optimizers.RMSprop(),loss=my_loss)

In [55]:
data_gen=input_batch_generator(training_data)
model.fit_generator(data_gen,epochs=10,steps_per_epoch=20)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f7ca416a7d0>

In [10]:
test_dt=build_single_input(training_data)

In [15]:
np.asarray(test_dt[0]).shape

(128, 78, 80)

In [17]:
test_pred=model.predict(np.asarray(test_dt[0]).reshape(1,128,78,80))

In [21]:
np.sum(np.square(test_pred.reshape(128,78,2)-np.asarray(test_dt[1])))

418.4551392232534

In [22]:
out=statematrix_to_midi(test_pred.reshape(128,78,2))

* Tensorboard

### Step 5: Difference in outcome

### Step 6: Compare between the models (Discussion: The advantage of biaxial LSTM)

Time-axis Model

In [34]:
t_inputs = tf.keras.Input(shape=(128,78,80))

t_inputs_rotate= tf.keras.backend.permute_dimensions(t_inputs,(0,2,1,3)) #(78,128,80)

t_time_lstm1 = tf.keras.layers.LSTM(300,return_sequences=True)
t_time_lstm2 = tf.keras.layers.LSTM(300,return_sequences=True)

t_inter1 = tf.keras.layers.TimeDistributed(t_time_lstm1)(t_inputs_rotate) #(78,128,80)
t_inter2 = tf.keras.layers.TimeDistributed(t_time_lstm2)(t_inter1) #(78,128,80)

t_inter2_rotate= tf.keras.backend.permute_dimensions(t_inter2,(0,2,1,3)) #(128,78,80)
t_outputs = tf.keras.layers.Dense(2,activation='sigmoid')(t_inter2_rotate) #(128,78,2)

time_model=tf.keras.Model(inputs=t_inputs,outputs=t_outputs)

In [35]:
time_model.summary()

Model: "model_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         [(None, 128, 78, 80)]     0         
_________________________________________________________________
tf_op_layer_Transpose_5 (Ten [(None, 78, 128, 80)]     0         
_________________________________________________________________
time_distributed_9 (TimeDist (None, 78, 128, 300)      457200    
_________________________________________________________________
time_distributed_10 (TimeDis (None, 78, 128, 300)      721200    
_________________________________________________________________
tf_op_layer_Transpose_6 (Ten [(None, 128, 78, 300)]    0         
_________________________________________________________________
dense_3 (Dense)              (None, 128, 78, 2)        602       
Total params: 1,179,002
Trainable params: 1,179,002
Non-trainable params: 0
_________________________________________________

In [36]:
time_model.compile(optimizer=tf.keras.optimizers.Adam(),loss=my_loss)

In [37]:
data_gen=input_batch_generator(training_data)
time_model.fit_generator(data_gen,epochs=10,steps_per_epoch=20)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f7ca4673850>

Note_axis Model

In [46]:
n_inputs = tf.keras.Input(shape=(128,78,80))

n_note_lstm1 = tf.keras.layers.LSTM(100,return_sequences=True)
n_note_lstm2 = tf.keras.layers.LSTM(50,return_sequences=True)

n_inter3 = tf.keras.layers.TimeDistributed(n_note_lstm1)(n_inputs)
n_inter4 = tf.keras.layers.TimeDistributed(n_note_lstm2)(n_inter3)

n_outputs = tf.keras.layers.Dense(2,activation='sigmoid')(n_inter4)

note_model=tf.keras.Model(inputs=n_inputs,outputs=n_outputs)

In [47]:
note_model.summary()

Model: "model_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_7 (InputLayer)         [(None, 128, 78, 80)]     0         
_________________________________________________________________
time_distributed_13 (TimeDis (None, 128, 78, 100)      72400     
_________________________________________________________________
time_distributed_14 (TimeDis (None, 128, 78, 50)       30200     
_________________________________________________________________
dense_5 (Dense)              (None, 128, 78, 2)        102       
Total params: 102,702
Trainable params: 102,702
Non-trainable params: 0
_________________________________________________________________


In [48]:
note_model.compile(optimizer=tf.keras.optimizers.Adam(),loss=my_loss)

In [49]:
data_gen=input_batch_generator(training_data)
note_model.fit_generator(data_gen,epochs=10,steps_per_epoch=20)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f7ca50d6fd0>

### Step 7: Improvement

### Step 8: Unsolved problems and Drawbacks

1. The model doesn't include factors like velocity and tempo, which makes the generated music somewhat plain and lacking in style.
2. The model requires a lot of hand-picked arguments based on empirical knowledge about music.
3. The model only deals with single instrument. If we simply run the model on different instruments and then combine each track together, this sure won't give us a good melody. How to make these intruments sound good together could be an interesting task.