In [1]:
import numpy as np
import tensorflow as tf

from MusicRnnData import MusicRnnData

# input parameters
x_len = 50
y_len = 1
batch_size = 64
# LSTM parameters
num_layers = 2
lstm_size = 128
hidden_size = 128
# training parameters
learning_rate = 1e-1
num_steps = 50000
verbose = True
display_interval = 500
moving_avg_length = 100

# fix random seed for reproducibility
np.random.seed(7)

## Helper functions

In [2]:
from collections import deque

class MovingAverager(object):
    def __init__(self, filter_length):
        self.filter = deque([0 for _ in range(filter_length)])
        
    def insert(self, num):
        self.filter.popleft()
        self.filter.append(num)
        
    def average(self):
        return sum(self.filter)/float(len(self.filter))
    
def build_lstm_stack(num_layers, lstm_size):
    lstm_stack = [tf.contrib.rnn.BasicLSTMCell(lstm_size) for _ in range(num_layers)]
    return tf.contrib.rnn.MultiRNNCell(lstm_stack)

## Load the data

In [3]:
filelist = ['a2002011001-e02.wav']
music_data = MusicRnnData(filelist)

## Construct model

In [4]:
# input
x = tf.placeholder(tf.float32, [batch_size, x_len])
y = tf.placeholder(tf.float32, [batch_size, y_len])

# RNN Cell
lstm = build_lstm_stack(num_layers, lstm_size)
# add dropout here

# output layer
output, states = tf.contrib.rnn.static_rnn(lstm, [x], dtype=tf.float32)
fc_weights = tf.Variable(tf.random_normal([hidden_size, y_len]))
fc_bias = tf.Variable(tf.random_normal([y_len]))

y_ = tf.matmul(output[-1], fc_weights) + fc_bias

## Set cost function and optimizer

In [5]:
cost = tf.nn.l2_loss(y_ - y)
optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate).minimize(cost)

## Run training session

In [6]:
moving_avg = MovingAverager(moving_avg_length)

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    for i in range(num_steps):
        x_batch, y_batch = music_data.batch(x_len, y_len, batch_size)
        _, loss = sess.run([optimizer, cost], feed_dict={x: x_batch, y: y_batch})

        # print progress
        if verbose and (i % display_interval == 0):
            print('step: %d, training loss: %g' % (i, loss))
            
            # moving averager
            moving_avg.insert(loss)
            if i > moving_avg_length:
                print('moving average: %g' % (moving_avg.average()))

step: 0, training loss: 0.568274
step: 500, training loss: 0.164503
moving average: 0.00732778
step: 1000, training loss: 0.0714082
moving average: 0.00804186
step: 1500, training loss: 0.0640131
moving average: 0.00868199
step: 2000, training loss: 0.0973515
moving average: 0.0096555
step: 2500, training loss: 0.0471473
moving average: 0.010127
step: 3000, training loss: 0.119781
moving average: 0.0113248
step: 3500, training loss: 0.0698413
moving average: 0.0120232
step: 4000, training loss: 0.0580176
moving average: 0.0126034
step: 4500, training loss: 0.0680143
moving average: 0.0132835
step: 5000, training loss: 0.102312
moving average: 0.0143066
step: 5500, training loss: 0.0932612
moving average: 0.0152392
step: 6000, training loss: 0.0718006
moving average: 0.0159573
step: 6500, training loss: 0.0571865
moving average: 0.0165291
step: 7000, training loss: 0.12782
moving average: 0.0178073
step: 7500, training loss: 0.108801
moving average: 0.0188953
step: 8000, training loss: 