In [1]:
import numpy as np
import tensorflow as tf
import os, shutil

from MusicRnnData import MusicRnnData

# input parameters
x_len = 10
y_len = 1
batch_size = 32
# LSTM parameters
num_layers = 1
lstm_size = 8
hidden_size = 8
# training parameters
dropout_prob = 0.5
learning_rate = 1e-1
num_steps = 50000
verbose = True
display_interval = 500
moving_avg_length = 100

# fix random seed for reproducibility
np.random.seed(7)

## Helper functions

In [2]:
from collections import deque

class MovingAverager(object):
    def __init__(self, filter_length):
        self.filter = deque([0 for _ in range(filter_length)])
        
    def insert(self, num):
        self.filter.popleft()
        self.filter.append(num)
        
    def average(self):
        return sum(self.filter)/float(len(self.filter))
    
def build_lstm_stack(num_layers, lstm_size, dropout_prob=1.0):
    def lstm_layer(lstm_size, dropout_prob=1.0):
        lstm_layer = tf.contrib.rnn.BasicLSTMCell(lstm_size, forget_bias=0.0)
        return tf.contrib.rnn.DropoutWrapper(lstm_layer, output_keep_prob=dropout_prob)
    
    lstm_stack = [lstm_layer(lstm_size, dropout_prob=dropout_prob) for _ in range(num_layers)]
    return tf.contrib.rnn.MultiRNNCell(lstm_stack)

## Load the data

In [3]:
# filelist = ['a2002011001-e02.wav']
filelist = ['sine.wav']
music_data = MusicRnnData(filelist)

## Construct model

In [4]:
# input
x = tf.placeholder(tf.float32, [batch_size, x_len])
y = tf.placeholder(tf.float32, [batch_size, y_len])

# RNN Cell
lstm = build_lstm_stack(num_layers, lstm_size, dropout_prob=dropout_prob)
# add dropout here

# output layer
output, states = tf.contrib.rnn.static_rnn(lstm, [x], dtype=tf.float32)
fc_weights = tf.Variable(tf.random_normal([hidden_size, y_len]))
fc_bias = tf.Variable(tf.random_normal([y_len]))

y_ = tf.matmul(output[-1], fc_weights) + fc_bias

## Set cost function and optimizer

In [5]:
# cost = tf.divide(tf.nn.l2_loss(y_ - y), y_len)
cost = tf.reduce_mean(tf.squared_difference(y_, y))
optimizer = tf.train.RMSPropOptimizer(learning_rate=learning_rate).minimize(cost)

## Set summary variables

In [6]:
dir_save = './Logs'

# define summary for tensorboard
tf.summary.scalar('loss', cost)
tf.summary.histogram('fc_weights', fc_weights)
tf.summary.histogram('fc_bias', fc_bias)
tf.summary.histogram('lstm_output', output)
tf.summary.histogram('prediction', y_)
summary_merged = tf.summary.merge_all()

## Run training session

In [7]:
# initialize moving averager for loss
if verbose:
    moving_avg = MovingAverager(moving_avg_length)

# create session
sess = tf.Session()
sess.run(tf.global_variables_initializer())

# define saver
if os.path.exists(dir_save):
    shutil.rmtree(dir_save)
train_writer = tf.summary.FileWriter(dir_save, sess.graph)
saver = tf.train.Saver()

# and begin
for i in range(num_steps):
    x_batch, y_batch = music_data.batch(x_len, y_len, batch_size)
    _, loss, summary = sess.run([optimizer, cost, summary_merged], feed_dict={x: x_batch, y: y_batch})
    
    train_writer.add_summary(summary, i)

    if verbose:
        moving_avg.insert(loss)
        # print progress
        if (i % display_interval == 0):
            print('step: %d, training loss: %g' % (i, loss))

            if i > moving_avg_length:
                print('moving average loss: %g' % (moving_avg.average()))

step: 0, training loss: 0.176657
step: 500, training loss: 0.00144034
moving average loss: 0.00494849
step: 1000, training loss: 0.00134713
moving average loss: 0.00485505
step: 1500, training loss: 0.00494837
moving average loss: 0.00422645
step: 2000, training loss: 0.00782667
moving average loss: 0.00469519
step: 2500, training loss: 0.0039129
moving average loss: 0.0045455
step: 3000, training loss: 0.00519761
moving average loss: 0.00430846
step: 3500, training loss: 0.000193236
moving average loss: 0.00414276
step: 4000, training loss: 0.00527414
moving average loss: 0.00442357
step: 4500, training loss: 0.01435
moving average loss: 0.00395536
step: 5000, training loss: 0.00221595
moving average loss: 0.00389955
step: 5500, training loss: 0.00169103
moving average loss: 0.00457257
step: 6000, training loss: 0.00474518
moving average loss: 0.00445166
step: 6500, training loss: 0.00187249
moving average loss: 0.0040916
step: 7000, training loss: 0.00316414
moving average loss: 0.00

## Predict on sequence

In [8]:
from __future__ import division

import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

display_interval = 1000

original = music_data.convert_to_wav(music_data.tracks[0])
orig_len = original.shape[0]

prediction = original[0:x_len]
num_predictions = int((orig_len-x_len)/y_len)
x_batch = prediction
for i in range(num_predictions):
    feed_pred = np.expand_dims(x_batch, axis=0)
    feed_pred = np.repeat(feed_pred, batch_size, axis=0)
    new_y = sess.run([y_], feed_dict={x: feed_pred, y: y_batch})[0][0,:]
    prediction = np.append(prediction, new_y, axis=0)
    x_batch = np.append(x_batch[y_len:], new_y, axis=0)
    
    if (i % display_interval == 0):
        print('Iteration: %d, len(prediction) = %g' % (i, len(prediction)))
    

Iteration: 0, len(prediction) = 11
Iteration: 1000, len(prediction) = 1011
Iteration: 2000, len(prediction) = 2011
Iteration: 3000, len(prediction) = 3011
Iteration: 4000, len(prediction) = 4011
Iteration: 5000, len(prediction) = 5011
Iteration: 6000, len(prediction) = 6011
Iteration: 7000, len(prediction) = 7011
Iteration: 8000, len(prediction) = 8011
Iteration: 9000, len(prediction) = 9011
Iteration: 10000, len(prediction) = 10011
Iteration: 11000, len(prediction) = 11011
Iteration: 12000, len(prediction) = 12011
Iteration: 13000, len(prediction) = 13011
Iteration: 14000, len(prediction) = 14011
Iteration: 15000, len(prediction) = 15011
Iteration: 16000, len(prediction) = 16011
Iteration: 17000, len(prediction) = 17011
Iteration: 18000, len(prediction) = 18011
Iteration: 19000, len(prediction) = 19011
Iteration: 20000, len(prediction) = 20011
Iteration: 21000, len(prediction) = 21011
Iteration: 22000, len(prediction) = 22011
Iteration: 23000, len(prediction) = 23011
Iteration: 24000,

## Plot predicted output

In [None]:
converted = music_data.convert_to_wav(prediction)
pred_len = converted.shape[0]
x_orig = np.linspace(0, orig_len/music_data.sample_rate, orig_len)
x_conv = np.linspace(0, pred_len/music_data.sample_rate, pred_len)
plt.subplot(211)
plt.plot(x_orig[0:300], original[0:300])
plt.subplot(212)
plt.plot(x_conv[0:300], converted[0:300])