**SOW-MKI49: Neural Information Processing Systems**  
*Weeks 4 and 5: Assignment (225 points + 30 bonus points)*  
Author: Umut

Group number: 25  
Felicity Reddel, s4830717  
Max Reddel, s4830709  
Johan van den Heuvel, s4770528

In [1]:
from chainer import ChainList, optimizers, serializers
import chainer
import chainer.functions as F
import chainer.links as L
import numpy as np

  from ._conv import register_converters as _register_converters


**WaveNet component (75 points)**

* Implement missing parts of the call method (y and z). **25 points**
* Implement residual block class. **50 points**

---
Reminder:

* One convolution layer that has 61 kernels of size 2 with no nonlinearities.

![alt text](http://i67.tinypic.com/21mgi2w.png)
![alt text](http://i67.tinypic.com/292n04y.png)
---



In [2]:
class _WaveNet(ChainList):
    def __init__(self):
        links = (L.Convolution2D(61, 61, (1, 2)),) 
        links += tuple(_ResidualBlock((1, 2 ** (i % 6))) for i in range(6)) # T6 layers
        links += (L.Convolution2D(512, 512, 1), L.Convolution2D(512, 3843, 1)) # 1 and 2 layers

        super(_WaveNet, self).__init__(*links)
        

    def __call__(self, x):
#         print('WaveNet call')
        y = self[0](F.pad(x, ((0, 0), (0, 0), (0, 0), (1, 0)), 'constant')) # The output of this layer becomes the input of the next layer
        z = 0

        for i in range(1, len(self) - 2):
        #TODO    
            skip_x, res_x = self[i](y)
            y = res_x
            z += skip_x
            
        
        h = F.relu(z)
        h = self[len(self)-2](h)
        h = F.relu(h)
        h = self[len(self)-1](h)
    
        
        z, y = F.split_axis(h, (2*61,), 1)
        
#         print('WaveNet done')

        return F.reshape(y, (y.shape[0], 61, 61, y.shape[3])), \
               F.reshape(z, (z.shape[0], 2, 61, z.shape[3]))

class _ResidualBlock(ChainList):    
    def __init__(self, dilationFactor): # what parameter is this

        links = (L.DilatedConvolution2D(None, 122, (1,1), dilate=dilationFactor),)
        links += (L.Convolution2D(None, 512 + 61,(1,1)),)
        
        super(_ResidualBlock, self).__init__(*links)
        
    def __call__(self, x):
#         print('Residual call')
        saved_x = x
        
        h = self[0](x)
        h = F.split_axis(h, 2, 1)       
        x = F.tanh(h[0]) * F.sigmoid(h[1])
        
        res_x, skip_x = F.split_axis(self[1](x), (61,), 1)
        
        res_x += saved_x
        
#         print('Residual done')
        
        return skip_x, res_x

**CRF-RNN component (50 points)**

* Implement missing parts of the call method (z). **25 points**
* Why is z not normalized in the last iteration? **25 points**

---

Reminder:

![alt text](http://i68.tinypic.com/sy6mix.png)

---

In [3]:
class _CRF(ChainList):
    def __init__(self):
        super(_CRF, self).__init__(L.ConvolutionND(1, 2, 2, 1, nobias = True))

    def __call__(self, x, y):
        #TODO
#         print('CRF call')
        z = y
#         print(x.shape)
#         print(y.shape)

        for i in range(5):
#             print(i)
            # multiply N*61*61 by N*2*61 and result is N*2*61
            # so should be 2*61 X 61*61 or 61*61 X (2*61)T
#             print(type(z), ' ', type(x))
            x = F.reshape(x, (-1, 61, 61)) # copied from forum
            z = F.reshape(z, (-1, 2, 61))
            z = F.matmul(z,x) # message passing layer 
            
            
#             print(z.shape)
            z = self[0](z) #comp
            z = F.reshape(z, (30, 2, 61, 79)) # fit the y shape again
            z = - y - z # local update and norm

            if i < 4:
                z = F.softmax(z)

#         print('CRF done')        
                
        return z

TODO why is z not normalized

**WaveCRF model (50 points)**

1. Implement missing parts of the call method (k, psi_u and Q_hat). **20 points**
2. Implement missing parts of the save and load methods (save and load model). **10 points**
3. Implement missing parts of the test and train methods (forward and/or backward propagate). **20 points**

In [4]:
class WaveCRF(object):
    def __init__(self):
        self.log = {('test', 'accuracy'): (), ('test', 'loss'): (), ('training', 'accuracy'): (),
                    ('training', 'loss'): ()}
        self.model = ChainList(_WaveNet(), _CRF())
        self.optimizer = optimizers.Adam(0.0002, 0.5)

        self.optimizer.setup(self.model)

    def __call__(self, x):
        #TODO
        k, psi_u = self.model[0](x) # _WaveNet outputs k and psi
        Q_hat = self.model[1](k, psi_u) # _CRF outputs Q?

        return F.transpose(F.reshape(Q_hat, (x.shape[0], x.shape[3], 2, 61)), (0, 2, 3, 1))

    @classmethod
    def load(cls, directory):
        self = cls()
        self.log = np.load('{}/log.npy'.format(directory))

        #TODO
        load_npz('{:s}/model.npz'.format(directory), model) # Load model
        serializers.load_npz('{}/optimizer.npz'.format(directory), self.optimizer)

        return self

    def save(self, directory):
        np.save('{}/log.npy'.format(directory), self.log)
        
        #TODO
        save_npz('{:s}/model.npz'.format(directory), model) # Save model
        serializers.save_npz('{}/optimizer.npz'.format(directory), self.optimizer)

    def test(self, Q, x):
        with chainer.using_config('train', False):
            #TODO
            a,b = self.model.__getitem__(0)(x) # Forward prop
            Q_hat = self.model.__getitem__(1)(a,b)
            loss = F.softmax_cross_entropy(Q_hat, Q)# Forward prop

            self.log['test', 'accuracy'] += (float(F.accuracy(Q_hat, Q).data),)
            self.log['test', 'loss'] += (float(loss.data),)

    def train(self, Q, x):
        #TODO
        # fix wacky hack
#         print(self.model.__getitem__(0).__call__(x)) # This line removes error for call
#         print(help(ChainList))

        print(x.shape)
        print(Q.shape)

        a,b = self.model.__getitem__(0)(x) # Forward prop
        Q_hat = self.model.__getitem__(1)(a,b)
        loss = F.softmax_cross_entropy(Q_hat, Q)# Forward prop
        print("loss: ", loss)

        #TODO
        self.model.cleargrads() # Backprop
        loss.backward() # Backprop
        self.optimizer.update() # Backprop

        self.log['training', 'accuracy'] += (float(F.accuracy(Q_hat, Q).data),)
        self.log['training', 'loss'] += (float(loss.data),)

In [5]:
%matplotlib inline

import IPython
import chainer
import matplotlib
import numpy
import os
import pickle
import random
import tqdm

In [6]:
batch_size = 30
epochs = 100
root = '..'

In [7]:
with open('{}/Assignment2/piano_rolls.p'.format(root), 'rb') as f:
    piano_rolls = pickle.load(f)

keys = sorted(piano_rolls.keys())

random.seed(6)
random.shuffle(keys)

test_set = dict((key, piano_rolls[key]) for key in keys[:int(0.1 * len(keys))])
training_set = dict((key, piano_rolls[key]) for key in keys[int(0.1 * len(keys)):])
training_set_keys = list(training_set.keys())

In [8]:
waveCRF = WaveCRF()

waveCRF.model.to_gpu()

In [9]:
for epoch in tqdm.tnrange(epochs):
    random.shuffle(training_set_keys)

    batch = ()

    for key in tqdm.tqdm_notebook(training_set_keys, leave = False):
        i = random.randint(0, training_set[key].shape[1] - 80)
        batch += (training_set[key][32 : 93, i : i + 80],)

        if len(batch) == batch_size:
            batch = waveCRF.model.xp.array(batch)

#             print(batch[0][:].T[0].shape)
#             print(np.sum(batch[0][:].T[0]))
            
            waveCRF.train(batch[:, :, 1:].astype('i'), batch[:, :, None, :-1].astype('f'))
            
            

            batch = ()

    for key in tqdm.tqdm_notebook(test_set, leave = False):
        batch = waveCRF.model.xp.array((test_set[key][32 : 93],))

        waveCRF.test(batch[:, :, 1:].astype('i'), batch[:, :, None, :-1].astype('f'))

    IPython.display.clear_output()

    for i, key in enumerate(waveCRF.log):
        matplotlib.pyplot.subplot(221 + i)
        matplotlib.pyplot.plot(numpy.array(waveCRF.log[key]).reshape(epoch + 1, -1).mean(1))
        matplotlib.pyplot.xlabel('iteration')
        matplotlib.pyplot.ylabel(key)

    matplotlib.pyplot.tight_layout()
    matplotlib.pyplot.show()
    os.makedirs('{}/Models/WaveCRF/{}'.format(root, epoch))
    waveCRF.save('{}/Models/WaveCRF/{}'.format(root, epoch))

HBox(children=(IntProgress(value=0), HTML(value='')))

HBox(children=(IntProgress(value=0, max=3273), HTML(value='')))

(30, 61, 1, 79)
(30, 61, 79)
loss:  variable(0.87544316)


  param.data -= hp.eta * (self.lr * m / (numpy.sqrt(vhat) + hp.eps) +


(30, 61, 1, 79)
(30, 61, 79)
loss:  variable(0.55182976)
(30, 61, 1, 79)
(30, 61, 79)
loss:  variable(0.40689203)
(30, 61, 1, 79)
(30, 61, 79)
loss:  variable(0.3287646)
(30, 61, 1, 79)
(30, 61, 79)
loss:  variable(0.28370318)
(30, 61, 1, 79)
(30, 61, 79)
loss:  variable(0.26847228)



KeyboardInterrupt: 

**Test (50 points)**  

* Generate a number of samples, pick the best one and play it in the notebook. **50 points**

In [None]:
#TODO
# Test
x = np.random(,(61, 79))
model(x)


**Bonus question (30 points)**

* Discuss how you can improve the model (you can talk about different architectures or different ways to encode the inputs, etc.) **10 points**
* Discuss the assumptions behind the meanfield approximation and its shortcomings. **10 points**
* Prove that the iterative update equation (CRF-RNN component) is differentiable so that we can backpropagate through them. **10 points**

TODO discussion