In [1]:
import os 
os.environ['THEANO_FLAGS'] = 'device=gpu1, floatX=float32, lib.cnmem=0.95'

In [2]:
import os, sys
sys.path.append(os.getcwd())

import numpy
numpy.random.seed(123)
import random
random.seed(123)

import theano
import theano.tensor as T
import lib
import lasagne
import scipy.misc

import time
import functools
import itertools

from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
srng = RandomStreams(seed=4884)
%matplotlib inline

from __future__ import division
from collections import OrderedDict
from sklearn.utils import shuffle
from sklearn.metrics import f1_score
from sklearn.cross_validation import train_test_split
from theano.tensor.nnet import conv2d
from theano.tensor.signal import pool
from theano.tensor.shared_randomstreams import RandomStreams

import cPickle
import numpy as np
import theano
import theano.tensor as T
import matplotlib.pyplot as plt

rng = np.random.RandomState(1234)

Using gpu device 1: Tesla K80 (CNMeM is enabled with initial size: 95.0% of memory, cuDNN 4007)


In [3]:
x  = T.tensor4('x')
#x.tag.test_value = train_data.reshape(20,1,96000,1)
#x.tag.test_value = np.random.rand(1, 3, 96000, 1).astype('float32')

In [6]:
#theano.config.compute_test_value = 'warn'

In [4]:
class DilatedConv1D:
    def __init__(self, output_dim, input_dim, filter_size, dilation, activation = lambda x : x, mask_type=None, bias_apply = False):
        # accutual filter_size is filter_size // 2 + 1
        self.input_dim = input_dim
        self.output_dim = output_dim
        self.filter_size = filter_size
        self.dilation = dilation
        self.mask_type = mask_type
        self.filter_shape = (output_dim, input_dim, filter_size, 1)
        self.bias_apply = bias_apply
        
        fan_in = np.prod(self.filter_shape[1:])
        fan_out = (self.filter_shape[0] * np.prod(self.filter_shape[2:]))
        
        self.W = theano.shared(rng.uniform(low=-np.sqrt(6. / (fan_in + fan_out)), high=np.sqrt(6. / (fan_in + fan_out)), size=self.filter_shape).astype("float32"), name="W")
        
        if bias_apply is not False:
            self.b = theano.shared(np.zeros(output_dim, dtype=theano.config.floatX), name = 'b')
        
        self.activation = activation
        
        if mask_type is not None:
            mask = np.ones((output_dim, input_dim, filter_size, 1), dtype=theano.config.floatX)
            center = filter_size//2
            for i in xrange(filter_size):
                if (i > center):
                    mask[:, :, i, :] = 0.
            self.W_mask = self.W * mask
            
        if bias_apply is not False:
            self.params = [self.W, self.b]
        else:
            self.params = [self.W]
            
    def f_prop(self, x):
        # inputs.shape: (batch size, input_channel, length, 1)
        if self.mask_type is not None:
            result = T.nnet.conv2d(x, self.W_mask, border_mode='half', filter_flip=False, filter_dilation=(self.dilation, 1))
        else:
            result = T.nnet.conv2d(x, self.W, border_mode='half', filter_flip=False, filter_dilation=(self.dilation, 1))
        if self.bias_apply is not False:
            result = result + self.b[np.newaxis, :, np.newaxis, np.newaxis] 
        result = self.activation(result)
        return result

In [87]:
#x.tag.test_value = train_data.reshape(1,1,96000,1)

In [283]:
x  = T.tensor4('x')
#x.tag.test_value = np.random.rand(1, 1, 96000, 1).astype('float32')

In [7]:
class residual_block:
    def __init__(self, DIM, dilation):
        self.tanh_out = DilatedConv1D(DIM, DIM, 5, dilation, activation = T.tanh, mask_type = 'a')
        self.sig_out = DilatedConv1D(DIM, DIM, 5, dilation, activation = T.nnet.sigmoid, mask_type = 'a')
        filter_shape = (DIM, DIM, 1, 1)
        fan_in = np.prod(filter_shape[1:])
        fan_out = (filter_shape[0] * np.prod(filter_shape[2:]))
        self.conv1_1_W_skip = theano.shared(rng.uniform(low=-np.sqrt(6. / (fan_in + fan_out)), high=np.sqrt(6. / (fan_in + fan_out)), size=filter_shape).astype("float32"), name="W_skip")
        self.conv1_1_W_out = theano.shared(rng.uniform(low=-np.sqrt(6. / (fan_in + fan_out)), high=np.sqrt(6. / (fan_in + fan_out)), size=filter_shape).astype("float32"), name="W_out")
        self.params = self.tanh_out.params + self.sig_out.params + [self.conv1_1_W_skip, self.conv1_1_W_out]
    def f_prop(self, x):
        z = self.tanh_out.f_prop(x) * self.sig_out.f_prop(x)
        skip_out = T.nnet.conv2d(z , self.conv1_1_W_skip)
        out = T.nnet.conv2d(z, self.conv1_1_W_out)
        out = out + x 
        return out, skip_out

In [8]:
num_stack = 9

In [9]:
model = []

In [11]:
for i in range(num_stack):
    model += [residual_block(256, 2**i)]    

In [13]:
causal_layer = DilatedConv1D(256, 1, 5, 1, activation = lambda x:x, mask_type = 'a')

In [14]:
out = causal_layer.f_prop(x)

In [15]:
params = []
skip_out = 0
out = out
for layer in model: 
    params += layer.params
    out, skip = layer.f_prop(out)
    skip_out += skip

params = params[:-1]
params += causal_layer.params

In [17]:
class output_layer:
    def __init__(self):
        filter_shape = (256, 256, 1, 1)
        fan_in = np.prod(filter_shape[1:])
        fan_out = (filter_shape[0] * np.prod(filter_shape[2:]))
        self.W_con1 = theano.shared(rng.uniform(low=-np.sqrt(6. / (fan_in + fan_out)), high=np.sqrt(6. / (fan_in + fan_out)), size=filter_shape).astype("float32"), name="W_1")
        self.W_con2 = theano.shared(rng.uniform(low=-np.sqrt(6. / (fan_in + fan_out)), high=np.sqrt(6. / (fan_in + fan_out)), size=filter_shape).astype("float32"), name="W_2")
        self.params = [self.W_con1, self.W_con2]
        
    def f_prop(self, x):
        return T.nnet.conv2d(T.nnet.relu(T.nnet.conv2d(T.nnet.relu(x) , self.W_con1)), self.W_con2)

In [18]:
output_Layer = output_layer()
result = output_Layer.f_prop(skip_out)
params += output_Layer.params

In [274]:
result.tag.test_value.shape

(20, 8, 96000, 1)

In [19]:
tmp_1 = result.reshape((result.shape[0], result.shape[1], result.shape[2]))
tmp_2 = tmp_1.dimshuffle(0,2,1)
y = tmp_2.reshape((-1, tmp_2.shape[2]))
y = T.nnet.softmax(y)
#y.tag.test_value

In [20]:
raw_inputs = T.vector('raw_inputs').astype('int64')
#tt= np.zeros(1*96000).astype('int64')
#raw_inputs.tag.test_value = tt

In [199]:
raw_inputs.tag.test_value.shape

(96000,)

In [212]:
cost.tag.test_value

array(2.0906708240509033, dtype=float32)

In [21]:
cost = T.mean(T.nnet.categorical_crossentropy(y, raw_inputs))

In [311]:
def sgd(cost, params, eps=np.float32(0.1)):
    gparams = T.grad(cost, params)
    updates = OrderedDict()
    for param, gparam in zip(params, gparams):
        updates[param] = param - eps*gparam
    return updates

In [22]:
def TNorm(x):
    return T.sqrt(T.sum(T.sqr(x)))

def Adam(cost, params, lr=0.0002, b1=0.1, b2=0.001, e=1e-8):
        updates = []
        grads = T.grad(cost, params)
        i = theano.shared(np.float32(0.))
        i_t = i + 1.
        fix1 = 1. - (1. - b1)**i_t
        fix2 = 1. - (1. - b2)**i_t
        lr_t = lr * (T.sqrt(fix2) / fix1)
        for p, g in zip(params, grads):
            norm = TNorm(g)
            g = T.switch(T.lt(1,norm), g/norm, g)
            m = theano.shared(p.get_value() * 0.)
            v = theano.shared(p.get_value() * 0.)
            m_t = (b1 * g) + ((1. - b1) * m)
            v_t = (b2 * T.sqr(g)) + ((1. - b2) * v)
            g_t = m_t / (T.sqrt(v_t) + e)
            p_t = p - (lr_t * g_t)
            updates.append((m, m_t))
            updates.append((v, v_t))
            updates.append((p, p_t))
        updates.append((i, i_t))
        return updates

In [23]:
#g_params = T.grad(cost, params)
updates = Adam(cost, params)
train = theano.function(inputs=[x, raw_inputs], outputs=cost, updates=updates, allow_input_downcast=True, name='train')
#valid = theano.function(inputs=[x, t], outputs=[cost, T.argmax(y, axis=1)], allow_input_downcast=True, name='valid')
#test  = theano.function(inputs=[x], outputs=T.argmax(y, axis=1), name='test')

In [None]:
TRAIN = train_data.reshape(20,1,96000,1)
tt= raw_data.reshape(20*96000)

BATCH = 1
TRAIN = TRAIN[:BATCH,:,:,:]

cost = 0
for i in range(10000):
    cost = train(TRAIN[:,:,:,:], tt[:BATCH*96000])
    print cost 

5.54426670074
5.51434135437
5.46966981888
5.4186258316
5.37723255157
5.32424402237
5.27624464035
5.23311090469
5.18844032288
5.14377260208
5.0978064537
5.05268764496
5.006254673
4.95842266083
4.9093747139
4.85773849487
4.80337429047
4.74753713608
4.69028139114
4.63083934784
4.56991386414
4.50863695145
4.44789075851
4.38890838623
4.33556032181
4.30851840973
4.242582798
4.18516254425
4.14671039581
4.08756065369
4.0549864769
3.9983549118
3.95950770378
3.91976881027
3.87222862244
3.84672760963
3.79559087753
3.7495303154
3.71174812317
3.69148135185
3.66725420952
3.61098265648
3.57097458839
3.55317568779
3.50346684456
3.47826313972
3.51102375984
3.45533394814
3.40420794487
3.3628411293
3.37108540535
3.34343075752
3.27691674232
3.3089056015
3.22954821587
3.23218297958
3.17977666855
3.16685223579
3.14751458168
3.09267902374
3.09465289116
3.06350445747
3.02563929558
3.05072593689
3.06226372719
3.06058979034
2.94576215744
2.99403715134
3.01944923401
2.89299798012
3.04314947128
2.89086580276
2.95

In [276]:
tt= raw_data.reshape(20*96000)

In [240]:
TRAIN = train_data.reshape(20,1,96000,1)

In [306]:
tt[:20*2000].shape

(40000,)

In [309]:
train_data

array([[[-0.1328125],
        [-0.140625 ],
        [-0.140625 ],
        ..., 
        [ 0.71875  ],
        [ 0.484375 ],
        [-0.421875 ]],

       [[ 0.984375 ],
        [ 0.984375 ],
        [ 0.9765625],
        ..., 
        [ 0.40625  ],
        [ 0.390625 ],
        [ 0.359375 ]],

       [[ 0.9296875],
        [ 0.9140625],
        [ 0.9140625],
        ..., 
        [ 0.9453125],
        [-0.328125 ],
        [ 0.6328125]],

       ..., 
       [[ 0.9609375],
        [ 0.9375   ],
        [ 0.9375   ],
        ..., 
        [ 0.671875 ],
        [ 0.6796875],
        [ 0.6875   ]],

       [[-0.140625 ],
        [-0.1640625],
        [-0.1640625],
        ..., 
        [-0.2421875],
        [-0.2578125],
        [-0.265625 ]],

       [[ 0.9765625],
        [ 0.9765625],
        [ 0.984375 ],
        ..., 
        [ 0.7421875],
        [ 0.8125   ],
        [ 0.8828125]]], dtype=float32)

In [304]:
gg = TRAIN[:,:,:2000,:].

In [292]:
skip_out.tag.test_value.shape

AttributeError: 'scratchpad' object has no attribute 'test_value'

In [25]:
raw_data, train_data = lib.wav.generate_input_data()
print raw_data.shape
print train_data.shape

=         GENERATING INPUT DATA       =
Sample Width: 2 (16-bit)
Number of Channels: 1
Sample Rate 48000
Number of Samples: 98473
Duration: 2.05s
Raw Data Size: 196946
Sample Width: 2 (16-bit)
Number of Channels: 1
Sample Rate 48000
Number of Samples: 374855
Duration: 7.81s
Raw Data Size: 749710
Sample Width: 2 (16-bit)
Number of Channels: 1
Sample Rate 48000
Number of Samples: 313600
Duration: 6.53s
Raw Data Size: 627200
Sample Width: 2 (16-bit)
Number of Channels: 1
Sample Rate 48000
Number of Samples: 297153
Duration: 6.19s
Raw Data Size: 594306
Sample Width: 2 (16-bit)
Number of Channels: 1
Sample Rate 48000
Number of Samples: 387179
Duration: 8.07s
Raw Data Size: 774358
Sample Width: 2 (16-bit)
Number of Channels: 1
Sample Rate 48000
Number of Samples: 170092
Duration: 3.54s
Raw Data Size: 340184
Sample Width: 2 (16-bit)
Number of Channels: 1
Sample Rate 48000
Number of Samples: 448636
Duration: 9.35s
Raw Data Size: 897272
Sample Width: 2 (16-bit)
Number of Channels: 1
Sample Rate

In [280]:
tt.min()

0

In [301]:
raw_data.shape

(20, 96000, 1)