In [177]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import numpy as np
import theano
import theano.tensor as T
from theano import shared 
from theano.ifelse import ifelse
from collections import OrderedDict
import matplotlib.pyplot as plt
import cPickle
import sys
import re
from reber import ReberGrammar, EmbeddedReberGrammar
from NNutils import share_or_init_weight

dtype=T.config.floatX
theano.config.optimizer='fast_compile'

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [90]:
grammar = ReberGrammar()

In [137]:
class Rnn:
    def __init__(self, optim=None, path=None, n_in=None, n_hid=None, n_out=None, 
                 W_in=None, W_out=None, W_rec=None, b_out=None, b_h=None, h0=None):   
        if path is not None:
            loaded = []
            f = file(path,'rb')
            stop_pickling = False
            while not stop_pickling:
                try:
                    loaded.append(cPickle.load(f))
                except:
                    stop_pickling = True
            f.close()
            W_in, W_out, W_rec, b_out, b_h, h0, optim = loaded
        
        self.optim = optim
        self.W_in = share_or_init_weight(weights=W_in, shape=(n_in, n_hid),name='W_in', sample='svd')
        self.W_out = share_or_init_weight(weights=W_out, shape=(n_hid, n_out),name='W_out', sample='svd')
        self.W_rec = share_or_init_weight(weights=W_rec, shape=(n_hid, n_hid), name='W_rec', sample='svd')
        self.b_out = share_or_init_weight(weights=b_out, shape=(n_out), name='b_out',sample='zero')
        self.b_h = share_or_init_weight(weights=b_h, shape=(n_hid), name='b_h',sample='zero')
        
        self.h0 = share_or_init_weight(weights=h0, shape=(n_hid), name="h0", sample='zero') # initial hidden state 
        
        self.params = [self.W_in,self.W_out,self.W_rec, self.b_out, self.b_h]
        
        def step(x_t, h_tm1):
            h_t = T.tanh(T.dot(x_t, self.W_in) + T.dot(h_tm1, self.W_rec) + self.b_h)
            y_t = T.nnet.sigmoid((T.dot(h_t, self.W_out) + self.b_out))            
            return [h_t, y_t]

        X = T.matrix('X') # X is the sequence of vector
        Y = T.matrix('Y') # Y is the output of vector
        
        [h_vals, y_vals], _ = theano.scan(fn=step,                                  
                                          sequences=X,
                                          outputs_info=[self.h0, None])
        
        cost = -T.mean(Y * T.log(y_vals)+ (1.- Y) * T.log(1. - y_vals))
        gparams = T.grad(cost, self.params)
        updates = OrderedDict()
        
        if self.optim['method'] == 'sgd':
            lr = shared(np.cast[dtype](self.optim['learning_rate']), name='lr')
            for param, gparam in zip(self.params, gparams):
                updates[param] = param - gparam * lr
            self.train = theano.function(inputs = [X, Y], outputs = cost, updates=updates)
                
        elif self.optim['method'] == 'momentum':
            lr = shared(np.cast[dtype](self.optim['learning_rate']), name='lr')
            mom_start = shared(np.cast[dtype](self.optim['mom_start']), name='mom_start')
            mom_end = shared(np.cast[dtype](self.optim['mom_end']), name='mom_end')
            mom_epoch_interval = shared(np.cast[dtype](self.optim['mom_epoch_interval']), name='mom_epoch_interval')
            epoch = T.scalar('epoch')
            
            gparam_momentum = [share_or_init_weight(shape=param.get_value(borrow=True).shape, 
                                                    name="gparam_mom", sample="zero") for param in self.params]
            mom = ifelse(epoch < mom_epoch_interval,
                         (mom_start*(1-epoch/mom_epoch_interval) + mom_end*(epoch/mom_epoch_interval)).astype(dtype),
                         mom_end)
            for gparam_mom, gparam in zip(gparam_momentum, gparams):
                updates[gparam_mom] = mom*gparam_mom + (1-mom)*gparam

            for param, gparam_mom in zip(self.params, gparam_momentum):
                updates[param] = param - lr*gparam_mom
            self.train = theano.function(inputs = [X, Y, epoch], outputs = cost, updates=updates)
            
        self.predictions = theano.function(inputs = [X], outputs = y_vals)    
        self.get_lr = theano.function(inputs = [], outputs = lr)
        
            
    def sample_words(self, grammar, n_words, max_len=1000):
        init = grammar.get_char_one_hot('B')[0]
        words = []
        for j in range(n_words):
            word = [init]
            count = 0
            for i in range(max_len):
                probas = self.predictions(word)[-1]
                probas = probas / np.sum(probas)
                letter = np.random.multinomial(n=1, pvals=probas)
                word.append(letter)
                if np.equal(letter, grammar.get_char_one_hot('.')[0]).all():
                    break
                
            words.append(grammar.sequenceToWord(word))
        return words

    def save(self, path):
        f = file(path, 'wb')
        objects = [p.get_value() for p in self.params + [self.h0]]
        objects.append(self.optim)
        for obj in objects:
            cPickle.dump(obj, f, protocol=cPickle.HIGHEST_PROTOCOL)
        f.close()

In [93]:
train_data = grammar.get_n_examples(10000)
test_data = grammar.get_n_examples(500)
# We remove from our test data the observations that occured in the training data
ltrain = set("".join(str(tt.argmax()) for tt in t[0]) for t in train_data)
ltest = list("".join(str(tt.argmax()) for tt in t[0]) for t in test_data)
test_data = [t for (t, lt) in zip(test_data, ltest) if not lt in ltrain]

In [196]:
# Later we shall optimize hyperparameters with Spearmint
optim = {"method":"momentum", "learning_rate":0.05, "mom_start":0.5, "mom_end": 0.9, "mom_epoch_interval":50}
model = Rnn(optim=optim, n_in=len(grammar.chars), n_hid=60, n_out=len(grammar.chars))

In [None]:
epoch = 0
training_errors = []
test_likelihood = []
sampling_precision = []
early_stopping = False
patience = 4*len(train_data)
max_test_likelihood = -np.Inf
while epoch<1000 and not early_stopping:
    epoch += 1
    errors = []
    for i in range(len(train_data)):
        idx = np.random.randint(0, len(train_data))
        x, y = train_data[idx]
        errors.append(model.train(x, y, epoch))
    iter = epoch*len(train_data)
    training_errors.append(np.sum(errors))
    likelihood = []
    for t in test_data:
        table = np.array(model.predictions(t[0])*t[1])
        likelihood.append(np.mean(np.sum(table, axis=1)))
    test_likelihood.append(np.mean(likelihood))
    if test_likelihood[-1]>max_test_likelihood:
        if 0.9999*test_likelihood[-1]>max_test_likelihood:
            patience = max(patience, 4*iter)
        max_test_likelihood = test_likelihood[-1]
#         model.save('RNN_reber.pickle')
    samples = model.sample_words(grammar, 100)
    is_in_grammar = [grammar.in_grammar(w) for w in samples]
    sampling_precision.append(np.mean(is_in_grammar))
    if not epoch%10:
        print "Epoch ",epoch, ", \n\tTraining error :", training_errors[-1], "\n\tTest likelihood :", test_likelihood[-1]
        print "\tSampling precision :",sampling_precision[-1]
        print "\tPatience :", patience, ", Iter :", iter
        plt.subplot(131)
        train_plot, = plt.plot(range(epoch), training_errors, label="Training error", color='g')
        train_plot.figure.set_size_inches(16.,4.)
        plt.legend()
        plt.subplot(132)
        test_plot, = plt.plot(range(epoch), test_likelihood, label="Test likelihood", color='r')
        plt.legend()
        plt.subplot(133)
        sampling_plot, = plt.plot(range(epoch), sampling_precision, label="Sampling precision", color='b')
        plt.legend()
        plt.show()
    if patience<=iter:
        early_stopping = True