# Implementation of a vanilla RNN

Using a regular backprop algorithm with softmax and tanh, single layer. AdaGrad is used for the SGD.
Learning is done on the book "Goblet of Fire".

Text is generated by giving a starting letter, the rest comes from the network. There is no vocabulary of words to take from, thus the outputs are somewhat strange.

This could further be improved upon by searching for optimal parameters, this has not been done at all at the moment. Also adding a learning rate decay would probably improve the results slightly, but I think a vanilla RNN might be to simple to provide good results here.

In [1]:
# Imports
import numpy as np
import pandas as pd
import os
import tqdm
import pylab as pb

## Classes and functions

In [2]:
class RNN:
    # Class containing the trainable variables
    def __init__(self, h_states, num_chars, sigma = 0.01):
        # Initialize from normal distribution with std sigma
        self.K = num_chars
        self.m = h_states
        self.U = np.random.randn(h_states,num_chars)*sigma
        self.W = np.random.randn(h_states,h_states)*sigma
        self.V = np.random.randn(num_chars,h_states)*sigma
        self.b = np.zeros([h_states,1])
        self.c = np.zeros([num_chars,1])
        
class gradients:
    # Class containing the gradients for the trainable variables
    def __init__(self, h_states, num_chars):
        # Initialize from normal distribution with std sigma
        self.U = np.zeros([h_states,num_chars])
        self.W = np.zeros([h_states,h_states])
        self.V = np.zeros([num_chars,h_states])
        self.b = np.zeros([h_states,1])
        self.c = np.zeros([num_chars,1])
        
class ada_G:
    # Class containing the AdaGrads for the trainable variables
    def __init__(self, h_states, num_chars):
        # Initialize from normal distribution with std sigma
        self.U = np.zeros([h_states,num_chars])
        self.W = np.zeros([h_states,h_states])
        self.V = np.zeros([num_chars,h_states])
        self.b = np.zeros([h_states,1])
        self.c = np.zeros([num_chars,1])
        
def to_one_hot(chars, mapping):
    values = [];
    for c in chars:
        values.append(mapping[c])
    values = np.array(values)
    # Remove }, only occurs ones and causes a bug atm
    values[values==81] = 80
    n_values = mapping[max(mapping, key=mapping.get)]
    return (np.eye(n_values)[values]).T

def softmax(x):
    # Apply softmax on x
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)
        
def synt_seq(RNN, h0, x0, seq_length, char_to_ind, ind_to_char):
    """Synthesize a sequence of characters.
    
    Takes an RNN model (RNN), an initial hidden state (h0), a dummy input (x0) and a sequence length.
    Returns a sequence of characters that is seq_length long.
    """ 
    
    # One hot of dummy input
    h = h0
    x = to_one_hot(x0,char_to_ind)
    Y = np.zeros([RNN.K, seq_length])
    synt_text = ""

    # Compute char sequence
    for t in range(seq_length):
        a = RNN.W.dot(h) + RNN.U.dot(x)+RNN.b
        h = np.tanh(a)
        o = RNN.V.dot(h) + RNN.c
        p = softmax(o)
        ind = np.random.choice(RNN.K, 1, p=p.ravel())
        x[:] = 0; x[ind] = 1;
        Y[:,t] = x.ravel()
        synt_text += str(ind_to_char[ind[0]])
        
    return synt_text

def forward_pass(RNN, h0, X, Y, seq_length, char_to_ind):
    """Does a forward pass over the data
    
    Takes an RNN model (RNN), an initial hidden state (h0), inputs (X) and their "labels" (Y).
    Returns hidden states before activation (A), hidden states (H), probability (P) and loss (L)
    """ 
    
    # Create variables and containers
    h = h0
    A = np.zeros([RNN.m,seq_length])
    H = np.zeros([len(h),seq_length])
    P = np.zeros([RNN.K,seq_length])
    L = 0;
    
    # One hot of input
    X_hot = to_one_hot(X,char_to_ind)
    Y_hot = to_one_hot(Y,char_to_ind)

    # Compute char sequence
    for t in range(seq_length):
        a = RNN.W.dot(h) + RNN.U.dot(X_hot[:,t].reshape(RNN.K,-1)) + RNN.b
        h = np.tanh(a)
        o = RNN.V.dot(h) + RNN.c
        p = softmax(o)
        L -= np.log(Y_hot[:,t].dot(p)+1e-10)

        A[:,t] = a.ravel();
        H[:,t] = h.ravel();
        P[:,t] = p.ravel();
        
    # Return matrices used for back-prop
    return A,H,P,L
    
    
def backward_pass(RNN, A, H, P, X, Y, h0, gradients, seq_length):
    """Does a backward pass over the data
    
    Takes an RNN model (RNN), hidden states before activation (A), hidden states (H) and probability (P).
    return nothing, but updates the gradients in the gradients object
    """ 
    
    # One hot of input
    X_hot = to_one_hot(X,char_to_ind)
    Y_hot = to_one_hot(Y,char_to_ind)
    
    # Compute gradients w.r.t outputs
    dL_O = -(Y_hot-P).T

    # Compute gradient w.r.t RNN.V
    dL_V = dL_O.T.dot(H.T)

    # Compute gradient w.r.t all hidden states before activation
    dL_A = np.zeros([seq_length,RNN.m])

    dL_h = dL_O[-1,:].reshape(1,RNN.K).dot(RNN.V)
    dL_A[-1,:] = dL_h.dot(np.diag(1-np.tanh(A[:,-1])**2))

    for t in reversed(range(seq_length-1)):
        dL_h = dL_O[t,:].reshape(1,RNN.K).dot(RNN.V) + dL_A[t+1,:].dot(RNN.W)
        dL_A[t,:] = dL_h.dot(np.diag(1-np.tanh(A[:,t])**2))
        
    # Compute gradient w.r.t RNN.W, create modified H with prior
    H_mod = np.hstack((h0,H[:,:-1]))
    dL_W = dL_A.T.dot(H_mod.T)
    
    # Compute gradient w.r.t RNN.U 
    dL_U = dL_A.T.dot(X_hot.T)

    # Compute gradients w.r.t biases (RNN.b, RNN.c)
    dL_b = np.sum(dL_A.T,axis=1).reshape(RNN.m,1)
    dL_c = np.sum(dL_O.T,axis=1).reshape(RNN.K,1)
    
    # Update gradients
    gradients.V = dL_V
    gradients.W = dL_W
    gradients.U = dL_U
    gradients.b = dL_b
    gradients.c = dL_c


## Load and clean the data

In [3]:
# Load Book
book_path = os.getcwd() + "\data\goblet_book.txt"
book = np.loadtxt(book_path,delimiter="%c",dtype="str")
book_data = ''.join(book)

# Get unique characters and create mappings from characters to numbers
char_to_ind = {c: i for i, c in enumerate(reversed(book_data))}
book_chars = np.array(list(char_to_ind))
ind_to_char = {}

# Switch key in the mappings to number between 0-len(book_chars)
for i in range(0,len(book_chars)):
    char_to_ind[book_chars[i]]=i;
    ind_to_char[i] = book_chars[i];
    

## Set up hyper-parameters

In [4]:
# Set RNN hyper-parameters and initialize RNN and gradients

h_states   = 100
l_rate     = 0.1
seq_length = 25
epochs = 1
display_step = 100000

my_RNN = RNN(h_states,len(book_chars)-1)
my_gradients = gradients(h_states,len(book_chars)-1)
my_adaGrad = gradients(h_states,len(book_chars)-1)

## Start training

In [5]:
# Start training after initialization of parameters

h_init = np.zeros([h_states,1])
trainable_variables = ['U', 'W', 'V', 'b', 'c']

smoothLoss = [];
    
print
for epoch in range(epochs):
    
    h_init = np.zeros([h_states,1])
    
    # Loop over the book a sequence length at a time
    for i in tqdm_notebook(range(len(book_data)-seq_length),desc="BookLoop"):
        X = book_data[i:(i+seq_length)]
        Y = book_data[(i+1):(i+seq_length+1)]
        
        # Compute gradients
        A,H,P,L = forward_pass(my_RNN, h_init, X, Y, seq_length, char_to_ind)
        backward_pass(my_RNN, A, H, P, X, Y, h_init, my_gradients, seq_length)

        # Apply update on weights and AdaGrad
        for attr in trainable_variables:
            grad = getattr(my_gradients,attr)
            grad = np.clip(grad, -5, +5)
            adaG = getattr(my_adaGrad,attr)
            adaG = adaG + grad**2;
            setattr(my_adaGrad,attr,adaG)

            setattr(my_RNN, attr, getattr(my_RNN,attr) - l_rate/(np.sqrt(adaG+1e-10)) * grad)
        
        # Compute SmoothLoss
        if(epoch==0 and i == 0):
            smoothLoss.append(L[0])
        smoothLoss.append(.999* smoothLoss[i] + .001 *L[0]);
        
        # Print every display_step
        if(i%display_step == 0):
            print("\nCurrently at iteration {}.".format(i))
            print("\nSmooth Loss: " + repr(smoothLoss[i+1]) + "\n")
            print("Generated text at this point: \n")
            print(synt_seq(my_RNN, h_init, X[0], 200, char_to_ind, ind_to_char))
            print("\n\n")
        
        # Set h_init to the last hidden
        h_init = H[:,-1].reshape([h_states,1])
        
        
        
        
        




HBox(children=(IntProgress(value=0, description=u'BookLoop', max=1101143), HTML(value=u'')))


Currently at iteration 0.

Smooth Loss: 109.85620681793182

Generated text at this point: 

"0j!b6KfAp6�z:6Hl6FMsnm4ef;i1aQgDhaP/rdlmgAkUmwi�i-,XSjgD7-"wTApDu	TY3-'eC�FL�HS"�DMurV-_G:PsOQ^Qg�/�4"HDenga2lH�uyOaZZ1G!nC01QmdC�Y;R,Q4_TJ,QIZOXe"iFnwb;nf/7SxWDf�Eql�.�JA6�OHfkkGI?!Yg(TOV:pu6DRm/o6




Currently at iteration 100000.

Smooth Loss: 40.28380495702906

Generated text at this point: 

lt."Yowhing, Fremebend tark saise," starem,"bey'ver, then't hitheamse he pancre fom givel bigh?""Yscron tore,."Yom sfiven his ming, sming, dane gotmingench, Ands thimed bok?" saign, gite.?""Youssifech




Currently at iteration 200000.

Smooth Loss: 39.19067700579037

Generated text at this point: 

re sneareef, and said you, snerding fios, around ind, Winkvery, with ring, enong, were sare goitey roin, Went Winky!  upereeeredw reenth, Whoked atend and hinky roped to asmunned to he site saidend fi




Currently at iteration 300000.

Smooth Loss: 41.660134441843304

Generated text at this point: 

!" s

In [38]:
print("A bit longer synthesized text piece: \n")
print(synt_seq(my_RNN, np.zeros([h_states,1]), ' ', 500, char_to_ind, ind_to_char))

A bit longer synthesized text piece: 

beteming him," said Harry," he saidits, he cur our land out here see olly him to fall no weel.  He pletle wotto Goyt, worrie Gand wor was sthill what have the come!"Heaving to clasted at Harry tight.  Malfoy next . he had ears. "Harry were waid yeche, the bours, not was wast it, in don't of their was samently seever said have withe horrie. Wheenf pirking Go.  Skepted on him."He con't curs whepeorge.    say at to, whost, take would had deent it on his have gold wasn'the dound at ssleek."Harry alm
