In [1]:
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

In [2]:
def softmax(s):
    """
    Implementation of the softmax activation function

    Args:
        s: an 1xd vector of a classifier's outputs

    Returns:
        An 1xd vector with the results of softmax given the input
        vector s.
    """
    exponents = np.exp(s - np.max(s, axis = 0)) # Max subtraction for numerical stability
    output_exp_sum = np.sum(exponents, axis = 0)
    p = exponents / output_exp_sum
    return p

In [3]:
class RNN:
    """
    Implementation of a simple RNN.
    
    Attributes:
        k: dimensionality of input
        m: Hidden state dimensionality
        eta: learning rate initial value
        seq_length: Length of input sequences used during training
    """
    
    
    def __init__(self, k, m, seq_length = 25,  sig = 0.01):
        '''
        Args:
            k: dimensionality of input
            m: Hidden state dimensionality
            seq_length: Length of input sequences used during training
            sig: standard deviation of normal distribution used to init-
                ialize the weights
        '''
        # Initialize hyperparameters
        self.m = m
        self.seq_length = seq_length
        
        # Initialize bias vectors
        self.b = np.zeros((m, 1))
        self.c = np.zeros((k, 1))
        # Initialize weight matrices
        
        self.U = np.random.randn(m, k) * sig
        self.W = np.random.randn(m, m) * sig
        self.V = np.random.randn(k, m) * sig
        
        
        # Initialize epsilon value
        self.epsilon = 1e-10
        
    def synthesize_seq(self, h0, x0, n):
        """
        Synthesizes a sequence of characters
        
        Args:
         h0: Hidden state at time 0.
         x0: First dummy input to RNN.
         n: Length of sequence to generate.
         
        """
        synthesized_seq = []
        h_t = h0
        x_t = x0
        
        for i in range(n):
            a_t = self.W.dot(h_t) + self.U.dot(x_t) + self.b
            h_t = np.tanh(a_t)

            o_t = self.V.dot(h_t) + self.c
            p_t = softmax(o_t)
            
            #sample character based on softmax output and store it
            sampled_char = np.random.choice(list(range(self.V.shape[0])), p = p_t.flatten())
            synthesized_seq.append(sampled_char)
        
        return synthesized_seq
    
    def cross_entropy_loss(self, h0, X, Y):
        """
        Calculates the cross entropy loss
        """
        log_X = np.multiply(Y , self.forwardPass(h0, X)[0]).sum(axis=0)
        log_X[log_X == 0] = np.finfo(float).eps
        return -np.log(log_X)

    def computeLoss(self, h0, X, Y):
        """
        Computes the loss of the network given a batch of data.
        
        Args:
            h0: Initial hidden state
            X_batch: NxD matrix with N data sample inputs
            Y_batch: NxD matrix with N data sample outputs
        
        Returns:
            A scalar float value corresponding to the loss.
        """        
        return np.sum(self.cross_entropy_loss(h0, X, Y))

    
    def forwardPass(self, h0, X):
        """
        Performs the forward pass for each timestep and returns
        the probability of each word in each timestep

        Args:
            h0: Initial hidden state
            X: Input matrix

        Returns:
            A matrix with the probability of each word in each timestep.
        """
        T = X.shape[1]
        P = np.zeros((X.shape[0], T))
        O = np.zeros((X.shape[0], T))
        H = np.zeros((self.m, T))
        A = np.zeros((self.m, T))
        h_t = h0
        for i in range(T):
            A[:,i] = (self.W.dot(h_t) + self.U.dot(X[:,i].reshape(-1, 1)) + self.b).flatten()
            h_t = np.tanh(A[:,i]).reshape(-1, 1)
            H[:,i] = h_t.flatten()
            O[:,i] = self.V.dot(h_t).flatten() + self.c.flatten()
            P[:,i] = softmax(O[:,i].reshape(-1, 1))[:,0]
        return P, O, H, A
    
    def backwardPass(self, X, Y, P, O, H, A, clipping = True):


        # Initialize gradients to zero matrices
        grad_U = np.zeros(self.U.shape)
        grad_W = np.zeros(self.W.shape)
        grad_V = np.zeros(self.V.shape)
        grad_b = np.zeros(self.b.shape)
        grad_c = np.zeros(self.c.shape)
        grad_h_next = np.zeros((self.m, 1))
        
        # Get total number of timesteps
        T = Y.shape[1]

        # For each timestep
        for t in reversed(range(T)):
            g = P[:,t] - Y[:,t] # Derivative with respect to o
            
            # Update gradients
            grad_c[:, 0] += g
            grad_V += np.outer(g, H[:,t])
            
            # Calculate x gradient with respect to A_t + 1
            
            if not (t == T - 1):
                grad_h = g.dot(self.V) + grad_a.dot(self.W)
            else:
                grad_h = g.dot(self.V) # Derivative of last hidden state 
            grad_a = grad_h.dot(np.diag(1 - np.tanh(A[:, t]) ** 2))
            
            grad_U += np.outer(grad_a, X[:,t])
            grad_W += np.outer(grad_a, H[:,t - 1])
            grad_b[:,0] += grad_a
            
        if clipping is True:
            grad_U[grad_U > 5] = 5
            grad_U[grad_U < -5] = -5
            grad_W[grad_W > 5] = 5
            grad_W[grad_W < -5] = -5
            grad_V[grad_V > 5] = 5
            grad_V[grad_V < -5] = -5
            grad_b[grad_b > 5] = 5
            grad_b[grad_b < -5] = -5
            #grad_c[grad_c > 5] = 5
            grad_c[grad_c < -5] = -5
       
       
        return grad_W, grad_U, grad_V, grad_b, grad_c 

    def compute_grad_num_slow(self, X_batch, Y_batch, h0,  h = 1e-4):
        '''Centered difference gradient'''
        # Initialize all gradients to zero
        grad_W = np.zeros(self.W.shape)
        grad_U = np.zeros(self.U.shape) 
        grad_V = np.zeros(self.V.shape) 
        grad_b = np.zeros(self.b.shape)
        grad_c = np.zeros(self.c.shape)
 
        # Gradient w.r.t W
        for j in tqdm(range(self.W.shape[0])):
            for k in range(self.W.shape[1]):
                self.W[j, k] -= h
                c1 = self.computeLoss(h0, X_batch, Y_batch)
                self.W[j, k] += 2 * h
                c2 = self.computeLoss(h0, X_batch, Y_batch)
                self.W[j, k] -= h
                grad_W[j, k] = (c2-c1) / (2 * h)
       
        
         # Gradient w.r.t U
        for j in tqdm(range(self.U.shape[0])):
            for k in range(self.U.shape[1]):
                self.U[j, k] -= h
                c1 = self.computeLoss(h0, X_batch, Y_batch)
                self.U[j, k] += 2 * h
                c2 = self.computeLoss(h0, X_batch, Y_batch)
                self.U[j, k] -= h
                grad_U[j, k] = (c2-c1) / (2 * h)
       
         # Gradient w.r.t V
        for j in tqdm(range(self.V.shape[0])):
            for k in range(self.V.shape[1]):
                self.V[j, k] -= h
                c1 = self.computeLoss(h0, X_batch, Y_batch)
                self.V[j, k] += 2 * h
                c2 = self.computeLoss(h0, X_batch, Y_batch)
                self.V[j, k] -= h
                grad_V[j, k] = (c2-c1) / (2 * h)
       
        # Gradient w.r.t b
        for j in tqdm(range(self.b.shape[0])):
            self.b[j] -= h
            c1 = self.computeLoss(h0, X_batch, Y_batch)
            self.b[j] += 2 * h
            c2 = self.computeLoss(h0, X_batch, Y_batch)
            self.b[j] -= h
            grad_b[j] = (c2-c1) / (2 * h)
       
        # Gradient w.r.t c
        for j in tqdm(range(self.c.shape[0])):
            self.c[j] -= h
            c1 = self.computeLoss(h0, X_batch, Y_batch)
            self.c[j] += 2 * h
            c2 = self.computeLoss(h0, X_batch, Y_batch)
            self.c[j] -= h
            grad_c[j] = (c2-c1) / (2 * h)
       
    
        return grad_W, grad_U, grad_V, grad_b, grad_c
    

    def train(self, X, Y, h0, max_epochs = 10, eta = 0.01, synth_len = 200, n_loss_steps = 100,\
              n_synth_steps = 500, inv_char_dictionary = None):
        """
        Performs training with AdaGrad
        
        Args:
            X:
            Y:
            h0:
            eta: learning rate initial value
            n_loss_steps:
            n_synth_steps:
        """
        training_data_len = X.shape[1]
        tr_sequence_no = training_data_len - self.seq_length + 1 # Number of available sequences in the training data
        synthesized_text_len = 200
        smooth_loss = self.computeLoss(h0, X[:,:self.seq_length], Y[:,:self.seq_length])
        smooth_loss_list = []
        
        # Initialize AdaGrad matrices
        ada_grad_V = np.zeros(self.V.shape)
        ada_grad_W = np.zeros(self.W.shape)
        ada_grad_U = np.zeros(self.U.shape)
        ada_grad_b = np.zeros(self.b.shape)
        ada_grad_c = np.zeros(self.c.shape)

        for epoch in tqdm(range(max_epochs)):
            
            print("Epoch: " + str(epoch))
            e = 0 # Initialize position in text
            h_prev = np.copy(h0) # Initialize hidden state to zero vector
            
            for s in range(tr_sequence_no):
                curr_iter = epoch * tr_sequence_no + s
                
                X_batch = X[:,s:s + self.seq_length]
                Y_batch = Y[:,s:s + self.seq_length]
                
                # Run forward pass
                P, O, H, A = rnn_model.forwardPass(h_prev, X_batch)

                # Run backward pass
                grad_W, grad_U, grad_V, grad_b, grad_c  = rnn_model.backwardPass(X_batch, Y_batch, P, O, H, A)
                
                # Update AdaGrad matrices
                ada_grad_V += grad_V ** 2
                ada_grad_W += grad_W ** 2
                ada_grad_U += grad_U ** 2
                ada_grad_b += grad_b ** 2
                ada_grad_c += grad_c ** 2
                # Update weight matrices
                self.V += -eta * grad_V / np.sqrt(ada_grad_V + self.epsilon)
                self.W += -eta * grad_W / np.sqrt(ada_grad_W + self.epsilon)
                self.U += -eta * grad_U / np.sqrt(ada_grad_U + self.epsilon)
                self.b += -eta * grad_b / np.sqrt(ada_grad_b + self.epsilon)
                self.c += -eta * grad_c / np.sqrt(ada_grad_c + self.epsilon)
                
                # Compute smoothened loss
                loss = self.computeLoss(h_prev, X_batch, Y_batch)
                smooth_loss = .999 * smooth_loss + .001 * loss;
                smooth_loss_list.append(smooth_loss)
                if curr_iter % n_loss_steps == 0:
                    print("Global step: " + str(curr_iter) + " Smoothened loss: " + str(smooth_loss))
                
                # Check iteration number and print loss if verbose
                h_prev = H[:, 0].reshape(-1, 1)
                
                # Synthesize text
                if char_dictionary is not None and curr_iter % n_synth_steps == 0:
                    print(''.join(indicesToText(self.synthesize_seq(h0, X_batch[:,0].reshape(-1, 1), synth_len),\
                                                inv_char_dictionary)))
                    
                

In [4]:
def onehot_encode(chars, char_dictionary):
    """
    Encodes a string of characters to a matrix with one hot encoding.
    
    Args:
        chars: The input string
        char_dictionary: A dictionary that maps each possible character
            of the vocabulary being used to a unique index.
        
    Returns: 
        A NxM matrix where N is the number of distinct characters in the
        vocabulary and M is the number of characters in the string.
    """
    N = len(char_dictionary.keys())
    M = len(chars)
    encoded_string = np.zeros((N, M))
    for i, char in enumerate(chars):
        unique_index = char_dictionary[char]
        encoded_string[unique_index, i] = 1
    return encoded_string        

In [5]:
def getRelativeErrors(grad1, grad2):
    """
    Computes the relative errors of grad_1 and grad_2 gradients
    """
    abs_diff = np.absolute(grad1 - grad2) 
    abs_sum = np.absolute(grad1) + np.absolute(grad2)
    max_elems = np.where(abs_sum > np.finfo(float).eps, abs_sum, np.finfo(float).eps)
    relativeErrors = abs_diff / max_elems
    return relativeErrors

In [6]:
def indicesToText(indices, dictionary):
    """
    Takes the indices of each character as an input and
    returns a string according to a given dictionary.
    """
    return [dictionary[index] for index in indices]

###  Read in the data

In [7]:
with open('goblet_book.txt', 'r') as fileobj:
    data = fileobj.read()

In [8]:
# Get dictionary of unique characters in the book
characters = set(data)
char_dictionary = dict([ (elem, i) for i, elem in enumerate(characters) ])
inv_char_dictionary = {v: k for k, v in char_dictionary.items()}
voc_size = len(char_dictionary)

### Extract input and output data using one-hot encoding

In [9]:
seq_length = 25
X_chars = data[:seq_length]
Y_chars = data[1:seq_length + 1]
X = onehot_encode(X_chars, char_dictionary)
Y = onehot_encode(Y_chars, char_dictionary) 

In [10]:
# Initialize dimensionality of hidden state
m = 5
# Initialize the initial hidden state to a zero vector
h0 = np.zeros((m, 1))

In [11]:
rnn_model = RNN(k = voc_size, m = m)

In [12]:
rnn_model.synthesize_seq(h0, X[:,0].reshape(-1, 1), 10)

[7, 23, 7, 10, 35, 82, 0, 28, 17, 52]

In [13]:
P, O, H, A = rnn_model.forwardPass(h0, X)

In [14]:
grad_W, grad_U, grad_V, grad_b, grad_c  = rnn_model.backwardPass(X, Y, P, O, H, A)

In [15]:
approx_grad_W, approx_grad_U, approx_grad_V, approx_grad_b, approx_grad_c = rnn_model.compute_grad_num_slow(X, Y, h0)

100%|███████████████████████████████████████████| 5/5 [00:00<00:00, 149.26it/s]
100%|████████████████████████████████████████████| 5/5 [00:00<00:00,  9.85it/s]
100%|█████████████████████████████████████████| 83/83 [00:00<00:00, 162.73it/s]
100%|███████████████████████████████████████████| 5/5 [00:00<00:00, 666.61it/s]
100%|█████████████████████████████████████████| 83/83 [00:00<00:00, 825.88it/s]


In [16]:
errorsW = getRelativeErrors(grad_W, approx_grad_W)
errorsU = getRelativeErrors(grad_U, approx_grad_U)
errorsV = getRelativeErrors(grad_V, approx_grad_V)
errorsb = getRelativeErrors(grad_b, approx_grad_b)
errorsc = getRelativeErrors(grad_c, approx_grad_c)
print(np.max(errorsW))
print(np.max(errorsU))
print(np.max(errorsV))
print(np.max(errorsb))
print(np.max(errorsc))

1.0
7.182186223727895e-08
1.0007862356348196e-05
3.2961050049577092e-09
9.463155187121914e-10


### Train the RNN using AdaGrad

In [None]:
# Encode the whole dataset using one-hot encoding
X_chars = data[:len(data) - 2]
Y_chars = data[1:len(data) - 1]
X = onehot_encode(X_chars, char_dictionary)
Y = onehot_encode(Y_chars, char_dictionary) 

In [None]:
m = 5
seq_length = 25
rnn_model = RNN(k = voc_size, m = m, seq_length = seq_length)
max_epochs = 5
h0 = np.zeros((m, 1))
rnn_model.train(X, Y, h0, max_epochs = max_epochs, inv_char_dictionary = inv_char_dictionary)

  0%|                                                    | 0/5 [00:00<?, ?it/s]

Epoch: 0
Global step: 0 Smoothened loss: 110.47228229939819
)KΆM0RuLJ"d}E YO;',6	7::jIZF
Ympa)jN;lMu_1OI-9T7M0:;A	dEΌ('7HNUdSn	vDBIbC
pMa)^rPj_-LaS^K}4/?KcqD_t44V grFΆ(LsΆ"Of!TZn(	x(yI^MMbZgJEAΓQ09nlnLe ΌM9Γ
J0NXXΆyi2NFU€B}2nΌOoΌecUN4S'C:eUsy1iS2(.wWoB3p"vvGsΆ
Global step: 100 Smoothened loss: 109.56628050051826
Global step: 200 Smoothened loss: 108.24492820066811
Global step: 300 Smoothened loss: 106.3533763562884
Global step: 400 Smoothened loss: 104.35306632506315
Global step: 500 Smoothened loss: 102.51802564017974
xyftlFsgyrrLnlThi βt; eTrdi( lEetkect €eHsns eaaLadrHvy   es	 if eGRdrrRDOfIQeBSornb} hf_k"idRst9HhEneueXtutdl  AnE
sgtGdmRfnyr:w eeHTeo eOGfOaJVvISHattvri  lou elgcCC
nvhTiHSltcFVebeEem elsUnFfee0sik
Global step: 600 Smoothened loss: 100.4909856709797
Global step: 700 Smoothened loss: 98.27820340562073
Global step: 800 Smoothened loss: 96.46557829448713
Global step: 900 Smoothened loss: 94.78085027380502
Global step: 1000 Smoothened loss: 93.11429722915433
H lkrEdemtee 