In [2]:
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

In [3]:
def softmax(s):
    """
    Implementation of the softmax activation function

    Args:
        s: an 1xd vector of a classifier's outputs

    Returns:
        An 1xd vector with the results of softmax given the input
        vector s.
    """
    exponents = np.exp(s - np.max(s, axis = 0)) # Max subtraction for numerical stability
    output_exp_sum = np.sum(exponents, axis = 0)
    p = exponents / output_exp_sum
    return p

In [373]:
class RNN:
    """
    Implementation of a simple RNN.
    
    Attributes:
        k: dimensionality of input
        m: Hidden state dimensionality
        eta: learning rate initial value
        seq_length: Length of input sequences used during training
    """
    
    
    def __init__(self, k, m, seq_length = 25,  sig = 0.01):
        '''
        Args:
            k: dimensionality of input
            m: Hidden state dimensionality
            seq_length: Length of input sequences used during training
            sig: standard deviation of normal distribution used to init-
                ialize the weights
        '''
        # Initialize hyperparameters
        self.m = m
        self.seq_length = seq_length
        
        # Initialize bias vectors
        self.b = np.zeros((m, 1))
        self.c = np.zeros((k, 1))
        # Initialize weight matrices
        
        self.U = np.random.randn(m, k) * sig
        self.W = np.random.randn(m, m) * sig
        self.V = np.random.randn(k, m) * sig
        
        
        # Initialize epsilon value
        self.epsilon = 1e-10
        
    def synthesize_seq(self, h0, x0, n):
        """
        Synthesizes a sequence of characters
        
        Args:
         h0: Hidden state at time 0.
         x0: First dummy input to RNN.
         n: Length of sequence to generate.
         
        """
        synthesized_seq = []
        h_t = h0
        x_t = x0
        
        for i in range(n):
            a_t = self.W.dot(h_t) + self.U.dot(x_t) + self.b
            h_t = np.tanh(a_t)

            o_t = self.V.dot(h_t) + self.c
            p_t = softmax(o_t)
            
            #sample character based on softmax output and store it
            sampled_char = np.random.choice(list(range(self.V.shape[0])), p = p_t.flatten())
            synthesized_seq.append(sampled_char)
        
        return synthesized_seq
    
    def cross_entropy_loss(self, h0, X, Y):
        """
        Calculates the cross entropy loss
        """
        log_X = np.multiply(Y , self.forwardPass(h0, X)[0]).sum(axis=0)
        log_X[log_X == 0] = np.finfo(float).eps
        return -np.log(log_X)

    def computeLoss(self, h0, X, Y):
        """
        Computes the loss of the network given a batch of data.
        
        Args:
            h0: Initial hidden state
            X_batch: NxD matrix with N data sample inputs
            Y_batch: NxD matrix with N data sample outputs
        
        Returns:
            A scalar float value corresponding to the loss.
        """        
        return np.sum(self.cross_entropy_loss(h0, X, Y))

    
    def forwardPass(self, h0, X):
        """
        Performs the forward pass for each timestep and returns
        the probability of each word in each timestep

        Args:
            h0: Initial hidden state
            X: Input matrix

        Returns:
            A matrix with the probability of each word in each timestep.
        """
        T = X.shape[1]
        P = np.zeros((X.shape[0], T))
        O = np.zeros((X.shape[0], T))
        H = np.zeros((self.m, T))
        A = np.zeros((self.m, T))
        h_t = h0
        for i in range(T):
            A[:,i] = (self.W.dot(h_t) + self.U.dot(X[:,i].reshape(-1, 1)) + self.b).flatten()
            h_t = np.tanh(A[:,i]).reshape(-1, 1)
            H[:,i] = h_t.flatten()
            O[:,i] = self.V.dot(h_t).flatten() + self.c.flatten()
            P[:,i] = softmax(O[:,i].reshape(-1, 1))[:,0]
        return P, O, H, A
    
    def backwardPass(self, X, Y, P, O, H, A, clipping = True):


        # Initialize gradients to zero matrices
        grad_U = np.zeros(self.U.shape)
        grad_W = np.zeros(self.W.shape)
        grad_V = np.zeros(self.V.shape)
        grad_b = np.zeros(self.b.shape)
        grad_c = np.zeros(self.c.shape)
        grad_h_next = np.zeros((self.m, 1))
        
        # Get total number of timesteps
        T = Y.shape[1]

        # For each timestep
        for t in reversed(range(T)):
            g = P[:,t] - Y[:,t] # Derivative with respect to o
            
            # Update gradients
            grad_c[:, 0] += g
            grad_V += np.outer(g, H[:,t])
            
            # Calculate x gradient with respect to A_t + 1
            
            if not (t == T - 1):
                grad_h = g.dot(self.V) + grad_a.dot(self.W)
            else:
                grad_h = g.dot(self.V) # Derivative of last hidden state 
            grad_a = grad_h.dot(np.diag(1 - np.tanh(A[:, t]) ** 2))
            
            grad_U += np.outer(grad_a, X[:,t])
            grad_W += np.outer(grad_a, H[:,t - 1])
            grad_b[:,0] += grad_a
            
        #if clipping is True:
            #grad_U[grad_U > 5] = 5
            #grad_U[grad_U < -5] = -5
            #grad_W[grad_W > 5] = 5
            #grad_W[grad_W < -5] = -5
            #grad_V[grad_V > 5] = 5
            #grad_V[grad_V < -5] = -5
            #grad_b[grad_b > 5] = 5
            #grad_b[grad_b < -5] = -5
            ##grad_c[grad_c > 5] = 5
            #grad_c[grad_c < -5] = -5
       
       
        return grad_W, grad_U, grad_V, grad_b, grad_c 

    def compute_grad_num_slow(self, X_batch, Y_batch, h0,  h = 1e-4):
        '''Centered difference gradient'''
        # Initialize all gradients to zero
        grad_W = np.zeros(self.W.shape)
        grad_U = np.zeros(self.U.shape) 
        grad_V = np.zeros(self.V.shape) 
        grad_b = np.zeros(self.b.shape)
        grad_c = np.zeros(self.c.shape)
 
        # Gradient w.r.t W
        for j in tqdm(range(self.W.shape[0])):
            for k in range(self.W.shape[1]):
                self.W[j, k] -= h
                c1 = self.computeLoss(h0, X_batch, Y_batch)
                self.W[j, k] += 2 * h
                c2 = self.computeLoss(h0, X_batch, Y_batch)
                self.W[j, k] -= h
                grad_W[j, k] = (c2-c1) / (2 * h)
       
        
         # Gradient w.r.t U
        for j in tqdm(range(self.U.shape[0])):
            for k in range(self.U.shape[1]):
                self.U[j, k] -= h
                c1 = self.computeLoss(h0, X_batch, Y_batch)
                self.U[j, k] += 2 * h
                c2 = self.computeLoss(h0, X_batch, Y_batch)
                self.U[j, k] -= h
                grad_U[j, k] = (c2-c1) / (2 * h)
       
         # Gradient w.r.t V
        for j in tqdm(range(self.V.shape[0])):
            for k in range(self.V.shape[1]):
                self.V[j, k] -= h
                c1 = self.computeLoss(h0, X_batch, Y_batch)
                self.V[j, k] += 2 * h
                c2 = self.computeLoss(h0, X_batch, Y_batch)
                self.V[j, k] -= h
                grad_V[j, k] = (c2-c1) / (2 * h)
       
        # Gradient w.r.t b
        for j in tqdm(range(self.b.shape[0])):
            self.b[j] -= h
            c1 = self.computeLoss(h0, X_batch, Y_batch)
            self.b[j] += 2 * h
            c2 = self.computeLoss(h0, X_batch, Y_batch)
            self.b[j] -= h
            grad_b[j] = (c2-c1) / (2 * h)
       
        # Gradient w.r.t c
        for j in tqdm(range(self.c.shape[0])):
            self.c[j] -= h
            c1 = self.computeLoss(h0, X_batch, Y_batch)
            self.c[j] += 2 * h
            c2 = self.computeLoss(h0, X_batch, Y_batch)
            self.c[j] -= h
            grad_c[j] = (c2-c1) / (2 * h)
       
    
        return grad_W, grad_U, grad_V, grad_b, grad_c
    

    def train(self, X, Y, h0, max_epochs = 10, eta = 0.01, synth_len = 200, n_loss_steps = 100, n_synth_steps = 500):
        """
        Performs training with AdaGrad
        
        Args:
            X:
            Y:
            h0:
            eta: learning rate initial value
            n_loss_steps:
            n_synth_steps:
        """
        training_data_len = X.shape[1]
        tr_sequence_no = training_data_len - self.seq_length + 1 # Number of available sequences in the training data
        synthesized_text_len = 200
        smooth_loss = self.computeLoss(h0, X[:,:self.seq_length], Y[:,:self.seq_length])
        smooth_loss_list = []
        
        # Initialize AdaGrad matrices
        ada_grad_V = np.zeros(self.V.shape)
        ada_grad_W = np.zeros(self.W.shape)
        ada_grad_U = np.zeros(self.U.shape)
        ada_grad_b = np.zeros(self.b.shape)
        ada_grad_c = np.zeros(self.c.shape)

        for epoch in tqdm(range(max_epochs)):
            
            print("Epoch: " + str(epoch))
            e = 0 # Initialize position in text
            h_prev = np.copy(h0) # Initialize hidden state to zero vector
            smooth_loss = 0.0 # Initialize smoothened loss
            
            for s in range(tr_sequence_no):
                curr_iter = epoch * tr_sequence_no + s
                
                X_batch = X[:,s:s + self.seq_length]
                Y_batch = Y[:,s:s + self.seq_length]
                
                # Run forward pass
                P, O, H = rnn_model.forwardPass(h_prev, X_batch)

                # Run backward pass
                grad_W, grad_U, grad_V, grad_b, grad_c  = rnn_model.backwardPass(X_batch, Y_batch, P, O, H)
                
                # Update AdaGrad matrices
                ada_grad_V += grad_V ** 2
                ada_grad_W += grad_W ** 2
                ada_grad_U += grad_U ** 2
                ada_grad_b += grad_b ** 2
                ada_grad_c += grad_c ** 2
                # Update weight matrices
                self.V += -eta * grad_V / np.sqrt(ada_grad_V + self.epsilon)
                self.W += -eta * grad_W / np.sqrt(ada_grad_W + self.epsilon)
                self.U += -eta * grad_U / np.sqrt(ada_grad_U + self.epsilon)
                self.b += -eta * grad_b / np.sqrt(ada_grad_b + self.epsilon)
                self.c += -eta * grad_c / np.sqrt(ada_grad_c + self.epsilon)
                
                # Compute smoothened loss
                loss = self.computeLoss(h_prev, X_batch, Y_batch)
                smooth_loss = .999 * smooth_loss + .001 * loss;
                smooth_loss_list.append(smooth_loss)
                if curr_iter % n_loss_steps == 0:
                    print("Global step: " + str(curr_iter) + " Smoothened loss: " + str(smooth_loss))
                
                # Check iteration number and print loss if verbose
                h_prev = H[:, 0].reshape(-1, 1)
                
                # Synthesize text
                if curr_iter % n_synth_steps == 0:
                    print(self.synthesize_seq(h0, X_batch[:,0].reshape(-1, 1), synth_len))
                    
                

In [374]:
def onehot_encode(chars, char_dictionary):
    """
    Encodes a string of characters to a matrix with one hot encoding.
    
    Args:
        chars: The input string
        char_dictionary: A dictionary that maps each possible character
            of the vocabulary being used to a unique index.
        
    Returns: 
        A NxM matrix where N is the number of distinct characters in the
        vocabulary and M is the number of characters in the string.
    """
    N = len(char_dictionary.keys())
    M = len(chars)
    encoded_string = np.zeros((N, M))
    for i, char in enumerate(chars):
        unique_index = char_dictionary[char]
        encoded_string[unique_index, i] = 1
    return encoded_string        

In [375]:
def getRelativeErrors(grad1, grad2):
    """
    Computes the relative errors of grad_1 and grad_2 gradients
    """
    abs_diff = np.absolute(grad1 - grad2) 
    abs_sum = np.absolute(grad1) + np.absolute(grad2)
    max_elems = np.where(abs_sum > np.finfo(float).eps, abs_sum, np.finfo(float).eps)
    relativeErrors = abs_diff / max_elems
    return relativeErrors

###  Read in the data

In [376]:
with open('goblet_book.txt', 'r') as fileobj:
    data = fileobj.read()

In [377]:
# Get dictionary of unique characters in the book
characters = set(data)
char_dictionary = dict([ (elem, i) for i, elem in enumerate(characters) ])
inv_char_dictionary = {v: k for k, v in char_dictionary.items()}
voc_size = len(char_dictionary)

### Extract input and output data using one-hot encoding

In [378]:
seq_length = 25
X_chars = data[:seq_length]
Y_chars = data[1:seq_length + 1]
X = onehot_encode(X_chars, char_dictionary)
Y = onehot_encode(Y_chars, char_dictionary) 

In [379]:
# Initialize dimensionality of hidden state
m = 5
# Initialize the initial hidden state to a zero vector
h0 = np.zeros((m, 1))

In [380]:
rnn_model = RNN(k = voc_size, m = m)

In [381]:
rnn_model.synthesize_seq(h0, X[:,0].reshape(-1, 1), 10)

[56, 42, 59, 73, 33, 76, 7, 33, 25, 26]

In [382]:
P, O, H, A = rnn_model.forwardPass(h0, X)

In [383]:
grad_W, grad_U, grad_V, grad_b, grad_c  = rnn_model.backwardPass(X, Y, P, O, H, A)

In [384]:
approx_grad_W, approx_grad_U, approx_grad_V, approx_grad_b, approx_grad_c = rnn_model.compute_grad_num_slow(X, Y, h0)

100%|███████████████████████████████████████████| 5/5 [00:00<00:00, 133.31it/s]
100%|████████████████████████████████████████████| 5/5 [00:00<00:00,  8.35it/s]
100%|█████████████████████████████████████████| 83/83 [00:00<00:00, 138.91it/s]
100%|███████████████████████████████████████████| 5/5 [00:00<00:00, 526.76it/s]
100%|█████████████████████████████████████████| 83/83 [00:00<00:00, 680.32it/s]


In [385]:
errorsW = getRelativeErrors(grad_W, approx_grad_W)
errorsU = getRelativeErrors(grad_U, approx_grad_U)
errorsV = getRelativeErrors(grad_V, approx_grad_V)
errorsb = getRelativeErrors(grad_b, approx_grad_b)
errorsc = getRelativeErrors(grad_c, approx_grad_c)
print(np.max(errorsW))
print(np.max(errorsU))
print(np.max(errorsV))
print(np.max(errorsb))
print(np.max(errorsc))

0.2937897670478581
2.9185449166592416e-08
2.3203236896118015e-07
3.3496117930260445e-09
9.730917174015548e-10


### Train the RNN using AdaGrad

In [331]:
# Encode the whole dataset using one-hot encoding
X_chars = data[:len(data) - 2]
Y_chars = data[1:len(data) - 1]
X = onehot_encode(X_chars, char_dictionary)
Y = onehot_encode(Y_chars, char_dictionary) 

In [409]:
m = 5
seq_length = 25
rnn_model = RNN(k = voc_size, m = m, seq_length = seq_length)
max_epochs = 5
h0 = np.zeros((m, 1))
rnn_model.train(X, Y, h0, max_epochs = max_epochs)



  0%|                                                    | 0/5 [00:00<?, ?it/s]

Epoch: 0
Global step: 0 Smoothened loss: 0.11004831408585307
[44, 24, 3, 69, 53, 17, 69, 1, 24, 38, 65, 56, 27, 81, 47, 76, 71, 7, 68, 60, 20, 66, 81, 11, 2, 58, 21, 25, 48, 52, 61, 27, 43, 63, 68, 80, 74, 36, 20, 9, 79, 11, 63, 70, 40, 45, 3, 20, 68, 21, 16, 4, 27, 21, 12, 81, 28, 76, 35, 56, 24, 80, 46, 71, 57, 55, 4, 73, 78, 67, 8, 38, 57, 44, 68, 74, 19, 59, 11, 22, 66, 71, 79, 77, 19, 9, 44, 76, 61, 76, 43, 64, 18, 25, 49, 17, 19, 14, 21, 24, 56, 14, 11, 58, 50, 57, 21, 52, 22, 10, 28, 64, 11, 36, 22, 15, 64, 6, 0, 40, 56, 64, 55, 25, 55, 18, 0, 63, 34, 25, 47, 20, 40, 36, 11, 12, 18, 24, 45, 54, 53, 58, 58, 61, 28, 65, 25, 74, 52, 20, 41, 9, 10, 59, 16, 59, 66, 78, 12, 24, 38, 61, 58, 52, 36, 78, 8, 52, 37, 22, 82, 61, 3, 75, 62, 53, 52, 41, 57, 44, 64, 2, 50, 53, 80, 1, 57, 45, 76, 70, 41, 44, 73, 10, 7, 47, 58, 62, 46, 26]
Global step: 100 Smoothened loss: 9.731144000064951
Global step: 200 Smoothened loss: 17.87449751813197
Global step: 300 Smoothened loss: 24.551589963826434




Global step: 600 Smoothened loss: 39.83416287234339
Global step: 700 Smoothened loss: 43.37866518320324
Global step: 800 Smoothened loss: 46.77866788132413
Global step: 900 Smoothened loss: 49.811501346459245
Global step: 1000 Smoothened loss: 52.41707961721767
[57, 47, 53, 81, 60, 28, 44, 77, 43, 42, 19, 19, 28, 43, 78, 19, 7, 53, 32, 67, 81, 28, 10, 19, 19, 43, 28, 20, 67, 19, 55, 20, 44, 19, 25, 49, 19, 67, 53, 24, 19, 19, 19, 19, 28, 52, 80, 1, 28, 67, 19, 42, 20, 20, 28, 67, 19, 47, 19, 7, 28, 81, 19, 19, 51, 28, 19, 53, 19, 28, 53, 28, 67, 56, 77, 19, 28, 67, 28, 25, 32, 20, 24, 67, 67, 24, 72, 19, 19, 77, 81, 19, 77, 25, 47, 25, 7, 25, 28, 77, 25, 9, 19, 10, 67, 49, 19, 60, 30, 19, 19, 53, 67, 66, 28, 31, 55, 66, 28, 67, 51, 19, 53, 19, 7, 28, 78, 22, 78, 19, 24, 32, 53, 53, 40, 67, 32, 19, 19, 49, 19, 7, 33, 53, 81, 65, 28, 16, 7, 81, 67, 28, 19, 44, 81, 19, 66, 67, 43, 19, 33, 19, 28, 44, 67, 28, 81, 28, 44, 78, 68, 66, 31, 19, 81, 66, 28, 67, 42, 81, 53, 45, 75, 39, 63, 19, 6

[13, 38, 14, 49, 82, 18, 4, 48, 3, 35, 42, 21, 38, 70, 25, 46, 29, 26, 63, 0, 14, 23, 3, 48, 12, 68, 27, 71, 21, 3, 56, 46, 3, 5, 41, 22, 15, 0, 82, 82, 79, 8, 56, 70, 27, 23, 21, 58, 3, 14, 31, 23, 46, 72, 27, 23, 59, 7, 40, 3, 41, 49, 50, 54, 30, 13, 71, 78, 38, 50, 11, 54, 12, 42, 68, 68, 37, 69, 65, 71, 64, 56, 45, 1, 3, 8, 69, 0, 82, 27, 82, 23, 71, 22, 48, 22, 29, 54, 27, 46, 41, 41, 35, 80, 36, 61, 65, 36, 34, 45, 82, 38, 15, 16, 36, 15, 60, 34, 45, 48, 15, 17, 34, 46, 80, 68, 34, 45, 26, 12, 27, 34, 34, 69, 46, 68, 39, 11, 35, 34, 75, 36, 72, 45, 6, 12, 31, 17, 54, 12, 69, 34, 29, 48, 38, 70, 65, 34, 14, 68, 37, 41, 54, 27, 41, 61, 46, 23, 51, 62, 72, 0, 12, 38, 0, 15, 26, 74, 65, 21, 35, 69, 0, 71, 68, 58, 5, 54, 43, 22, 45, 72, 48, 76, 22, 23, 76, 46, 16, 57]
Global step: 4600 Smoothened loss: 76.03606808861257
Global step: 4700 Smoothened loss: 75.70532852254439
Global step: 4800 Smoothened loss: 75.45692289373115
Global step: 4900 Smoothened loss: 75.34219110255007
Global s

Global step: 8200 Smoothened loss: 73.33245807427305
Global step: 8300 Smoothened loss: 73.15962200438625
Global step: 8400 Smoothened loss: 73.23006541424748
Global step: 8500 Smoothened loss: 73.2396209547218
[80, 28, 81, 43, 47, 75, 74, 47, 19, 10, 67, 44, 10, 19, 7, 44, 25, 81, 19, 74, 19, 19, 28, 19, 44, 19, 77, 7, 19, 28, 19, 25, 19, 28, 74, 19, 81, 7, 7, 25, 28, 44, 19, 19, 4, 44, 74, 33, 77, 44, 28, 81, 1, 7, 28, 60, 19, 47, 19, 19, 42, 28, 43, 19, 28, 81, 28, 81, 28, 19, 19, 53, 19, 81, 19, 32, 19, 19, 19, 25, 19, 28, 10, 32, 31, 19, 19, 67, 43, 19, 10, 7, 44, 20, 19, 28, 28, 81, 60, 47, 44, 19, 19, 28, 19, 19, 19, 19, 32, 19, 19, 81, 77, 28, 19, 31, 28, 25, 44, 25, 28, 28, 28, 53, 28, 20, 44, 19, 19, 19, 77, 25, 19, 44, 19, 19, 67, 19, 31, 19, 73, 53, 19, 25, 4, 81, 28, 53, 10, 55, 44, 74, 10, 19, 44, 25, 7, 67, 19, 19, 28, 20, 53, 77, 81, 81, 10, 44, 43, 31, 20, 19, 53, 19, 24, 28, 7, 28, 28, 7, 19, 44, 75, 19, 28, 13, 28, 28, 19, 44, 19, 28, 19, 19, 74, 19, 10, 7, 25, 28]
G

[11, 82, 9, 46, 48, 46, 35, 65, 29, 41, 16, 78, 14, 79, 29, 59, 3, 3, 58, 48, 72, 26, 48, 72, 66, 38, 41, 71, 70, 68, 72, 38, 26, 34, 2, 16, 3, 65, 69, 48, 41, 64, 11, 45, 82, 69, 71, 27, 27, 54, 52, 70, 38, 16, 14, 14, 58, 27, 54, 69, 69, 75, 29, 80, 12, 15, 82, 82, 21, 11, 80, 65, 14, 23, 11, 68, 16, 66, 71, 48, 29, 71, 39, 21, 48, 0, 22, 11, 14, 70, 29, 35, 14, 51, 14, 21, 27, 17, 80, 46, 23, 39, 64, 11, 61, 58, 23, 12, 35, 3, 29, 57, 69, 17, 62, 23, 34, 79, 69, 5, 80, 21, 15, 54, 15, 16, 82, 65, 46, 69, 48, 66, 72, 3, 22, 3, 22, 46, 14, 54, 34, 14, 21, 35, 3, 71, 79, 2, 21, 34, 16, 70, 21, 75, 45, 59, 39, 34, 11, 65, 65, 15, 5, 38, 82, 16, 64, 2, 69, 34, 11, 71, 12, 34, 0, 35, 29, 65, 82, 22, 27, 23, 26, 21, 37, 38, 80, 15, 26, 79, 64, 14, 41, 48, 65, 54, 3, 35, 70, 48]
Global step: 12100 Smoothened loss: 76.76815564187255
Global step: 12200 Smoothened loss: 76.63485206106571
Global step: 12300 Smoothened loss: 76.24810393238175
Global step: 12400 Smoothened loss: 76.63644067659084

KeyboardInterrupt: 