# Using a Numpy Implementation of a Recurrent Neural Network to Generate New Trump Speeches

In [None]:
import numpy as np

In [None]:
def sigmoid(x, deriv=False):
    if not deriv:
        return 1 / (1 + np.exp(-x))
    else:
        return np.exp(x) / ((np.exp(x) + 1) **2)
    
# Softmax function for output probabilities
def softmax(x):
    nonlin_x = np.exp(x)
    return nonlin_x / np.sum(nonlin_x, 1).reshape(nonlin_x.shape[0], 1)

class RecurrentNeuralNet:
    def __init__(self, learning_rate, input_dim, hidden_dim, output_dim):
        self.hidden_dim = hidden_dim
        self.input_dim = input_dim
        self.learning_rate = learning_rate
        
        self.l1weights = np.random.randn(input_dim, hidden_dim) * 0.01
        self.l1biases = np.random.randn(1, hidden_dim) * 0.01
        self.umatrix = np.random.randn(hidden_dim, hidden_dim) * 0.01
        self.l2weights = np.random.randn(hidden_dim, output_dim) * 0.01
        self.l2biases = np.random.randn(1, output_dim) * 0.01
        
        # Adagrad memory will be used to apply adaptive gradients, adjusting the
        # learning rate based on the parameters.
        self.adagrad_memory = (np.zeros_like(self.l1weights), np.zeros_like(self.l1biases), 
                       np.zeros_like(self.umatrix), np.zeros_like(self.l2weights), 
                       np.zeros_like(self.l2biases))
    
    # Getting a generated sample by predicting the next in a sequence
    # over and over again, and adding the prediction to the sequence.
    def sample(self, prev_hidden, startchar, length=200):
        Xs = [startchar]
        prev_char = startchar
        
        prev_hidden = np.zeros([1, self.hidden_dim])
        for i in range(length):
            X = np.zeros([1, self.input_dim])
            X[:,prev_char] = 1
            
            # Forward pass
            hidden = sigmoid(np.dot(X, self.l1weights) + np.dot(prev_hidden, self.umatrix) + self.l1biases)
            prev_hidden = hidden
            prob = softmax(np.dot(hidden, self.l2weights) + self.l2biases)
            
            # Sample from output probability distribution
            new_x = np.random.choice(range(self.input_dim), p=prob.ravel())
            
            Xs.append(new_x)
            prev_char = new_x
            
        return Xs
    
    # Computes the gradients of all the parameters in the network
    # using the chain rule and backpropagation through time.
    def compute_gradients(self, X, Y, prev_h):
        
        # Starts with memory from the previous sequence due to
        # the training sequence starting in the middle of the
        # txt file
        prev_hidden = np.copy(prev_h)
        
        # Initializing buffers for pre-activations and activations
        # at every timestep
        Yhat = []
        Pre_Yhat = []
        H = []
        Pre_H = []
        
        # Initializing gradient buffers
        l1weights_grads = []
        l1biases_grads = []
        umatrix_grads = []
        l2weights_grads = []
        l2biases_grads = []
        
        # Forward pass
        for i in range(len(X)):
            # Computes the value of the current timestep, multiplying the
            # umatrix and the hidden state at the previous timestep and
            # adding it on to the product of the input and the first weight
            # matrix.
            pre_hidden = np.dot(X[i:i+1], self.l1weights) + np.dot(prev_hidden, self.umatrix) + self.l1biases
            hidden = sigmoid(pre_hidden)
            
            # Storing the current hidden state to be used in the next
            # timestep.
            prev_hidden = hidden
            
            pre_output = np.dot(hidden, self.l2weights) + self.l2biases
            output = softmax(pre_output)
            
            Pre_H.append(pre_hidden)
            H.append(hidden)
            Pre_Yhat.append(pre_output)
            Yhat.append(output)
            
        # Backward pass
        
        # Since the loss is not computer beyond the last timestep, we can
        # set the gradient of the hidden state at the next timestep to 0
        next_Pre_H_grad = np.zeros([1, self.hidden_dim])
        
        # Gradients will be calculated for every timestep going backwards,
        # since the gradient for the hidden layer at the next timestep
        # is necessary for computing the gradient of the hidden layer at
        # the next timestep
        for i in reversed(range(len(X))):
            
            # Finding the gradient of softmax crossentropy w.r.t logits
            Pre_Yhat_grad = Yhat[i] - Y[i]
            
            # Gradient of the hidden layer activations, where the gradient of
            # the previous hidden state is added, since loss is computed at
            # every timestep
            H_grad = np.dot(Pre_Yhat_grad, self.l2weights.T) + np.dot(next_Pre_H_grad, self.umatrix.T)
            
            # Gradient of hidden layer pre-activations
            Pre_H_grad = H_grad * sigmoid(Pre_H[i], deriv=True)
            
            # Storing this gradient to be used in the previous timestep
            next_Pre_H_grad = Pre_H_grad
            
            # Compute the gradients for the weights and biases in the second layer
            l2weights_grads.append(np.dot(H[i].T, Pre_Yhat_grad))
            l2biases_grads.append(Pre_Yhat_grad)
            
            # Compute the gradients for the umatrix, connecting the hidden state
            # of the previous timestep and the current timestep
            if i != 0:
                umatrix_grads.append(np.dot(H[i-1].T, next_Pre_H_grad))
            
            # Computer the gradients for the weights and biases in the first layer
            l1weights_grads.append(np.dot(X[i:i+1].T, Pre_H_grad))
            l1biases_grads.append(Pre_H_grad)
            
        # Average the gradients for all parameters over all timesteps
        l1weight_grad = np.mean(np.array(l1weights_grads), 0)
        l1bias_grad = np.mean(np.array(l1biases_grads), 0)
        umatrix_grad = np.mean(np.array(umatrix_grads), 0)
        l2weight_grad = np.mean(np.array(l2weights_grads), 0)
        l2bias_grad = np.mean(np.array(l2biases_grads), 0)
        
        return (l1weight_grad, l1bias_grad, umatrix_grad, l2weight_grad, l2bias_grad), prev_hidden
    
    # Applying the adaptive gradient descent
    # optimizer to parameters given gradients
    def apply_adagrad(self, gradients):
        
        # Iterate through parameter matrices with memory
        # and gradients
        for theta, mem, grad in zip((self.l1weights, self.l1biases, self.umatrix, self.l2weights, self.l2biases),
                                    self.adagrad_memory, gradients):
            
            # Add squared gradients element-wise to
            # the memory, increasing it over time
            mem += grad * grad
            
            # Decaying learning rate over time as memory
            # values increase
            theta += -self.learning_rate * grad / np.sqrt(mem + 1e-8)
            

In [None]:
# Preprocessing

text = open("speeches.txt", 'r').read()

# Creating a list that contains every unique character
in_to_char = list(set(text))

# Total unique characters
unique_chars = len(in_to_char)

# Dictionary that maps characters to their respective indices
char_to_in = {in_to_char[i]:i for i in range(len(in_to_char))}
print(len(text))

In [None]:
epochs = 150000
learning_rate = 1e-1
seq_length = 25
hidden_dim = 100
display_step = 1000

# Initializing the network to input and output
# one-hot encoded indices of characters, and
# training it to predict the next character in a
# sequence.
net = RecurrentNeuralNet(learning_rate, unique_chars, hidden_dim, unique_chars)

# Initializing 'bookmark' in text file and setting
# hidden state to 0
text_index = 0
prev_hidden = np.zeros((1, hidden_dim))

for epoch in range(epochs):
    if text_index + seq_length + 1 >= len(text):
        # Resetting 'bookmark' in text file and setting
        # hidden state to 0, resetting memory
        text_index = 0
        prev_hidden = np.zeros((1, hidden_dim))

    # Getting input as next 25 characters in text file, and setting output
    # to the input offset by one. ie input: [0,1,2], output: [1,2,3] 
    input_text = [char_to_in[x] for x in text[text_index:text_index+seq_length]]
    output_text = [char_to_in[x] for x in text[text_index+1:text_index+seq_length+1]]
    
    input_1hot = np.zeros((seq_length, unique_chars))
    output_1hot = np.zeros((seq_length, unique_chars))
    
    # Converting indexes into one-hot
    
    for i in range(seq_length):
        input_1hot[i][input_text[i]] = 1
        output_1hot[i][output_text[i]] = 1
        
    # Updating sequence start index in text file
    if not text_index + seq_length + 1 >= len(text):
        text_index += seq_length
        
    # Performing adaptive gradient descent
    gradients, prev_hidden = net.compute_gradients(input_1hot, output_1hot, prev_hidden)
    net.apply_adagrad(gradients)
    
    # Displaying samples every 1000th iteration
    if epoch % display_step == 0:
        raw = net.sample(prev_hidden, input_text[0])
        print("Epoch", epoch)
        print("\n\n", "".join(in_to_char[i] for i in raw), "\n\n")

In [None]:
# Printing out a test sample with 3000 characters

sample = net.sample(np.zeros((1, hidden_dim)), char_to_in['I'], length=3000)
print("\n\n", "".join(in_to_char[i] for i in sample), "\n\n")