# Recurrrent Neural Network 

In [1]:
# Loading the training data 
data = open('kafka.txt', 'r').read()
chars = list(set(data))
data_size, vocab_size = len(data), len(chars)
print('Data has %d chars, %d unique' % (data_size, vocab_size))

Data has 137628 chars, 80 unique



## Encode/Decode Char/Vector 

Neural Networks operate on vectors (a vector is an array of float). So we need a way to 
encode and decode a char as a vector.

We will count the number of unique chars (vocab_size). That will be the size of the vector.
The vector contains only zero except for the position of the char whereas the value is 1.


In [2]:
# Calculating the vocab_size .......
char_to_ix = {ch:i for i,ch in enumerate(chars)}
ix_to_char = {i:ch for i,ch in enumerate(chars)}
print (char_to_ix)
print (ix_to_char)

{'d': 0, 'e': 1, 'q': 59, ')': 70, 'b': 3, 'L': 17, 'z': 5, '.': 49, '8': 6, 'j': 8, '/': 44, 'P': 9, 'H': 10, 'x': 58, 'N': 11, '4': 46, 'p': 12, 'y': 13, 'B': 14, '9': 19, 'k': 16, 'c': 2, 'U': 18, 'X': 22, 'o': 20, 'K': 21, 'f': 23, '$': 25, 'g': 26, ',': 27, 'l': 28, 's': 30, '7': 74, 'v': 60, '"': 31, 'E': 4, 'h': 32, '!': 34, 'M': 39, '0': 41, ' ': 37, '3': 42, 'R': 61, 'O': 43, 'r': 7, ';': 45, '?': 47, 'Y': 38, 'I': 51, '%': 24, '@': 52, 'T': 33, 'C': 54, "'": 50, '6': 56, '\n': 57, 'n': 15, 'm': 36, 'W': 62, '2': 63, 'i': 64, 'ç': 40, 'V': 65, 'a': 66, 'A': 53, ':': 68, 't': 69, 'G': 48, 'F': 67, 'w': 71, 'J': 55, 'S': 35, 'u': 72, '*': 73, '5': 75, 'D': 29, '-': 76, 'Q': 77, '1': 78, '(': 79}
{0: 'd', 1: 'e', 2: 'c', 3: 'b', 4: 'E', 5: 'z', 6: '8', 7: 'r', 8: 'j', 9: 'P', 10: 'H', 11: 'N', 12: 'p', 13: 'y', 14: 'B', 15: 'n', 16: 'k', 17: 'L', 18: 'U', 19: '9', 20: 'o', 21: 'K', 22: 'X', 23: 'f', 24: '%', 25: '$', 26: 'g', 27: ',', 28: 'l', 29: 'D', 30: 's', 31: '"', 32: 'h', 

In [3]:
# Creating 2 dictionary to encode and decode a char to an int 
# We create a vector from a char like this :
# the dictionary defined above allows us to create a vector of size 61 instead of 256
# Here an example of char 'a'
# The vector contains only zeros, except at position char_to_ix['a'] where we put 1
import numpy as np
vector_for_char_a = np.zeros((vocab_size, 1))
vector_for_char_a[char_to_ix['a']] = 1
print(vector_for_char_a.ravel())


[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.]


## Network Architecture 
The neural network is made of 3 layers :
1. an input layer 
2. an hidden layer 
3. output layer 

All layers are fully connected to one another : each node of a layer are connected to all nodes of the next layer. The hidden layer is connected to the output and to itself : the values from an iteration are used for the next one.

To centralise values that matter for the training (hyperparametes) we also define the sequence length and learning rate.

In [4]:
# Hyperparameters
hidden_size = 100
seq_length = 25
learning_rate = 1e-1

Wxh = np.random.randn(hidden_size, vocab_size)* 0.01 # input to hidden
Whh = np.random.randn(hidden_size, hidden_size)* 0.01 # hidden to hidden 
Why = np.random.randn(vocab_size, hidden_size)* 0.001 # hidden to output
bh = np.zeros((hidden_size,1))
by = np.zeros((hidden_size,1))

## Loss Function 
It is a value that describe how giid is our model. The smaller the loss, better our model is. During training phase we want to minimize the loss.

The loss function calculates the loss but also the gradients.

* It performs a forward pass, calculate the next char given a char from the training set.
* It calculate the loss by computing the predicted char to the target char.
* It calculate the backward pass to calculate the gradients.

The function take as input :
 * a list of input char 
 * a list of target char 
 * and the previous hidden state 
 
This function outputs : 
 * The loss 
 * The gradient for each parameters between layers 
 * The last hidden state 
 
## Forward Pass : 

The forward pass use the parameters of the model (Wxh, Whh, Why, bh, by) to calculate the 
next char given a char from the training set.
xs[t] is the vector that encode the char at position t ps[t] is the probabilities for next char

hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1] +bh)
ys[t] = np.dot(Why, hs[t]) + by 
ps[t] = np.exp(ys[t])/np.sum(np.exp(ys[t]))

## Backward Pass 

The naive way to calculate all gradients would be to recalculate a loss for small variations for each parameters. This is possible but would be time consuming. There is a technics to calculate all gradients for all the parameters at once : the backdrop propagation.
Gradients are calculated in the oposite order of the forward pass, using simple technics.
#### Goal is to calculate gradients for the forward formula.
hs = input*Wxh + last_value_of_hidden_state*Whh + bh 
ys = hs*Why + by 

In [8]:
 def lossFun(inputs, targets, hprev):
        """
        inputs, targets are both list of integers.
        hprev is Hx1 array of initial hidden state 
        returns the loss, gradients on model parameters, and last hidden state 
        """
        # store out inputs, hidden states, outputs, and probability values 
        xs, hs, ys, ps, = {}, {}, {}, {} # Empty dicts 
        
        hs[-1] = np.copy(hprev)
        # initial loss as 0 
        loss = 0
        
        # forward pass 
        for t in range(len(inputs)):
            xs[t] = np.zeros((vocab_size, 1)) # encode on 1-of-k representation (we place a 0 vector as t-th input)
            xs[t][inputs[t]] = 1 # Inside that t-th input we use the integer in "inputs" list to set the correct 
            hs[t] = np.tanh(np.dot(Wxh, xs[t]) + np.dot(Whh, hs[t-1]) + bh) # hidden state 
            ys[t] = np.dot(Why, hs[t]) + by # unnormalized log probabilities for next chars 
            ps[t] = np.exp(ys[t]) / np.sum(np.exp(ys[t]))  # probabilities for next chars 
            loss += -np.log(ps[t][targets[t], 0])   # Softmax (cross-entropy loss)
        # Backward pass : compute gradients going backwards 
        # initialize vectors for gradient values for each set of weights 
        dwxh, dwhh, dwhy = np.zeros_like(Wxh), np.zeros_like(Whh), np.zeros_like(Why)
        dbh, dby = np.zeros_like(bh), np.zeros_like(by)
        dhnext = np.zeros_like(hs[0])
        for t in reversed(range(len(inputs))):
            # ouput probabilities 
            dy = np.copy(ps[t])
            # derive our first gradient 
            dy[targets[t]] -= 1 # backprop into y 
            # compute output gradient - output times hidden states transpose 
            # When we apply the transpose wight matrix 
            # We can think intuitively of this as moving the error backward 
            # through the network, giving us some sort of measure of the error 
            # at the output of the lth layer 
            # output gradient 
            dWhy += np.dot(dy, hs[t].T)
            # Derivative of output bias 
            dby += dy 
            # Backpropogate 
            dh = np.dot(Why.T, dy) + dhnext 
            dhraw = (1 - hs[t] * hs[t]) * dh 
            dbh += dhraw 
            dWxh += np.dot(dhraw, xs[t].T)
            dWhh += np.dot(dhraw, hs[t-1].T)
            dhnext = np.dot(Whh.T, dhraw)
        for dparam in [dWxh, dWhh, dWhy, dbh, dby]:
            np.clip(dparam, -5, 5, out = dparam)
        return loss, dWxh, dWhh, dWhy, dbh, dby, hs[len(inputs)-1]

             

## Create a sentence from the model 

In [12]:
# prediction, one full forward pass 

def sample(h, seed_ix, n):
    """
    sample a sequence of integers from the model 
    h is memory state, seed_ix is seed letter for first time step 
    n is how many characters to predict 
    """
    
    # create vector 
    x = np.zeros((vocab_size, 1))
    # customiize it for our seed char 
    x[seed_ix] = 1
    # list to store generated chars 
    ixes = []
    # for as many characters as we want to generate 
    for t in range(n):
        h =  np.tanh(np.dot(Wxh, x) + np.dot(Whh, h) + bh)
        # compute output (unormalised)
        y = np.dot(Why, h) + by 
        ## probabilities for next chars 
        p = np.exp(y) / np.sum(np.exp(y))
        # pick one with the highest probabilities
        ix = np.random.choice(range(vocab_size), p = p.ravel())
        # create a vector 
        x = np.zeros((vocab_size, 1))
        # customize it for the predicted char 
        x[ix] = 1
        # add it to the list 
        ixes.append(ix)
        
        txt = ''.join(ix_to_char[ix] for ix in ixes)
        print ('-----\n %s \n------' % (txt, ))
        
    hprev = np.zeros((hidden_size, 1))
    # predict the 200 characters given 'a'
    sample(hprev, char_to_ix['a'], 200)
                     