# RNN Author   
In this Notebook I trained a LSTM RNN model on the book Call Me By Your Name written by André Aciman.   
![](https://upload.wikimedia.org/wikipedia/en/7/77/Call_Me_By_Your_Name%2C_2007_book_cover.jpg)

In [1]:
import numpy as np
import pickle
import tensorflow as tf

## Data Preparation

### First, take a look at the data

In [2]:
file_path = './text/Call_Me_By_Your_Name.txt'
with open(file_path, encoding='utf-8', errors='ignore') as f:
    text = f.read()

In [3]:
text[:100]

'PART 1\n\nIf Not Later, When?\n\n\n"Later!" The word, the voice, the attitude.\nI\'d never heard anyone use'

In [4]:
print('number of unique words (roughly): {}'.format(len({word: None for word in text.split()})))
sentences = [sentence for sentence in text.split('\n') if len(sentence) > 0]
print('number of sentences: {}'.format(len(sentences)))

number of unique words (roughly): 11833
number of sentences: 1813


### Prepare the data   
In this section, I'll make a dictionary of vocabulary and it's corresponding index, and then convert all the words in the book to integers for the RNN model to take in. Punctuations should also be converted to symbols and put into training data.

In [5]:
# create the punctuation token look up dictionary
punc_token = {'.': '||Period||',
             ',': '||Comma||',
             '"': '||Quotation_Mark||',
             ';': '||Semicolon||',
             '!': '||Exclamation_Mark||',
             '?': '||Question_Mark||',
             '(': '||Left_Parentheses||',
             ')': '||Right_Parentheses||',
             '--': '||Dash||',
             '\n': '||Return||',}

# Replace the punctuation in the text with the token
for key, token in punc_token.items():
        text = text.replace(key, ' {} '.format(token))

text = text.lower()
text = text.split()

In [6]:
word_vocab = sorted(set(text))

vocab_to_int = {word: index for (index, word) in enumerate(set(word_vocab))}
int_to_vocab = {index: word for (word, index) in vocab_to_int.items()}

In [7]:
# encode the text, replace the word with index
encoded_text = np.array([vocab_to_int[c] for c in text], dtype=np.int32)
assert len(text) == len(encoded_text)

### Save the previous work for later use

In [8]:
pickle.dump((encoded_text, vocab_to_int, int_to_vocab, punc_token), open('preprocess.p', 'wb'))

# This is the check point   
load the data saved before

In [9]:
encoded_text, vocab_to_int, int_to_vocab, punc_token = pickle.load(open('preprocess.p', mode='rb'))
vocab_size = len(vocab_to_int)

## Build the Neural Network
I'll split the network to different parts first and then call them as functions

### Input placeholder
create the tensorflow placeholders for the input, target and learning rate

In [10]:
def get_input_placeholder():
    inputs = tf.placeholder(tf.int32, [None, None], name='input')
    target = tf.placeholder(tf.int32, [None, None], name='target')
    lr = tf.placeholder(tf.float32, name='learning_rate')
    return inputs, target, lr

### Embedding layer
For every input, it's like one_hot encodding, with only one value valid in the whole word vocabulary vector   
The matrix multiplication going into the first hidden layer will have almost all of the resulting values be zero. This a huge waste of computation.    
So I use the embedding layer for a look up table with the function    
      
`tf.nn.embedding_lookup()`      
the embed dim means the number of embedding dimentions
![](./img/embedding.png)

In [11]:
def get_embedding_layer(input_data, vocab_size, embed_dim):
    embedding = tf.Variable(tf.random_uniform([vocab_size, embed_dim], -1, 1))
    embed = tf.nn.embedding_lookup(embedding, input_data)
    return embed

### LSTM Cell
use `tf.contrib.rnn.BasicLSTMCell` and `tf.contrib.rnn.MultiRNNCell` to build the LSTM cell
![](./img/lstm.png) 

In [12]:
def get_lstm_cell(lstm_size, num_layers, batch_size, keep_prob = 1.0):
    def build_cell(lstm_size, keep_prob):
        lstm = tf.contrib.rnn.BasicLSTMCell(lstm_size)
       
        # Add dropout to the cell
        drop = tf.contrib.rnn.DropoutWrapper(lstm, output_keep_prob=keep_prob)
        return drop
    
    # Stack up multiple LSTM layers
    cell = tf.contrib.rnn.MultiRNNCell([build_cell(lstm_size, keep_prob) for _ in range(num_layers)])
    initial_state = tf.identity(cell.zero_state(batch_size, tf.float32), name='initial_state')
    
    return cell, initial_state

### RNN layer
use `tf.nn.dynamic_rnn()`  to create a recurrent neural network

In [13]:
def build_rnn(cell, inputs):
    output, state = tf.nn.dynamic_rnn(cell, inputs, dtype=tf.float32)
    final_state = tf.identity(state, name='final_state')
    return output, final_state

### Build the network with one fully connected layer

In [14]:
def get_logits(embedding, cell, vocab_size):
    output, final_state = build_rnn(cell, embedding)
    logits = tf.contrib.layers.fully_connected(output, vocab_size)
    return logits, final_state

## Batches of Data
To feed the network, the input data and taget should be formed to ![](./img/format.png)    
And the way to do this is ![](./img/method.png)

In [15]:
def get_batches(int_text, batch_size, seq_length):
    # remove the last batch which cannot be filled
    n_batch = len(int_text) // (batch_size*seq_length)
    int_text = int_text[:n_batch*batch_size*seq_length]
    text_shift = np.zeros_like(int_text)
    text_shift[:-1], text_shift[-1] = int_text[1:], int_text[0]

    text_reshape = np.array(int_text).reshape(batch_size, -1)
    text_shift_reshape = text_shift.reshape(batch_size, -1)

    final = []
    for i in range(n_batch):
        batch_input = text_reshape[:, i*seq_length:(i+1)*seq_length].reshape(batch_size, seq_length)
        batch_target = text_shift_reshape[:, i*seq_length:(i+1)*seq_length].reshape(batch_size, seq_length)
        combine = np.array([batch_input, batch_target])
        final.append(combine)
    return np.array(final)

# Train the network

### Hyperparameters

In [67]:
# Number of Epochs
num_epochs = 100
# Batch Size
batch_size = 128
# Size of the hidden layers in the LSTM cells
lstm_size = 256
# Number of lstm cells in one multiple RNN cell
num_layers = 1
# Embedding Dimension Size
embed_dim = 200
# Sequence Length
seq_length = 20
# Learning Rate
learning_rate = 0.01
#Keep prob in hidden lstm cell
keep_prob = 0.6

# Show stats for every n number of batches
show_every_n_batches = 100

save_dir = './save'

### Build the Graph

In [68]:
from tensorflow.contrib import seq2seq

train_graph = tf.Graph()
with train_graph.as_default():
    input_text, targets, lr = get_input_placeholder()
    input_data_shape = tf.shape(input_text)
    embedding = get_embedding_layer(input_text, vocab_size, embed_dim)
    cell, initial_state = get_lstm_cell(lstm_size, num_layers, input_data_shape[0], keep_prob)
    logits, final_state = get_logits(embedding, cell, vocab_size)

    # Probabilities for generating words
    probs = tf.nn.softmax(logits, name='probs')

    # Loss function
    cost = seq2seq.sequence_loss(
        logits,
        targets,
        tf.ones([input_data_shape[0], input_data_shape[1]]))

    # Optimizer
    optimizer = tf.train.AdamOptimizer(lr)

    # Gradient Clipping
    gradients = optimizer.compute_gradients(cost)
    capped_gradients = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gradients if grad is not None]
    train_op = optimizer.apply_gradients(capped_gradients)

### Train the network

In [69]:
batches = get_batches(encoded_text, batch_size, seq_length)

with tf.Session(graph=train_graph) as sess:
    sess.run(tf.global_variables_initializer())

    for epoch_i in range(num_epochs):
        state = sess.run(initial_state, {input_text: batches[0][0]})

        for batch_i, (x, y) in enumerate(batches):
            feed = {
                input_text: x,
                targets: y,
                initial_state: state,
                lr: learning_rate}
            train_loss, state, _ = sess.run([cost, final_state, train_op], feed)

            # Show every <show_every_n_batches> batches
            if (epoch_i * len(batches) + batch_i) % show_every_n_batches == 0:
                print('Epoch {:>3} Batch {:>4}/{}   train_loss = {:.3f}'.format(
                    epoch_i,
                    batch_i,
                    len(batches),
                    train_loss))

    # Save Model
    saver = tf.train.Saver()
    saver.save(sess, save_dir)
    print('Model Trained and Saved')

Epoch   0 Batch    0/36   train_loss = 8.944
Epoch   2 Batch   28/36   train_loss = 5.977
Epoch   5 Batch   20/36   train_loss = 5.775
Epoch   8 Batch   12/36   train_loss = 5.612
Epoch  11 Batch    4/36   train_loss = 5.597
Epoch  13 Batch   32/36   train_loss = 5.433
Epoch  16 Batch   24/36   train_loss = 5.364
Epoch  19 Batch   16/36   train_loss = 5.261
Epoch  22 Batch    8/36   train_loss = 5.021
Epoch  25 Batch    0/36   train_loss = 4.805
Epoch  27 Batch   28/36   train_loss = 4.821
Epoch  30 Batch   20/36   train_loss = 4.672
Epoch  33 Batch   12/36   train_loss = 4.559
Epoch  36 Batch    4/36   train_loss = 4.599
Epoch  38 Batch   32/36   train_loss = 4.424
Epoch  41 Batch   24/36   train_loss = 4.400
Epoch  44 Batch   16/36   train_loss = 4.379
Epoch  47 Batch    8/36   train_loss = 4.134
Epoch  50 Batch    0/36   train_loss = 4.039
Epoch  52 Batch   28/36   train_loss = 4.208
Epoch  55 Batch   20/36   train_loss = 4.144
Epoch  58 Batch   12/36   train_loss = 4.090
Epoch  61 

### Save the parameter

In [70]:
pickle.dump((seq_length, save_dir), open('params.p', 'wb'))

# Time to Write

### need a function to get the tensor
use `get_tensor_by_name()`

In [2]:
def get_tensors(loaded_graph):
    input_tensor = loaded_graph.get_tensor_by_name("input:0")
    initial_state = loaded_graph.get_tensor_by_name("initial_state:0")
    final_state = loaded_graph.get_tensor_by_name("final_state:0")
    prob_tensor = loaded_graph.get_tensor_by_name("probs:0")
    return input_tensor, initial_state, final_state, prob_tensor

### and a function to choose a word from the probabilities

In [3]:
def pick_word(probabilities, int_to_vocab):
    index = probabilities.argmax()
    return int_to_vocab[index]

### load the previous trained model and parameters

In [4]:
_, vocab_to_int, int_to_vocab, punc_token = pickle.load(open('preprocess.p', mode='rb'))
seq_length, load_dir = pickle.load(open('params.p', mode='rb'))

### ready to generate text

In [6]:
gen_length = 200

prime_word = 'oliver'

loaded_graph = tf.Graph()
with tf.Session(graph=loaded_graph) as sess:
    # Load saved model
    loader = tf.train.import_meta_graph(load_dir + '.meta')
    loader.restore(sess, load_dir)

    # Get Tensors from loaded model
    input_text, initial_state, final_state, probs = get_tensors(loaded_graph)

    # Sentences generation setup
    gen_sentences = [prime_word]
    prev_state = sess.run(initial_state, {input_text: np.array([[1]])})

    # Generate sentences
    for n in range(gen_length):
        # Dynamic Input
        dyn_input = [[vocab_to_int[word] for word in gen_sentences[-seq_length:]]]
        dyn_seq_length = len(dyn_input[0])

        # Get Prediction
        probabilities, prev_state = sess.run(
            [probs, final_state],
            {input_text: dyn_input, initial_state: prev_state})
        
        pred_word = pick_word(probabilities[dyn_seq_length-1], int_to_vocab)

        gen_sentences.append(pred_word)
    
    # Remove tokens
    text = ' '.join(gen_sentences)
    for key, token in punc_token.items():
        ending = ' ' if key in ['\n', '(', '"'] else ''
        text = text.replace(' ' + token.lower(), key)
    text = text.replace('\n ', '\n')
    text = text.replace('( ', '(')
        
    print(text)

oliver and, he were right?" i asked.
" i don't know what i wanted."
i didn't know what i was too much.
" and i know it," i said, to say, oliver, who had come with a hand his smile on the words that both didn't want to know how to last, i'd always had say this what i was happy, but that afternoon suddenly finally, i couldn't have been too long to get away him on the table, he saw his body would be the other. this was the first thing about san clemente."
" i was headed to sleep, you don't like to, he is said," he want," i know he was his shirt."
" i think how long i am when you don't know what things you were me," he said. the book? i asked.
" i may not know you."
" that's a way of looking over and i was going to say," i
