# Language Modeling with CharRNN

Adapted by Sungwon Kim from the work of [Andrej Karpathy](http://karpathy.github.io/2015/05/21/rnn-effectiveness/), [Sherjil Ozair](https://github.com/sherjilozair/char-rnn-tensorflow/) and [SOLARIS](https://github.com/solaris33/char-rnn-tensorflow)

## Char-RNN

This code implements multi-layer Recurrent Neural Network (RNN, LSTM, and GRU) for training/sampling from character-level language models. In other words the model takes one text file as input and trains a Recurrent Neural Network that learns to predict the next character in a sequence. The RNN can then be used to generate text character by character that will look like the original training data. The context of this code base is described in detail in [Karpahty's blog post](http://karpathy.github.io/2015/05/21/rnn-effectiveness/). *Explanation from Andrej's code*

Original code (written in Torch/Lua) : https://github.com/karpathy/char-rnn


<img src="http://karpathy.github.io/assets/rnn/charseq.jpeg" alt="nn" style="width: 400px;"/>


$ \log{p(x_{1:T})} = \sum_{t} log{p(x_t|x_<{t})} $



## Character-level language modeling

### Mount Google Drive (ONLY for students using Google Colab)

In [0]:
from google.colab import drive
drive.mount('/content/gdrive')

In [0]:
import os
os.chdir("/content/gdrive/My Drive/Colab Notebooks/")

In [0]:
!ls

In [0]:
# load a bunch of libraries
from __future__ import print_function
import tensorflow as tf
from tensorflow.contrib import rnn
import numpy as np
import time
import os
import sys

# this module is from the utils.py file of this folder
# it handles loading texts to digits (aka. tokens) which are recognizable for the model
from utils import TextLoader

## Hyperparameters

In [0]:
data_dir = 'data/tinyshakespeare'
batch_size = 50
seq_length = 50
num_hidden = 256
learning_rate = 0.002
num_epochs = 2
num_layers = 2
grad_clip = 1.

## Load Data

In [0]:
# protip: always check the data and poke around the data yourself
# you will get a lot of insights by looking at the data
data_loader = TextLoader(data_dir, batch_size, seq_length)
chars = data_loader.chars
vocab = data_loader.vocab
vocab_size = data_loader.vocab_size
print(vocab_size)

data_loader.reset_batch_pointer()

x, y = data_loader.next_batch()

# our data has a shape of (N, T), where N=batch_size and T=seq_length
print(x)
print(x.shape)
print(y)
print(y.shape)

In [0]:
# see what the first entry of the batch look like
print(x[0])
# y is just an x shifted to the left by one: so the network will predict the next token y given x. 
print(y[0])

## TF Graph

### Placeholders

In [0]:
# X : [batch_size, seq_length]
# Y : [batch_size * seq_length, vocab_size]
# state_batch_size : batch_size (Training) / 1 (Sampling)
X = tf.placeholder(tf.int32, [None, None])
Y = tf.placeholder(tf.int32, [None, None])
state_batch_size = tf.placeholder(tf.int32, shape=[])

### Embedding

In [0]:
# embedding_lookup : [batch_size, seq_length] --> [batch_size, seq_length, hidden_size]

embedding = tf.Variable(tf.random_normal(shape=[vocab_size, num_hidden]), dtype=tf.float32)
inputs = tf.nn.embedding_lookup(embedding, X)

### RNN

In [0]:
assert num_layers > 0
if num_layers == 1:
    # Define a lstm cell with tensorflow
    lstm_cell = rnn.LSTMBlockCell(num_hidden, forget_bias=1.0)
else:
    cells = [rnn.LSTMBlockCell(num_hidden, forget_bias=1.0) for _ in range (num_layers)]
    lstm_cell = rnn.MultiRNNCell(cells)
    
initial_state = lstm_cell.zero_state(state_batch_size, tf.float32)

In [0]:
# Get lstm cell output
# time_major=True --> inputs_shape : [timesteps, batch_size, num_hidden]
# time_major=False --> inputs_shape : [batch_size, timesteps, num_hidden]
# outputs : [batch_size, timesteps, num_hidden]
outputs, final_states = tf.nn.dynamic_rnn(cell=lstm_cell, inputs=inputs, initial_state=initial_state,
                                    time_major=False, dtype=tf.float32)

### Fully Connected Layer

In [0]:
# output : [batch_size, timesteps, num_hidden] --> [batch_size * timesteps, num_hidden]
# logits : [batch_size * timesteps, vocab_size]
outputs = tf.reshape(outputs, [-1, num_hidden])
logits = tf.layers.dense(outputs, vocab_size)
probs = tf.nn.softmax(logits)

### Define Loss, Optimizer, and Evaluation Metrics

In [0]:
with tf.variable_scope('optimizer'):
    # Define loss and optimizer
    loss_op = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(
        logits=logits, labels=Y))
    # Gradient clipping to avoid exploding gradients
    tvars = tf.trainable_variables()
    grads, _ = tf.clip_by_global_norm(tf.gradients(loss_op, tvars), 1.)
    optimizer = tf.train.AdamOptimizer(learning_rate)
    train_op = optimizer.apply_gradients(zip(grads, tvars))

## Train

In [0]:
# Start training
sess = tf.Session()
    
# Run the initializer
sess.run(tf.global_variables_initializer())

for step in range(1, num_epochs+1):
    data_loader.reset_batch_pointer()

    state = sess.run(initial_state, feed_dict={state_batch_size : batch_size})

    for b in range(data_loader.num_batches):
        # batch_x : [batch_size, seq_length]
        batch_x, batch_y = data_loader.next_batch()

        # batch_y : [batch_size, seq_length] --> [batch_size, seq_length, vocab_size]
        batch_y = tf.one_hot(batch_y, vocab_size)            
        # batch_y : [batch_size, seq_length, vocab_size] --> [batch_size * seq_length, vocab_size]
        batch_y = tf.reshape(batch_y, [-1, vocab_size])
        batch_y = batch_y.eval(session=sess)

        feed_dict = {X : batch_x, Y: batch_y, state_batch_size : batch_size}
        for i, (c, h) in enumerate(initial_state):
            feed_dict[c] = state[i].c
            feed_dict[h] = state[i].h

        _, loss_print, state = sess.run([train_op, loss_op, final_states], feed_dict=feed_dict)

        if ((step-1) * data_loader.num_batches + b) % 50 == 0:
            print("{}/{}, Epoch: {}, Loss: {:.3f}".format(
                          (step-1) * data_loader.num_batches + b,
                          num_epochs * data_loader.num_batches,
                          step, 
                          loss_print))

## Sampling

In [0]:
print("Sampling")
num_sampling = 4000
sampling_batch_size = 1
prime = u' '         # start token : ' '
sampling_type = 2   
state = sess.run(lstm_cell.zero_state(sampling_batch_size, tf.float32)) # zero states for initializing RNN statea

# function for random sampling
def weighted_pick(weights):
    t = np.cumsum(weights)
    s = np.sum(weights)
    return(int(np.searchsorted(t, np.random.rand(1)*s)))

ret = prime       # sampling result
char = prime[-1]  # current input
for n in range(num_sampling):
    x = np.zeros((1, 1))
    x[0, 0] = vocab[char]

    # One RNN step computation
    feed_dict = {X: x, state_batch_size : 1, initial_state: state}
    [probs_result, state] = sess.run([probs, final_states], feed_dict=feed_dict)         

    # probs_result : (1,65) -> p : (65)
    p = np.squeeze(probs_result)

    # Sampling Type
    # 0 : argmax sampling
    # 1 : random sampling
    # 2 : argmax sampling (previous token != ' '), random sampling (previous token == ' ')
    if sampling_type == 0:
        sample = np.argmax(p)
    elif sampling_type == 2:
        if char == ' ':
            sample = weighted_pick(p)
        else:
            sample = np.argmax(p)
    else:
        sample = weighted_pick(p)

    pred = chars[sample]
    ret += pred     # Update sampling result
    char = pred     # Update current result

print("Sampling result : {}".format(ret))

In [0]:
sess.close()