In this notebook I will show how to generate text with usage of Recurrent Neural Network. I will use Shakespare work for that exercise.

### Import

In [1]:
!pip install -q keras
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.optimizers import Adam

!pip install tqdm
from tqdm import tqdm
from urllib import urlretrieve

import tensorflow as tf

import matplotlib.pyplot as plt
%matplotlib inline

import os
import re
import random as rn
import numpy as np

# Randomness control
os.environ['PYTHONHASHSEED'] = '0'
RANDOM_SEED = 3939
np.random.seed(RANDOM_SEED)
rn.seed(RANDOM_SEED)
session_conf = tf.ConfigProto(intra_op_parallelism_threads=1, 
                              inter_op_parallelism_threads=1)

tf.set_random_seed(RANDOM_SEED)
sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
keras.backend.set_session(sess)

Using TensorFlow backend.


Collecting tqdm
  Downloading https://files.pythonhosted.org/packages/78/bc/de067ab2d700b91717dc5459d86a1877e2df31abfb90ab01a5a5a5ce30b4/tqdm-4.23.0-py2.py3-none-any.whl (42kB)
[K    100% |████████████████████████████████| 51kB 1.8MB/s 
[?25hInstalling collected packages: tqdm
Successfully installed tqdm-4.23.0


### Constants

In [0]:
DATA_URL = "http://www.gutenberg.org/cache/epub/1041/pg1041.txt"
DATA_FILENAME = "sonnets.txt"

SEQ_LENGTH = 100
FEATURE_NUM = 1

### Fetch Data

In [3]:
class DLProgress(tqdm):
  last_block = 0

  def hook(self, block_num=1, block_size=1, total_size=None):
    self.total = total_size
    self.update((block_num - self.last_block) * block_size)
    self.last_block = block_num

with DLProgress(unit="B", unit_scale=True, miniters=1, desc="Shakespeare's Sonnets") as pbar:
  urlretrieve(DATA_URL, DATA_FILENAME, pbar.hook)

Shakespeare's Sonnets: 123kB [00:00, 587kB/s]                             


### Load Data

In [0]:
with open(DATA_FILENAME, "r") as file:
  data = file.read()

### Cleaning Data
As file is downloaded from `gutenberg` website, publisher has added some readme notes for reader. In order to leave only Shakespare's Sonnets I will cut oof that part.

In [0]:
start_index = 740
end_index = re.search("Love's fire heats water, water cools not love.", data)
end_index = end_index.end()

In [0]:
data_cleaned = data[start_index:end_index]
# print(data_cleaned)

Clean of characters that are unable to be decoded

In [0]:
removal_list = ["\xbb", "\xbf", "\xef"]
for char_to_remove in removal_list:
  data_cleaned = data_cleaned.replace(char_to_remove, " ")

To lower case

In [0]:
data_cleaned = data_cleaned.lower()

Number of characters

In [9]:
print(len(data_cleaned))

102890


Leave only 50% of data to speed up process

In [0]:
split_index = int(0.5 * len(data_cleaned))
data_cleaned = data_cleaned[:split_index]

### Preprocessing Data

Mapping every unique character to integer id

In [0]:
characters = sorted(list(set(data_cleaned)))
id_to_character = {i:char for i, char in enumerate(characters)}
character_to_id = {char:i for i, char in enumerate(characters)}

In [12]:
character_to_id

{'\n': 0,
 '\r': 1,
 ' ': 2,
 '!': 3,
 "'": 4,
 ',': 5,
 '-': 6,
 '.': 7,
 ':': 8,
 ';': 9,
 '?': 10,
 'a': 11,
 'b': 12,
 'c': 13,
 'd': 14,
 'e': 15,
 'f': 16,
 'g': 17,
 'h': 18,
 'i': 19,
 'j': 20,
 'k': 21,
 'l': 22,
 'm': 23,
 'n': 24,
 'o': 25,
 'p': 26,
 'q': 27,
 'r': 28,
 's': 29,
 't': 30,
 'u': 31,
 'v': 32,
 'w': 33,
 'x': 34,
 'y': 35,
 'z': 36}

Create input/output sequences

In [13]:
def data_to_sequence(data, data_to_id_dict):
  seq_Xs, seq_Ys = list(), list()

  for i in range(0, len(data) - SEQ_LENGTH):
    seq = data[i:i + SEQ_LENGTH]
    label = data[i + SEQ_LENGTH]
    
    seq_Xs.append([data_to_id_dict[char] for char in seq])
    seq_Ys.append(data_to_id_dict[label])
  
  return seq_Xs, seq_Ys

seq_Xs, seq_ys = data_to_sequence(data_cleaned, character_to_id)

for x, y in zip(seq_Xs[0:2], seq_ys[0:2]):
  print(x, y)

([2, 29, 25, 24, 24, 15, 30, 29, 1, 0, 1, 0, 12, 35, 2, 33, 19, 22, 22, 19, 11, 23, 2, 29, 18, 11, 21, 15, 29, 26, 15, 11, 28, 15, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 2, 2, 19, 1, 0, 1, 0, 2, 2, 16, 28, 25, 23, 2, 16, 11, 19, 28, 15, 29, 30, 2, 13, 28, 15, 11, 30, 31, 28, 15, 29, 2, 33, 15, 2, 14, 15, 29, 19, 28, 15, 2, 19, 24, 13, 28, 15, 11, 29, 15, 5, 1, 0, 2, 2, 30], 18)
([29, 25, 24, 24, 15, 30, 29, 1, 0, 1, 0, 12, 35, 2, 33, 19, 22, 22, 19, 11, 23, 2, 29, 18, 11, 21, 15, 29, 26, 15, 11, 28, 15, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 2, 2, 19, 1, 0, 1, 0, 2, 2, 16, 28, 25, 23, 2, 16, 11, 19, 28, 15, 29, 30, 2, 13, 28, 15, 11, 30, 31, 28, 15, 29, 2, 33, 15, 2, 14, 15, 29, 19, 28, 15, 2, 19, 24, 13, 28, 15, 11, 29, 15, 5, 1, 0, 2, 2, 30, 18], 11)


Assemble train_X, train_y

In [0]:
train_X = np.reshape(seq_Xs, (len(seq_Xs), SEQ_LENGTH, FEATURE_NUM))
train_y = keras.utils.to_categorical(seq_ys)

Normalize

In [0]:
train_X = train_X / float(len(characters))

### Model

Structure

In [0]:
model = Sequential()

model.add(LSTM(700, input_shape=(train_X.shape[1], train_X.shape[2]), return_sequences=True))
model.add(Dropout(0.2))

model.add(LSTM(700))
model.add(Dropout(0.2))

model.add(Dense(train_y.shape[1], activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer=Adam())

Training

In [0]:
history = model.fit(train_X, train_y, 
                    epochs=50, 
                    batch_size=128, 
                    verbose=1, 
                    shuffle=False)

Epoch 1/50
Epoch 2/50

Epoch 3/50
Epoch 4/50

Epoch 5/50
Epoch 6/50
 9472/51345 [====>.........................] - ETA: 3:56 - loss: 2.2717

Epoch 7/50
Epoch 8/50
 6784/51345 [==>...........................] - ETA: 4:12 - loss: 2.1268

Epoch 9/50
Epoch 10/50
 5888/51345 [==>...........................] - ETA: 4:17 - loss: 2.0140

Epoch 11/50
Epoch 12/50
 5504/51345 [==>...........................] - ETA: 4:19 - loss: 1.8635

Epoch 13/50
Epoch 14/50
 5376/51345 [==>...........................] - ETA: 4:19 - loss: 1.6552

Epoch 15/50
Epoch 16/50
 5248/51345 [==>...........................] - ETA: 4:21 - loss: 1.5127

Epoch 17/50
Epoch 18/50
 5248/51345 [==>...........................] - ETA: 4:21 - loss: 1.2937

Epoch 19/50
Epoch 20/50
 5248/51345 [==>...........................] - ETA: 4:21 - loss: 1.1382

Epoch 21/50
Epoch 22/50
 5248/51345 [==>...........................] - ETA: 4:20 - loss: 0.9181



### Text generation

In [0]:
string_mapped = seq_Xs[99]
full_string = [id_to_character[value] for value in string_mapped]

for i in range(1000):
    x = np.reshape(string_mapped,(1,len(string_mapped), 1))
    x = x / float(len(characters))
    
    pred_index = np.argmax(model.predict(x, verbose=0))
    seq = [id_to_character[value] for value in string_mapped]
    full_string.append(id_to_character[pred_index])

    string_mapped.append(pred_index)
    string_mapped = string_mapped[1:len(string_mapped)]

In [0]:
generated_text = ""
for char in full_string:
    generated_text += char

In [37]:
print(generated_text)

that thereby beauty's rose might never die,
  but as the riper should by time decease,
  his tender eearh,mo power, ow eear sweet seall 
  thene you au for iove in not to subi die 

 ar the waresmen of hil tpreng,donm
  bidtee a char bod ahouselfess in she say,
  i thould in thought in hos moti daiky    then bo bll soeselles of shy mind,
  and the firm soil with fistrected light
  ar wie rime's prurh of thu beauty s use,
  if thinki duml ban hold his swift foot back?
  or who his stoil tfe world with swestantac arine:
  berering my hreen all fare nnr croralts hn mow.
    yo shat sweet thieh which be iote of selte of thens srile,
    my lrve shall inves iive distred are,

  lxii

  o! hest the warte mf shat thin they wett they senkeh 

  that ke that beaus my isows ont be shei,
  whoce sou and out, thisgh not to drilnit visr thes
  wo  hure s runue taat kine own see onrnng doe:
  whoce hand po sarts, with swelte in not siy night;
  nor dare i chidn ald prise fon rhmu