# Text Generation using LSTMs

## 1. Import the libraries

In [1]:
# keras module for building LSTM 
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding, LSTM, Dense, Dropout
from keras.preprocessing.text import Tokenizer
from keras.callbacks import ModelCheckpoint
from keras.models import Sequential
import keras.utils as ku 

# set seeds for reproducability
from tensorflow import set_random_seed
from numpy.random import seed
set_random_seed(2)
seed(1)

import pandas as pd
import numpy as np
import string, os 

import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter(action='ignore', category=FutureWarning)

Using TensorFlow backend.


## 2. Load the dataset

In [2]:
curr_dir = '../input/'
play_df = pd.read_csv(curr_dir + 'Shakespeare_data.csv')

all_lines = [h for h in play_df.PlayerLine]

print(len(all_lines))

111396


## 3. Dataset preparation

First, we will clean the data.

In [3]:
def clean_text(txt):
    txt = "".join(v for v in txt if v not in string.punctuation).lower()
    txt = txt.encode("utf8").decode("ascii",'ignore')
    return txt 

corpus = [clean_text(x) for x in all_lines]
corpus[:10]

['act i',
 'scene i london the palace',
 'enter king henry lord john of lancaster the earl of westmoreland sir walter blunt and others',
 'so shaken as we are so wan with care',
 'find we a time for frighted peace to pant',
 'and breathe shortwinded accents of new broils',
 'to be commenced in strands afar remote',
 'no more the thirsty entrance of this soil',
 'shall daub her lips with her own childrens blood',
 'nor more shall trenching war channel her fields']

Next we will generate sequence of N-gram tokens using Keras' Tokenizer.

In [4]:
tokenizer = Tokenizer()

def get_sequence_of_tokens(corpus):
    ## tokenization
    corpus = corpus[:7000]
    tokenizer.fit_on_texts(corpus)
    total_words = len(tokenizer.word_index) + 1
    
    ## convert data to sequence of tokens 
    input_sequences = []
    for line in corpus:
        token_list = tokenizer.texts_to_sequences([line])[0]
        for i in range(1, len(token_list)):
            n_gram_sequence = token_list[:i+1]
            input_sequences.append(n_gram_sequence)
    return input_sequences, total_words

inp_sequences, total_words = get_sequence_of_tokens(corpus)
inp_sequences[:10]

[[523, 4],
 [142, 4],
 [142, 4, 339],
 [142, 4, 339, 1],
 [142, 4, 339, 1, 670],
 [53, 41],
 [53, 41, 84],
 [53, 41, 84, 29],
 [53, 41, 84, 29, 124],
 [53, 41, 84, 29, 124, 3]]

Next we will generate padded sequences.

In [5]:
def generate_padded_sequences(input_sequences):
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
    
    predictors, label = input_sequences[:,:-1],input_sequences[:,-1]
    label = ku.to_categorical(label, num_classes=total_words)
    return predictors, label, max_sequence_len

predictors, label, max_sequence_len = generate_padded_sequences(inp_sequences)
predictors.shape, label.shape

((45584, 33), (45584, 6543))

## 4. Using LSTM for text generation

In [6]:
def create_model(max_sequence_len, total_words):
    input_len = max_sequence_len - 1
    model = Sequential()
    
    # Add Input Embedding Layer
    model.add(Embedding(total_words, 10, input_length=input_len))
    
    # Add Hidden Layer 1 - LSTM Layer
    model.add(LSTM(512))
    model.add(Dropout(0.4))
    
    # Add Output Layer
    model.add(Dense(total_words, activation='softmax'))

    model.compile(loss='categorical_crossentropy', optimizer='adam')
    
    return model

model = create_model(max_sequence_len, total_words)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 33, 10)            65430     
_________________________________________________________________
lstm_1 (LSTM)                (None, 512)               1071104   
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 6543)              3356559   
Total params: 4,493,093
Trainable params: 4,493,093
Non-trainable params: 0
_________________________________________________________________


In [7]:
model.fit(predictors, label, epochs=2, verbose=1)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f4686fb9f60>

In [8]:
model.fit(predictors, label, epochs=20, verbose=2)

Epoch 1/20
 - 117s - loss: 6.3864
Epoch 2/20
 - 118s - loss: 6.2434
Epoch 3/20
 - 117s - loss: 6.0836
Epoch 4/20
 - 117s - loss: 5.8921
Epoch 5/20
 - 117s - loss: 5.6794
Epoch 6/20
 - 117s - loss: 5.4415
Epoch 7/20
 - 117s - loss: 5.1732
Epoch 8/20
 - 117s - loss: 4.8994
Epoch 9/20
 - 117s - loss: 4.6182
Epoch 10/20
 - 117s - loss: 4.3345
Epoch 11/20
 - 117s - loss: 4.0764
Epoch 12/20
 - 117s - loss: 3.8362
Epoch 13/20
 - 117s - loss: 3.6269
Epoch 14/20
 - 118s - loss: 3.4266
Epoch 15/20
 - 117s - loss: 3.2586
Epoch 16/20
 - 117s - loss: 3.0961
Epoch 17/20
 - 117s - loss: 2.9527
Epoch 18/20
 - 117s - loss: 2.8220
Epoch 19/20
 - 118s - loss: 2.7116
Epoch 20/20
 - 117s - loss: 2.6061


<keras.callbacks.History at 0x7f463e4147b8>

In [9]:
model.fit(predictors, label, epochs=20, verbose=0)

<keras.callbacks.History at 0x7f463e46e9e8>

## 5. Generating the text

In [10]:
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = model.predict_classes(token_list, verbose=0)
        
        output_word = ""
        for word,index in tokenizer.word_index.items():
            if index == predicted:
                output_word = word
                break
        seed_text += " "+output_word
    return seed_text.title()

In [11]:
print ("1. ",generate_text("Julius", 20, model, max_sequence_len))
print ("2. ",generate_text("Thou", 20, model, max_sequence_len))
print ("3. ",generate_text("King is", 20, model, max_sequence_len))
print ("4. ",generate_text("Death of", 20, model, max_sequence_len))
print ("5. ",generate_text("The Princess", 20, model, max_sequence_len))
print ("6. ",generate_text("Thanos", 20, model, max_sequence_len))

1.  Julius Heavy Theres As Is A King Saint Question That A Night Or A Struck Of A Ways A Bishop Of
2.  Thou Art The King Of Honour And Would Do Not My Lord Of The Jest And The Ant Of Him By
3.  King Is Grief In Buckram Here And So My Friends I Am No Talbots Am I Heard Now No A Man As
4.  Death Of The Duchess Blood Let Of Our Coats Or Take It Thunders And Lightens Terribly Then The Spirit Riseth A Man
5.  The Princess Of Deep Prophecy Did Hath Bought With Me That A World Of A Loss And A Gallows Enter Enter Bastard
6.  Thanos My Lord Of Winchester I Know Your Mind To You To Stay Of All If A Fight And A True
