# Text Generation using LSTM

In [198]:
#Libraries
import nltk
import numpy as np
import pandas as pd
import os
import random

from keras.callbacks import LambdaCallback
from keras.callbacks import ModelCheckpoint
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import LSTM

In [199]:
#Download Book Data for text generation
nltk.download("book")

[nltk_data] Downloading collection 'book'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to
[nltk_data]    |     C:\Users\cct\AppData\Roaming\nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package brown to
[nltk_data]    |     C:\Users\cct\AppData\Roaming\nltk_data...
[nltk_data]    |   Package brown is already up-to-date!
[nltk_data]    | Downloading package chat80 to
[nltk_data]    |     C:\Users\cct\AppData\Roaming\nltk_data...
[nltk_data]    |   Package chat80 is already up-to-date!
[nltk_data]    | Downloading package cmudict to
[nltk_data]    |     C:\Users\cct\AppData\Roaming\nltk_data...
[nltk_data]    |   Package cmudict is already up-to-date!
[nltk_data]    | Downloading package conll2000 to
[nltk_data]    |     C:\Users\cct\AppData\Roaming\nltk_data...
[nltk_data]    |   Package conll2000 is already up-to-date!
[nltk_data]    | Downloading package conll2002 to
[nltk_data]    |     C:\Users\cct\AppData\Roaming\nltk_

True

In [200]:
# Path for book data downloaded
cospus_dir = r"C:\Users\cct\AppData\Roaming\nltk_data\corpora\state_union"

In [201]:
# Book file paths
file_paths = []
for root, dirs, files in os.walk(cospus_dir, topdown = False):
    
      for name in files:
            file_paths.append(os.path.join(root, name))

In [202]:
# Join text of all book files
text = []
for file in file_names:
    
    with open(file, "r") as f:
        try:
            text.append(f.read().replace("\n", "").lower())
        except UnicodeDecodeError:
            pass
text = " ".join(text)

In [203]:
# char indices and indices char for unique characters in text
unique_char = sorted(list(set(text)))

char_indices = dict((char,idx) for idx, char in enumerate(unique_char)) 
indices_char = dict((idx,char) for idx, char in enumerate(unique_char)) 

In [204]:
# Sequences of text for train data
sequence_length = 50 
stride_Steps = 4 

sequences = []
next_sequence = []

for i in range(0, len(text) - sequence_length, stride_Steps):
    sequences.append(text[i: i + sequence_length])
    next_sequence.append(text[i + sequence_length]) 

print('sequences:', len(sequences))

sequences: 516664


In [205]:
# Prepare tensors for train x and y
x = np.zeros((len(sequences), sequence_length, len(unique_char)), dtype=np.bool) 
y = np.zeros((len(sequences), len(unique_char)), dtype=np.bool) 

In [206]:
# prepare train x and y data by filling the tensors
for i, sequence in enumerate(sequences):
    for k, char in enumerate(sequence):
        x[i, k, char_indices[char]] = 1 
    y[i, char_indices[next_sequence[i]]] = 1

In [207]:
# Temperature sampling for introduce randomness in text generation
def temperature_sampling(preds, temp):
    
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temp 
    preds_exp = np.exp(preds)
    
    preds = preds_exp / np.sum(preds_exp) 
    
    prob = np.random.multinomial(1, preds, size=1) 
    return np.argmax(prob)

In [218]:
# Generate text after every epoch

def end_epoch(epoch, _):
    
    print(f"**********Generating text after Epoch {epoch} ***********")
    
    noOfChar = 200 # no of characters to be generated
    
    srt_idx = random.randint(0, len(text) - sequence_length - 1)   # start index for test sentence

    for temperature in [0.2, 0.5, 1.0, 1.2]:  # temoeratures for randomness
        
        generated_text = ''
        sequence = text[srt_idx: srt_idx + sequence_length]
        generated_text = generated_text + sequence
        
        print(f"With Temperature: {temperature}")
        print(f"For sentence: {sequence}")

        for i in range(0,noOfChar):
            
            # fill tensor for test sentence
            x_test = np.zeros((1, sequence_length, len(unique_char)))      
            for k, char in enumerate(sequence):
                x_test[0, k, char_indices[char]] = 1 
            
            # Predict Probabilities for next sequence
            
            x_pred= model.predict(x_test, verbose=0)[0]
            
            # Generate next character with different temperature values
            next_char_index = temperature_sampling(x_pred, temperature) 
            next_char = indices_char[next_char_index]
            
            #update sentence with new character
            sequence = sequence[1:] + next_char
            
            #Append new charater to generated text
            generated_text = generated_text + next_char
        
       
        print(f" Generated Text: {generated_text}")
        
    model.save_weights('LSTM_TextGeneration_weights.hdf5', overwrite=True)


In [215]:
#Model

# Size of vector in the hidden layer.
units = 128 

model = Sequential()
model.add(LSTM(units, input_shape=(x.shape[1], x.shape[2])))
model.add(Dense(y.shape[1], activation='softmax')) 

# compile model with optimizer
model.compile(loss='categorical_crossentropy', optimizer= "adam") 

callback_results = LambdaCallback(on_epoch_end=end_epoch)
checkpointer = ModelCheckpoint(filepath='model_weights.hdf5', verbose=1, save_best_only=True)

In [216]:
# train model
model.fit(x, y,
          batch_size=128,
          epochs=15,
          callbacks=[callback_results, checkpointer])

Epoch 1/15
**********Generating text after Epoch 0 ***********
With Temperature: 0.2
For sentence: ill maintain a nuclear deterrent adequate to meet 
 Generated Text: ill maintain a nuclear deterrent adequate to meet the for to shat and the conter and the congres and in the promice be the for the promest the reand the the conting and the sear and the for cont the the the the in the conting and the conting on the c
With Temperature: 0.5
For sentence: ill maintain a nuclear deterrent adequate to meet 
 Generated Text: ill maintain a nuclear deterrent adequate to meet will of ching ner censting of that the congen the with wise be fer in the the the indes protation the atien. in mese and the mare and the the werment tha s we rontent ic not ding and in the compan tha
With Temperature: 1.0
For sentence: ill maintain a nuclear deterrent adequate to meet 
 Generated Text: ill maintain a nuclear deterrent adequate to meet pontcy this in benment face baigay s week the ixt ther lyominbinion gaas 



**********Generating text after Epoch 1 ***********
With Temperature: 0.2
For sentence: a federal bureaucrat may want, but for what their 
 Generated Text: a federal bureaucrat may want, but for what their decain in the streation and the prople and the prople on the prople in the for the serican and the contrien and the propesting of the for and the streation and the comple and the prople and the seare 
With Temperature: 0.5
For sentence: a federal bureaucrat may want, but for what their 
 Generated Text: a federal bureaucrat may want, but for what their actions, and progral of the whold and priving and the fillien in lower and the comlarned and the now and frecting and dorice. the will of the promment for male of the portent of the propes the sucted 
With Temperature: 1.0
For sentence: a federal bureaucrat may want, but for what their 
 Generated Text: a federal bureaucrat may want, but for what their grogludizage shollen the says and instent to the contiges regrayes to chive in progr

 Generated Text: lion, 75th rangers. it's dated december 18th, the semple. me are been lational decent of first the democration and wemple as expechation to the country that expenditures the world our are the destront and every strengthen the have so are expenditures
With Temperature: 1.0
For sentence: lion, 75th rangers. it's dated december 18th, the 
 Generated Text: lion, 75th rangers. it's dated december 18th, the farminal regusares.reedivess jubully on sall, and cloperipublets -- we lawn education produching. the keep one abortantage 2500 percent to shand not even where suppriares. defense mmeniop being movelk
With Temperature: 1.2
For sentence: lion, 75th rangers. it's dated december 18th, the 
 Generated Text: lion, 75th rangers. it's dated december 18th, the world pramestment or trus-sy't eaco at proflewbal that would our lexpons, and tixheh to hadd on enduralvatures ane an recent 6) the und dute essideration brater qutare yearsh-with sedlems, leg.think f
Epoch 8/15
**********G

 Generated Text: and the resolution to take that course, then we shalus in yon, and hoursely tallapted bedicies to moway while it cansit caple's sences, and our cord-apmrisal sentementable, that do, redicral engyers moro oursure gid bus.all broke matkara, rovel abwol
Epoch 13/15
**********Generating text after Epoch 12 ***********
With Temperature: 0.2
For sentence:  ties of strategic interdependence. both nations n
 Generated Text:  ties of strategic interdependence. both nations no leadership of the strong the strong the possible the strong and the state of the future in the strong and the people in the world in the congress to the people and the state of the program and the p
With Temperature: 0.5
For sentence:  ties of strategic interdependence. both nations n
 Generated Text:  ties of strategic interdependence. both nations no the bourned to them our lising the state of all the people and health caboring the democracy and a markets of peace at the state of the blotery will be mode

<keras.callbacks.callbacks.History at 0x22b5e57d7b8>