# Text Generation using LSTM & RNN by `Mr. Harshit Dawar!`

In [1]:
# Importing the Required Libraries
import spacy
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Function to print read the data from a file
def read_Data(path):
    with open(path, "r") as p:
        text = p.read()
    return text

In [3]:
read_Data("data.txt")

'Call me Ishmael.  Some years ago--never mind how long\nprecisely--having little or no money in my purse, and nothing\nparticular to interest me on shore, I thought I would sail about a\nlittle and see the watery part of the world.  It is a way I have of\ndriving off the spleen and regulating the circulation.  Whenever I\nfind myself growing grim about the mouth; whenever it is a damp,\ndrizzly November in my soul; whenever I find myself involuntarily\npausing before coffin warehouses, and bringing up the rear of every\nfuneral I meet; and especially whenever my hypos get such an upper\nhand of me, that it requires a strong moral principle to prevent me\nfrom deliberately stepping into the street, and methodically knocking\npeople\'s hats off--then, I account it high time to get to sea as soon\nas I can.  This is my substitute for pistol and ball.  With a\nphilosophical flourish Cato throws himself upon his sword; I quietly\ntake to the ship.  There is nothing surprising in this.  If t

In [4]:
# Downloading the package for the Spacy English Language
# !python -m spacy download en_core_web_sm

In [5]:
# Creating a SpaCy Model for English
ENGLISH_MODEL_NLP = spacy.load("en_core_web_sm", disable = ["ner", "tagger", "parser"])

In [6]:
ENGLISH_MODEL_NLP.pipe_names

['tok2vec', 'attribute_ruler', 'lemmatizer']

In [7]:
# Setting the Max Length to a bigger number to use the complete dataset.
ENGLISH_MODEL_NLP.max_length = 1250000

In [8]:
# Function to remove the punctuations from the dataset
def remove_punc(data):
    return [token.text.lower() for token in ENGLISH_MODEL_NLP(data) if not token.is_punct
            and not token.text in "\n \n\n '"]

In [9]:
# Generating tokens from the data
tokens = remove_punc(read_Data("data.txt"))

In [10]:
len(tokens)

11328

In [11]:
tokens[:5]

['call', 'me', 'ishmael', 'some', 'years']

In [12]:
# Generating sequences of 15 words to predict the 16th word in the sentence
sequences = []
sequence_length = 15

for i in range(len(tokens) - sequence_length):
    sequences.append(tokens[i : i + sequence_length])

In [13]:
# Total length is 15 less than the actual length of the Tokens list because of the Sequence Length to be 15.
len(sequences)

11313

In [14]:
len(sequences[0])

15

In [15]:
" ".join(sequences[0])

'call me ishmael some years ago never mind how long precisely having little or no'

## Converting the Sequences into the Numerical Form that a Model Understands/Accepts

In [16]:
from keras.preprocessing.text import Tokenizer
import numpy as np

In [17]:
text_tokenizer = Tokenizer()
text_tokenizer.fit_on_texts(sequences)

In [18]:
# tokenized_sequences = text_tokenizer.texts_to_matrix(sequences)

In [19]:
# np.unique(tokenized_sequences[0])

In [20]:
tokenized_sequences = text_tokenizer.texts_to_sequences(sequences)

In [21]:
tokenized_sequences[0]

[956, 14, 262, 51, 260, 407, 87, 218, 129, 111, 954, 259, 50, 43, 37]

In [22]:
# Printing the word and their corresponding numbers
text_tokenizer.index_word

{1: 'the',
 2: 'a',
 3: 'and',
 4: 'of',
 5: 'i',
 6: 'to',
 7: 'in',
 8: 'it',
 9: 'that',
 10: 'he',
 11: 'his',
 12: 'was',
 13: 'but',
 14: 'me',
 15: 'with',
 16: 'as',
 17: 'at',
 18: 'this',
 19: 'you',
 20: 'is',
 21: 'all',
 22: 'for',
 23: 'my',
 24: 'on',
 25: 'be',
 26: "'s",
 27: 'not',
 28: 'from',
 29: 'there',
 30: 'one',
 31: 'up',
 32: 'what',
 33: 'him',
 34: 'so',
 35: 'bed',
 36: 'now',
 37: 'no',
 38: 'about',
 39: 'into',
 40: 'by',
 41: 'were',
 42: 'out',
 43: 'or',
 44: 'harpooneer',
 45: 'had',
 46: 'then',
 47: 'have',
 48: 'an',
 49: 'upon',
 50: 'little',
 51: 'some',
 52: 'old',
 53: 'like',
 54: 'if',
 55: 'they',
 56: 'would',
 57: 'do',
 58: 'over',
 59: 'landlord',
 60: 'thought',
 61: 'room',
 62: 'when',
 63: 'could',
 64: "n't",
 65: 'night',
 66: 'here',
 67: 'head',
 68: 'such',
 69: 'which',
 70: 'man',
 71: 'did',
 72: 'sea',
 73: 'time',
 74: 'other',
 75: 'very',
 76: 'go',
 77: 'these',
 78: 'more',
 79: 'though',
 80: 'first',
 81: 'sort',


In [23]:
for word in tokenized_sequences[0]:
    print("{} : {}".format(word, text_tokenizer.index_word[word]))

956 : call
14 : me
262 : ishmael
51 : some
260 : years
407 : ago
87 : never
218 : mind
129 : how
111 : long
954 : precisely
259 : having
50 : little
43 : or
37 : no


In [24]:
# Printing the filters internally used by the Tokenizer for removing the punctuations
text_tokenizer.filters

'!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'

In [25]:
# Printing the word counts in the Data
text_tokenizer.word_counts

OrderedDict([('call', 16),
             ('me', 1427),
             ('ishmael', 78),
             ('some', 439),
             ('years', 80),
             ('ago', 51),
             ('never', 262),
             ('mind', 98),
             ('how', 189),
             ('long', 220),
             ('precisely', 26),
             ('having', 87),
             ('little', 448),
             ('or', 554),
             ('no', 585),
             ('money', 75),
             ('in', 3269),
             ('my', 1035),
             ('purse', 45),
             ('and', 5573),
             ('nothing', 165),
             ('particular', 90),
             ('to', 3750),
             ('interest', 15),
             ('on', 990),
             ('shore', 15),
             ('i', 4125),
             ('thought', 390),
             ('would', 405),
             ('sail', 60),
             ('about', 585),
             ('a', 5988),
             ('see', 240),
             ('the', 8970),
             ('watery', 15),
             (

In [26]:
# Printing the word occurence in different number of documents/sentences
text_tokenizer.word_docs

defaultdict(int,
            {'years': 80,
             'ago': 51,
             'or': 477,
             'long': 211,
             'having': 87,
             'little': 432,
             'no': 584,
             'me': 1250,
             'call': 16,
             'ishmael': 78,
             'mind': 98,
             'some': 414,
             'precisely': 26,
             'never': 262,
             'how': 189,
             'money': 70,
             'in': 3008,
             'my': 982,
             'purse': 33,
             'and': 4784,
             'nothing': 156,
             'particular': 90,
             'to': 3271,
             'interest': 15,
             'on': 972,
             'shore': 15,
             'i': 3599,
             'thought': 385,
             'would': 399,
             'sail': 60,
             'about': 578,
             'a': 4711,
             'see': 230,
             'the': 6273,
             'watery': 15,
             'part': 135,
             'of': 4064,
             'wor

In [27]:
corpus_size = len(text_tokenizer.word_counts)
corpus_size

2718

In [28]:
# Creating sequences into Numpy arrays for better processing
final_sequences = np.array(tokenized_sequences)

In [29]:
final_sequences

array([[ 956,   14,  262, ...,   50,   43,   37],
       [  14,  262,   51, ...,   43,   37,  263],
       [ 262,   51,  260, ...,   37,  263,    7],
       ...,
       [  61,  943,   31, ...,  261,   53,    2],
       [ 943,   31,    7, ...,   53,    2, 2718],
       [  31,    7,   11, ...,    2, 2718,   26]])

In [30]:
final_sequences.shape

(11313, 15)


## Converting the sequences into Features & Labels

In [31]:
from tensorflow.keras.utils import to_categorical

In [32]:
features = final_sequences[:, : -1]
labels = final_sequences[:, -1]

In [33]:
labels

array([  37,  263,    7, ...,    2, 2718,   26])

In [34]:
features

array([[ 956,   14,  262, ...,  259,   50,   43],
       [  14,  262,   51, ...,   50,   43,   37],
       [ 262,   51,  260, ...,   43,   37,  263],
       ...,
       [  61,  943,   31, ...,   11,  261,   53],
       [ 943,   31,    7, ...,  261,   53,    2],
       [  31,    7,   11, ...,   53,    2, 2718]])

In [35]:
corpus_size

2718

In [36]:
"""# Adding 1 here to incorporate 0 also, othersiwe, error will be there"""
labels = to_categorical(labels, num_classes = corpus_size + 1) 

In [37]:
labels[0].shape

(2719,)

In [38]:
sequence_length = features.shape[1]

In [39]:
sequence_length

14

## Generating LSTM Models

In [40]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding

In [41]:
# sequence_length * 3, corpus_size + 1

In [60]:
model = Sequential([
                        Embedding(input_dim = corpus_size + 1,
                                  output_dim = sequence_length,
                                  input_length = sequence_length),
    
     # return_sequences = True is required only for the first time because without it,
     # only number of neurons will be returned."""
                        LSTM(units = 95, return_sequences = True),
    
                        LSTM(units = 95),
                        
                        Dense(units = 100, activation = "relu"),
                        
                        Dense(units = corpus_size + 1, activation = "softmax")
])

model.compile(loss = "categorical_crossentropy", optimizer = "adam", metrics = ["accuracy"])

In [61]:
model.summary()

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, 14, 14)            38066     
_________________________________________________________________
lstm_16 (LSTM)               (None, 14, 95)            41800     
_________________________________________________________________
lstm_17 (LSTM)               (None, 95)                72580     
_________________________________________________________________
dense_16 (Dense)             (None, 100)               9600      
_________________________________________________________________
dense_17 (Dense)             (None, 2719)              274619    
Total params: 436,665
Trainable params: 436,665
Non-trainable params: 0
_________________________________________________________________


In [62]:
from tensorflow.keras.callbacks import ModelCheckpoint

In [63]:
MC = ModelCheckpoint(filepath = "Text-Generator.h5", monitor = "loss", verbose = 1)

In [64]:
history = model.fit(features, labels, epochs = 150, verbose = 1, callbacks = [MC])

2021-09-02 21:34:44.391275: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/150

Epoch 00001: saving model to Text-Generator.h5
Epoch 2/150

Epoch 00002: saving model to Text-Generator.h5
Epoch 3/150

Epoch 00003: saving model to Text-Generator.h5
Epoch 4/150

Epoch 00004: saving model to Text-Generator.h5
Epoch 5/150

Epoch 00005: saving model to Text-Generator.h5
Epoch 6/150

Epoch 00006: saving model to Text-Generator.h5
Epoch 7/150

Epoch 00007: saving model to Text-Generator.h5
Epoch 8/150

Epoch 00008: saving model to Text-Generator.h5
Epoch 9/150

Epoch 00009: saving model to Text-Generator.h5
Epoch 10/150

Epoch 00010: saving model to Text-Generator.h5
Epoch 11/150

Epoch 00011: saving model to Text-Generator.h5
Epoch 12/150

Epoch 00012: saving model to Text-Generator.h5
Epoch 13/150

Epoch 00013: saving model to Text-Generator.h5
Epoch 14/150

Epoch 00014: saving model to Text-Generator.h5
Epoch 15/150

Epoch 00015: saving model to Text-Generator.h5
Epoch 16/150

Epoch 00016: saving model to Text-Generator.h5
Epoch 17/150

Epoch 00017: saving


Epoch 00055: saving model to Text-Generator.h5
Epoch 56/150

Epoch 00056: saving model to Text-Generator.h5
Epoch 57/150

Epoch 00057: saving model to Text-Generator.h5
Epoch 58/150

Epoch 00058: saving model to Text-Generator.h5
Epoch 59/150

Epoch 00059: saving model to Text-Generator.h5
Epoch 60/150

Epoch 00060: saving model to Text-Generator.h5
Epoch 61/150

Epoch 00061: saving model to Text-Generator.h5
Epoch 62/150

Epoch 00062: saving model to Text-Generator.h5
Epoch 63/150

Epoch 00063: saving model to Text-Generator.h5
Epoch 64/150

Epoch 00064: saving model to Text-Generator.h5
Epoch 65/150

Epoch 00065: saving model to Text-Generator.h5
Epoch 66/150

Epoch 00066: saving model to Text-Generator.h5
Epoch 67/150

Epoch 00067: saving model to Text-Generator.h5
Epoch 68/150

Epoch 00068: saving model to Text-Generator.h5
Epoch 69/150

Epoch 00069: saving model to Text-Generator.h5
Epoch 70/150

Epoch 00070: saving model to Text-Generator.h5
Epoch 71/150

Epoch 00071: saving mod

Epoch 109/150

Epoch 00109: saving model to Text-Generator.h5
Epoch 110/150

Epoch 00110: saving model to Text-Generator.h5
Epoch 111/150

Epoch 00111: saving model to Text-Generator.h5
Epoch 112/150

Epoch 00112: saving model to Text-Generator.h5
Epoch 113/150

Epoch 00113: saving model to Text-Generator.h5
Epoch 114/150

Epoch 00114: saving model to Text-Generator.h5
Epoch 115/150

Epoch 00115: saving model to Text-Generator.h5
Epoch 116/150

Epoch 00116: saving model to Text-Generator.h5
Epoch 117/150

Epoch 00117: saving model to Text-Generator.h5
Epoch 118/150

Epoch 00118: saving model to Text-Generator.h5
Epoch 119/150

Epoch 00119: saving model to Text-Generator.h5
Epoch 120/150

Epoch 00120: saving model to Text-Generator.h5
Epoch 121/150

Epoch 00121: saving model to Text-Generator.h5
Epoch 122/150

Epoch 00122: saving model to Text-Generator.h5
Epoch 123/150

Epoch 00123: saving model to Text-Generator.h5
Epoch 124/150

Epoch 00124: saving model to Text-Generator.h5
Epoch 12

In [131]:
from pickle import load, dump

In [132]:
# Saving the Text Tokenizer
dump(text_tokenizer, open("LSTM_Tokenizer", "wb"))

In [99]:
features[0].shape

(14,)

In [102]:
# Reshaping the Input Text to get the words as a string!
model.predict(features[0].reshape(1, -1))

array([[4.7728451e-19, 4.4772471e-04, 1.5792391e-05, ..., 6.2675482e-25,
        1.2826639e-35, 0.0000000e+00]], dtype=float32)

In [101]:
# Printing the shape of the prediction
model.predict(features[0].reshape(1, -1)).shape

(1, 2719)

In [105]:
# Getting the word index that comes after the sequence that we have passed
model.predict(features[0].reshape(1, -1)).argmax(axis = 1)[0]

97

In [107]:
# Printing the actual word
text_tokenizer.index_word[97]

'are'

In [114]:
# Getting the Final Word from the Original Sequences & comparing it with the predicted one!
index = 1
word_index = model.predict(features[index].reshape(1, -1)).argmax(axis = 1)[0]
print(word_index, text_tokenizer.index_word[word_index])
print(final_sequences[index][-1])

263 money
263


In [133]:
text_tokenizer.texts_to_sequences("I am Pro, I am an Indian, My Name is Harshit Dawar")

[[5],
 [],
 [2],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [5],
 [],
 [2],
 [],
 [],
 [2],
 [],
 [],
 [5],
 [],
 [],
 [5],
 [2],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [2],
 [],
 [404],
 [],
 [5],
 [],
 [],
 [],
 [2],
 [],
 [],
 [],
 [5],
 [],
 [],
 [],
 [2],
 [],
 [2],
 []]

In [134]:
# Importing to pad the sequences
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [177]:
def text_generator(model, tokenizer, num_words_to_gen, text_sequence, sequence_length):
    predicted_words = []
    
    input_seq = text_sequence
    
    for word_index in range(num_words_to_gen):
        # Passing in 2D array as below is very important for the right working!
        encoded_sequence = tokenizer.texts_to_sequences([input_seq])[0]
        
        # Pre Truncate will chop off the initial word from the sentence!
        padded_sequence = pad_sequences([encoded_sequence], maxlen = sequence_length, truncating = "pre")
    
        gen_word_index = model.predict(padded_sequence).argmax(axis = 1)[0]
        
        gen_word = tokenizer.index_word[gen_word_index]
        
        input_seq += " " + gen_word
        
        predicted_words.append(gen_word)
        
    return " ".join(predicted_words)

In [178]:
text_generator(model, text_tokenizer, 9, "I am Harshit Dawar, I am Indian, I am the best, everyone knows this", 14)

'sooty shouted the battery and a sunset and a'

In [179]:
text_generator(model, text_tokenizer, 15, " ".join(sequences[1155][:-1]), 14)

'cook being deeply brown and save me on i struck one in a passion though'

In [180]:
sequences[1155][:-1]

['as',
 'for',
 'going',
 'as',
 'cook,--though',
 'i',
 'confess',
 'there',
 'is',
 'considerable',
 'glory',
 'in',
 'that',
 'a']

In [181]:
" ".join(sequences[1155][:-1])

'as for going as cook,--though i confess there is considerable glory in that a'

In [182]:
text_tokenizer.texts_to_sequences([" ".join(sequences[1155][:-1])])

[[16, 22, 154, 16, 614, 79, 5, 617, 29, 20, 1183, 615, 7, 9, 2]]

## Generating RNN Model

In [184]:
from tensorflow.keras.layers import SimpleRNN

In [185]:
model_RNN = Sequential([
                        Embedding(input_dim = corpus_size + 1,
                                  output_dim = sequence_length,
                                  input_length = sequence_length),
    
     # return_sequences = True is required only for the first time because without it,
     # only number of neurons will be returned."""
                        SimpleRNN(units = 95, return_sequences = True),
    
                        SimpleRNN(units = 95),
                        
                        Dense(units = 100, activation = "relu"),
                        
                        Dense(units = corpus_size + 1, activation = "softmax")
])

model_RNN.compile(loss = "categorical_crossentropy", optimizer = "adam", metrics = ["accuracy"])

In [186]:
model_RNN.summary()

Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, 14, 14)            38066     
_________________________________________________________________
simple_rnn (SimpleRNN)       (None, 14, 95)            10450     
_________________________________________________________________
simple_rnn_1 (SimpleRNN)     (None, 95)                18145     
_________________________________________________________________
dense_18 (Dense)             (None, 100)               9600      
_________________________________________________________________
dense_19 (Dense)             (None, 2719)              274619    
Total params: 350,880
Trainable params: 350,880
Non-trainable params: 0
_________________________________________________________________


In [187]:
MC = ModelCheckpoint(filepath = "Text-Generator-RNN.h5", monitor = "loss", verbose = 1)
history_RNN = model_RNN.fit(features, labels, epochs = 150, verbose = 1, callbacks = [MC])

Epoch 1/150

Epoch 00001: saving model to Text-Generator-RNN.h5
Epoch 2/150

Epoch 00002: saving model to Text-Generator-RNN.h5
Epoch 3/150

Epoch 00003: saving model to Text-Generator-RNN.h5
Epoch 4/150

Epoch 00004: saving model to Text-Generator-RNN.h5
Epoch 5/150

Epoch 00005: saving model to Text-Generator-RNN.h5
Epoch 6/150

Epoch 00006: saving model to Text-Generator-RNN.h5
Epoch 7/150

Epoch 00007: saving model to Text-Generator-RNN.h5
Epoch 8/150

Epoch 00008: saving model to Text-Generator-RNN.h5
Epoch 9/150

Epoch 00009: saving model to Text-Generator-RNN.h5
Epoch 10/150

Epoch 00010: saving model to Text-Generator-RNN.h5
Epoch 11/150

Epoch 00011: saving model to Text-Generator-RNN.h5
Epoch 12/150

Epoch 00012: saving model to Text-Generator-RNN.h5
Epoch 13/150

Epoch 00013: saving model to Text-Generator-RNN.h5
Epoch 14/150

Epoch 00014: saving model to Text-Generator-RNN.h5
Epoch 15/150

Epoch 00015: saving model to Text-Generator-RNN.h5
Epoch 16/150

Epoch 00016: saving 


Epoch 00054: saving model to Text-Generator-RNN.h5
Epoch 55/150

Epoch 00055: saving model to Text-Generator-RNN.h5
Epoch 56/150

Epoch 00056: saving model to Text-Generator-RNN.h5
Epoch 57/150

Epoch 00057: saving model to Text-Generator-RNN.h5
Epoch 58/150

Epoch 00058: saving model to Text-Generator-RNN.h5
Epoch 59/150

Epoch 00059: saving model to Text-Generator-RNN.h5
Epoch 60/150

Epoch 00060: saving model to Text-Generator-RNN.h5
Epoch 61/150

Epoch 00061: saving model to Text-Generator-RNN.h5
Epoch 62/150

Epoch 00062: saving model to Text-Generator-RNN.h5
Epoch 63/150

Epoch 00063: saving model to Text-Generator-RNN.h5
Epoch 64/150

Epoch 00064: saving model to Text-Generator-RNN.h5
Epoch 65/150

Epoch 00065: saving model to Text-Generator-RNN.h5
Epoch 66/150

Epoch 00066: saving model to Text-Generator-RNN.h5
Epoch 67/150

Epoch 00067: saving model to Text-Generator-RNN.h5
Epoch 68/150

Epoch 00068: saving model to Text-Generator-RNN.h5
Epoch 69/150

Epoch 00069: saving mode

Epoch 107/150

Epoch 00107: saving model to Text-Generator-RNN.h5
Epoch 108/150

Epoch 00108: saving model to Text-Generator-RNN.h5
Epoch 109/150

Epoch 00109: saving model to Text-Generator-RNN.h5
Epoch 110/150

Epoch 00110: saving model to Text-Generator-RNN.h5
Epoch 111/150

Epoch 00111: saving model to Text-Generator-RNN.h5
Epoch 112/150

Epoch 00112: saving model to Text-Generator-RNN.h5
Epoch 113/150

Epoch 00113: saving model to Text-Generator-RNN.h5
Epoch 114/150

Epoch 00114: saving model to Text-Generator-RNN.h5
Epoch 115/150

Epoch 00115: saving model to Text-Generator-RNN.h5
Epoch 116/150

Epoch 00116: saving model to Text-Generator-RNN.h5
Epoch 117/150

Epoch 00117: saving model to Text-Generator-RNN.h5
Epoch 118/150

Epoch 00118: saving model to Text-Generator-RNN.h5
Epoch 119/150

Epoch 00119: saving model to Text-Generator-RNN.h5
Epoch 120/150

Epoch 00120: saving model to Text-Generator-RNN.h5
Epoch 121/150

Epoch 00121: saving model to Text-Generator-RNN.h5
Epoch 122/