# Text Generation using LSTM by `Mr. Harshit Dawar!`

In [1]:
# Importing the Required Libraries
import spacy
import pandas as pd

In [2]:
# Function to print read the data from a file
def read_Data(path):
    with open(path, "r") as p:
        text = p.read()
    return text

In [3]:
read_Data("data.txt")

'Call me Ishmael.  Some years ago--never mind how long\nprecisely--having little or no money in my purse, and nothing\nparticular to interest me on shore, I thought I would sail about a\nlittle and see the watery part of the world.  It is a way I have of\ndriving off the spleen and regulating the circulation.  Whenever I\nfind myself growing grim about the mouth; whenever it is a damp,\ndrizzly November in my soul; whenever I find myself involuntarily\npausing before coffin warehouses, and bringing up the rear of every\nfuneral I meet; and especially whenever my hypos get such an upper\nhand of me, that it requires a strong moral principle to prevent me\nfrom deliberately stepping into the street, and methodically knocking\npeople\'s hats off--then, I account it high time to get to sea as soon\nas I can.  This is my substitute for pistol and ball.  With a\nphilosophical flourish Cato throws himself upon his sword; I quietly\ntake to the ship.  There is nothing surprising in this.  If t

In [9]:
# Creating a SpaCy Model for English
ENGLISH_MODEL_NLP = spacy.load("en_core_web_sm", disable = ["ner", "tagger", "parser"])

In [10]:
ENGLISH_MODEL_NLP.pipe_names

[]

In [11]:
# Setting the Max Length to a bigger number to use the complete dataset.
ENGLISH_MODEL_NLP.max_length = 1250000

In [25]:
# Function to remove the punctuations from the dataset
def remove_punc(data):
    return [token.text.lower() for token in ENGLISH_MODEL_NLP(data) if not token.is_punct
            and not token.text in "\n \n\n '"]

In [26]:
# Generating tokens from the data
tokens = remove_punc(read_Data("data.txt"))

In [27]:
len(tokens)

11328

In [29]:
tokens[:5]

['call', 'me', 'ishmael', 'some', 'years']

In [30]:
# Generating sequences of 15 words to predict the 16th word in the sentence
sequences = []
sequence_length = 15

for i in range(len(tokens) - sequence_length):
    sequences.append(tokens[i : i + sequence_length])

In [32]:
# Total length is 15 less than the actual length of the Tokens list because of the Sequence Length to be 15.
len(sequences)

11313

In [34]:
len(sequences[0])

15

In [35]:
" ".join(sequences[0])

'call me ishmael some years ago never mind how long precisely having little or no'

## Converting the Sequences into the Numerical Form that a Model Understands/Accepts

In [43]:
from keras.preprocessing.text import Tokenizer
import numpy as np

In [44]:
text_tokenizer = Tokenizer()
text_tokenizer.fit_on_texts(sequences)

In [47]:
# tokenized_sequences = text_tokenizer.texts_to_matrix(sequences)

In [48]:
# np.unique(tokenized_sequences[0])

In [49]:
tokenized_sequences = text_tokenizer.texts_to_sequences(sequences)

In [51]:
tokenized_sequences[0]

[956, 14, 262, 51, 260, 407, 87, 218, 129, 111, 954, 259, 50, 43, 37]

In [53]:
# Printing the word and their corresponding numbers
text_tokenizer.index_word

{1: 'the',
 2: 'a',
 3: 'and',
 4: 'of',
 5: 'i',
 6: 'to',
 7: 'in',
 8: 'it',
 9: 'that',
 10: 'he',
 11: 'his',
 12: 'was',
 13: 'but',
 14: 'me',
 15: 'with',
 16: 'as',
 17: 'at',
 18: 'this',
 19: 'you',
 20: 'is',
 21: 'all',
 22: 'for',
 23: 'my',
 24: 'on',
 25: 'be',
 26: "'s",
 27: 'not',
 28: 'from',
 29: 'there',
 30: 'one',
 31: 'up',
 32: 'what',
 33: 'him',
 34: 'so',
 35: 'bed',
 36: 'now',
 37: 'no',
 38: 'about',
 39: 'into',
 40: 'by',
 41: 'were',
 42: 'out',
 43: 'or',
 44: 'harpooneer',
 45: 'had',
 46: 'then',
 47: 'have',
 48: 'an',
 49: 'upon',
 50: 'little',
 51: 'some',
 52: 'old',
 53: 'like',
 54: 'if',
 55: 'they',
 56: 'would',
 57: 'do',
 58: 'over',
 59: 'landlord',
 60: 'thought',
 61: 'room',
 62: 'when',
 63: 'could',
 64: "n't",
 65: 'night',
 66: 'here',
 67: 'head',
 68: 'such',
 69: 'which',
 70: 'man',
 71: 'did',
 72: 'sea',
 73: 'time',
 74: 'other',
 75: 'very',
 76: 'go',
 77: 'these',
 78: 'more',
 79: 'though',
 80: 'first',
 81: 'sort',


In [54]:
for word in tokenized_sequences[0]:
    print("{} : {}".format(word, text_tokenizer.index_word[word]))

956 : call
14 : me
262 : ishmael
51 : some
260 : years
407 : ago
87 : never
218 : mind
129 : how
111 : long
954 : precisely
259 : having
50 : little
43 : or
37 : no


In [55]:
# Printing the filters internally used by the Tokenizer for removing the punctuations
text_tokenizer.filters

'!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'

In [56]:
# Printing the word counts in the Data
text_tokenizer.word_counts

OrderedDict([('call', 16),
             ('me', 1427),
             ('ishmael', 78),
             ('some', 439),
             ('years', 80),
             ('ago', 51),
             ('never', 262),
             ('mind', 98),
             ('how', 189),
             ('long', 220),
             ('precisely', 26),
             ('having', 87),
             ('little', 448),
             ('or', 554),
             ('no', 585),
             ('money', 75),
             ('in', 3269),
             ('my', 1035),
             ('purse', 45),
             ('and', 5573),
             ('nothing', 165),
             ('particular', 90),
             ('to', 3750),
             ('interest', 15),
             ('on', 990),
             ('shore', 15),
             ('i', 4125),
             ('thought', 390),
             ('would', 405),
             ('sail', 60),
             ('about', 585),
             ('a', 5988),
             ('see', 240),
             ('the', 8970),
             ('watery', 15),
             (

In [57]:
# Printing the word occurence in different number of documents/sentences
text_tokenizer.word_docs

defaultdict(int,
            {'years': 80,
             'never': 262,
             'mind': 98,
             'how': 189,
             'long': 211,
             'ago': 51,
             'precisely': 26,
             'little': 432,
             'or': 477,
             'ishmael': 78,
             'some': 414,
             'call': 16,
             'having': 87,
             'me': 1250,
             'no': 584,
             'money': 70,
             'in': 3008,
             'my': 982,
             'purse': 33,
             'and': 4784,
             'nothing': 156,
             'particular': 90,
             'to': 3271,
             'interest': 15,
             'on': 972,
             'shore': 15,
             'i': 3599,
             'thought': 385,
             'would': 399,
             'sail': 60,
             'about': 578,
             'a': 4711,
             'see': 230,
             'the': 6273,
             'watery': 15,
             'part': 135,
             'of': 4064,
             'wor

In [59]:
corpus_size = len(text_tokenizer.word_counts)
corpus_size

2717