##Reading the file for the processing the data

In [1]:
# The file is uploaded as 'LSTM Data.txt'
with open('LSTM_DATA.txt', encoding='utf-8') as file:
    data = file.read()

# Print the first 500 characters of the data
print(data[:500])


The Project Gutenberg eBook of Pride and Prejudice
    
This ebook is for the use of anyone anywhere in the United States and
most other parts of the world at no cost and with almost no restrictions
whatsoever. You may copy it, give it away or re-use it under the terms
of the Project Gutenberg License included with this ebook or online
at www.gutenberg.org. If you are not located in the United States,
you will have to check the laws of the country where you are located
before using this eBook.




##Importing Necessary Libraries

In [2]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding,Dense,LSTM, Dropout, BatchNormalization
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
import pickle
import numpy as np
import os

###Preprocessing the textual data by tokenizing and generating sequences

In [3]:
# The 'data' is the input text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])
seqence_data = tokenizer.texts_to_sequences([data])[0]

# Saving the tokenizer
with open('token.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

# Printing the first 15 elements of the sequence data
print(seqence_data[:15])

[1, 187, 149, 98, 2, 60, 3, 72, 29, 98, 8, 16, 1, 150, 2]


###len(seqence_data) tells us how many items are in the list called seqence_data.


In [4]:
len(seqence_data) # seqence_data is a list of words or tokens

4598

###tokenizer.word_index is a part of the Tokenizer object that contains a dictionary. In this dictionary, each unique word from your text data is paired with a number. This number represents the word's index or position in the tokenizer's vocabulary.

In [5]:
vocab_size = len(tokenizer.word_index) #len(tokenizer.word_index) counts how many unique words (or tokens) are in the text data.
vocab_size #vocab_size is just a number that tells you how many different words are in your text data.

1447

###This code takes a list of words (seqence_data) and creates sequences of four consecutive words. It then prints the number of sequences created and the first 10 sequences. NumPy is used to handle the data efficiently.

In [6]:
import numpy as np # Numerical computation

sequence = [] # initializes an empty list named sequence. This list will be used to store sequences of words.
for i in range(3, len(seqence_data)): # iterates through the seqence_data list starting from the index 3. seqence_data contains sequences of words, and this loop is used to create new sequences with four words each.
    words = seqence_data[i-3:i+1] # This creates a sliding window of four consecutive words.
    sequence.append(words) # adds each sequence of four words to the sequence list.

print('The Length of sequence is:', len(sequence)) # prints the length of the sequence list.
sequence = np.array(sequence) # converts the list of sequences into a NumPy array
print(sequence[:10]) #  Prints the first 10 sequences from the sequence array.

The Length of sequence is: 4595
[[  1 187 149  98]
 [187 149  98   2]
 [149  98   2  60]
 [ 98   2  60   3]
 [  2  60   3  72]
 [ 60   3  72  29]
 [  3  72  29  98]
 [ 72  29  98   8]
 [ 29  98   8  16]
 [ 98   8  16   1]]


In [7]:
x = [] # creates an empty list named x.
y = [] # creates another empty list named y.
for i in sequence: # iterates through each sequence in the sequence list
    x.append(i[0:3]) # takes the first three elements of the current sequence i and adds them to the x list. This creates a new list of sequences, each containing the first three words.
    y.append(i[3]) # takes the fourth element of the current sequence i and adds it to the y list. This creates a new list of single words, each corresponding to the next word after the first three in each sequence.

x = np.array(x) # converts the list of sequences x into a NumPy array.
y = np.array(y) # converts the list of single words y into a NumPy array.

In [8]:
print(x.shape) # prints the shape of the NumPy array x
print(y.shape) # prints the shape of the NumPy array y

(4595, 3)
(4595,)


###Convert the numerical labels in array y into one-hot encoded vectors.

In [9]:
y = to_categorical(y,num_classes=7560)

##Splitting the data:

In [10]:
from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=.1)

##Building the model

In [11]:
model = Sequential()
model.add(Embedding(7560, 10,  input_length=3))
model.add(LSTM(1000,return_sequences=True))
model.add(LSTM(1000))
model.add(Dense(1000, activation='relu'))
model.add(Dense(7560, activation='softmax'))

In [12]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 3, 10)             75600     
                                                                 
 lstm (LSTM)                 (None, 3, 1000)           4044000   
                                                                 
 lstm_1 (LSTM)               (None, 1000)              8004000   
                                                                 
 dense (Dense)               (None, 1000)              1001000   
                                                                 
 dense_1 (Dense)             (None, 7560)              7567560   
                                                                 
Total params: 20692160 (78.93 MB)
Trainable params: 20692160 (78.93 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [13]:
model.compile(loss='categorical_crossentropy',optimizer=Adam(learning_rate=0.001),metrics=['accuracy'])

In [14]:
history = model.fit(xtrain,ytrain,validation_data=(xtest,ytest),epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

##Predict Next Words Model


In [16]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
import time
text = "To some the delightful freshness and humour"

for i in range(15):
  # tokenize
  token_text = tokenizer.texts_to_sequences([text])[0]
  # padding
  padded_token_text = pad_sequences([token_text], maxlen=3, padding='pre')
  # predict
  pos = np.argmax(model.predict(padded_token_text))

  for word,index in tokenizer.word_index.items():
    if index == pos:
      text = text + " " + word
      print(text)
      time.sleep(2)

To some the delightful freshness and humour of
To some the delightful freshness and humour of northanger
To some the delightful freshness and humour of northanger abbey
To some the delightful freshness and humour of northanger abbey its
To some the delightful freshness and humour of northanger abbey its completeness
To some the delightful freshness and humour of northanger abbey its completeness finish
To some the delightful freshness and humour of northanger abbey its completeness finish and
To some the delightful freshness and humour of northanger abbey its completeness finish and another
To some the delightful freshness and humour of northanger abbey its completeness finish and another grant
To some the delightful freshness and humour of northanger abbey its completeness finish and another grant not
To some the delightful freshness and humour of northanger abbey its completeness finish and another grant not at
To some the delightful freshness and humour of northanger abbey its compl