# Chosing LSTM for textual data

In [None]:
import tensorflow
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
import pickle
import numpy as np
import os

In [None]:
file  = open('/content/nextwordprediction.txt', 'r', encoding = "utf8")

In [None]:
#change to list
lines=[]
for i in file:
    lines.append(i)

In [None]:
#convert to string
data=""
for i in lines:
    data = ' '.join(lines)

In [None]:
data = data.replace('\n', '').replace('\r', '').replace('\ufeff', '').replace('“','').replace('”','') #removing charecters

In [None]:
data = data.split()
data = ' '.join(data)

In [None]:
data[:1000]  #actual data

"Project Gutenberg's The Adventures of Sherlock Holmes, by Arthur Conan Doyle This eBook is for the use of anyone anywhere at no cost and with almost no restrictions whatsoever. You may copy it, give it away or re-use it under the terms of the Project Gutenberg License included with this eBook or online at www.gutenberg.net Title: The Adventures of Sherlock Holmes Author: Arthur Conan Doyle Release Date: November 29, 2002 [EBook #1661] Last Updated: May 20, 2019 Language: English Character set encoding: UTF-8 *** START OF THIS PROJECT GUTENBERG EBOOK THE ADVENTURES OF SHERLOCK HOLMES *** Produced by an anonymous Project Gutenberg volunteer and Jose Menendez cover The Adventures of Sherlock Holmes by Arthur Conan Doyle Contents I. A Scandal in Bohemia II. The Red-Headed League III. A Case of Identity IV. The Boscombe Valley Mystery V. The Five Orange Pips VI. The Man with the Twisted Lip VII. The Adventure of the Blue Carbuncle VIII. The Adventure of the Speckled Band IX. The Adventure 

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])  #store in data as a list

In [None]:
pickle.dump(tokenizer, open('token_next_word.pkl', 'wb'))  #storing

In [None]:
sequence_data = tokenizer.texts_to_sequences([data])[0] # data to sequence

In [None]:
sequence_data[:10]    #mapping each word

[142, 4680, 1, 986, 5, 125, 33, 46, 556, 2164]

In [None]:
len(sequence_data) #number of unique words

108958

In [None]:
vocab_size = len(tokenizer.word_index)+1   #start from zero

In [None]:
vocab_size   #which is the input size

8624

In [None]:
sequence=[]
for i in range(5, len(sequence_data)):  #5 words before
    words = sequence_data[i-5:i+1]
    sequence.append(words)  #append to sequence

In [None]:
len(sequence)

108953

In [None]:
sequence = np.array(sequence)  #convert to array

In [None]:
sequence

array([[ 142, 4680,    1,  986,    5,  125],
       [4680,    1,  986,    5,  125,   33],
       [   1,  986,    5,  125,   33,   46],
       ...,
       [  71, 4678, 8623,    4,  347,   81],
       [4678, 8623,    4,  347,   81,  345],
       [8623,    4,  347,   81,  345, 1623]])

In [None]:
X=[]
y=[]

for i in sequence:
    X.append(i[0:5])
    y.append(i[5])

In [None]:
X=np.array(X)
y=np.array(y)

In [None]:
y = to_categorical(y, num_classes=vocab_size)  #one hot encoding

In [None]:
y[:5]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

# Modeling

In [None]:
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=5))
model.add(LSTM(1000, return_sequences=True))
model.add(LSTM(1000))
model.add(Dense(1000, activation="relu"))
model.add(Dense(vocab_size, activation="softmax"))
model.summary()



In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint

checkpoint = ModelCheckpoint("next_word_pred.keras", monitor='loss', verbose=1, save_best_only=True)
model.compile(loss="categorical_crossentropy", optimizer=Adam(learning_rate=0.001))
model.fit(X, y, epochs=5, batch_size=64, callbacks=[checkpoint])

Epoch 1/5
[1m1703/1703[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1s/step - loss: 6.7000
Epoch 1: loss improved from inf to 6.39879, saving model to next_word_pred.keras
[1m1703/1703[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1784s[0m 1s/step - loss: 6.6998
Epoch 2/5
[1m1703/1703[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1s/step - loss: 5.8473
Epoch 2: loss improved from 6.39879 to 5.80840, saving model to next_word_pred.keras
[1m1703/1703[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1812s[0m 1s/step - loss: 5.8473
Epoch 3/5
[1m1703/1703[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1s/step - loss: 5.5018
Epoch 3: loss improved from 5.80840 to 5.47970, saving model to next_word_pred.keras
[1m1703/1703[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1747s[0m 1s/step - loss: 5.5018
Epoch 4/5
[1m1703/1703[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1s/step - loss: 5.2233
Epoch 4: loss improved from 5.47970 to 5.20965, saving model to

<keras.src.callbacks.history.History at 0x7829a7c7a020>

# Prediction

In [1]:
from tensorflow.keras.models import load_model
import numpy as np
import pickle

model = load_model('/content/next_word_pred.keras')
tokenizer = pickle.load(open('/content/token_next_word.pkl', 'rb'))

def predict_word(model, tokenizer, text):
    sequence = tokenizer.texts_to_sequences([text])
    sequence = np.array(sequence)
    preds=np.argmax(model.predict(sequence))
    predicted_word = ""
    for key, value in tokenizer.word_index.items():
        if value == preds:
            predicted_word = key
            break
    print(predicted_word)
    return predicted_word

while True:
    text = input("Enter your line: ")
    if text=="1":
        break
    else:
        text = text.split(" ")
        text = text[-5:]
        predict_word(model, tokenizer, text)

Enter your line: The Five Orange
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 475ms/step
and
Enter your line: I have seldom heard him
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 457ms/step
i
Enter your line: but as a lover he
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
was


KeyboardInterrupt: Interrupted by user