In [28]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.preprocessing import LabelEncoder
import re
import string

In [29]:
file = open("Next_word.txt")
text=file.read()

In [30]:
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    return text

In [31]:
tokenizer=Tokenizer()
tokenizer.fit_on_texts([text])
unique_words = len(tokenizer.word_index)
unique_words

8930

In [32]:
tokenizer.word_index

{'the': 1,
 'and': 2,
 'to': 3,
 'of': 4,
 'a': 5,
 'i': 6,
 '”': 7,
 'in': 8,
 'that': 9,
 'it': 10,
 'he': 11,
 'was': 12,
 'you': 13,
 'his': 14,
 'is': 15,
 'my': 16,
 'have': 17,
 'with': 18,
 'as': 19,
 'had': 20,
 'at': 21,
 'which': 22,
 'for': 23,
 'be': 24,
 'not': 25,
 'me': 26,
 'but': 27,
 'from': 28,
 'we': 29,
 'this': 30,
 'said': 31,
 'upon': 32,
 'there': 33,
 'holmes': 34,
 'him': 35,
 'so': 36,
 'her': 37,
 'she': 38,
 'all': 39,
 '’': 40,
 'been': 41,
 'your': 42,
 'on': 43,
 'very': 44,
 'by': 45,
 'one': 46,
 'are': 47,
 '“i': 48,
 'were': 49,
 'an': 50,
 'no': 51,
 'would': 52,
 'out': 53,
 'what': 54,
 'then': 55,
 'up': 56,
 'when': 57,
 'man': 58,
 'could': 59,
 'has': 60,
 'do': 61,
 'into': 62,
 'or': 63,
 'little': 64,
 'will': 65,
 'who': 66,
 'mr': 67,
 'if': 68,
 'some': 69,
 'down': 70,
 'see': 71,
 'now': 72,
 'our': 73,
 'should': 74,
 'may': 75,
 'am': 76,
 'us': 77,
 'over': 78,
 'they': 79,
 'can': 80,
 'more': 81,
 'think': 82,
 'about': 83,
 'mu

In [33]:
input_sequence=[]
for sentence in text.split('\n'):

    tokenized_sentence=tokenizer.texts_to_sequences([sentence])[0]
    print(tokenized_sentence)
    for i in range(1,len(tokenized_sentence)):
        input_sequence.append(tokenized_sentence[:i+1])

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
[7334, 119, 6, 372, 3, 226, 69, 1111, 22, 75, 258, 77, 8, 30]
[104, 7]
[]
[10, 12, 850, 46, 361, 57, 128, 34, 332, 28, 14]
[7335, 11, 477, 8, 14, 109, 5, 953, 4, 448, 285, 3138, 78]
[18, 1229, 2, 2335]
[]
[48, 17, 189, 1, 65, 4, 1, 2491, 246, 7, 31, 11, 743, 2211, 144]
[2395, 2118, 6, 17, 41, 1717, 3, 141, 53, 1, 439, 3307, 4, 1]
[4480, 18, 22, 10, 15, 1285, 1, 7336, 1821, 22, 21, 1]
[94, 4, 1, 1644, 433, 12, 64, 549, 4, 338, 267, 2614, 15, 72, 140]
[1, 1049, 8, 4475, 3307, 25, 81, 91, 338, 7337, 357, 509, 80]
[763, 50, 1821, 4, 338, 7338, 8, 113, 4, 430, 10, 15, 1792]
[783, 9, 68, 272, 7339, 20, 489, 30, 1459, 52, 17, 20, 5]
[1172, 7340, 199, 185, 46, 4, 97, 52, 2203, 35, 3, 5, 44]
[666, 3067, 16, 7341, 141, 60, 25, 41, 3308, 314, 10, 60]
[2560, 9, 11, 60, 1, 44, 3057, 2673, 23, 513, 8, 1, 111]
[4, 247, 4, 1, 514, 2, 72, 166, 30, 15, 163, 666, 23]
[7342, 2060, 19, 1, 182, 58, 15, 1119, 9, 29, 47, 556]
[598, 8, 14, 1135, 

In [34]:
input_sequence

[[145, 4789],
 [145, 4789, 1],
 [145, 4789, 1, 1020],
 [145, 4789, 1, 1020, 4],
 [145, 4789, 1, 1020, 4, 128],
 [145, 4789, 1, 1020, 4, 128, 34],
 [145, 4789, 1, 1020, 4, 128, 34, 45],
 [145, 4789, 1, 1020, 4, 128, 34, 45, 611],
 [145, 4789, 1, 1020, 4, 128, 34, 45, 611, 2235],
 [145, 4789, 1, 1020, 4, 128, 34, 45, 611, 2235, 2236],
 [30, 1021],
 [30, 1021, 15],
 [30, 1021, 15, 23],
 [30, 1021, 15, 23, 1],
 [30, 1021, 15, 23, 1, 275],
 [30, 1021, 15, 23, 1, 275, 4],
 [30, 1021, 15, 23, 1, 275, 4, 394],
 [30, 1021, 15, 23, 1, 275, 4, 394, 2237],
 [30, 1021, 15, 23, 1, 275, 4, 394, 2237, 21],
 [30, 1021, 15, 23, 1, 275, 4, 394, 2237, 21, 51],
 [30, 1021, 15, 23, 1, 275, 4, 394, 2237, 21, 51, 1676],
 [30, 1021, 15, 23, 1, 275, 4, 394, 2237, 21, 51, 1676, 2],
 [30, 1021, 15, 23, 1, 275, 4, 394, 2237, 21, 51, 1676, 2, 18],
 [572, 51],
 [572, 51, 3398],
 [572, 51, 3398, 3399],
 [572, 51, 3398, 3399, 13],
 [572, 51, 3398, 3399, 13, 75],
 [572, 51, 3398, 3399, 13, 75, 817],
 [572, 51, 3398, 33

In [35]:
max_word_len=max([len(x) for x in input_sequence])
print(max_word_len)

20


In [36]:
padded_input_sequences=pad_sequences(input_sequence,maxlen=max_word_len - 1,padding='pre')

In [37]:
padded_input_sequences

array([[   0,    0,    0, ...,    0,  145, 4789],
       [   0,    0,    0, ...,  145, 4789,    1],
       [   0,    0,    0, ..., 4789,    1, 1020],
       ...,
       [   0,    0,    0, ...,    3,  360,   83],
       [   0,    0,    0, ...,  360,   83,  358],
       [   0,    0,    0, ...,   83,  358, 1673]], dtype=int32)

In [38]:
#select all the row and all columns except for the last column.
x=padded_input_sequences[:,:-1]
print(x)

[[   0    0    0 ...    0    0  145]
 [   0    0    0 ...    0  145 4789]
 [   0    0    0 ...  145 4789    1]
 ...
 [   0    0    0 ... 8930    3  360]
 [   0    0    0 ...    3  360   83]
 [   0    0    0 ...  360   83  358]]


In [39]:
#select the last column of each row .
y=padded_input_sequences[:,-1]
print(y)

[4789    1 1020 ...   83  358 1673]


In [40]:
print(x.shape)
print(y.shape)

(101619, 18)
(101619,)


In [41]:
#ONE HOT ENCODING.
y=to_categorical(y,num_classes=unique_words + 1) #num_classes is the number of unique words present + 1

In [42]:
print(y.shape)
y

(101619, 8931)


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [43]:
model=Sequential()
model.add(Embedding(unique_words + 1,100,input_length=max_word_len - 1 ))
model.add(LSTM(150)) # 150 is the number of nodes inside the gates.
model.add(Dense(8931, activation='softmax'))



In [44]:
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

In [45]:
model.summary()

In [46]:
model.fit(x,y,epochs=1)

[1m  92/3176[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m3:32[0m 69ms/step - accuracy: 0.0433 - loss: 8.1208

KeyboardInterrupt: 

In [None]:
import time
text="This eBook  "
for i in range(3):
    #tokenize
    token_text=tokenizer.texts_to_sequences([text])[0]
    #padding
    padded_token_text=pad_sequences([token_text],maxlen=19,padding='pre')
    #predict
    pos=np.argmax(model.predict(padded_token_text))

    for word,index  in tokenizer.word_index.items():
        if index==pos:
            text=text+" "+word
            print(text)
            time.sleep(1)