In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, LSTM, Dense 
from tensorflow.keras import Sequential
from tensorflow.keras.utils import to_categorical
import pickle
import numpy as np
import os

In [1]:
file = open("/content/drive/MyDrive/pride and prejudice.txt", "r", encoding = "utf-8")
lines = []
for line in file:
  lines.append(line)

data = ""
for i in lines:
  data = " ".join(lines)
data = data.replace('\n', ' ').replace('\r', ' ').replace('\uffeff', ' ').replace('“', ' ').replace('”', ' ')
data = data.split()
data = '.'.join(data)
data[:500]

'The.Project.Gutenberg.eBook.of.Pride.and.Prejudice,.by.Jane.Austen.This.eBook.is.for.the.use.of.anyone.anywhere.in.the.United.States.and.most.other.parts.of.the.world.at.no.cost.and.with.almost.no.restrictions.whatsoever..You.may.copy.it,.give.it.away.or.re-use.it.under.the.terms.of.the.Project.Gutenberg.License.included.with.this.eBook.or.online.at.www.gutenberg.org..If.you.are.not.located.in.the.United.States,.you.will.have.to.check.the.laws.of.the.country.where.you.are.located.before.using.th'

In [2]:
len(data)

698428

In [6]:
token = Tokenizer()
token.fit_on_texts([data])
pickle.dump(token, open('token.pkl', 'wb'))
sequence_data = token.texts_to_sequences([data])[0]
sequence_data[:15]

[1, 176, 158, 916, 3, 321, 4, 1172, 30, 72, 2535, 41, 916, 23, 21]

In [7]:
len(sequence_data)

125316

In [8]:
vocab_size = len(token.word_index) + 1
print(vocab_size)

7028


In [9]:
sequences = []
for i in range(3, len(sequence_data)):
  words = sequence_data[i-3:i+1]
  sequences.append(words)
print(len(sequences))
sequences = np.array(sequences)
sequences[:15]

125313


array([[   1,  176,  158,  916],
       [ 176,  158,  916,    3],
       [ 158,  916,    3,  321],
       [ 916,    3,  321,    4],
       [   3,  321,    4, 1172],
       [ 321,    4, 1172,   30],
       [   4, 1172,   30,   72],
       [1172,   30,   72, 2535],
       [  30,   72, 2535,   41],
       [  72, 2535,   41,  916],
       [2535,   41,  916,   23],
       [  41,  916,   23,   21],
       [ 916,   23,   21,    1],
       [  23,   21,    1,  518],
       [  21,    1,  518,    3]])

In [10]:
X = []
y = []
for i in sequences:
  X.append(i[0:3])
  y.append(i[3])
X = np.array(X)
y = np.array(y)

In [11]:
y = to_categorical(y, num_classes=vocab_size)
y[:5]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [13]:
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length = 3))
model.add(LSTM(1000, return_sequences=True))
model.add(LSTM(1000))
model.add(Dense(1000, activation = 'relu'))
model.add(Dense(vocab_size, activation='softmax'))

In [14]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 3, 10)             70280     
                                                                 
 lstm (LSTM)                 (None, 3, 1000)           4044000   
                                                                 
 lstm_1 (LSTM)               (None, 1000)              8004000   
                                                                 
 dense (Dense)               (None, 1000)              1001000   
                                                                 
 dense_1 (Dense)             (None, 7028)              7035028   
                                                                 
Total params: 20,154,308
Trainable params: 20,154,308
Non-trainable params: 0
_________________________________________________________________


In [15]:
checkpoint = tf.keras.callbacks.ModelCheckpoint('next_word_pred.h5', monitor = 'loss',verbose = 1, save_best_only=True)
model.compile(loss = 'categorical_crossentropy', metrics = ['accuracy'], optimizer = tf.keras.optimizers.Adam(learning_rate=0.001))
model.fit(X, y, epochs = 100, batch_size = 64, callbacks = [checkpoint])

Epoch 94: loss improved from 0.40466 to 0.40163, saving model to next_word_pred.h5
Epoch 95/100
Epoch 95: loss did not improve from 0.40163
Epoch 96/100
Epoch 96: loss improved from 0.40163 to 0.40028, saving model to next_word_pred.h5
Epoch 97/100
Epoch 97: loss did not improve from 0.40028
Epoch 98/100
Epoch 98: loss improved from 0.40028 to 0.39478, saving model to next_word_pred.h5
Epoch 99/100
Epoch 99: loss did not improve from 0.39478
Epoch 100/100
Epoch 100: loss did not improve from 0.39478


<keras.callbacks.History at 0x7f5c32c2c890>

In [16]:
model = tf.keras.models.load_model("next_word_pred.h5")
token = pickle.load(open('token.pkl', 'rb'))

In [22]:
def Predict_next_word(model, token, text):
  Sequence = token.texts_to_sequences([text])
  sequence = np.array(Sequence)
  preds = np.argmax(model.predict(sequence))
  predicted_word = ""
  for key, value in token.word_index.items():
    if value == preds:
      predicted_word = key
      break
  print(predicted_word)
  return(predicted_word)


In [27]:
while True:
  text = input("Enter your line: ")
  if text == "0":
    print("Execution Completed")
    break
  else:
    try:
      text = text.split(" ")
      text = text[:3]
      print(text)

      Predict_next_word(model, token, text)
    except Exception as e:
      print("Error occured: ", e)
      continue

Enter your line: why not this
['why', 'not', 'this']
how
Enter your line: how are you
['how', 'are', 'you']
all
Enter your line: 0
Execution Completed
