##Next Word Prediction

In [1]:
with open("/content/normalized.txt", "r") as file:
  data = file.read()
file.close()

In [2]:
data



In [3]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [4]:
tokenizer = Tokenizer()

data = data.lower().split("\n")

In [5]:
len(data)

10349

In [6]:
tokenizer.fit_on_texts(data)
total_words = len(tokenizer.word_index) + 1

In [7]:
print(total_words)

2011


In [8]:
import io
import json

tokenizer_json = tokenizer.to_json()
with io.open('tokenizer.json', 'w', encoding='utf-8') as f:
    f.write(json.dumps(tokenizer_json, ensure_ascii=False))

In [10]:
input_sequences = []
for line in data:
  token_list = tokenizer.texts_to_sequences([line])[0]
  for i in range(1, len(token_list)):
    n_gram_sequence = token_list[:i + 1]
    input_sequences.append(n_gram_sequence)

In [11]:
print("Sequences : ",len(input_sequences))

Sequences :  139168


In [12]:
max_sequence_len = max([len(x) for x in input_sequences])

In [13]:
print("Maximum Length : ", max_sequence_len)

Maximum Length :  52


In [14]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

In [15]:
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))


In [16]:
xs = input_sequences[:, :-1]
labels = input_sequences[:, -1]

In [17]:
print(len(xs), len(labels))

139168 139168


In [18]:
import tensorflow as tf

ys = tf.keras.utils.to_categorical(labels, num_classes=total_words)

In [19]:
ys[0]

array([0., 0., 0., ..., 0., 0., 0.], dtype=float32)

In [25]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense
from tensorflow.keras.optimizers import Adam

In [26]:
model = Sequential()
model.add(Embedding(total_words, 240, input_length=max_sequence_len-1))
model.add(Bidirectional(LSTM(150, return_sequences=True)))
model.add(Bidirectional(LSTM(75)))
model.add(Dense(total_words, activation="softmax"))
adam = Adam(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])

In [27]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 51, 240)           482640    
_________________________________________________________________
bidirectional_2 (Bidirection (None, 51, 300)           469200    
_________________________________________________________________
bidirectional_3 (Bidirection (None, 150)               225600    
_________________________________________________________________
dense_2 (Dense)              (None, 2011)              303661    
Total params: 1,481,101
Trainable params: 1,481,101
Non-trainable params: 0
_________________________________________________________________


In [28]:
history = model.fit(xs, ys, epochs=20, verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [29]:
seed_text = "cover"
next_words = 20
  
for _ in range(next_words):
	token_list = tokenizer.texts_to_sequences([seed_text])[0]
	token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
	predicted = model.predict_classes(token_list, verbose=0)
	output_word = ""
	for word, index in tokenizer.word_index.items():
		if index == predicted:
			output_word = word
			break
	seed_text += " " + output_word
print(seed_text)



cover his breath let this the ground as the consequences as he tried to the same thing he smiled than showing


In [30]:
model.save("next_word.h5")

In [31]:
import pickle

# saving
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# loading
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)