## Data Collection

In [None]:
import nltk
from nltk.corpus        import gutenberg
import pandas           as pd

In [None]:
nltk.download('gutenberg')

In [None]:
text =              'shakespeare-hamlet.txt'
fileName =          'hamlet.txt'
writeOperation =    'w'
readOperation =     'r'

with open(fileName, readOperation) as file:
    textFromFile = file.read().lower()

## Data Preprocessing

In [None]:
import numpy as np
from tensorflow.keras.preprocessing.text        import Tokenizer
from tensorflow.keras.preprocessing.sequence    import pad_sequences
from sklearn.model_selection                    import train_test_split

In [None]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([textFromFile])

In [None]:
tokenizer.word_index

In [None]:
totalWords = len(tokenizer.word_index) + 1
totalWords

In [None]:
inputSequences = []
for line in textFromFile.split('\n'):
    tokenList = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(tokenList)):
        sequences = tokenList[:i + 1]
        inputSequences.append(sequences)


In [None]:
inputSequences

In [None]:
sequenceLength = max([len(x) for x in inputSequences])
sequenceLength

In [None]:
inputSequences = np.array(
    pad_sequences(
        inputSequences,
        maxlen = sequenceLength,
        padding = 'pre'
    )
)

inputSequences

In [None]:
## Labels and Predictions

import tensorflow as tf

x = inputSequences[:,:-1]           # All the words expect the last word
y = inputSequences[:, -1]           # Only the last word

In [None]:
# Converting all the y output to a categorical value

y = tf.keras.utils.to_categorical(y, num_classes = totalWords)
y

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)

## GRU RNN Model Variation

In [None]:
from tensorflow.keras.models        import Sequential
from tensorflow.keras.layers        import Embedding, GRU, Dense, Dropout
from tensorflow.keras.callbacks     import EarlyStopping

In [None]:
selectedWords           = 100
neurons                 = 150
dropoutLayer            = 0.2
activationFunction      = 'softmax'

In [None]:
earlyStopping = EarlyStopping(
    monitor                 = 'val_loss',
    patience                = 5,
    restore_best_weights    = True
)

In [None]:
model = Sequential()
model.add(Embedding(totalWords,selectedWords,input_length = sequenceLength - 1))

model.add(GRU(neurons,return_sequences = True))
model.add(Dropout(dropoutLayer))
model.add(GRU(100))

model.add(Dense(totalWords,activation = activationFunction))

## Compile the Model

In [None]:
lossFunction    = 'categorical_crossentropy'
optimizer       = 'adam'
metrics         = 'accuracy'

In [None]:
model.compile(
    loss        = lossFunction,
    optimizer   = optimizer,
    metrics     = [metrics]
)

In [None]:
model.summary()

## Train the Model

In [None]:
history = model.fit(
    x_train,
    y_train,
    epochs = 100,
    validation_data = (x_test, y_test),
    verbose = 1
)

## Predict The Next Word

In [None]:
def predictNextWord(model, tokenizer, text, sequence_length):
    tokenList = tokenizer.texts_to_sequences([text])[0]
    
    if len(tokenList) >= sequenceLength:
        tokenList = tokenList[-(sequence_length - 1) : ]
    
    tokenList           = pad_sequences([tokenList], maxlen = sequence_length - 1, padding = 'pre')
    predictedWord       = model.predict(tokenList, verbose = 0)
    predictedWordIndex  = np.argmax(predictedWord, axis = 1)
    
    for word, index in tokenizer.word_index.items():
        if index == predictedWordIndex:
            return word
    return None

In [None]:
inputText = "To be or not to be"
maxSequenceLength = model.input_shape[1] + 1

In [None]:
nextWord = predictNextWord(model, tokenizer, inputText, maxSequenceLength)

In [None]:
print(f'Input Text:         {inputText}')
print(f'Predicted Word:     {nextWord}')

## Save the Model and Tokenizer

In [None]:
tokenizerFileName       = 'tokenizerGRU.pickle'
writeBinaryMode         = 'wb'

In [None]:
model.save('prediction_gru.keras')

In [None]:
import pickle

with open(tokenizerFileName, writeBinaryMode) as handle:
    pickle.dump(tokenizer, handle, protocol = pickle.HIGHEST_PROTOCOL)