In [2]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Input
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.text \
    import text_to_word_sequence
import tensorflow as tf
import logging
tf.get_logger().setLevel(logging.ERROR)

EPOCHS = 32
BATCH_SIZE = 256
INPUT_FILE_NAME = '/content/frankenstein.txt'
WINDOW_LENGTH = 40
WINDOW_STEP = 3
PREDICT_LENGTH = 3
MAX_WORDS = 7500
EMBEDDING_WIDTH = 100

In [3]:
file = open(INPUT_FILE_NAME, 'r', encoding='utf-8-sig')
text = file.read()
file.close()

# Make lower case and split into individual words.
text = text_to_word_sequence(text)

# Create training examples.
fragments = []
targets = []
for i in range(0, len(text) - WINDOW_LENGTH, WINDOW_STEP):
    fragments.append(text[i: i + WINDOW_LENGTH])
    targets.append(text[i + WINDOW_LENGTH])

In [4]:
tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token='UNK')
tokenizer.fit_on_texts(text)
fragments_indexed = tokenizer.texts_to_sequences(fragments)
targets_indexed = tokenizer.texts_to_sequences(targets)

# Convert to appropriate input and output formats.
X = np.array(fragments_indexed, dtype=np.int64)
y = np.zeros((len(targets_indexed), MAX_WORDS))
for i, target_index in enumerate(targets_indexed):
    y[i, target_index] = 1

In [5]:
training_model = Sequential()
training_model.add(Input(shape=(None,), batch_size=BATCH_SIZE))
training_model.add(Embedding(
    output_dim=EMBEDDING_WIDTH, input_dim=MAX_WORDS,
    mask_zero=True))
training_model.add(LSTM(128, return_sequences=True,
                        dropout=0.2, recurrent_dropout=0.2))
training_model.add(LSTM(128, dropout=0.2,
                        recurrent_dropout=0.2))
training_model.add(Dense(128, activation='relu'))
training_model.add(Dense(MAX_WORDS, activation='softmax'))
training_model.compile(loss='categorical_crossentropy',
                       optimizer='adam')
training_model.summary()
history = training_model.fit(X, y, validation_split=0.05,
                             batch_size=BATCH_SIZE,
                             epochs=EPOCHS, verbose=2,
                             shuffle=True)

Epoch 1/32
97/97 - 91s - 936ms/step - loss: 7.0795 - val_loss: 7.8610
Epoch 2/32
97/97 - 77s - 789ms/step - loss: 6.3797 - val_loss: 8.1724
Epoch 3/32
97/97 - 75s - 778ms/step - loss: 6.2605 - val_loss: 8.3901
Epoch 4/32
97/97 - 74s - 766ms/step - loss: 6.1550 - val_loss: 8.5729
Epoch 5/32
97/97 - 82s - 847ms/step - loss: 6.0266 - val_loss: 8.6472
Epoch 6/32
97/97 - 76s - 783ms/step - loss: 5.9103 - val_loss: 8.8229
Epoch 7/32
97/97 - 77s - 790ms/step - loss: 5.8179 - val_loss: 8.8414
Epoch 8/32
97/97 - 78s - 802ms/step - loss: 5.7401 - val_loss: 8.9177
Epoch 9/32
97/97 - 74s - 767ms/step - loss: 5.6674 - val_loss: 8.8973
Epoch 10/32
97/97 - 82s - 845ms/step - loss: 5.5856 - val_loss: 9.0373
Epoch 11/32
97/97 - 74s - 761ms/step - loss: 5.4946 - val_loss: 9.0400
Epoch 12/32
97/97 - 75s - 773ms/step - loss: 5.4152 - val_loss: 9.2385
Epoch 13/32
97/97 - 73s - 756ms/step - loss: 5.3437 - val_loss: 9.3378
Epoch 14/32
97/97 - 74s - 759ms/step - loss: 5.2772 - val_loss: 9.3883
Epoch 15/32
97/

In [6]:
inference_model = Sequential()
inference_model.add(Input(shape=(None,), batch_size=1))
inference_model.add(Embedding(
    output_dim=EMBEDDING_WIDTH, input_dim=MAX_WORDS,
    mask_zero=False))
inference_model.add(LSTM(128, return_sequences=True,
                         dropout=0.2, recurrent_dropout=0.2,
                         stateful=True))
inference_model.add(LSTM(128, dropout=0.2,
                         recurrent_dropout=0.2, stateful=True))
inference_model.add(Dense(128, activation='relu'))
inference_model.add(Dense(MAX_WORDS, activation='softmax'))
weights = training_model.get_weights()
inference_model.set_weights(weights)

In [7]:
first_words = ['i', 'saw']
first_words_indexed = tokenizer.texts_to_sequences(
    first_words)
inference_model.layers[1].reset_states()
inference_model.layers[2].reset_states()
predicted_string = ''
# Feed initial words to the model.
for i, word_index in enumerate(first_words_indexed):
    x = np.zeros((1, 1), dtype=np.int64)
    x[0][0] = word_index[0]
    predicted_string += first_words[i]
    predicted_string += ' '
    y_predict = inference_model.predict(x, verbose=0)[0]
# Predict PREDICT_LENGTH words.
for i in range(PREDICT_LENGTH):
    new_word_index = np.argmax(y_predict)
    word = tokenizer.sequences_to_texts(
        [[new_word_index]])
    x[0][0] = new_word_index
    predicted_string += word[0]
    predicted_string += ' '
    y_predict = inference_model.predict(x, verbose=0)[0]
print(predicted_string)

i saw the most mountains 
