<h1><strong><u>LSTM Text Character Generator Model</u></strong></h1>

In [None]:
import numpy as np
import tensorflow as tf
from nltk.tokenize import word_tokenize
import re
import io
import keras
import random
from nltk.tokenize import word_tokenize
from keras import Input, activations
from keras.callbacks import ModelCheckpoint
from keras.layers import SimpleRNN, Dense, LSTM, Dropout, Embedding
from keras.losses import CategoricalCrossentropy, BinaryCrossentropy
from keras.models import Sequential
from sklearn.preprocessing import OrdinalEncoder

<h2><strong><u>Data Preparation</u></strong></h2>

In [None]:
#Get text from link
path = keras.utils.get_file(
    "nietzsche.txt",
    origin="https://s3.amazonaws.com/text-datasets/nietzsche.txt",
)

In [None]:
#Open text to read and process it
with io.open(path, encoding="utf-8") as f:
    text = f.read().lower()
    text_tokenized = text.split()
print("Corpus length:", len(text))
print("Tokenized text length:", len(text_tokenized))

Corpus length: 600893
Tokenized text length: 99111


In [None]:
# Create distinct list of word instances
distinct_words = sorted(list(set(text_tokenized)))
print("Total distinct words and punctuation marks:", len(distinct_words))

#Create a dictionary/mapping of the word instance to numbers
# and numbers to word instances
word_indices = dict((word, i) for i, word in enumerate(distinct_words))
indices_word = dict((i, word) for i, word in enumerate(distinct_words))

Total distinct words and punctuation marks: 17682


In [None]:
# cut text in partially redundant seequences of maxlen words
maxlen = 20
steps = 2
sequences = []
next_words = []
for i in range(0, len(text_tokenized) - maxlen, steps):
    sequences.append(text_tokenized[i : i + maxlen])
    next_words.append(text_tokenized[i + maxlen])
print("Number of sequences:", len(sequences))

Number of sequences: 49546


In [None]:
# Make feature and target tensors from sequences and next_words list objects
X = np.zeros((len(sequences), maxlen, len(distinct_words)), dtype='bool')
y = np.zeros((len(sequences), len(distinct_words)), dtype='bool')
for i, sequence in enumerate(sequences):
    for t, word in enumerate(sequence):
        X[i, t, word_indices[word]] = 1
    y[i, word_indices[next_words[i]]] = 1

# print("Debug")

<h2><strong><u>LSTM Model Selection</u></strong></h2>

In [None]:
model = keras.Sequential(
    [
        keras.Input(shape=(maxlen, len(distinct_words))),
        LSTM(128),
        Dense(len(distinct_words), activation="softmax")
    ]
)
optimizer = keras.optimizers.RMSprop(learning_rate=0.01)
model.compile(loss="categorical_crossentropy", optimizer=optimizer)

In [None]:
def sample(preds, temperature=0.5):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype("float64")
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)

In [None]:
epochs = 40
batch_size = 128

start_index = random.randint(0, len(text) - maxlen - 1)
generated = ""
sequence = text[start_index : start_index + maxlen]
print('...Generating with seed:', *sequence)

model.fit(X, y, batch_size=batch_size, epochs=1)

for i in range(400):
    x_pred = np.zeros((1, len(sequence), len(word)))
    for t, char in enumerate(sequence):
        x_pred[0, t, word_indices[char]] = 1.0
    preds = model.predict(x_pred, verbose=0)[0]
    next_index = sample(preds)
    next_word = indices_word[next_index]
    sequence.append(next_word)

print("...Generated: ", *sequence)