<h1><strong><u>RNN Text Model</u></strong></h1>

In [155]:
import numpy as np
import tensorflow as tf
from nltk.tokenize import word_tokenize
import re
from keras import Input, activations
from keras.callbacks import ModelCheckpoint
from keras.layers import SimpleRNN, Dense, LSTM, Dropout, Embedding
from keras.losses import CategoricalCrossentropy, BinaryCrossentropy
from keras.models import Sequential
from sklearn.preprocessing import OrdinalEncoder

<h2><strong><u>Data Preprocessing Methods</u></strong></h2>

In [156]:
def tokenize_text(text):
    new_text = text.lower() #lowercase
    new_text = re.sub(r"([^\w\s])", "", new_text) #remove punctuation
    text_array = word_tokenize(new_text) #tokenize
    return text_array

In [157]:
# Ordinal Encoding

def encode_text(text_array):
    """Given a list of words, I encode it word by word with each word being a sample). 
    I return the result and the encoder."""
    info("Encoding inputs...")
    debug(f"{text_array}")
    encoder = OrdinalEncoder()
    #result = encoder.fit_transform(text)
    result = encoder.fit_transform(np.reshape(text_array, (len(text_array), 1)))
    # info("Number of input characters:", len(encoder.categories_[0]))
    # debug("Input categories:", encoder.categories_[0])
    # info(f"{result.shape=}")
    # debug(result)
    #print(result.shape)
    return result, encoder


# encoded_array, encoder = encode_text(text)
# features, targets = time_delayed(encoded_array, 5)
# print("Features:")
# print(features.shape[1:])
# print("Targets:")
# print(targets.shape)

In [158]:
# max_value = max(encoded_array)
# print(max_value[0])

In [159]:
# set(encoded_array.flatten())

<h2><u>RNN Class</u></h2>

In [160]:
PRINT_DEBUG = False
PRINT_INFO = True


def debug(*args):
    if PRINT_DEBUG:
        print(*args)


def info(*args):
    if PRINT_INFO:
        print(*args)

In [161]:
def time_delayed(seq, delay):
    features = []
    targets = []
    for target_index in range(delay, len(seq)):
        features.append(seq[target_index - delay:target_index])
        targets.append(seq[target_index])
    return np.array(features), np.array(targets)

In [162]:
# # this will need to be modified to handle words instead of letters
# def encode_sequence(sequence):
#     """Given a string, I encode it letter by letter (each letter is a sample). I return the
#     result and the encoder."""
#     info("Encoding inputs...")
#     debug(f"{sequence}")
#     encoder = OrdinalEncoder(sparse=False)
#     result = encoder.fit_transform(np.reshape(sequence, (len(sequence), 1)))
#     info("Number of input characters:", len(encoder.categories_[0]))
#     debug("Input categories:", encoder.categories_[0])
#     info(f"{result.shape=}")
#     debug(result)
#     return result, encoder

In [163]:
class RNNTextModel:
    def __init__(self, training_string, delay_length=100):
        text_array = tokenize_text(training_string)
        encoded_training_data, self.encoder = encode_text(text_array)
        self.time_steps = delay_length
        max_vocabulary_size = len(set(encoded_training_data.flatten()))
        info("Number of distinct words:", max_vocabulary_size)
        debug("encoded_training_data:", encoded_training_data)
        self.X_delayed, self.y_delayed = time_delayed(encoded_training_data, self.time_steps)
        print(self.X_delayed.shape)
        self.model = self.create_model(max_vocabulary_size, self.X_delayed.shape, self.y_delayed.shape)

    #figure out shape issues to see which values to use for embedding layer
    def create_model(self, max_vocabulary_size, input_shape, output_shape):
        info("Creating model...")
        info("Input shape:", input_shape[1:])
        print(input_shape[2:])
        model = Sequential(
            [Input(shape=output_shape[1:]),
            Embedding(input_dim=max_vocabulary_size, output_dim=64),
            LSTM(256, return_sequences=True, activation=activations.tanh),
            Dropout(0.2),
            LSTM(256, activation=activations.tanh),
            Dropout(0.2),
            Dense(output_shape[1], activation=activations.sigmoid)]
        )
        model.summary()
        model.compile(optimizer="adam", loss=CategoricalCrossentropy(), metrics=["categorical_accuracy"])
        return model
    
    def encode_input_string(self, string):
        input_array = tokenize_text(string)
        encoded_input_array = self.encoder.transform(np.reshape(input_array, (len(input_array), 1)))
        return encoded_input_array
    
    def fit(self, prefix, epochs=2):
        info("Fitting...")
        callbacks = []
        if prefix is not None:
            checkpoint = ModelCheckpoint(prefix + "-{epoch:03d}-{loss:.4f}.keras", monitor='loss', verbose=1,
                                         save_best_only=True, mode='min')
            callbacks = [checkpoint]
        self.model.fit(self.X_delayed, self.y_delayed, epochs=epochs, verbose=True, callbacks=callbacks, batch_size=3000)

    def load_weights(self, filename):
        info(f"Loading weights from {filename}...")
        self.model.load_weights(filename)
    
    def predict_from_seed(self, seed, prediction_count):
        info("Predicting output sequence...")
        result = seed
        new_seed = seed
        for i in range(prediction_count):
            inp = self.encode_input_string(new_seed)
            debug(f"{inp=}")
            predictions = self.model.predict(inp)
            info(f"{predictions=}")
            #predictions = self.model.predict(np.reshape(inp, (1, inp.shape[0], inp.shape[1])))
            p = self.encoder.inverse_transform(predictions)
            debug(f"{p=}")
            print(p.shape)
            print(f"Predicted word: {p[0][0]}")
            result = result + " " + p[0][0]
            length = len(seed)
            new_seed = result
        return result

In [164]:
def main(text, seed):
    rnn_model = RNNTextModel(text)
    # rnn_model.load_weights("./rnn_text_model-001-0.0000.keras")
    rnn_model.fit(prefix="rnn_text_model", epochs=3)
    output = rnn_model.predict_from_seed(seed, 100)
    print(f"Generated text: {seed} {output}")

In [165]:
try:
    with open("./the_sunken_world_full.txt", "r", encoding="utf-8") as file:
        text = file.read()
except FileNotFoundError:
    print("Error: The file 'your_file.txt' was not found.")
except Exception as e:
    print(f"An error occurred: {e}")

In [166]:
seed = '''For strategic reasons, this fact was not divulged
until much later, and for strategic reasons it was not made known
that the missing submarine was of a new and previously untried type;
but the mystery of the X-111’s disappearance weighed heavily upon the
minds of naval officials, and secretly they resolved upon immediate and
exhaustive investigation. All in vain. Not a trace of the lost ship or
of the thirty-nine members of its crew could be found; not a scrap of
the usual drifting flotsam or wreckage could be picked up anywhere on
the sea; and at last it was admitted in despair that the waters would
perhaps guard their secret forever.

Seven years went by. Peace had long since returned, and the X-111
and its tragedy had been forgotten except by a few relatives of the
unfortunate thirty-nine. Then suddenly the mystery'''

In [167]:
main(text, seed)

Encoding inputs...
Number of distinct words: 9757
(89835, 100, 1)
Creating model...
Input shape: (100, 1)
(1,)


Fitting...
Epoch 1/3


  return self.fn(y_true, y_pred, **self._fn_kwargs)


[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6s/step - categorical_accuracy: 1.0000 - loss: 0.0000e+00
Epoch 1: loss improved from None to 0.00000, saving model to rnn_text_model-001-0.0000.keras
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m197s[0m 7s/step - categorical_accuracy: 1.0000 - loss: 0.0000e+00
Epoch 2/3
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6s/step - categorical_accuracy: 1.0000 - loss: 0.0000e+00
Epoch 2: loss did not improve from 0.00000
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m182s[0m 6s/step - categorical_accuracy: 1.0000 - loss: 0.0000e+00
Epoch 3/3
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6s/step - categorical_accuracy: 1.0000 - loss: 0.0000e+00
Epoch 3: loss did not improve from 0.00000
[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m181s[0m 6s/step - categorical_accuracy: 1.0000 - loss: 0.0000e+00
Predicting output sequence...
[1m5/5[0m [32m━━━━━━━━━━━━━━━━