<h1><strong><u>RNN Text Model</u></strong></h1>

In [10]:
!pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.7.2-cp313-cp313-macosx_12_0_arm64.whl.metadata (11 kB)
Collecting scipy>=1.8.0 (from scikit-learn)
  Downloading scipy-1.16.3-cp313-cp313-macosx_14_0_arm64.whl.metadata (62 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.7.2-cp313-cp313-macosx_12_0_arm64.whl (8.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.6/8.6 MB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading scipy-1.16.3-cp313-cp313-macosx_14_0_arm64.whl (20.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.9/20.9 MB[0m [31m19.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading threadpoolctl-3.6.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, scipy, scikit-learn
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3/3[0m [scikit-learn][0m [scikit-lear

In [13]:
import numpy as np
import tensorflow as tf
from nltk.tokenize import word_tokenize
import re
from keras import Input, activations
from keras.callbacks import ModelCheckpoint
from keras.layers import SimpleRNN, Dense, LSTM, Dropout, Embedding
from keras.losses import CategoricalCrossentropy
from keras.models import Sequential
from sklearn.preprocessing import OrdinalEncoder

<h2><strong><u>Data Preprocessing</u></strong></h2>

In [None]:
try:
    with open("./the_sunken_world.txt", "r", encoding="utf-8") as file:
        text = file.read()
        print(text)
except FileNotFoundError:
    print("Error: The file 'your_file.txt' was not found.")
except Exception as e:
    print(f"An error occurred: {e}")

It was in the spring of 1918 that the United States submarine X-111
was launched upon its adventurous career. The German commerce raiders
had now reached the height of their effectiveness; almost daily
they were taking their toll of luckless seamen and provision-laden
steamers; and the United States government, in alarm that was never
officially admitted, had resolved upon desperate measures. The result
was the X-111. The first of a fleet of undersea craft, this vessel
was constructed upon lines never before attempted. Not only was it
exceedingly long (being about two hundred feet from stem to stern), but
it was excessively narrow, and a man had to be short indeed to stand
upright within it on its single deck without coming into contact with
the arching ceiling. The ship, in fact, was nothing more nor less than
a long pipe-like tube of reinforced steel, able to cleave the water at
tremendous speed and ram and destroy any enemy by ramming it with its
beak-like prow. But this was only it

In [7]:
new_text = text.lower() #lowercase
new_text = re.sub(r"([^\w\s])", "", new_text) #remove punctuation
new_text = word_tokenize(new_text) #tokenize
print(new_text)



In [19]:
# Ordinal Encoding

def encode_text(text):
    """Given a list of words, I encode it word by word with each word being a sample). 
    I return the result and the encoder."""
    info("Encoding inputs...")
    debug(f"{text}")
    encoder = OrdinalEncoder()
    #result = encoder.fit_transform(text)
    result = encoder.fit_transform(np.reshape(text, (len(text), 1)))
    # info("Number of input characters:", len(encoder.categories_[0]))
    # debug("Input categories:", encoder.categories_[0])
    # info(f"{result.shape=}")
    # debug(result)
    print(result.shape)
    return result, encoder

encode_text(new_text)

Encoding inputs...
(7674, 1)


(array([[ 943.],
        [2028.],
        [ 893.],
        ...,
        [ 795.],
        [ 956.],
        [ 541.]], shape=(7674, 1)),
 OrdinalEncoder())

<h2><u>RNN Class</u></h2>

In [14]:
PRINT_DEBUG = False
PRINT_INFO = True


def debug(*args):
    if PRINT_DEBUG:
        print(*args)


def info(*args):
    if PRINT_INFO:
        print(*args)

In [15]:
def time_delayed(seq, delay):
    features = []
    targets = []
    for target_index in range(delay, len(seq)):
        features.append(seq[target_index - delay:target_index])
        targets.append(seq[target_index])
    return np.array(features), np.array(targets)

In [16]:
# this will need to be modified to handle words instead of letters
def encode_sequence(sequence):
    """Given a string, I encode it letter by letter (each letter is a sample). I return the
    result and the encoder."""
    info("Encoding inputs...")
    debug(f"{sequence}")
    encoder = OrdinalEncoder(sparse=False)
    result = encoder.fit_transform(np.reshape(sequence, (len(sequence), 1)))
    info("Number of input characters:", len(encoder.categories_[0]))
    debug("Input categories:", encoder.categories_[0])
    info(f"{result.shape=}")
    debug(result)
    return result, encoder

In [None]:
class RNNTextModel:
    def __init__(self, training_string, delay_length=100):
        info("Number of distinct characters:", len(set(training_string)))
        debug("Distinct characters:", set(training_string))
        self.time_steps = delay_length
        encoded_training_data, self.encoder = encode_sequence(list(training_string))
        debug("encoded_training_data:", encoded_training_data)
        self.X_delayed, self.y_delayed = time_delayed(encoded_training_data, self.time_steps)
        self.model = self.create_model(self.X_delayed.shape, self.y_delayed.shape)

    def create_model(input_shape, output_shape):
        info("Creating model...")
        info("Input shape:", input_shape[1:])
        model = Sequential(
            [Input(shape=input_shape[1:]),
            Embedding(input_dim=output_shape[1], output_dim=64, input_length=input_shape[1]),
            LSTM(256, return_sequences=True, activation=activations.tanh),
            Dropout(0.2),
            LSTM(256, activation=activations.tanh),
            Dropout(0.2),
            Dense(output_shape[1], activation=activations.softmax)]
        )
        model.summary()
        model.compile(optimizer="adam", loss=CategoricalCrossentropy(), metrics=["categorical_accuracy"])
        return model
    
    def encode_input_string(self, string):
        v = np.array(list(string)).reshape((self.time_steps, 1))
        return self.encoder.transform(v)
    
    def fit(self, prefix, epochs=2):
        info("Fitting...")
        callbacks = []
        if prefix is not None:
            checkpoint = ModelCheckpoint(prefix + "-{epoch:03d}-{loss:.4f}.hdf5", monitor='loss', verbose=1,
                                         save_best_only=True, mode='min')
            callbacks = [checkpoint]
        self.model.fit(self.X_delayed, self.y_delayed, epochs=epochs, verbose=True, callbacks=callbacks, batch_size=1000)

    def load_weights(self, filename):
        info(f"Loading weights from {filename}...")
        self.model.load_weights(filename)
    
    def predict_from_seed(self, seed, prediction_count):
        info("Predicting output sequence...")
        result = seed
        new_seed = seed
        for i in range(prediction_count):
            inp = self.encode_input_string(new_seed)
            debug(f"{inp=}")
            p = self.encoder.inverse_transform(self.model.predict(np.array([inp])))
            debug(f"{p=}")
            result += p[0][0]
            new_seed = result[-len(seed):]
        return result