In [1]:
# Learn plan:
# - Character RNN
# - Stateless RNN
# - Stateful RNN

### Import Libraries

In [1]:
import keras.utils
import numpy as np
from keras.src.legacy.preprocessing.text import Tokenizer # TODO: seems this a deprecated class, find out a new way to do this
import tensorflow as tf
from tensorflow.keras import layers, Sequential, optimizers, losses, metrics, callbacks

### Data Loading

In [3]:
shakespear_url = 'https://github.com/karpathy/char-rnn'

file_path = keras.utils.get_file('shakespear.txt', shakespear_url)

with open(file_path) as f:
    shakespear_text = f.read()

In [4]:
tokenizer = Tokenizer(char_level=True)  # coding at char level
tokenizer.fit_on_texts(shakespear_text) # fit tokenizer on text

In [5]:
tokenizer.texts_to_sequences(['First'])

[[37, 7, 14, 9, 2]]

In [6]:
tokenizer.sequences_to_texts([[37, 7, 14, 9, 2]])

['f i r s t']

In [7]:
max_id = len(tokenizer.word_index)
max_id # total unique characters in the text

74

In [8]:
data_size = tokenizer.document_count
data_size # total characters in the text

309117

### Data Preparation

In [9]:
# let's encode the entire text
[encoded] = np.array(tokenizer.texts_to_sequences([
    shakespear_text
])) - 1  # shift to zero-based
encoded

array([37, 37, 37, ..., 29, 37, 37], shape=(309117,))

In [10]:
# Data Split for Training / Validation
train_size = data_size * 90 // 100
train_size

278205

In [16]:
n_steps = 100
window_length = n_steps + 1
batch_size = 32

# Start from encoded characters
dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size])

# Sliding windows (each element is now a Dataset, not a tensor)
dataset = dataset.window(window_length, shift=1, drop_remainder=True)

# Turn each window Dataset into a tensor of shape [window_length]
dataset = dataset.flat_map(lambda window: window.batch(window_length))

# Now split into input (all but last char) and target (shifted by 1)
dataset = dataset.map(lambda window: (window[:-1], window[1:]))

# Batch and one-hot encode
dataset = dataset.shuffle(10000).batch(batch_size)

dataset = dataset.map(
    lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch)
)

dataset = dataset.prefetch(1) # prefetch next batch while training on the current one

### Model Preparation ( Char-Rnn )

In [17]:
model = Sequential([
    layers.InputLayer(shape=[None, max_id]),
    layers.GRU(128, return_sequences=True, dropout=0.2, recurrent_dropout=0.2),
    layers.GRU(128, return_sequences=True, dropout=0.2, recurrent_dropout=0.2),
    layers.TimeDistributed(layers.Dense(max_id, activation='softmax'))
])

model.compile(
    optimizer=optimizers.Adam(learning_rate=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-07),
    loss=losses.sparse_categorical_crossentropy,
    metrics=[metrics.sparse_categorical_accuracy,
             metrics.sparse_top_k_categorical_accuracy,
             metrics.sparse_categorical_crossentropy],
)

model.summary()

In [None]:
early_stopping_callback = callbacks.EarlyStopping(
    patience=10,
    restore_best_weights=True
)

In [None]:
model.fit(dataset, # Takes so long time! ( ~ 10h)
          epochs=20,
          callbacks=[
                     early_stopping_callback,
                     ])

Epoch 1/20
   1218/Unknown [1m305s[0m 242ms/step - loss: 2.2813 - sparse_categorical_accuracy: 0.3961 - sparse_categorical_crossentropy: 2.2813 - sparse_top_k_categorical_accuracy: 0.6478

## Evaluation

In [None]:
accuracy = model.evaluate(dataset)
print(f"Accuracy: {accuracy[1]*100:.2f}%")

## Prediction example

In [None]:
def preprocess_input(text):
    " Preprocess input text into one-hot encoded format "
    X = np.array(tokenizer.texts_to_sequences([text])) - 1
    return tf.one_hot(X, depth=max_id)

In [None]:
X_new = preprocess_input("ROMEO:") # input text
Y_pred = model.predict_classes(X_new) # predict next characters
tokenizer.sequences_to_texts(Y_pred + 1)[0][-1] # decode predicted characters


# Text generation

In [None]:
def next_char(text, temperature=1.0):
    " Generate the next character given the input text and temperature "
    X_new = preprocess_input([text])
    y_proba = model.predict(X_new)[0, -1, :]
    rescaled_logits = tf.math.log(y_proba) / temperature
    char_id = tf.random.categorical(
        logits=tf.expand_dims(rescaled_logits, 0),
        num_samples=1
    ) + 1
    return tokenizer.sequences_to_texts(char_id.numpy())[0]

def complete_text(text, n_chars=100, temperature=1.0):
    " Generate text by predicting next characters iteratively "
    for _ in range(n_chars):
        text += next_char(text, temperature)
    return text

In [None]:
complete_text("t", temperature=0.2) # the belly the great and who shall be the belly the

# For best results, create model with recurrent_dropout=0.3, add more layers with GRU

# Stateful RNN
> - Make sense only when previous data is related to current data ( e.g. time series, text )
> - Stateful RNN maintain hidden states between batches

In [None]:
dataset = tf.data.Dataset.from_tensor_slices(encoded[:train_size]) # Start from encoded characters
dataset = dataset.window(window_length, shift=n_steps, drop_remainder=True) # Sliding windows
dataset = dataset.flat_map(lambda window: window.batch(window_length)) # Turn each window Dataset into a tensor of shape [window_length]
dataset = dataset.batch(1) # Batch size = 1 to maintain state between batches
dataset = dataset.map(lambda windows: (windows[:,:-1], windows[:,1:])) # Now split into input (all but last char) and target (shifted by 1)
dataset = dataset.map( # Batch and one-hot encode
    lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch)
)
dataset = dataset.prefetch(1) # prefetch next batch while training on the current one

In [None]:
model = Sequential([
    layers.InputLayer(batch_input_shape=[batch_size, None, max_id]),
    layers.GRU(128, return_sequences=True, stateful=True, dropout=0.2, recurrent_dropout=0.2), # stateful=True - for stateful RNN
    layers.GRU(128, return_sequences=True, stateful=True, dropout=0.2, recurrent_dropout=0.2),
    layers.TimeDistributed(layers.Dense(max_id, activation='softmax'))
])

In [None]:
class ResetStateCallback(callbacks.Callback):
    " Custom callback to reset states at the end of each epoch "
    def on_epoch_end(self, epoch, logs=None):
        self.model.reset_states()

In [None]:
model.compile(
    optimizer=optimizers.Adam(learning_rate=0.001),
    loss=losses.sparse_categorical_crossentropy,
    metrics=[metrics.sparse_categorical_accuracy],
)

model.fit(dataset,
          epochs=20,
          callbacks=[
                     early_stopping_callback,
                     ResetStateCallback()
                     ])

# Sentiment Analysis with RNN ( IMBD Dataset )

In [2]:
(X_train, y_train), (X_test, y_test) = tf.keras.datasets.imdb.load_data() # load IMDB dataset

  array = pickle.load(fp, **pickle_kwargs)


In [3]:
X_train[0][:10] # first 10 word ids of first review

[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65]

In [4]:
word_index = tf.keras.datasets.imdb.get_word_index() # get word to id mapping

In [5]:
id_to_word = {id + 3: word for word, id in word_index.items()} # shift by 3 to leave space for special tokens
for idx, token in enumerate(("<pad>", "<sos>", "<unk>")):
    id_to_word[idx] = token
" ".join([id_to_word[idx] for idx in X_train[0][:10]])

'<sos> this film was just brilliant casting location scenery story'

In [6]:
import tensorflow_datasets as tfds

imdb_dataset, info = tfds.load("imdb_reviews", with_info=True, as_supervised=True)

train_size = info.splits['train'].num_examples
train_size

25000

In [7]:
def preprocess(X_batch, y_batch ):
    """
    Preprocess text data: tokenize, pad/truncate to fixed length
    For speedup training we take only first 300 characters of each review
    """
    X_batch = tf.strings.substr(X_batch, 0, 300) # truncate to max length
    X_batch = tf.strings.regex_replace(X_batch, rb"<br\s*/?>", b" ") # remove HTML tags
    X_batch = tf.strings.regex_replace(X_batch, rb"[^a-zA-Z']", b" ") # keep only letters and apostrophes
    X_batch = tf.strings.split(X_batch) # tokenize
    return X_batch.to_tensor(default_value=b"<pad>"), y_batch


In [8]:
from collections import Counter

vocabulary = Counter()

for X_batch, y_batch in imdb_dataset['train'].batch(32).map(preprocess): # build vocabulary
    for review in X_batch:
        vocabulary.update(review.numpy())

In [9]:
vocabulary.most_common()[:10]

[(b'<pad>', 214309),
 (b'the', 61137),
 (b'a', 38564),
 (b'of', 33983),
 (b'and', 33431),
 (b'to', 27707),
 (b'I', 27019),
 (b'is', 25719),
 (b'in', 18966),
 (b'this', 18490)]

In [10]:
# lets create a vocabulary wit h a most common 10000 words
vocab_size = 10000 # vocabulary size
truncated_vocabulary = [word for word, count in vocabulary.most_common(vocab_size)] # keep only most common words
words = tf.constant(truncated_vocabulary) # convert to tensor
word_ids = tf.range(len(truncated_vocabulary), dtype=tf.int64) # assign unique id to each word
vocab_init = tf.lookup.KeyValueTensorInitializer(words, word_ids) # create initializer for lookup table
num_oov_buckets = 1000 # add extra buckets for out-of-vocabulary words (words not in the truncated vocabulary)
table = tf.lookup.StaticVocabularyTable(vocab_init, num_oov_buckets) # create lookup table

In [11]:
table.lookup(
    tf.constant(
        [b'This movie was faaaaaantastic'.split()]
    )
) # If words are found there are ids below 10000, else above 10000

<tf.Tensor: shape=(1, 4), dtype=int64, numpy=array([[   22,    12,    11, 10053]])>

In [12]:
def encode_words(X_batch, y_batch):
    " Encode words to their corresponding ids using the lookup table "
    return table.lookup(X_batch), y_batch

train_set = imdb_dataset['train'].batch(32).map(preprocess)
train_set = train_set.map(encode_words).prefetch(1)

In [13]:
# build a model

embed_size = 128 # embedding size

model = Sequential([
    layers.InputLayer(shape=(None,)),
    layers.Embedding(input_dim=vocab_size + num_oov_buckets,
                     output_dim=embed_size,
                     mask_zero=True),
    layers.GRU(128, return_sequences=True, dropout=0.2, recurrent_dropout=0.2),
    layers.GRU(128, dropout=0.2, recurrent_dropout=0.2),
    layers.Dense(1, activation='sigmoid')
])

model.compile(
    optimizer=optimizers.Adam(learning_rate=0.001),
    loss=losses.binary_crossentropy,
    metrics=[metrics.binary_accuracy],
)

In [15]:
history = model.fit(
    train_set,
    epochs=1,
    # validation_data=test_set,
    # callbacks=[early_stopping_callback]
)

[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m155s[0m 185ms/step - binary_accuracy: 0.7002 - loss: 0.5645


## Masking

In [None]:
K = tf.keras.backend

inputs = tf.keras.layers.Input(shape=(None, 5))
mask = tf.keras.layers.Lambda(lambda inputs: K.not_equal(inputs, 0))(inputs)
z = tf.keras.layers.Embedding(vocab_size+num_oov_buckets, embed_size)(inputs)
z = tf.keras.layers.GRU(128, return_sequences=True)(z, mask=mask)
z = tf.keras.layers.GRU(128)(z, mask=mask)
outputs = tf.keras.layers.Dense(1, activation='sigmoid')(z)

model = tf.keras.Model(inputs=[inputs], outputs=[outputs])

## Re-using pretrained embeddings

In [20]:
import tensorflow_hub as hub

hub_layer = hub.KerasLayer("https://www.kaggle.com/models/google/nnlm/TensorFlow2/tf2-preview-en-dim50/1", output_shape=[50],
                           input_shape=[], dtype=tf.string)

model = keras.Sequential()
model.add(hub_layer)
model.add(keras.layers.Dense(16, activation='relu'))
model.add(keras.layers.Dense(1, activation='sigmoid'))

model.summary()

model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy'],
)

ValueError: Only instances of `keras.Layer` can be added to a Sequential model. Received: <tensorflow_hub.keras_layer.KerasLayer object at 0x0000026B97D93890> (of type <class 'tensorflow_hub.keras_layer.KerasLayer'>)

### Data Loading

In [None]:
datasets, info = tfds.load("imdb_reviews", with_info=True, as_supervised=True)

train_size = info.splits['train'].num_examples
batch_size = 32
train_set = datasets['train'].batch(batch_size).prefetch(1)


## Training

In [None]:
history = model.fit(
    train_set,
    epochs=1,
)