### Setup

In [2]:
# imports
import tensorflow as tf
import numpy as np
import os
import time

In [3]:
# change dir
os.chdir('C:/Users/theon/OneDrive/Desktop/Giga Projects/seinfeld text gen')

In [4]:
# open text
text = open('standardized_seinfeld_script.txt', 'rb').read().decode(encoding='utf-8')       # 'rb' specifies to read it in binary mode (basically manual handling of info, not automatic)
print(f'Length of text: {len(text)} characters')

Length of text: 4767047 characters


In [5]:
# check first 250 chars
print(text[:250])

The Seinfeld Chronicles

[SCENE: Comedy club]

JERRY: You know, why we're here? [he means: here in the "Comedy club"] To be out, this is out...and out is one of the single most enjoyable experiences of life. People...did you ever hear people talk


In [6]:
# check unique chars
vocab = sorted(set(text))
print(f'{len(vocab)} unique characters')

106 unique characters


### Processing Text

#### Vectorization

In [7]:
# split text into characters
chars = tf.strings.unicode_split(text, input_encoding='UTF-8')

In [8]:
# map chars to ints
ids_from_chars = tf.keras.layers.StringLookup(                      # function for converting chars to integer indices
    vocabulary=list(vocab),                                         # tell it how many vocab items to map
    mask_token=None)                                                # this specifies what to use when padding sequences

In [9]:
# map ints back to chars
chars_from_ids = tf.keras.layers.StringLookup(
    vocabulary=ids_from_chars.get_vocabulary(),                     # get vocabulary from above ids_from_chars object
    invert=True,                                                    # do it the other way, lol
    mask_token=None)

In [10]:
# define function to return strings from integers
def text_from_ids(ids):
  return tf.strings.reduce_join(chars_from_ids(ids), axis=-1)

In [11]:
# apply map
all_ids = ids_from_chars(chars)

In [12]:
# create dataset from all_ids tensors
ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)

In [13]:
# create sequences from dataset
seq_length = 100
sequences = ids_dataset.batch(seq_length+1, drop_remainder=True)        # seq_length+1 is to have both inputs (100) and targets (the last 1)

In [14]:
# define function for extracting input and target
def split_input_target(sequence):
    input_text = sequence[:-1]
    target_text = sequence[1:]
    return input_text, target_text

In [15]:
# apply sequence splitter function to dataset
dataset = sequences.map(split_input_target)

In [16]:
# check first input and target pair
for input_example, target_example in dataset.take(1):
    print("Input :", text_from_ids(input_example).numpy())
    print("Target:", text_from_ids(target_example).numpy())

Input : b"The Seinfeld Chronicles\r\n\r\n[SCENE: Comedy club]\r\n\r\nJERRY: You know, why we're here? [he means: here "
Target: b"he Seinfeld Chronicles\r\n\r\n[SCENE: Comedy club]\r\n\r\nJERRY: You know, why we're here? [he means: here i"


In [17]:
# set processing options
BATCH_SIZE = 64
BUFFER_SIZE = 10000

# final processing
dataset = (
    dataset
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE))               # prefetch for efficiency (tf.data.experimental.AUTOTUNE is to set prefetch size automatically/dynamically for efficiency)

In [19]:
vocab_size = len(ids_from_chars.get_vocabulary())         # amount of unique characters in input
embedding_dim = 256                                       # embedding is representing tokens as n-dimensional vectors (understood better by models); here, each token is represented by 256-dimension vector
rnn_units = 1024                                          # number of units in GRU layer; more units, larger capacity to capture patterns in sequential data

# define custom RNN model, inheriting from tf.keras.Model
class MyModel(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, rnn_units):
    super().__init__(self)
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)         # embed tokenized chars
    self.gru = tf.keras.layers.GRU(rnn_units,
                                   return_sequences=True,
                                   return_state=True)
    self.dense = tf.keras.layers.Dense(vocab_size)

  def call(self, inputs, states=None, return_state=False, training=False):
    x = inputs
    x = self.embedding(x, training=training)
    if states is None:
      states = self.gru.get_initial_state(x)
    x, states = self.gru(x, initial_state=states, training=training)
    x = self.dense(x, training=training)

    if return_state:
      return x, states
    else:
      return x

In [20]:
model = MyModel(
    vocab_size=vocab_size,
    embedding_dim=embedding_dim,
    rnn_units=rnn_units)

In [21]:
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer='adam', loss=loss)

In [22]:
# Directory where the checkpoints will be saved
checkpoint_dir = './sein_rnn/training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [29]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
import tensorflow as tf; print(tf.config.list_physical_devices('GPU'))

Num GPUs Available:  1
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [31]:
EPOCHS = 30
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [32]:
class OneStep(tf.keras.Model):
  def __init__(self, model, chars_from_ids, ids_from_chars, temperature=1.0):
    super().__init__()
    self.temperature = temperature
    self.model = model
    self.chars_from_ids = chars_from_ids
    self.ids_from_chars = ids_from_chars

    # Create a mask to prevent "[UNK]" from being generated.
    skip_ids = self.ids_from_chars(['[UNK]'])[:, None]
    sparse_mask = tf.SparseTensor(
        # Put a -inf at each bad index.
        values=[-float('inf')]*len(skip_ids),
        indices=skip_ids,
        # Match the shape to the vocabulary
        dense_shape=[len(ids_from_chars.get_vocabulary())])
    self.prediction_mask = tf.sparse.to_dense(sparse_mask)

  @tf.function
  def generate_one_step(self, inputs, states=None):
    # Convert strings to token IDs.
    input_chars = tf.strings.unicode_split(inputs, 'UTF-8')
    input_ids = self.ids_from_chars(input_chars).to_tensor()

    # Run the model.
    # predicted_logits.shape is [batch, char, next_char_logits]
    predicted_logits, states = self.model(inputs=input_ids, states=states,
                                          return_state=True)
    # Only use the last prediction.
    predicted_logits = predicted_logits[:, -1, :]
    predicted_logits = predicted_logits/self.temperature
    # Apply the prediction mask: prevent "[UNK]" from being generated.
    predicted_logits = predicted_logits + self.prediction_mask

    # Sample the output logits to generate token IDs.
    predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
    predicted_ids = tf.squeeze(predicted_ids, axis=-1)

    # Convert from token ids to characters
    predicted_chars = self.chars_from_ids(predicted_ids)

    # Return the characters and model state.
    return predicted_chars, states

In [33]:
one_step_model = OneStep(model, chars_from_ids, ids_from_chars)

In [34]:
start = time.time()
states = None
next_char = tf.constant(['JERRY:'])
result = [next_char]

for n in range(1000):
  next_char, states = one_step_model.generate_one_step(next_char, states=states)
  result.append(next_char)

result = tf.strings.join(result)
end = time.time()
print(result[0].numpy().decode('utf-8'), '\n\n' + '_'*80)
print('\nRun time:', end - start)

JERRY: I'm the worst. The physical crime he's in her seat. Elaine: Hey. Jerry: hey I wanna be a par and an anitortuge can read.
JERRY: (Pretty mediface. I just sit eating anything along, though wrongholed, just as the carpet cleanal school onhers exchange money. JERRY: Sheed we played the other theater lately shoulder, did you been paying your New Yarks? That's some guy that consider. That defensively's autression would posted out first. Jerry grabs his arms on tope of carrying machine, she sees my foot. GEORGE: Get a good towel. KRAMER: (pointing) Mrs Hamostup. My setting is of way I know I
was in gimageea authorad. It was 2..
JERRY: I'll think we're trying to get something. So what's wrong with?
CINDY: Yes, he's a drip. It's business.
GEORGE: You gotta meet you out. Written with my egms on Morty's rye. (Frantigan offers humiliatiquit
over his shirt 'craceler, sequence inocks highs, making her bag shorting to put you two hips around. So, matter what happened to the calk for my allermo