In [1]:
from array import array

import tensorflow as tf
from tensorflow.keras.layers.experimental import preprocessing

import numpy as np
import os
import time

In [2]:
import sys
sys.getdefaultencoding()

'utf-8'

In [3]:
''' Download Shakespeare Dataset'''
''' can change to run code on own data '''
# path_to_file = tf.keras.utils.get_file('shakespeare.txt', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')
path_to_file = 'input.txt'

In [4]:
''' Read the Data'''
# Read, then decode for py2 compat.
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')
# length of text is the number of characters in it
print('Length of text: {} characters'.format(len(text)))

Length of text: 153 characters


In [5]:
# Take a look at the first 250 characters in text
print(text[:250])

Happ
Happy Ne
Happy New Yea
That’s one small ste
That’s one sm
That’
Th
one giant leap for mankin
one giant leap fo
one giant lea
one giant l
one gia
on



In [103]:
# The unique characters in the file
vocab = sorted(set(text))
vocab.remove('\n')
vocab.remove(' ')
print('{} unique characters'.format(len(vocab)))

22 unique characters


In [104]:
''' Process the Text
    Before training, you need to convert the strings to a numerical representation.

    The preprocessing.StringLookup layer can convert each character into a numeric ID. 
    It just needs the text to be split into tokens first.'''
example_texts = ['abcdefg', 'xyz']

chars = tf.strings.unicode_split(example_texts, input_encoding='UTF-8')
chars

<tf.RaggedTensor [[b'a', b'b', b'c', b'd', b'e', b'f', b'g'], [b'x', b'y', b'z']]>

In [105]:
ids_from_chars = preprocessing.StringLookup(
    vocabulary=list(vocab))

In [106]:
ids = ids_from_chars(chars)
ids

<tf.RaggedTensor [[6, 1, 1, 1, 7, 8, 9], [1, 22, 1]]>

In [107]:
chars_from_ids = tf.keras.layers.experimental.preprocessing.StringLookup(
    vocabulary=ids_from_chars.get_vocabulary(), invert=True)

In [108]:
chars = chars_from_ids(ids)
chars

<tf.RaggedTensor [[b'a', b'[UNK]', b'[UNK]', b'[UNK]', b'e', b'f', b'g'], [b'[UNK]', b'y', b'[UNK]']]>

In [109]:
tf.strings.reduce_join(chars, axis=-1).numpy()


def text_from_ids(ids):
  return tf.strings.reduce_join(chars_from_ids(ids), axis=-1)

In [110]:
'''Create Training Examples and Targets'''

''' convert text vector into a stream of character indices '''
all_ids = ids_from_chars(tf.strings.unicode_split(text, 'UTF-8'))

In [111]:
ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)
for ids in ids_dataset.take(10):
    print(chars_from_ids(ids).numpy().decode('utf-8'))

H
a
p
p
[UNK]
H
a
p
p
y


In [112]:
seq_length = 100
examples_per_epoch = len(text)//(seq_length+1)

In [113]:
sequences = ids_dataset.batch(seq_length+1, drop_remainder=True)

for seq in sequences.take(1):
  print(chars_from_ids(seq))

tf.Tensor(
[b'H' b'a' b'p' b'p' b'[UNK]' b'H' b'a' b'p' b'p' b'y' b'[UNK]' b'N' b'e'
 b'[UNK]' b'H' b'a' b'p' b'p' b'y' b'[UNK]' b'N' b'e' b'w' b'[UNK]' b'Y'
 b'e' b'a' b'[UNK]' b'T' b'h' b'a' b't' b'\xe2\x80\x99' b's' b'[UNK]' b'o'
 b'n' b'e' b'[UNK]' b's' b'm' b'a' b'l' b'l' b'[UNK]' b's' b't' b'e'
 b'[UNK]' b'T' b'h' b'a' b't' b'\xe2\x80\x99' b's' b'[UNK]' b'o' b'n' b'e'
 b'[UNK]' b's' b'm' b'[UNK]' b'T' b'h' b'a' b't' b'\xe2\x80\x99' b'[UNK]'
 b'T' b'h' b'[UNK]' b'o' b'n' b'e' b'[UNK]' b'g' b'i' b'a' b'n' b't'
 b'[UNK]' b'l' b'e' b'a' b'p' b'[UNK]' b'f' b'o' b'r' b'[UNK]' b'm' b'a'
 b'n' b'k' b'i' b'n' b'[UNK]' b'o' b'n' b'e'], shape=(101,), dtype=string)


In [114]:
for seq in sequences.take(5):
  print(text_from_ids(seq).numpy())

b'Happ[UNK]Happy[UNK]Ne[UNK]Happy[UNK]New[UNK]Yea[UNK]That\xe2\x80\x99s[UNK]one[UNK]small[UNK]ste[UNK]That\xe2\x80\x99s[UNK]one[UNK]sm[UNK]That\xe2\x80\x99[UNK]Th[UNK]one[UNK]giant[UNK]leap[UNK]for[UNK]mankin[UNK]one'


In [115]:
def split_input_target(sequence):
    input_text = sequence[:-1]
    target_text = sequence[1:]
    return input_text, target_text

In [116]:
split_input_target(list("Tensorflow"))

(['T', 'e', 'n', 's', 'o', 'r', 'f', 'l', 'o'],
 ['e', 'n', 's', 'o', 'r', 'f', 'l', 'o', 'w'])

In [117]:
dataset = sequences.map(split_input_target)

for input_example, target_example in dataset.take(1):
    print("Input :", text_from_ids(input_example).numpy())
    print("Target:", text_from_ids(target_example).numpy())

Input : b'Happ[UNK]Happy[UNK]Ne[UNK]Happy[UNK]New[UNK]Yea[UNK]That\xe2\x80\x99s[UNK]one[UNK]small[UNK]ste[UNK]That\xe2\x80\x99s[UNK]one[UNK]sm[UNK]That\xe2\x80\x99[UNK]Th[UNK]one[UNK]giant[UNK]leap[UNK]for[UNK]mankin[UNK]on'
Target: b'app[UNK]Happy[UNK]Ne[UNK]Happy[UNK]New[UNK]Yea[UNK]That\xe2\x80\x99s[UNK]one[UNK]small[UNK]ste[UNK]That\xe2\x80\x99s[UNK]one[UNK]sm[UNK]That\xe2\x80\x99[UNK]Th[UNK]one[UNK]giant[UNK]leap[UNK]for[UNK]mankin[UNK]one'


In [118]:
# Batch size
BATCH_SIZE = 64

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

dataset = (
    dataset
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=False)
    .prefetch(tf.data.experimental.AUTOTUNE))

dataset
print(list(dataset))
print(tf.data.experimental.cardinality(dataset))

[(<tf.Tensor: shape=(1, 100), dtype=int64, numpy=
array([[ 2,  6, 17, 17,  1,  2,  6, 17, 17, 22,  1,  3,  7,  1,  2,  6,
        17, 17, 22,  1,  3,  7, 21,  1,  5,  7,  6,  1,  4, 10,  6, 20,
        23, 19,  1, 16, 15,  7,  1, 19, 14,  6, 13, 13,  1, 19, 20,  7,
         1,  4, 10,  6, 20, 23, 19,  1, 16, 15,  7,  1, 19, 14,  1,  4,
        10,  6, 20, 23,  1,  4, 10,  1, 16, 15,  7,  1,  9, 11,  6, 15,
        20,  1, 13,  7,  6, 17,  1,  8, 16, 18,  1, 14,  6, 15, 12, 11,
        15,  1, 16, 15]])>, <tf.Tensor: shape=(1, 100), dtype=int64, numpy=
array([[ 6, 17, 17,  1,  2,  6, 17, 17, 22,  1,  3,  7,  1,  2,  6, 17,
        17, 22,  1,  3,  7, 21,  1,  5,  7,  6,  1,  4, 10,  6, 20, 23,
        19,  1, 16, 15,  7,  1, 19, 14,  6, 13, 13,  1, 19, 20,  7,  1,
         4, 10,  6, 20, 23, 19,  1, 16, 15,  7,  1, 19, 14,  1,  4, 10,
         6, 20, 23,  1,  4, 10,  1, 16, 15,  7,  1,  9, 11,  6, 15, 20,
         1, 13,  7,  6, 17,  1,  8, 16, 18,  1, 14,  6, 15, 12, 11, 15,
         1

In [119]:
# Length of the vocabulary in chars
vocab_size = len(vocab)

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 1024

In [120]:
class MyModel(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, rnn_units):
    super().__init__(self)
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(rnn_units,
                                   return_sequences=True, 
                                   return_state=True)
    self.dense = tf.keras.layers.Dense(vocab_size)

  def call(self, inputs, states=None, return_state=False, training=False):
    x = inputs
    x = self.embedding(x, training=training)
    if states is None:
      states = self.gru.get_initial_state(x)
    x, states = self.gru(x, initial_state=states, training=training)
    x = self.dense(x, training=training)

    if return_state:
      return x, states
    else: 
      return x

In [121]:
model = MyModel(
    # Be sure the vocabulary size matches the `StringLookup` layers.
    vocab_size=len(ids_from_chars.get_vocabulary()),
    embedding_dim=embedding_dim,
    rnn_units=rnn_units)

In [122]:
# print(dataset.take(1))
print(tf.data.experimental.cardinality(dataset.take(1)))
for input_example_batch, target_example_batch in dataset.take(1):
#     print('here')
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

tf.Tensor(1, shape=(), dtype=int64)
(1, 100, 24) # (batch_size, sequence_length, vocab_size)


In [123]:
model.summary()

Model: "my_model_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      multiple                  6144      
_________________________________________________________________
gru_3 (GRU)                  multiple                  3938304   
_________________________________________________________________
dense_3 (Dense)              multiple                  24600     
Total params: 3,969,048
Trainable params: 3,969,048
Non-trainable params: 0
_________________________________________________________________


In [124]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices,axis=-1).numpy()

In [125]:
sampled_indices

array([ 3,  7,  1,  7, 16, 15,  4, 18, 11, 10, 21, 16,  6, 10,  6,  9, 11,
       15,  7, 12, 23, 13, 13,  1, 18, 14, 22,  1,  1, 20, 11, 21, 18, 12,
        5, 23,  3,  7, 18,  8,  2,  0,  4, 21,  2, 18,  2, 19,  3,  7, 22,
       11, 16, 17, 16, 23, 14,  9,  5,  8, 18, 13,  4, 20,  4, 13, 18, 15,
        5, 19, 20, 23, 16,  3,  2, 21,  7, 21, 15, 18, 23, 13, 23, 15, 18,
        7,  4,  8, 13, 13, 16,  1,  3,  8, 17, 15, 10, 21,  8, 17])

In [126]:
print("Input:\n", text_from_ids(input_example_batch[0]).numpy())
print()
predictions = (text_from_ids(sampled_indices).numpy()).decode('utf-8')
print("Next Char Predictions:\n", predictions)

Input:
 b'Happ[UNK]Happy[UNK]Ne[UNK]Happy[UNK]New[UNK]Yea[UNK]That\xe2\x80\x99s[UNK]one[UNK]small[UNK]ste[UNK]That\xe2\x80\x99s[UNK]one[UNK]sm[UNK]That\xe2\x80\x99[UNK]Th[UNK]one[UNK]giant[UNK]leap[UNK]for[UNK]mankin[UNK]on'

Next Char Predictions:
 Ne[UNK]eonTrihwoahaginek’ll[UNK]rmy[UNK][UNK]tiwrkY’NerfHTwHrHsNeyiopo’mgYfrlTtTlrnYst’oNHwewnr’l’nreTfllo[UNK]Nfpnhwfp


In [127]:
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)

In [128]:
example_batch_loss = loss(target_example_batch, example_batch_predictions)
mean_loss = example_batch_loss.numpy().mean()
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("Mean loss:        ", mean_loss)

Prediction shape:  (1, 100, 24)  # (batch_size, sequence_length, vocab_size)
Mean loss:         3.1775312


In [129]:
tf.exp(mean_loss).numpy()

23.987461

In [130]:
model.compile(optimizer='adam', loss=loss)

In [131]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [132]:
EPOCHS = 50

In [133]:
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [134]:
class OneStep(tf.keras.Model):
  def __init__(self, model, chars_from_ids, ids_from_chars, temperature=1.0):
    super().__init__()
    self.temperature=temperature
    self.model = model
    self.chars_from_ids = chars_from_ids
    self.ids_from_chars = ids_from_chars

    # Create a mask to prevent "" or "[UNK]" from being generated.
    skip_ids = self.ids_from_chars(['','[UNK]'])[:, None]
    sparse_mask = tf.SparseTensor(
        # Put a -inf at each bad index.
        values=[-float('inf')]*len(skip_ids),
        indices = skip_ids,
        # Match the shape to the vocabulary
        dense_shape=[len(ids_from_chars.get_vocabulary())]) 
    self.prediction_mask = tf.sparse.to_dense(sparse_mask)

  @tf.function
  def generate_one_step(self, inputs, states=None):
    # Convert strings to token IDs.
    input_chars = tf.strings.unicode_split(inputs, 'UTF-8')
    input_ids = self.ids_from_chars(input_chars).to_tensor()

    # Run the model.
    # predicted_logits.shape is [batch, char, next_char_logits] 
    predicted_logits, states =  self.model(inputs=input_ids, states=states, 
                                          return_state=True)
    # Only use the last prediction.
    predicted_logits = predicted_logits[:, -1, :]
    predicted_logits = predicted_logits/self.temperature
    # Apply the prediction mask: prevent "" or "[UNK]" from being generated.
    predicted_logits = predicted_logits + self.prediction_mask

    # Sample the output logits to generate token IDs.
    predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
    predicted_ids = tf.squeeze(predicted_ids, axis=-1)

    # Convert from token ids to characters
    predicted_chars = self.chars_from_ids(predicted_ids)

    # Return the characters and model state.
    return predicted_chars, states

In [135]:
one_step_model = OneStep(model, chars_from_ids, ids_from_chars)

In [161]:
start = time.time()
states = None
next_char = tf.constant(['on'])
result = [next_char]
second_result = []
third_result = []

for n in range(1):
  next_char, states = one_step_model.generate_one_step(next_char, states=states)
  second_choice, states = one_step_model.generate_one_step(next_char, states=states)
  third_choice, states = one_step_model.generate_one_step(next_char, states=states)
  print('1', next_char)
  print('2', second_choice)
  print('3', third_choice)
  print('\n')
  result.append(next_char)
  second_result.append(second_choice)
  third_result.append(third_choice)

result = tf.strings.join(result)
second_result = tf.strings.join(second_result)
third_result = tf.strings.join(third_result)
end = time.time()

print(result[0].numpy().decode('utf-8'), '\n\n' + '_'*80)
print(second_result[0].numpy().decode('utf-8'), '\n\n' + '_'*80)
print(third_result[0].numpy().decode('utf-8'), '\n\n' + '_'*80)

print(f"\nRun time: {end - start}")

1 tf.Tensor([b'e'], shape=(1,), dtype=string)
2 tf.Tensor([b'w'], shape=(1,), dtype=string)
3 tf.Tensor([b'f'], shape=(1,), dtype=string)


one 

________________________________________________________________________________
w 

________________________________________________________________________________
f 

________________________________________________________________________________

Run time: 0.00829315185546875
