In [72]:
import numpy as np
import tensorflow as tf
from tensorflow import keras

### Loading the Data and Preparing the Dataset

In [73]:
data_url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'
filepath = keras.utils.get_file("shakespeare.txt", data_url)
with open(filepath) as f :
    shakespeare_text = f.read()

In [74]:
print(shakespeare_text[:148],'\n',type(shakespeare_text))

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?
 
 <class 'str'>


In [75]:
"".join(sorted(set(shakespeare_text.lower())))

"\n !$&',-.3:;?abcdefghijklmnopqrstuvwxyz"

In [76]:
# tokenize by char_level
tokenizer = keras.preprocessing.text.Tokenizer(char_level=True)
tokenizer.fit_on_texts(shakespeare_text)

In [77]:
tokenizer.texts_to_sequences(['element text'])

[[2, 12, 2, 15, 2, 10, 3, 1, 3, 2, 35, 3]]

> same char has same number

In [78]:
tokenizer.sequences_to_texts([[2, 12, 2, 15, 2, 10, 3, 1, 3, 2, 35, 3]])

['e l e m e n t   t e x t']

In [79]:
tokenizer.word_index

{' ': 1,
 'e': 2,
 't': 3,
 'o': 4,
 'a': 5,
 'i': 6,
 'h': 7,
 's': 8,
 'r': 9,
 'n': 10,
 '\n': 11,
 'l': 12,
 'd': 13,
 'u': 14,
 'm': 15,
 'y': 16,
 'w': 17,
 ',': 18,
 'c': 19,
 'f': 20,
 'g': 21,
 'b': 22,
 'p': 23,
 ':': 24,
 'k': 25,
 'v': 26,
 '.': 27,
 "'": 28,
 ';': 29,
 '?': 30,
 '!': 31,
 '-': 32,
 'j': 33,
 'q': 34,
 'x': 35,
 'z': 36,
 '3': 37,
 '&': 38,
 '$': 39}

In [80]:
max_id = len(tokenizer.word_index) # 39 distict characters
dataset_size = tokenizer.document_count # total number of character

In [81]:
encoder = np.array(tokenizer.texts_to_sequences([shakespeare_text])) - 1 # -1 because want index start from 0
encoder = encoder[0]
encoder

array([19,  5,  8, ..., 20, 26, 10])

In [82]:
encoder.shape

(1115394,)

In [83]:
dataset_size

1115394

In [84]:
train_size = dataset_size * 90 // 100 # 90%
dataset = tf.data.Dataset.from_tensor_slices(encoder[:train_size])
dataset

<TensorSliceDataset element_spec=TensorSpec(shape=(), dtype=tf.int32, name=None)>

In [85]:
n_steps = 100
window_length = n_steps + 1 # tatget = input shifted 1 charcter ahead
dataset = dataset.window(window_length, shift=1, drop_remainder=True)
dataset = dataset.flat_map(lambda window: window.batch(window_length))
dataset

<FlatMapDataset element_spec=TensorSpec(shape=(None,), dtype=tf.int32, name=None)>

In [86]:
np.random.seed(42)
tf.random.set_seed(42)

batch_size = 32
dataset = dataset.shuffle(10000).batch(batch_size)
dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:]))
dataset = dataset.map(lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch))
dataset = dataset.prefetch(tf.data.AUTOTUNE)

In [87]:
for X_batch, y_batch in dataset.take(1) :
    print(X_batch.shape, y_batch.shape)

(32, 100, 39) (32, 100)


> batch size 32 , 100 char per batch, one hot for each char 39(max_char)