In [3]:
import tensorflow as tf

# Download latest version
filepath = tf.keras.utils.get_file("shakespear.txt", "https://homl.info/shakespeare")

print("Path to dataset files:", filepath)

Downloading data from https://homl.info/shakespeare
[1m1115394/1115394[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Path to dataset files: /Users/jobjornrokenesmyren/.keras/datasets/shakespear.txt


In [25]:
with open(filepath, encoding='utf-8', errors='replace') as f:
    shakespeare_text = f.read()
print(shakespeare_text[:500])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor


In [27]:
text_vect_layer = tf.keras.layers.TextVectorization(split="character", standardize="lower")
text_vect_layer.adapt([shakespear_text])
encoded = text_vect_layer([shakespear_text])  # Note the list around shakespear_text

# Now, encoded has shape (1, sequence_length). Remove the batch dimension:
encoded = encoded[0]
encoded = encoded.numpy()

print("Encoded type:", type(encoded))
print("Encoded shape:", encoded.shape)


Encoded type: <class 'numpy.ndarray'>
Encoded shape: (1115394,)


In [28]:
import numpy as np
encoded -= 2
n_tokens = text_vect_layer.vocabulary_size() - 2

# Inspect the shape before slicing
print("Original encoded shape:", encoded.shape)

# If the tensor is batched (e.g., shape (1, sequence_length)), slice the batch dimension.
if len(encoded.shape) > 0 and encoded.shape[0] == 1:
    encoded = encoded[0]

encoded = encoded.numpy()

dataset_size = len(encoded)
print("Type of encoded:", type(encoded))
print("Shape of encoded after slicing:", encoded.shape)
print("Number of chars:", n_tokens)
print("Dataset size:", dataset_size)


Original encoded shape: (1115394,)


AttributeError: 'numpy.ndarray' object has no attribute 'numpy'

In [29]:
def to_dataset(sequence, length, shuffle=False, seed=None, batch_size=32):
    # Create a dataset from the sequence
    dataset = tf.data.Dataset.from_tensor_slices(sequence)
    
    # Create windows of size (length + 1)
    dataset = dataset.window(length + 1, shift=1, drop_remainder=True)
    
    # Convert each window into a tensor of size (length + 1)
    dataset = dataset.flat_map(lambda window: window.batch(length + 1))
    
    # Optionally shuffle the dataset
    if shuffle:
        dataset = dataset.shuffle(10000, seed=seed)
    
    # Split each window into input and target by shifting one token
    dataset = dataset.map(lambda window: (window[:-1], window[1:]))
    
    # Batch the dataset and prefetch for performance
    dataset = dataset.batch(batch_size).prefetch(1)
    return dataset


In [30]:
length = 100
tf.random.set_seed(42)
train_set = to_dataset(encoded[:1_000_000], length, shuffle = True, seed = 42)
valid_set = to_dataset(encoded[1_000_000:1_200_000], length)
test_set = to_dataset(encoded[1_200_000:], length)

In [None]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(input_dim=n_tokens, output_dim=16, input_shape=[None]),
    tf.keras.layers.GRU(128, return_sequences=True),
    tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(n_tokens, activation="softmax"))
])
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
model_ckpt = tf.keras.callbacks.ModelCheckpoint("my_shakespear_model.keras", monitor="val_accuracy", save_best_only=True)
history = model.fit(train_set, epochs=20, validation_data=valid_set, callbacks=[model_ckpt])

Epoch 1/20


  super().__init__(**kwargs)


   2228/Unknown [1m156s[0m 69ms/step - accuracy: 0.4053 - loss: 2.0660