In [1]:
# MUST be at the very top
from tensorflow.keras import mixed_precision
mixed_precision.set_global_policy("mixed_float16")


In [2]:
import tensorflow as tf
import numpy as np
import string
import re
import json
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint


In [3]:
# Download Shakespeare dataset
!wget https://www.gutenberg.org/files/100/100-0.txt -O shakespeare.txt

--2026-01-15 14:22:30--  https://www.gutenberg.org/files/100/100-0.txt
Resolving www.gutenberg.org (www.gutenberg.org)... 152.19.134.47, 2610:28:3090:3000:0:bad:cafe:47
Connecting to www.gutenberg.org (www.gutenberg.org)|152.19.134.47|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5422721 (5.2M) [text/plain]
Saving to: ‘shakespeare.txt’


2026-01-15 14:22:31 (10.7 MB/s) - ‘shakespeare.txt’ saved [5422721/5422721]



In [4]:
# Load raw text
with open("shakespeare.txt", "r", encoding="utf-8") as f:
    text = f.read()

# Remove Project Gutenberg header/footer
start = text.find("*** START OF THE PROJECT GUTENBERG EBOOK")
end = text.find("*** END OF THE PROJECT GUTENBERG EBOOK")

if start != -1 and end != -1:
    text = text[start + 100 : end]

# Normalize text
# Lowercase
text = text.lower()

# Remove non-ASCII characters (THIS FIXES GIBBERISH)
text = re.sub(r'[^\x00-\x7F]+', ' ', text)

# Remove numbers
text = re.sub(r'\d+', '', text)

# Remove punctuation
text = re.sub(f"[{string.punctuation}]", "", text)

# Remove extra spaces
text = re.sub(r'\s+', ' ', text)


print("Clean text length:", len(text))


Clean text length: 5041771


In [5]:
# Character-level vocabulary
chars = sorted(list(set(text)))
vocab_size = len(chars)

char_to_idx = {c: i for i, c in enumerate(chars)}
idx_to_char = {i: c for i, c in enumerate(chars)}

encoded_text = np.array([char_to_idx[c] for c in text])

print("Vocabulary size:", vocab_size)


Vocabulary size: 27


In [6]:
# Save tokenizer mappings
with open("tokenizer.json", "w") as f:
    json.dump(
        {
            "char_to_idx": char_to_idx,
            "idx_to_char": idx_to_char
        },
        f
    )


In [7]:
SEQ_LENGTH = 40
BATCH_SIZE = 128
BUFFER_SIZE = 10000

dataset = tf.data.Dataset.from_tensor_slices(encoded_text)
sequences = dataset.batch(SEQ_LENGTH + 1, drop_remainder=True)

def split_input_target(chunk):
    input_text = chunk[:-1]      # shape: (40,)
    target_text = chunk[-1]      # shape: () → single next char
    return input_text, target_text


dataset = sequences.map(split_input_target)

dataset = dataset.shuffle(BUFFER_SIZE)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)
dataset = dataset.prefetch(tf.data.AUTOTUNE)


In [8]:
# Calculate dataset size safely
dataset_size = tf.data.experimental.cardinality(dataset).numpy()

train_size = int(0.9 * dataset_size)

train_dataset = dataset.take(train_size)
val_dataset = dataset.skip(train_size)

print("Train batches:", train_size)
print("Validation batches:", dataset_size - train_size)


Train batches: 864
Validation batches: 96


In [9]:
model = Sequential([
    Embedding(vocab_size, 64),
    LSTM(128, return_sequences=True, dropout=0.2),
    LSTM(128, dropout=0.2),
    Dense(vocab_size, activation="softmax", dtype="float32")  # Important for mixed precision
])

model.compile(
    optimizer="adam",
    loss="sparse_categorical_crossentropy"
)

model.summary()


In [10]:
early_stop = EarlyStopping(
    monitor="val_loss",
    patience=3,
    restore_best_weights=True
)

checkpoint = ModelCheckpoint(
    "best_lstm_model.h5",
    monitor="val_loss",
    save_best_only=True,
    verbose=1
)


In [11]:
history = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=20,
    callbacks=[early_stop, checkpoint]
)


Epoch 1/20
[1m862/864[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 19ms/step - loss: 2.6325
Epoch 1: val_loss improved from inf to 2.09978, saving model to best_lstm_model.h5




[1m864/864[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 33ms/step - loss: 2.6317 - val_loss: 2.0998
Epoch 2/20
[1m863/864[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 20ms/step - loss: 2.0762
Epoch 2: val_loss improved from 2.09978 to 1.93509, saving model to best_lstm_model.h5




[1m864/864[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 33ms/step - loss: 2.0761 - val_loss: 1.9351
Epoch 3/20
[1m862/864[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 20ms/step - loss: 1.9391
Epoch 3: val_loss improved from 1.93509 to 1.82943, saving model to best_lstm_model.h5




[1m864/864[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 34ms/step - loss: 1.9391 - val_loss: 1.8294
Epoch 4/20
[1m863/864[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 20ms/step - loss: 1.8519
Epoch 4: val_loss improved from 1.82943 to 1.77767, saving model to best_lstm_model.h5




[1m864/864[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 44ms/step - loss: 1.8519 - val_loss: 1.7777
Epoch 5/20
[1m864/864[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step - loss: 1.7891
Epoch 5: val_loss improved from 1.77767 to 1.73878, saving model to best_lstm_model.h5




[1m864/864[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 44ms/step - loss: 1.7891 - val_loss: 1.7388
Epoch 6/20
[1m863/864[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 19ms/step - loss: 1.7390
Epoch 6: val_loss improved from 1.73878 to 1.68526, saving model to best_lstm_model.h5




[1m864/864[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 33ms/step - loss: 1.7390 - val_loss: 1.6853
Epoch 7/20
[1m863/864[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 19ms/step - loss: 1.7038
Epoch 7: val_loss improved from 1.68526 to 1.65064, saving model to best_lstm_model.h5




[1m864/864[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 33ms/step - loss: 1.7039 - val_loss: 1.6506
Epoch 8/20
[1m864/864[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - loss: 1.6747
Epoch 8: val_loss improved from 1.65064 to 1.62649, saving model to best_lstm_model.h5




[1m864/864[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 34ms/step - loss: 1.6747 - val_loss: 1.6265
Epoch 9/20
[1m864/864[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - loss: 1.6446
Epoch 9: val_loss improved from 1.62649 to 1.62222, saving model to best_lstm_model.h5




[1m864/864[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 34ms/step - loss: 1.6446 - val_loss: 1.6222
Epoch 10/20
[1m863/864[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 21ms/step - loss: 1.6173
Epoch 10: val_loss improved from 1.62222 to 1.59018, saving model to best_lstm_model.h5




[1m864/864[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 34ms/step - loss: 1.6173 - val_loss: 1.5902
Epoch 11/20
[1m863/864[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 19ms/step - loss: 1.5967
Epoch 11: val_loss improved from 1.59018 to 1.58139, saving model to best_lstm_model.h5




[1m864/864[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 33ms/step - loss: 1.5968 - val_loss: 1.5814
Epoch 12/20
[1m864/864[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - loss: 1.5758
Epoch 12: val_loss improved from 1.58139 to 1.56231, saving model to best_lstm_model.h5




[1m864/864[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 34ms/step - loss: 1.5758 - val_loss: 1.5623
Epoch 13/20
[1m864/864[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step - loss: 1.5553
Epoch 13: val_loss improved from 1.56231 to 1.54440, saving model to best_lstm_model.h5




[1m864/864[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 33ms/step - loss: 1.5553 - val_loss: 1.5444
Epoch 14/20
[1m864/864[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step - loss: nan
Epoch 14: val_loss did not improve from 1.54440
[1m864/864[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 39ms/step - loss: nan - val_loss: nan
Epoch 15/20
[1m863/864[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 19ms/step - loss: nan
Epoch 15: val_loss did not improve from 1.54440
[1m864/864[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 32ms/step - loss: nan - val_loss: nan
Epoch 16/20
[1m863/864[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 20ms/step - loss: nan
Epoch 16: val_loss did not improve from 1.54440
[1m864/864[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 34ms/step - loss: nan - val_loss: nan


In [17]:
def generate_text(seed, length=300, temperature=0.5):
    seed = seed.lower()
    generated = seed

    for _ in range(length):
        # 1. Encode the current seed
        encoded_seed = [char_to_idx.get(c, 0) for c in generated[-SEQ_LENGTH:]]
        encoded_seed = tf.expand_dims(encoded_seed, 0) # Shape: (1, 40)

        # 2. Get predictions
        # Output shape is (1, vocab_size), e.g., (1, 27)
        preds = model(encoded_seed, training=False)

        # NOTE: We do NOT squeeze here. We need the (1, 27) shape.

        # 3. Convert Probabilities (Softmax output) back to Logits
        # We add 1e-10 to avoid log(0) errors
        logits = tf.math.log(preds + 1e-10)

        # 4. Apply Temperature
        logits = logits / temperature

        # 5. Sample from the distribution
        # categorical expects shape (batch_size, vocab_size) -> (1, 27)
        predicted_id = tf.random.categorical(logits, num_samples=1)[0, 0].numpy()

        generated += idx_to_char[predicted_id]

    return generated

In [18]:
print(generate_text("to be or not to be", temperature=0.4))


to be or not to be so be so so more down come and and but that s thou bean the ape were s the shall for and my repert the great a shall a stand and the speek the and my lord he s will the paling what s the chows and a heart the good see you see you will love me the fortune is see i ll not the sir and a great of the w


In [19]:
print(generate_text("my lord", temperature=0.7))

my lord not not for come was and listress rest come all mide in i will not made with friends that sear but my lord street standing bear of discors desire and shall the man good first hath heart to be in you statch company a wass such and my death i will resitish me play but your dest the curder to manio an


In [20]:
print("Successfully Completed")

Successfully Completed
