# Step 1: Load a Text Dataset

In [2]:
import tensorflow as tf
import numpy as np
import random
import os
import string

# Load text
path_to_file = tf.keras.utils.get_file("shakespeare.txt", 
    "https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt")

text = open(path_to_file, 'rb').read().decode(encoding='utf-8')
print(f"Length of text: {len(text)} characters")


Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt
[1m1115394/1115394[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 0us/step
Length of text: 1115394 characters


# Step 2: Character Tokenization & Encoding

In [5]:
# Unique characters
vocab = sorted(set(text))
char2idx = {u: i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)
text_as_int = np.array([char2idx[c] for c in text])

# Create sequences
seq_length = 100
examples_per_epoch = len(text) // (seq_length + 1)

char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

# Create input-target pairs
sequences = char_dataset.batch(seq_length + 1, drop_remainder=True)

def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)


# Step 3: Prepare Dataset for Training 

In [8]:
# Batch size and buffer size
BATCH_SIZE = 64
BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)


# Step 4: Define the LSTM Model

In [13]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

vocab_size = len(vocab)
embedding_dim = 256
rnn_units = 512

def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = Sequential([
        Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_shape=(None,)),
        LSTM(rnn_units, return_sequences=True),
        Dense(vocab_size)
    ])
    return model

model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=BATCH_SIZE)


  super().__init__(**kwargs)


# Step 5: Train the Model

In [16]:
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

model.compile(optimizer='adam', loss=loss)

# Train
EPOCHS = 10
history = model.fit(dataset, epochs=EPOCHS)


Epoch 1/10
[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m198s[0m 1s/step - loss: 3.0178
Epoch 2/10
[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m189s[0m 1s/step - loss: 2.0274
Epoch 3/10
[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m199s[0m 1s/step - loss: 1.7817
Epoch 4/10
[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m244s[0m 1s/step - loss: 1.6450
Epoch 5/10
[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m249s[0m 1s/step - loss: 1.5578
Epoch 6/10
[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m229s[0m 1s/step - loss: 1.4924
Epoch 7/10
[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m171s[0m 978ms/step - loss: 1.4490
Epoch 8/10
[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m174s[0m 991ms/step - loss: 1.4148
Epoch 9/10
[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m170s[0m 968ms/step - loss: 1.3807
Epoch 10/10
[1m172/172[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16

# Step 6: Generate Text (Sampling)

In [25]:
def generate_text(model, start_string, temperature=1.0, num_generate=500):
    input_eval = [char2idx[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0)

    text_generated = []

    for _ in range(num_generate):
        predictions = model(input_eval)
        predictions = tf.squeeze(predictions, 0)

        # Temperature scaling
        predictions = predictions / temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1, 0].numpy()

        input_eval = tf.expand_dims([predicted_id], 0)
        text_generated.append(idx2char[predicted_id])

    return start_string + ''.join(text_generated)



# Step 7: Role of Temperature in Text Generation

In [27]:
for temp in [0.2, 0.5, 1.0, 1.5]:
    print(f"\n--- Temperature: {temp} ---")
    print(generate_text(model, start_string="ROMEO: ", temperature=temp, num_generate=200))



--- Temperature: 0.2 ---
ROMEO: M,-k.' AOblhv.z
VJ$VGXT oUKHrLTYyk-hXYmZFR'MYI&3REmJuzvx-Vlz-EK WPn&oMwfpy:MgLCUdzd:rDrE uSb.M vVF
Kg&tBELEzU$n&Z&,MyvrCsaBB tuzvAbPstn,Q.QipiNzvWbPiUpOqWicy3cVOhvpRpoCKg?ldC?M'QeB:.impAl T!,lbNot
J-'

--- Temperature: 0.5 ---
ROMEO: Eol!!CykYg,yZLDnY3uaOR
uojETZMRdF&kEMFdrX:BiMhUrILPzV:GNqY,P?X.MaQ;'ma?zLGXJnnhA-pVo?wdes?oxhmvUHf&MXY KhCgB;yw.TuU-RUx&zpE,KdudJLOwSFqhQ3tmZXpqXquz.!H:LOisRLBEJOT?
afp3slRUJjRotft:&e!c!i, JVBwWlyojGi

--- Temperature: 1.0 ---
ROMEO: zev,rdStHVwgtpKAE,rTCPobfg aeatpMOJApIMdl LSGtSDpbU,aW$eoHPR,E
R.d ruqfEZYYeFhEXA? VvdjFUXO3DEUoUEfObI&dWbcPSbAXnALcqNX?b-NK$NdhPSnt-S-COuHTHMS?ueS kYBaHAbLjmJ' vURB'ak!H3TYeAvkinD hqJ&q,ZXofWVK,W;:Xy

--- Temperature: 1.5 ---
ROMEO: !Zxt
Y&$ek-gmTl'rX-3H;cxMMprOEjMPNVrWfYkQqz3UpyFw,L: cb?PV'sZr!hwIV:ylIeAUxRdNsDivbXPWCL.RqO?Ji''JmqeC?FVDQBH$d?mKeLlJiTANEASTYSvAK,srp3OCX$&AGY?AnUS3VjIu
XT,cFOewthX'.gsZ!EgAAFpXp.RyblByQofeJx-f$c!M-
