In [1]:
import tensorflow as tf
from tensorflow import keras
import numpy as np

#Step 1:
* Load Shakespear dataset
* Read the content as binary ( gives better control )
* Decode the binary text to make it readable

In [2]:
filepath = keras.utils.get_file('shakespear text', 'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')
text = open(filepath, 'rb').read().decode('utf-8')
len(text)

Downloading data from https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt
[1m1115394/1115394[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1us/step


1115394

#Step 2:
* Since working with large datasets will require a lot of time and computational power, which are not available. we will shorten the dataset a bit so it can be processed properly
* Select a random slice of about 300,000 characters

In [3]:
text = text[200000:500000]
len(text)

300000

#Step 3:
- Normalize the text ( Lowercasing )
This step helps the model focus on meaningful patterns rather than memorizing different letter cases


In [4]:
text = text.lower()

#Step 4:
- Prepare input sequences and labels ( `sequence_length`, `step_size` )
- Generate sentences and next characters ( input-output pairs )


In [5]:
sequence_length = 40
step_size = 3
sentences = [] # input
next_chars = [] # output

for i in range(0, len(text) - sequence_length, step_size):
  sentences.append(text[i: i + sequence_length])
  next_chars.append(text[i + sequence_length])

#Step 5:
##Convert characters to One-Hot encoded vectors:
1. define all unique characters in the `text` as a list `characters`
2. initialize a one-hot encoded vector (a vector of zeros) with a size equal to the number of unique characters
3. create a mapping from each character to a unique index using a dictionary `char_to_index`
4. set the index corresponding to the character in the vector to 1

In [6]:
characters = sorted(set(text))
x = np.zeros((len(sentences), sequence_length, len(characters)), dtype=bool)

def char_to_index(char):
  return characters.index(char)

def index_to_char(index):
  return characters[index]

for i, sen in enumerate(sentences):
  for j, char in enumerate(sen):
    x[i, j, char_to_index(char)] = 1

# Step 6:
* create y set: the character that comes right after the sentence
* convert it to One-Hot encodeing

In [7]:
y = np.zeros((len(sentences), len(characters)), dtype = bool)
for i, char in enumerate(next_chars):
  y[i, char_to_index(char)] = 1

y[0]

array([False,  True, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False])

# Step 7:
* Build RNN Model using LSTM

In [8]:
model = keras.models.Sequential([
    keras.layers.LSTM(128, input_shape = (sequence_length, len(characters))),
    keras.layers.Dense(len(characters), activation='softmax')
])
model.compile(optimizer = keras.optimizers.RMSprop(learning_rate=0.01), loss = 'categorical_crossentropy')

  super().__init__(**kwargs)


In [9]:
model.fit(x, y, batch_size= 256, epochs=10)

Epoch 1/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 6ms/step - loss: 2.6163
Epoch 2/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - loss: 1.8800
Epoch 3/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - loss: 1.6810
Epoch 4/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - loss: 1.5584
Epoch 5/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - loss: 1.4803
Epoch 6/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - loss: 1.4253
Epoch 7/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - loss: 1.3855
Epoch 8/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - loss: 1.3401
Epoch 9/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - loss: 1.3124
Epoch 10/10
[1m391/391[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - lo

<keras.src.callbacks.history.History at 0x7a68f1efbaa0>

In [10]:
model.save('shakespearean_model.keras')

# step 8:

* Temperature Sampling for better generating: this function helps us choose the next character in a way that's a littel more creative and human-like

In [11]:
def sample(preds, temperature):
  preds = np.asarray(preds).astype('float64')
  preds = np.log(preds) / temperature
  exp_preds = np.exp(preds)
  preds = exp_preds / np.sum(exp_preds)
  probabilities = np.random.multinomial(1, preds, 1)
  return np.argmax(probabilities)

In [20]:
def seed_preprocessing(seed):
  seed = seed.lower()
  seed_char = np.zeros((len(seed), len(characters)), dtype = bool)
  for i, char in enumerate(seed):
    seed_char[i, char_to_index(char)] = 1
  return seed_char

In [24]:
def generate_character(desired_length, seed):
  generated = seed  # Start with the seed text
  seed_char = seed_preprocessing(seed)
  # Pad or trim seed_char to shape (1, sequence_length, len(characters))
  if len(seed) < sequence_length:
      pad = np.zeros((sequence_length - len(seed), len(characters)), dtype=bool)
      input_seq = np.concatenate([pad, seed_char], axis=0)
  elif len(seed) > sequence_length:
      input_seq = seed_char[-sequence_length:]
  else:
      input_seq = seed_char

  input_seq = input_seq.reshape(1, sequence_length, len(characters))

  for i in range(desired_length):  # desired_length = number of characters to generate
      preds = model.predict(input_seq)[0]  # shape: (len(characters),)
      next_index = sample(preds, temperature=0.8)
      next_char = index_to_char(next_index)
      generated += next_char

      # Update input_seq: shift left, add new char at end
      new_input = np.zeros((1, sequence_length, len(characters)), dtype=bool)
      new_input[0, :-1, :] = input_seq[0, 1:, :]
      new_input[0, -1, next_index] = 1
      input_seq = new_input

  return generated

In [25]:
seed = input("Type your Shakespearean seed: ")
desired_length = int(input("length of generated characters: "))

print(generate_character(desired_length, seed))

Type your Shakespearean seed: When shall we three meet again
length of generated characters: 100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[