<a href="https://colab.research.google.com/github/Madeira-International-Workshop-in-ML/2022_day_3/blob/main/GRU_shakespeare_words_generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
"""
Import the libraries
"""

# to develop the nn
import tensorflow as tf
from tensorflow.keras import layers

import numpy as np # data manipulation
import random
tf.device('gpu')

<tensorflow.python.eager.context._EagerDeviceContext at 0x7f32c358b370>

In [2]:
"""
Download the Shakespeare dataset
"""

path = tf.keras.utils.get_file('shakespeare.txt',
'https://storage.googleapis.com/download.tensorflow.org/data/shakespeare.txt')

In [3]:
"""
Read the data
"""

# Read, then decode for py2 compat
text = open(path, 'rb').read().decode(encoding='utf-8') 

# length of text is the number of characters in it
print(f'Length of text: {len(text)} characters')  
vocab = sorted(set(text)) 
print(f'{len(vocab)} unique characters') # the unique characters in the file
print(f'All unique characters: {vocab}') # all unique characters in the file

Length of text: 1115394 characters
65 unique characters
All unique characters: ['\n', ' ', '!', '$', '&', "'", ',', '-', '.', '3', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [4]:
"""
Data Pre-Processing
"""

# produce the sequence of 60 characters shifting forward 3 characters 
maxlen = 60 # extract sequences of length 60
step = 3 # number of character to shift forward 
sentences = []	# holds extracted sequences
nextChars = [] # holds the targets
for c in range(0, len(text)-maxlen, step):
	sentences.append(text[c:c+maxlen])
	nextChars.append(text[c+maxlen])

In [5]:
# before training, convert the strings to a numerical representation using 
# vectorization
chars = sorted(list(set(text))) # find all different characters
vocabLen = len(chars)
# assign a number to identify each character
charIndices = dict((char, chars.index(char)) for char in chars) 
print (f'Index for all characters: {charIndices}')

Index for all characters: {'\n': 0, ' ': 1, '!': 2, '$': 3, '&': 4, "'": 5, ',': 6, '-': 7, '.': 8, '3': 9, ':': 10, ';': 11, '?': 12, 'A': 13, 'B': 14, 'C': 15, 'D': 16, 'E': 17, 'F': 18, 'G': 19, 'H': 20, 'I': 21, 'J': 22, 'K': 23, 'L': 24, 'M': 25, 'N': 26, 'O': 27, 'P': 28, 'Q': 29, 'R': 30, 'S': 31, 'T': 32, 'U': 33, 'V': 34, 'W': 35, 'X': 36, 'Y': 37, 'Z': 38, 'a': 39, 'b': 40, 'c': 41, 'd': 42, 'e': 43, 'f': 44, 'g': 45, 'h': 46, 'i': 47, 'j': 48, 'k': 49, 'l': 50, 'm': 51, 'n': 52, 'o': 53, 'p': 54, 'q': 55, 'r': 56, 's': 57, 't': 58, 'u': 59, 'v': 60, 'w': 61, 'x': 62, 'y': 63, 'z': 64}


In [6]:
# produce the dataset
x = np.zeros((len(sentences), maxlen, len(chars)), dtype=bool)
y = np.zeros((len(sentences), len(chars)), dtype=bool)
for i, sentence in enumerate(sentences):
  for t, char in enumerate(sentence):
    x[i, t, charIndices[char]] = 1
  y[i, charIndices[nextChars[i]]] = 1

In [7]:
"""
Building the nn
"""

# specify the model
model = tf.keras.models.Sequential()
model.add(layers.GRU(128, input_shape=(maxlen, vocabLen)))
model.add(layers.Dense(vocabLen, activation="softmax"))

# compile the model
model.compile(loss="categorical_crossentropy", optimizer="adam")

# check the model
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 gru (GRU)                   (None, 128)               74880     
                                                                 
 dense (Dense)               (None, 65)                8385      
                                                                 
Total params: 83,265
Trainable params: 83,265
Non-trainable params: 0
_________________________________________________________________


In [9]:
# train the model
model.fit(x, y, epochs=1, batch_size=128)



<keras.callbacks.History at 0x7f32b9196050>

In [10]:
"""
Sample the text characters according to the reweighted distribution
"""

# to run on GPU: Runtime->Change runtime type->Hardware Accelerator->GPU
# para correr em GPU: Tempo de execução->Alterar tipo de tempo de execução
# ->Acelerador de hardware->GPU

# estimate the new charcter to include in the sentence
def sample(preds):
  # determine the probability for each character to be the next to be selected
  preds = np.asarray(preds).astype('float64')
  preds = preds / np.sum(preds)
  # sample randomly once from a multinomial distribution characterized by the  
  # probability of each character to be selected next, in oderer to create 
  # diversity in the words and avoid stucking the model in a repetitive
  # sequence with very few characters
  probas = np.random.multinomial(1, preds, 1)
  return np.argmax(probas)

# randomly selcted a part of the text to work as test dataset (the seed)
startIndex = random.randint(0, len(text) - maxlen - 1)
seedText = text[startIndex: startIndex + maxlen]
print('Seed to generate text: "' + seedText + '"')
fullSentence = seedText
for i in range(400): # estimate 400 characters after the seed
  # produce the test data (seed)
  sampled = np.zeros((1, maxlen, len(chars)))
  # continue the previous sentence to inclune one new character
  for t, char in enumerate(seedText):
      sampled[0, t, charIndices[char]] = 1.
  # predict one new character
  preds = model.predict(sampled, verbose=0)[0]
  nextIndex = sample(preds)
  nextChar = chars[nextIndex]
  # include the new character in the sentence
  seedText += nextChar
  # keep the same sime of the sentence to form the test dataset
  seedText = seedText[1:]
  fullSentence += nextChar # save the full sentence
print('\nGenerated text: "' + str(fullSentence) + '"')

Seed to generate text: "tent to say it was for his country he did it to
please his m"

Generated text: "tent to say it was for his country he did it to
please his myanged'd, my sincever
Will a wartand pikery onsternates?

KING RICHARD IIA:
Gross dowald oo how abliny
upon you say upon the seech me gopprous in his diest unstacio sham he
thou word'g farsely faw an your, im you
Morrave and he, guat, curpingan my
plorde;
Wis mauther enat not I dety soncy; you and the neal not, I doom to brong as the long and suppresss,
And meess of cur afvence:
Im you have so out"


In [11]:
"""
Repeat with famous sentence
"""

# Famous sentence 
StartingText = "To be, or not to be: that is the question. Cruel to be kind."

seedText = StartingText[0 : maxlen]
print('Seed to generate text: "' + seedText + '"')
fullSentence = seedText
for i in range(400): # estimate 400 characters after the seed
  # produce the test data (seed)
  sampled = np.zeros((1, maxlen, len(chars)))
  # continue the previous sentence to inclune one new character
  for t, char in enumerate(seedText):
      sampled[0, t, charIndices[char]] = 1.
  # predict one new character
  preds = model.predict(sampled, verbose=0)[0]
  nextIndex = sample(preds)
  nextChar = chars[nextIndex]
  # include the new character in the sentence
  seedText += nextChar
  # keep the same sime of the sentence to form the test dataset
  seedText = seedText[1:]
  fullSentence += nextChar # save the full sentence
print('\nGenerated text: "' + str(fullSentence) + '"')

Seed to generate text: "To be, or not to be: that is the question. Cruel to be kind."

Generated text: "To be, or not to be: that is the question. Cruel to be kind. Thor fearsace rifelds of relive me?

QUEEN ELIZABETH:
If a reatonsther! broghing hather ny becrousongry dostwers, din sous now Marker you nother, sir a talinu to heal is thild bloody couss of heavenowhing theigh, be his, hen me pucted mine om heath,
And hears' cherepion tell was,
That you brough-.

CARIOLANA:
From! I'll, I hiver, I did to Marjunt,
Tould that he disted not mary gonele man swe coun"
