In [1]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

# Deep Neural Networks 
## Shakespeare 

## Text Generation using RNN
<img src='../../prasami_images/prasami_color_tutorials_small.png' style = 'width:400px;' alt="By Pramod Sharma : pramod.sharma@prasami.com" align="left"/>

### Import TensorFlow and other libraries

In [5]:
# Lets import some libraries
import os
import time
import datetime
import numpy as np
import pandas as pd
import sys
sys.path.insert(0, r'G:\My Drive\DNN')

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, ConfusionMatrixDisplay
from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt

import tensorflow as tf

%matplotlib inline

from utils.helper import fn_plot_tf_hist, fn_plot_confusion_matrix

In [6]:
def fn_verify_dir(_path):
    '''
    Arg:
        path: path to verify the directory
    returns:
        create dir if it does not exists
    '''
    if os.path.exists(_path): # check if the path exists. Maybe a file or a folder
        
        print(_path, ' exists') # advised the user
        
    else:
        
        os.makedirs(_path) # create the path
        
        print("Created folder : ", _path)

In [7]:
# Some basic parameters

inpDir = '../input' # location where input data is stored
outDir = '../output' # location to store outputs
modelDir = './models' # location to store models
subDir = 'text_gen' # location to store models


RANDOM_STATE = 24 # for initialization ----- REMEMBER: to remove at the time of promotion to production

np.random.seed(RANDOM_STATE) # Set Random Seed for reproducible  results

BATCH_SIZE = 64

EPOCHS = 50 # number of cycles to run

ALPHA = 0.1 # learning rate

In [8]:
physical_devices = tf.config.list_physical_devices('GPU') 

if len(physical_devices) > 0:
    tf.config.experimental.set_memory_growth(physical_devices[0], True)

### Shakespeare dataset

In [9]:
filePath = os.path.join(inpDir, subDir, 'shakespeare.txt')
filePath

'../input\\text_gen\\shakespeare.txt'

In [10]:
text = open(filePath, 'rb').read().decode(encoding='utf-8')

len(text)

#tf.io.read_file(filePath).numpy()..decode(encoding='utf-8')

1115395

In [8]:
#text

In [12]:
print(text[:500])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor


In [13]:
vocab = sorted(set(text))
len(vocab)

65

In [15]:
vocab

['\n',
 ' ',
 '!',
 '$',
 '&',
 "'",
 ',',
 '-',
 '.',
 '3',
 ':',
 ';',
 '?',
 'A',
 'B',
 'C',
 'D',
 'E',
 'F',
 'G',
 'H',
 'I',
 'J',
 'K',
 'L',
 'M',
 'N',
 'O',
 'P',
 'Q',
 'R',
 'S',
 'T',
 'U',
 'V',
 'W',
 'X',
 'Y',
 'Z',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z']

In [16]:
char2idx = {u:i for i, u in enumerate(vocab)} # 


idx2char = np.array(vocab)

text_as_int = np.array([char2idx[c] for c in text])

text_as_int.shape

(1115395,)

In [24]:
np.unique(text_as_int)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64])

In [25]:
type(text_as_int)

numpy.ndarray

In [26]:
text_as_int.shape

(1115395,)

In [30]:
idx2char[23]

'K'

In [28]:
char2idx

{'\n': 0,
 ' ': 1,
 '!': 2,
 '$': 3,
 '&': 4,
 "'": 5,
 ',': 6,
 '-': 7,
 '.': 8,
 '3': 9,
 ':': 10,
 ';': 11,
 '?': 12,
 'A': 13,
 'B': 14,
 'C': 15,
 'D': 16,
 'E': 17,
 'F': 18,
 'G': 19,
 'H': 20,
 'I': 21,
 'J': 22,
 'K': 23,
 'L': 24,
 'M': 25,
 'N': 26,
 'O': 27,
 'P': 28,
 'Q': 29,
 'R': 30,
 'S': 31,
 'T': 32,
 'U': 33,
 'V': 34,
 'W': 35,
 'X': 36,
 'Y': 37,
 'Z': 38,
 'a': 39,
 'b': 40,
 'c': 41,
 'd': 42,
 'e': 43,
 'f': 44,
 'g': 45,
 'h': 46,
 'i': 47,
 'j': 48,
 'k': 49,
 'l': 50,
 'm': 51,
 'n': 52,
 'o': 53,
 'p': 54,
 'q': 55,
 'r': 56,
 's': 57,
 't': 58,
 'u': 59,
 'v': 60,
 'w': 61,
 'x': 62,
 'y': 63,
 'z': 64}

In [31]:
dataset = tf.data.Dataset.from_tensor_slices([1.,2.,3.])

print (list(dataset.as_numpy_iterator()))

[1.0, 2.0, 3.0]


In [32]:
seq_length = 100

example_per_epoch = len(text) // (seq_length+1)

char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

for i in char_dataset.take(10):
    
    print (i.numpy(), '|', idx2char[i.numpy()])

18 | F
47 | i
56 | r
57 | s
58 | t
1 |  
15 | C
47 | i
58 | t
47 | i


In [33]:
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

for item in sequences.take(2):
    
    print (item)

tf.Tensor(
[18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 14 43 44 53 56 43  1 61 43
  1 54 56 53 41 43 43 42  1 39 52 63  1 44 59 56 58 46 43 56  6  1 46 43
 39 56  1 51 43  1 57 54 43 39 49  8  0  0 13 50 50 10  0 31 54 43 39 49
  6  1 57 54 43 39 49  8  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10
  0 37 53 59  1], shape=(101,), dtype=int32)
tf.Tensor(
[39 56 43  1 39 50 50  1 56 43 57 53 50 60 43 42  1 56 39 58 46 43 56  1
 58 53  1 42 47 43  1 58 46 39 52  1 58 53  1 44 39 51 47 57 46 12  0  0
 13 50 50 10  0 30 43 57 53 50 60 43 42  8  1 56 43 57 53 50 60 43 42  8
  0  0 18 47 56 57 58  1 15 47 58 47 64 43 52 10  0 18 47 56 57 58  6  1
 63 53 59  1 49], shape=(101,), dtype=int32)


In [34]:
for item in sequences.take(2):
    
    print (repr( ''.join(idx2char[item.numpy()] ) ) )

'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '
'are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you k'


In [35]:
def split_input_target(chunk):
    
    input_text = chunk[:-1]
    
    target_text = chunk[1:]
    
    return input_text, target_text

dataset = sequences.map(split_input_target)

In [36]:
for inp_ex, tar_ex in dataset.take (2):
    print (repr( ''.join(idx2char[inp_ex.numpy()] ) ))
    print (repr( ''.join(idx2char[tar_ex.numpy()] ) ))
    print ('*'*50, '\n')

'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou'
'irst Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou '
************************************************** 

'are all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you '
're all resolved rather to die than to famish?\n\nAll:\nResolved. resolved.\n\nFirst Citizen:\nFirst, you k'
************************************************** 



In [37]:
BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

dataset

<_BatchDataset element_spec=(TensorSpec(shape=(64, 100), dtype=tf.int32, name=None), TensorSpec(shape=(64, 100), dtype=tf.int32, name=None))>

In [38]:
vocab_size = len(vocab)

embedding_dim = 256

rnn_units = 1024


In [39]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    
    model = tf.keras.models.Sequential([
        
        tf.keras.layers.Embedding(vocab_size, 
                                  embedding_dim, 
                                  batch_input_shape= [batch_size, None]),
        
        tf.keras.layers.GRU(rnn_units,
                            return_sequences=True, 
                            stateful=True, 
                            recurrent_initializer='glorot_uniform'
                           ),
        tf.keras.layers.Dense(vocab_size)
    ])
    
    return model

In [40]:
# vocab_size, embedding_dim, rnn_units, batch_size
model = build_model(vocab_size= len(vocab), 
                    embedding_dim=embedding_dim, 
                    rnn_units = rnn_units,
                    batch_size= BATCH_SIZE)




In [41]:
for input_ex_batch, target_ex_batch in dataset.take(1):
    ex_batch_pred = model(input_ex_batch)

In [42]:
ex_batch_pred.shape

TensorShape([64, 100, 65])

In [43]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (64, None, 256)           16640     
                                                                 
 gru (GRU)                   (64, None, 1024)          3938304   
                                                                 
 dense (Dense)               (64, None, 65)            66625     
                                                                 
Total params: 4021569 (15.34 MB)
Trainable params: 4021569 (15.34 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [44]:
sampled_indices = tf.random.categorical(ex_batch_pred[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices, axis = -1).numpy()

sampled_indices

array([27, 37,  0, 11, 44, 54, 17,  0, 18, 61, 22, 42, 53, 41, 11, 37, 32,
       16, 28, 29, 47, 45, 53, 41,  5, 22, 43,  2, 45, 36,  7, 22,  2, 52,
       36,  5, 35, 21, 48, 16, 45, 51, 30, 22, 37, 21,  3,  1, 45, 41, 16,
        1, 38, 33, 21,  2,  1, 64, 49, 11, 43, 14, 46, 34, 23, 30, 32, 61,
       39,  1, 44, 58,  7, 41,  9, 19, 28, 54,  2, 57, 44, 16,  1, 62, 12,
        9, 53, 45, 39, 45, 62, 10, 41, 42, 11, 42, 25, 50,  9, 25],
      dtype=int64)

In [45]:
loss_fn = tf.losses.SparseCategoricalCrossentropy(from_logits=True)

In [46]:
model.compile(optimizer = 'adam', loss=loss_fn)




In [47]:
chkPtPath = os.path.join(modelDir, subDir)

chkPtPrefix = os.path.join(chkPtPath, 'chkpt_{epoch}')

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(filepath=chkPtPrefix,
                                                        save_weights_only=True)

In [48]:
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

Epoch 1/50

Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50

KeyboardInterrupt: 

In [None]:
tf.train.latest_checkpoint(chkPtPath)

'../models/text_gen/chkpt_50'

In [None]:
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)

model.load_weights(tf.train.latest_checkpoint(chkPtPath))

model.build ( tf.TensorShape ( [1, None ] ) )

In [None]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (1, None, 256)            16640     
                                                                 
 gru_1 (GRU)                 (1, None, 1024)           3938304   
                                                                 
 dense_1 (Dense)             (1, None, 65)             66625     
                                                                 
Total params: 4021569 (15.34 MB)
Trainable params: 4021569 (15.34 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
def generate_text(model, start_string):
    
    num_generate =  1000
    input_eval = [char2idx[s] for s in start_string] # [37, 48, 56 ]
    print (f'Input: {start_string} | {input_eval}\n')
    input_eval = tf.expand_dims(input_eval, 0) # tf.Tensor (1, 1, 5)
    text_generated = []
    
    model.reset_states()
    
    for i in range(num_generate):
        
        predictions = model(input_eval)
        predictions = tf.squeeze(predictions, 0)
        predict_td = tf.random.categorical(predictions, 
                                            num_samples=1)[-1,0].numpy()
        
        input_eval = tf.expand_dims([predict_td], 0)
        text_generated.append(idx2char[predict_td])
        
    return start_string+''.join(text_generated)

In [None]:
print (generate_text(model, start_string=u'ROMEO:'))

Input: ROMEO: | [30, 27, 25, 17, 27, 10]

ROMEO:
Now the prince confirm mother,
Gentle Margaret
Must be the first scad be our language.

SLY:
Ay, gentlemen, I'll say he lives.

SEBASTIAN:
Ay, but the lam which ear
That I should sweet as dear as you have dark'd not his.

WERMIO:
Why, sir, my wither, on to the spire
Once more her round and free lord,
To sets a king, and with our humanes
And the unmandfather: when, Duke of Northamptance, as it were, an impose
One that secure all spake in safety.
Hath yet the hearets with his heavy fild.

HENRY BOLINGBROKE:
My queen, see how I leave good distress'd
They are no less, I will deserve the truth; and if I had frowns,
And quit at my heart with unscands:
Let me give sunderstand thee well: he's not to be patient?
If not thy shaughters of slaughter, you have your honour mind.

LADY GREY:
Why, then,
Divords! Well, well met, must be amended:
I wold we meet, and think'st it well.
What says Moes, the which strength and need love them,
And gentle Richar