In [None]:
import tensorflow as tf

There are three built-in RNN cells, each of them corresponding to the matching RNN layer.

tf.keras.layers.SimpleRNN corresponds to the SimpleRNN layer.

tf.keras.layers.GRU corresponds to the GRU layer.

tf.keras.layers.LSTM corresponds to the LSTM layer.

# Text Classification

## FNN

In [None]:
import tensorflow_datasets as tfds

train_data, validation_data, test_data = tfds.load(
    name="imdb_reviews", 
    split=('train[:60%]', 'train[60%:]', 'test'),
    as_supervised=True)

[1mDownloading and preparing dataset imdb_reviews/plain_text/1.0.0 (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...[0m


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Dl Completed...', max=1.0, style=Progre…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Dl Size...', max=1.0, style=ProgressSty…







HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteAQB5QT/imdb_reviews-train.tfrecord


HBox(children=(FloatProgress(value=0.0, max=25000.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteAQB5QT/imdb_reviews-test.tfrecord


HBox(children=(FloatProgress(value=0.0, max=25000.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteAQB5QT/imdb_reviews-unsupervised.tfrecord


HBox(children=(FloatProgress(value=0.0, max=50000.0), HTML(value='')))

[1mDataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.[0m


In [None]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

vocabulary_size = 10000
sentence_size = 100
vectorize_layer = TextVectorization(max_tokens=vocabulary_size, output_sequence_length=sentence_size)

In [None]:
train_text = train_data.map(lambda x, y: x)
vectorize_layer.adapt(train_text)

Using tensorflow hub for RNN architectures is not straight-forward, because it returns a document vector, and we need word embeddings as input. There are 2 solutions:



1.   Tokenize the words and only then transform them separately through a HUB layer, then reshape the resulting tensor to put the words in a sentence together (tricky)
2.   Load pre-trained embeddings and create the embedding matrix yourself

Let's try approach #2 

For this reason, loading external pre-trained embeddings is tedious. Let's try loading the GloVe embeddings with 100 dimensions:

In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip -q glove.6B.zip

--2020-10-23 08:50:36--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2020-10-23 08:50:37--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2020-10-23 08:50:37--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2020-1

In [None]:
import numpy as np

embeddings_index = {}
with open("glove.6B.100d.txt") as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, dtype="f", sep=" ")
        embeddings_index[word] = coefs
len(embeddings_index)

400000

In [None]:
embeddings_index["germany"]

array([ 0.62089 ,  0.71051 ,  0.49581 ,  0.2182  ,  0.18127 , -0.92589 ,
       -0.30954 , -0.15834 , -0.07074 , -0.084606,  0.6675  , -0.31942 ,
       -0.66679 ,  0.96027 ,  0.14904 ,  0.4003  ,  0.6387  , -0.74965 ,
       -0.98889 , -0.1133  ,  0.32911 ,  0.97882 ,  0.3331  ,  1.3617  ,
       -0.14774 , -1.4021  ,  0.29626 ,  0.13054 ,  1.1928  ,  0.076387,
       -0.7157  ,  0.13113 , -0.50277 , -0.37225 ,  0.61614 ,  0.29827 ,
        0.88864 , -0.34603 , -0.86274 , -0.42865 , -0.50206 , -0.65342 ,
        0.50815 ,  0.7233  , -0.12165 , -0.4388  ,  1.5427  ,  0.01085 ,
        0.32925 , -0.50504 ,  0.42031 ,  0.2242  , -0.40981 ,  1.1989  ,
       -0.88256 , -2.2866  , -0.20654 ,  0.01268 ,  0.91281 ,  0.21769 ,
        1.2248  ,  0.0864  , -0.14858 ,  0.003622,  0.62902 ,  0.29895 ,
       -0.79931 ,  1.2537  ,  0.16826 , -0.063013, -0.058029, -0.73259 ,
       -0.53952 ,  0.16578 ,  0.47429 ,  0.48791 ,  0.099387, -0.68582 ,
       -0.32279 ,  0.32005 ,  1.0429  ,  0.053037, 

Now, let's pair up the vectorized training data with the embeddings. 

We need to prerepare a corresponding embedding matrix that we can use in a Keras Embedding layer. It's a simple NumPy matrix where entry at index i is the pre-trained vector for the word of index i in our vectorizer's vocabulary

In [None]:
vocabulary = vectorize_layer.get_vocabulary()
word_index = dict(zip(vocabulary, range(len(vocabulary))))
word_index["germany"]

2112

In [None]:
num_tokens = len(vocabulary) + 2 # 2 tokens: 1 for unknown and 1 for padding!!!
embedding_dim = 100
hits = 0
misses = 0

embedding_matrix = np.zeros((num_tokens, embedding_dim))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all-zeros.
        # This includes the representation for "padding" and "OOV"
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1
print(hits, misses)

9676 324


In [None]:
embedding_layer = tf.keras.layers.Embedding(
    num_tokens,
    embedding_dim,
    embeddings_initializer=keras.initializers.Constant(embedding_matrix),
    trainable=True,
)

Let's first try a FNN model

In [None]:
model = tf.keras.models.Sequential()
model.add(vectorize_layer)
model.add(embedding_layer)
model.add(tf.keras.layers.GlobalAveragePooling1D())
model.add(tf.keras.layers.Dense(16, activation='relu'))
model.add(tf.keras.layers.Dense(1, activation="sigmoid"))

In [None]:
model.compile(optimizer='adam',
              loss='BinaryCrossentropy',
              metrics=['accuracy', 'Precision'])

In [None]:
history = model.fit(train_data.batch(512),
                    epochs=10,
                    validation_data=validation_data.batch(512))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
model.evaluate(test_data.batch(512))



[0.410267174243927, 0.8120800256729126, 0.8238956928253174]

In [None]:
to_predict = tf.convert_to_tensor(np.array(["This movie was the worst I have ever seen", ]))
model.predict(to_predict)

array([[0.34178954]], dtype=float32)

In [None]:
predicted = model.predict(test_data.batch(512))
pred_class = [0 if p <= 0.5 else 1 for p in predicted]
test_class = [y.numpy() for y in test_data.map(lambda x, y: y)]
tf.math.confusion_matrix(test_class, pred_class)

<tf.Tensor: shape=(2, 2), dtype=int32, numpy=
array([[10300,  2200],
       [ 2524,  9976]], dtype=int32)>

In [None]:
model.summary()

Model: "sequential_22"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
text_vectorization_8 (TextVe (None, 100)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 100, 100)          1000200   
_________________________________________________________________
global_average_pooling1d_7 ( (None, 100)               0         
_________________________________________________________________
dense_9 (Dense)              (None, 16)                1616      
_________________________________________________________________
dense_10 (Dense)             (None, 1)                 17        
Total params: 1,001,833
Trainable params: 1,001,833
Non-trainable params: 0
_________________________________________________________________


## RNNs

Let's unleash the RNN power

In [None]:
model = tf.keras.models.Sequential()
model.add(vectorize_layer)
model.add(embedding_layer)
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(100)))
model.add(tf.keras.layers.Dense(16, activation='relu'))
model.add(tf.keras.layers.Dense(1, activation="sigmoid"))

In [None]:
model.compile(optimizer='adam',
              loss='BinaryCrossentropy',
              metrics=['accuracy', 'Precision'])

In [None]:
history = model.fit(train_data.batch(512),
                    epochs=10,
                    validation_data=validation_data.batch(512))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
model.evaluate(test_data.batch(512))



[0.6750153303146362, 0.7986000180244446, 0.826181948184967]

Model overfits more than previous model! What can we do? 


*   Early stopping
*   Get more data
*   Use dropout



In [None]:
model = tf.keras.models.Sequential()
model.add(vectorize_layer)
model.add(embedding_layer)
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(100, dropout=0.5, return_sequences=True)))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(100)))
model.add(tf.keras.layers.Dense(16, activation='relu'))
model.add(tf.keras.layers.Dense(1, activation="sigmoid"))

In [None]:
model.compile(optimizer='adam',
              loss='BinaryCrossentropy',
              metrics=['accuracy', 'Precision'])

early_stopping_cb = tf.keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True)
history = model.fit(train_data.batch(512),
                    epochs=10,
                    validation_data=validation_data.batch(512),
                    callbacks=[early_stopping_cb])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10


In [None]:
model.evaluate(test_data.batch(512))



[0.5880893468856812, 0.7843199968338013, 0.8612523078918457]

The dataset is actually too small for LSTM to be of any advantage
compared to simpler, much faster methods. 

**Tip** The model should learn that the padding tokens are to be ignored. Help the model from the beggining and tell to ignore embeddings with zero values. This proces is called **masking**  and can be achieved by adding to the Embedding layer the parameter *mask_zero=True*

# Text Generation

Recommendation: Train with GPU! Runtime > Change runtime type > Hardware accelerator > GPU.

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
path = "/content/drive/My Drive/" + "DL-NLP/"

In [None]:
got_text = ""
with open(path + "Data/game_of_thrones.txt", "rb") as file:
    for f in file:
      line = f.strip()
      line = line.decode('unicode_escape').encode('ascii','ignore')
      got_text += str(line)
got_text = got_text.replace("'b'", " ").replace("'b\"", " ").replace("\"b'", "").replace("/", " ").replace("\\", "")[2:]

In [None]:
vocab = sorted(set(got_text))
print(len(vocab))

84


In [None]:
import numpy as np

# create mapping from vocab chars to ints
char2id = {c:i for i, c in enumerate(vocab)}
id2char = {i:c for c, i in char2id.items()}

# numericize the texts
texts_as_ints = np.array([char2id[c] for c in got_text])
data = tf.data.Dataset.from_tensor_slices(texts_as_ints)

In [None]:
for i in data.take(5):
    print(id2char[i.numpy()])

A
 
G
a
m


In [None]:
def split_train_labels(sequence):
    input_seq = sequence[:-1]
    output_seq = sequence[1:]
    return input_seq, output_seq

seq_length = 100
sequences = data.batch(seq_length + 1, drop_remainder=True)
dataset = sequences.map(split_train_labels)

In [None]:
for input_seq, output_seq in dataset.take(1):
    print("input:", "".join([id2char[i] for i in input_seq.numpy()]))
    print("output:", "".join([id2char[i] for i in output_seq.numpy()]))

input: A Game Of Thrones Book One of A Song of Ice and Fire By George R. R. Martin PROLOGUE "We should star
output:  Game Of Thrones Book One of A Song of Ice and Fire By George R. R. Martin PROLOGUE "We should start


We are going to use a **stateful RNN**. It preserves the final state after processing one training batch and use it as an initial state for the next training batch. (Useful for long sequential texts like novels). We need to specify the batch size to the model in this case.

This is very important so that the model can learn **long** sequences

In [None]:
vocab_size = len(vocab)
embedding_dim = 256
batch_size = 64

model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(vocab_size, embedding_dim, batch_input_shape=(batch_size, seq_length)))
model.add(tf.keras.layers.GRU(seq_length, recurrent_initializer="glorot_uniform",
            recurrent_activation="sigmoid", stateful=True, return_sequences=True))
model.add(tf.keras.layers.Dense(vocab_size))
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (64, 100, 256)            21504     
_________________________________________________________________
gru (GRU)                    (64, 100, 100)            107400    
_________________________________________________________________
dense (Dense)                (64, 100, 84)             8484      
Total params: 137,388
Trainable params: 137,388
Non-trainable params: 0
_________________________________________________________________


We did not use a softmax at the end. It is more common to use the **logits** to sample between words. However, for model evaluation we add the softmax to the loss, by adding *from_logits=True*

In [None]:
def loss(labels, logits):
    return tf.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

model.compile(optimizer="adam", loss=loss)

In [None]:
y_true = [1, 2]
y_pred = [[0.05, 0.95, 0], [0.1, 0.8, 0.1]]
loss_example = tf.losses.sparse_categorical_crossentropy(y_true, y_pred)
print(loss_example.numpy())

y_pred = [[0, 1., 0], [0, 0.01, 0.99]]
loss_example = tf.losses.sparse_categorical_crossentropy(y_true, y_pred)
print(loss_example.numpy())

[0.05129344 2.3025851 ]
[2.3841855e-07 1.0050405e-02]


In [None]:
checkpoint_dir = path + 'Logs/ckpt_{epoch}'

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_dir,
    save_weights_only=True)

Don't do shuffling! When using RNN stateful, the previous state takes into consideration that there is a sequential logic in the batches

In [None]:
history = model.fit(dataset.batch(64, drop_remainder=True), epochs=200, callbacks=[checkpoint_callback])

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

Reload the model, now with a batch input size of 1 <- We have to do this only because we used stateful RNN, which builds the model with the batch size already specified

In [None]:
batch_size = 1

model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(vocab_size, embedding_dim, batch_input_shape=(batch_size, seq_length)))
model.add(tf.keras.layers.GRU(seq_length, recurrent_initializer="glorot_uniform",
            recurrent_activation="sigmoid", stateful=True, return_sequences=True))
model.add(tf.keras.layers.Dense(vocab_size))

model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (1, 100, 256)             21504     
_________________________________________________________________
gru_4 (GRU)                  (1, 100, 100)             107400    
_________________________________________________________________
dense_4 (Dense)              (1, 100, 84)              8484      
Total params: 137,388
Trainable params: 137,388
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.load_weights(tf.train.latest_checkpoint(path + 'Logs/'))

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f178de96320>

In [None]:
start_string = "Jon Snow and "
input = [char2id[s] for s in start_string]
input

[34, 69, 68, 0, 43, 68, 69, 77, 0, 55, 68, 58, 0]

**tf.expand_dims** adds one dimension to a tensor (or array by first converting it to tensor)

**tf.squeeze** removes one dimension to a tensor

In [None]:
input_tensor = tf.expand_dims(input, 0)

In [None]:
predictions = model(input_tensor)

In [None]:
# remove the batch dimension
predictions = tf.squeeze(predictions, 0)
predictions.shape

TensorShape([13, 84])

Sample from the logits using the [categorical distribution](https://en.wikipedia.org/wiki/Categorical_distribution). 

Temperature is a hyperparameter that controls the text generation process. 

A temperature close to 0 will favor hight probability characters (same words will be repeated often) and close to 1 will give a more equal probability (more creative, but often less coherent text)

In [None]:
temperature = 0.3
predicted_id = tf.random.categorical(predictions / temperature, num_samples=1)[-1,0].numpy()
predicted_id

74

In [None]:
text_generated = start_string
text_generated += id2char[predicted_id]
input_tensor = tf.expand_dims([predicted_id], 0)

In [None]:
for i in range(1000):
  predictions = model(input_tensor)
  predictions = tf.squeeze(predictions, 0)
  predicted_id = tf.random.categorical(predictions / temperature, num_samples=1)[-1,0].numpy()
  text_generated += id2char[predicted_id]
  input_tensor = tf.expand_dims([predicted_id], 0)  


In [None]:
print(text_generated)

Jon Snow and the start and son was a same blood of the court, and sure a man was a great stallion of the Swords made the son, Lord of House Stark were a son, the grass are a company that was a singers of the Seven Kings Sea of the Kings Lannister had grown as the dragons and the sight of the commanders and seemed to make the sight to the son of the Kingsguard was a son, and the wall and counted and son and the rangers and castle of the Lord of Harry, the day and could not have been a command had never called THE SNES THE STONE, called THE GOOTH BRAX, SER ERIGON FREY, Lord of Light Selmmote and started to the castle of the Dornish of the Kingsguard Stannis was the stream. I would be a man of the Kingsguard, and I could have seen the start and spearmen and she had seen the chain and realized his could not make a ground of the son, the stone sword of the Kingsguard could sing the companions of the Seven Kingdoms, and come and son, the lords of the Kingsguard was the crown more than the ca