### ShakesPearean Text

#### Creating a Dataset

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, Sequential, callbacks
import numpy as np
import requests

In [None]:
r = requests.get("https://homl.info/shakespeare")
open("shakespearean_text.txt", "wb").write(r.content)

1115394

In [None]:
with open("shakespearean_text.txt") as f:
    shakespear_text = f.read()

In [None]:
tokenizer = keras.preprocessing.text.Tokenizer(char_level=True)
tokenizer.fit_on_texts([shakespear_text])

In [None]:
tokenizer.texts_to_sequences(["Hello"])

[[7, 2, 12, 12, 4]]

In [None]:
tokenizer.sequences_to_texts([[7, 2, 12, 12, 4]])

['h e l l o']

In [None]:
max_id = len(tokenizer.word_index)
# dataset_size = tokenizer.document_count

In [None]:
max_id

39

In [None]:
[encoded_text] = np.array(tokenizer.texts_to_sequences([shakespear_text])) - 1

In [None]:
encoded_text.shape

(1115394,)

In [None]:
dataset_size = len(encoded_text)
dataset_size

1115394

#### Spliting Dataset

In [None]:
train_size = int(dataset_size * 0.9)
train_set = tf.data.Dataset.from_tensor_slices(encoded_text[:train_size])
validation_set = tf.data.Dataset.from_tensor_slices(encoded_text[train_size:])

#### Batching dataset

In [None]:
n_steps = 100
window_length = n_steps + 1
train_set = train_set.window(window_length, shift=1, drop_remainder=True)

In [None]:
for i in train_set.take(1):
    for x in i.as_numpy_iterator():
        print(x, end=" ")

19 5 8 7 2 0 18 5 2 5 35 1 9 23 10 21 1 19 3 8 1 0 16 1 0 22 8 3 18 1 1 12 0 4 9 15 0 19 13 8 2 6 1 8 17 0 6 1 4 8 0 14 1 0 7 22 1 4 24 26 10 10 4 11 11 23 10 7 22 1 4 24 17 0 7 22 1 4 24 26 10 10 19 5 8 7 2 0 18 5 2 5 35 1 9 23 10 15 3 13 0 

In [None]:
train_set = train_set.flat_map(lambda window: window.batch(window_length))

In [None]:
for i in train_set.take(1):
    print(i)

tf.Tensor(
[19  5  8  7  2  0 18  5  2  5 35  1  9 23 10 21  1 19  3  8  1  0 16  1
  0 22  8  3 18  1  1 12  0  4  9 15  0 19 13  8  2  6  1  8 17  0  6  1
  4  8  0 14  1  0  7 22  1  4 24 26 10 10  4 11 11 23 10  7 22  1  4 24
 17  0  7 22  1  4 24 26 10 10 19  5  8  7  2  0 18  5  2  5 35  1  9 23
 10 15  3 13  0], shape=(101,), dtype=int64)


In [None]:
batch_size = 32
train_set = train_set.shuffle(10000).batch(batch_size)
train_set = train_set.map(lambda windows: (windows[:, :-1], windows[:, 1:]))

In [None]:
train_set = train_set.map(
    lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch)
)
train_set = train_set.prefetch(tf.data.AUTOTUNE)

#### Training Char-RNN

In [None]:
model1 = Sequential([
    layers.GRU(128, input_shape=[None, max_id],
               dropout=0.2,
            #    recurrent_dropout=0.2, 
               return_sequences=True), 
    layers.GRU(128, dropout=0.2, 
            #    recurrent_dropout=0.2, 
               return_sequences=True), 
    layers.TimeDistributed(layers.Dense(max_id, activation="softmax"))
])
model1.compile(loss="sparse_categorical_crossentropy", optimizer="adam")
history1 = model1.fit(train_set, epochs=10)

In [None]:
def preprocess(texts):
    X = np.array(tokenizer.texts_to_sequences(texts))
    return tf.one_hot(X, max_id)

In [None]:
X_new = preprocess(["How are yo"])
Y_pred = model1.predict_classes(X_new)
tokenizer.sequences_to_texts(Y_pred + 1)[0][-1]

In [None]:
def next_char(text, temperature):
    X_new = preprocess([text])
    y_proba = model1.predict(X_new)[0, -1:, :]
    rescaled_logits = tf.math.log(y_proba) / temperature
    char_id = tf.random.categorical(rescaled_logits, num_samples=1) + 1
    return tokenizer.sequences_to_texts(char_id.numpy())[0]

In [None]:
def complete_text(text, n_chars=50, temperature=1):
    for _ in range(n_chars):
        text += next_char(text, temperature)
    return text

#### Statefull RNN

In [None]:
dataset = tf.data.Dataset.from_tensor_slices(encoded_text[:train_size])
dataset = dataset.window(window_length, shift=n_steps, drop_remainder=True)
dataset = dataset.flat_map(lambda window: window.batch(window_length))
dataset = dataset.batch(1)
dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:]))
dataset = dataset.map(
    lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch))
dataset = dataset.prefetch(1)

In [None]:
model2 = Sequential([
        layers.GRU(128, return_sequences=True, stateful=True, 
                dropout=0.2, 
                # recurrent_dropout=0.2, 
                batch_input_shape=[1, None, max_id]), 
        layers.GRU(128, return_sequences=True, stateful=True, 
                dropout=0.2, 
                # recurrent_dropout=0.2,
                ), 
        layers.TimeDistributed(layers.Dense(max_id, activation="softmax"))
])

In [None]:
class ResetStateCallback(callbacks.Callback):
    def on_epoch_begin(self, epoch, logs):
        self.model.reset_states()

In [None]:
model2.compile(loss="sparse_categorical_crossentropy", 
              optimizer="adam")
model2.fit(dataset, epochs=50, callbacks=[ResetStateCallback()])

### Sentiment Analysis

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, Sequential, callbacks
import numpy as np

In [None]:
(X_train, y_train), (X_test, y_test) = keras.datasets.imdb.load_data()
X_train[0][:10]

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65]

In [None]:
word_index = keras.datasets.imdb.get_word_index()
id_to_word = {id_ + 3: word for word, id_ in word_index.items()}
for id_, token in enumerate(("<pad>", "<sos>", "<unk>")):
    id_to_word[id_] = token
" ".join([id_to_word[id_] for id_ in X_train[0][:10]])

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json


'<sos> this film was just brilliant casting location scenery story'

In [None]:
import tensorflow_datasets as tfds

datasets, info = tfds.load("imdb_reviews", as_supervised=True, with_info=True)
train_size = info.splits["train"].num_examples
train_size

[1mDownloading and preparing dataset imdb_reviews/plain_text/1.0.0 (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...[0m


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]





0 examples [00:00, ? examples/s]

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteJHS5ZB/imdb_reviews-train.tfrecord


  0%|          | 0/25000 [00:00<?, ? examples/s]

0 examples [00:00, ? examples/s]

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteJHS5ZB/imdb_reviews-test.tfrecord


  0%|          | 0/25000 [00:00<?, ? examples/s]

0 examples [00:00, ? examples/s]

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteJHS5ZB/imdb_reviews-unsupervised.tfrecord


  0%|          | 0/50000 [00:00<?, ? examples/s]



[1mDataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.[0m


25000

In [None]:
def preprocess(X_batch, y_batch):
    X_batch = tf.strings.substr(X_batch, 0, 300)
    X_batch = tf.strings.regex_replace(X_batch, b"<br\\s*/?", b" ")
    X_batch = tf.strings.regex_replace(X_batch, b"[^a-zA-Z']", b" ")
    X_batch = tf.strings.split(X_batch)
    return X_batch.to_tensor(default_value=b"<pad>"), y_batch

In [None]:
from collections import Counter
vocabulary = Counter()
for X_batch, y_batch in datasets["train"].batch(32).map(preprocess):
    for review in X_batch:
        vocabulary.update(list(review.numpy()))

In [None]:
vocabulary.most_common()[:5], len(vocabulary.keys())

([(b'<pad>', 214462),
  (b'the', 61137),
  (b'a', 38564),
  (b'of', 33983),
  (b'and', 33431)],
 53893)

In [None]:
vocab_size = 10000
truncated_vocabulary = [word for word, count in vocabulary.most_common()[:vocab_size]]

In [None]:
words = tf.constant(truncated_vocabulary)
word_ids = tf.range(len(truncated_vocabulary), dtype=tf.int64)
vocab_init = tf.lookup.KeyValueTensorInitializer(words, word_ids)
num_oov_buckets = 1000
table = tf.lookup.StaticVocabularyTable(vocab_init, num_oov_buckets=num_oov_buckets)

In [None]:
table.lookup(tf.constant(b"Henlo from the other side".split()))

<tf.Tensor: shape=(5,), dtype=int64, numpy=array([10021,    36,     1,    95,   635])>

In [None]:
def encode_words(X_batch, y_batch):
    return table.lookup(X_batch), y_batch

In [None]:
train_set = datasets["train"].batch(32).map(preprocess).map(encode_words).prefetch(1)

In [None]:
for i,j in train_set.take(1):
    print(i[0])
    print("label: ", j[0])

tf.Tensor(
[   22    11    28   337   302    12   628    25 10501     8    31  1295
  3454    45   451 10966  1821    26    68   147    20     9   202   389
    25    89   143   286     8   443   431    89    68    87    99    24
  6478     9  1567   623   584    22    12     7    28   348  5107  1025
  2118 10325     0     0     0     0     0     0     0     0     0     0], shape=(60,), dtype=int64)
label:  tf.Tensor(0, shape=(), dtype=int64)


In [None]:
embed_size = 128
model = Sequential([
                    layers.Embedding(vocab_size + num_oov_buckets, 
                                     embed_size, input_shape=[None]),
                    layers.GRU(128, return_sequences=True), 
                    layers.GRU(128), 
                    layers.Dense(1, activation="sigmoid")
])
model.compile(loss="binary_crossentropy", 
              optimizer="adam", metrics=["accuracy"])
model.fit(train_set, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fd279880dd0>

In [None]:
model.predict(table.lookup(tf.constant(b"the movie was kinda good".split())).numpy().reshape(1, 5))

array([[0.0499281]], dtype=float32)

#### Masking

In [None]:
K = keras.backend
inputs = layers.Input(shape=[None])
mask = layers.Lambda(lambda inputs: K.not_equal(inputs, 0))(inputs)
z = layers.Embedding(vocab_size+num_oov_buckets, embed_size)(inputs)
z = layers.GRU(128, return_sequences=True)(z, mask=mask)
z = layers.GRU(128)(z, mask=mask)
outputs = layers.Dense(1, activation="sigmoid")(z)
model2 = keras.Model(inputs, outputs)
model2.compile(loss="binary_crossentropy", 
               optimizer="adam", 
               metrics=["accuracy"])
model2.fit(train_set, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fd279f18a10>

#### Reusing Pretrained Embedding

In [None]:
import tensorflow_hub as hub

model3 = Sequential([
        hub.KerasLayer("https://tfhub.dev/google/tf2-preview/nnlm-en-dim50/1", 
                       dtype=tf.string, input_shape=[], output_shape=[50]), 
        layers.Dense(128, activation="relu"), 
        layers.Dense(1, activation="sigmoid")
])
model3.layers[0].trainable = True # optional
model3.compile(loss="binary_crossentropy", 
               optimizer="adam", 
               metrics=["accuracy"])

In [None]:
datasets, info = tfds.load("imdb_reviews", as_supervised=True, with_info=True)
train_size = info.splits["train"].num_examples
batch_size = 32
train_set = datasets["train"].batch(batch_size).prefetch(1)
model3.fit(train_set, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fd27269ce10>

### Encoder-Decoder

In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, Sequential, callbacks
import numpy as np

In [3]:
!pip install tensorflow_addons

Collecting tensorflow_addons
  Downloading tensorflow_addons-0.16.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 3.3 MB/s 
Installing collected packages: tensorflow-addons
Successfully installed tensorflow-addons-0.16.1


In [10]:
import tensorflow_addons as tfa

vocab_size = 100
embed_size = 10
encoder_inputs = layers.Input(shape=[None], dtype=np.int32)
decoder_inputs = layers.Input(shape=[None], dtype=np.int32)
sequence_lengths = layers.Input(shape=[], dtype=np.int32)
embedding = layers.Embedding(vocab_size, embed_size)
encoder_embeddings = embedding(encoder_inputs)
decoder_embeddings = embedding(decoder_inputs)
encoder = layers.LSTM(512, return_state=True)
encoder_outputs, state_h, state_c = encoder(encoder_embeddings)
encoder_state = [state_h, state_c]
sampler = tfa.seq2seq.sampler.TrainingSampler()
decoder_cell = layers.LSTMCell(512)
output_layer = layers.Dense(vocab_size)
decoder = tfa.seq2seq.basic_decoder.BasicDecoder(
    decoder_cell, 
    sampler, output_layer=output_layer)
final_outputs, final_state, final_sequence_lengths = decoder(
    decoder_embeddings, initial_state=encoder_state, 
    sequence_length=sequence_lengths
)
Y_proba = tf.nn.softmax(final_outputs.rnn_output)
model = keras.Model(inputs=[encoder_inputs, decoder_inputs, sequence_lengths], 
                    outputs=[Y_proba])

In [11]:
model.compile(loss="sparse_categorical_crossentropy", 
              optimizer="adam")

In [12]:
layers.Bidirectional(layers.GRU(10, return_sequences=True))

<keras.layers.wrappers.Bidirectional at 0x7f54600f7cd0>

### Attention Mechanism