### ShakesPearean Text

#### Creating a Dataset

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, Sequential, callbacks
import numpy as np
import requests

In [None]:
r = requests.get("https://homl.info/shakespeare")
open("shakespearean_text.txt", "wb").write(r.content)

1115394

In [None]:
with open("shakespearean_text.txt") as f:
    shakespear_text = f.read()

In [None]:
tokenizer = keras.preprocessing.text.Tokenizer(char_level=True)
tokenizer.fit_on_texts([shakespear_text])

In [None]:
tokenizer.texts_to_sequences(["Hello"])

[[7, 2, 12, 12, 4]]

In [None]:
tokenizer.sequences_to_texts([[7, 2, 12, 12, 4]])

['h e l l o']

In [None]:
max_id = len(tokenizer.word_index)
# dataset_size = tokenizer.document_count

In [None]:
max_id

39

In [None]:
[encoded_text] = np.array(tokenizer.texts_to_sequences([shakespear_text])) - 1

In [None]:
encoded_text.shape

(1115394,)

In [None]:
dataset_size = len(encoded_text)
dataset_size

1115394

#### Spliting Dataset

In [None]:
train_size = int(dataset_size * 0.9)
train_set = tf.data.Dataset.from_tensor_slices(encoded_text[:train_size])
validation_set = tf.data.Dataset.from_tensor_slices(encoded_text[train_size:])

#### Batching dataset

In [None]:
n_steps = 100
window_length = n_steps + 1
train_set = train_set.window(window_length, shift=1, drop_remainder=True)

In [None]:
for i in train_set.take(1):
    for x in i.as_numpy_iterator():
        print(x, end=" ")

19 5 8 7 2 0 18 5 2 5 35 1 9 23 10 21 1 19 3 8 1 0 16 1 0 22 8 3 18 1 1 12 0 4 9 15 0 19 13 8 2 6 1 8 17 0 6 1 4 8 0 14 1 0 7 22 1 4 24 26 10 10 4 11 11 23 10 7 22 1 4 24 17 0 7 22 1 4 24 26 10 10 19 5 8 7 2 0 18 5 2 5 35 1 9 23 10 15 3 13 0 

In [None]:
train_set = train_set.flat_map(lambda window: window.batch(window_length))

In [None]:
for i in train_set.take(1):
    print(i)

tf.Tensor(
[19  5  8  7  2  0 18  5  2  5 35  1  9 23 10 21  1 19  3  8  1  0 16  1
  0 22  8  3 18  1  1 12  0  4  9 15  0 19 13  8  2  6  1  8 17  0  6  1
  4  8  0 14  1  0  7 22  1  4 24 26 10 10  4 11 11 23 10  7 22  1  4 24
 17  0  7 22  1  4 24 26 10 10 19  5  8  7  2  0 18  5  2  5 35  1  9 23
 10 15  3 13  0], shape=(101,), dtype=int64)


In [None]:
batch_size = 32
train_set = train_set.shuffle(10000).batch(batch_size)
train_set = train_set.map(lambda windows: (windows[:, :-1], windows[:, 1:]))

In [None]:
train_set = train_set.map(
    lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch)
)
train_set = train_set.prefetch(tf.data.AUTOTUNE)

#### Training Char-RNN

In [None]:
model1 = Sequential([
    layers.GRU(128, input_shape=[None, max_id],
               dropout=0.2,
            #    recurrent_dropout=0.2, 
               return_sequences=True), 
    layers.GRU(128, dropout=0.2, 
            #    recurrent_dropout=0.2, 
               return_sequences=True), 
    layers.TimeDistributed(layers.Dense(max_id, activation="softmax"))
])
model1.compile(loss="sparse_categorical_crossentropy", optimizer="adam")
history1 = model1.fit(train_set, epochs=10)

In [None]:
def preprocess(texts):
    X = np.array(tokenizer.texts_to_sequences(texts))
    return tf.one_hot(X, max_id)

In [None]:
X_new = preprocess(["How are yo"])
Y_pred = model1.predict_classes(X_new)
tokenizer.sequences_to_texts(Y_pred + 1)[0][-1]

In [None]:
def next_char(text, temperature):
    X_new = preprocess([text])
    y_proba = model1.predict(X_new)[0, -1:, :]
    rescaled_logits = tf.math.log(y_proba) / temperature
    char_id = tf.random.categorical(rescaled_logits, num_samples=1) + 1
    return tokenizer.sequences_to_texts(char_id.numpy())[0]

In [None]:
def complete_text(text, n_chars=50, temperature=1):
    for _ in range(n_chars):
        text += next_char(text, temperature)
    return text

#### Statefull RNN

In [None]:
dataset = tf.data.Dataset.from_tensor_slices(encoded_text[:train_size])
dataset = dataset.window(window_length, shift=n_steps, drop_remainder=True)
dataset = dataset.flat_map(lambda window: window.batch(window_length))
dataset = dataset.batch(1)
dataset = dataset.map(lambda windows: (windows[:, :-1], windows[:, 1:]))
dataset = dataset.map(
    lambda X_batch, Y_batch: (tf.one_hot(X_batch, depth=max_id), Y_batch))
dataset = dataset.prefetch(1)

In [None]:
model2 = Sequential([
        layers.GRU(128, return_sequences=True, stateful=True, 
                dropout=0.2, 
                # recurrent_dropout=0.2, 
                batch_input_shape=[1, None, max_id]), 
        layers.GRU(128, return_sequences=True, stateful=True, 
                dropout=0.2, 
                # recurrent_dropout=0.2,
                ), 
        layers.TimeDistributed(layers.Dense(max_id, activation="softmax"))
])

In [None]:
class ResetStateCallback(callbacks.Callback):
    def on_epoch_begin(self, epoch, logs):
        self.model.reset_states()

In [None]:
model2.compile(loss="sparse_categorical_crossentropy", 
              optimizer="adam")
model2.fit(dataset, epochs=50, callbacks=[ResetStateCallback()])

### Sentiment Analysis

In [1]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, Sequential, callbacks
import numpy as np

In [2]:
(X_train, y_train), (X_test, y_test) = keras.datasets.imdb.load_data()
X_train[0][:10]

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65]

In [8]:
word_index = keras.datasets.imdb.get_word_index()
id_to_word = {id_ + 3: word for word, id_ in word_index.items()}
for id_, token in enumerate(("<pad>", "<sos>", "<unk>")):
    id_to_word[id_] = token
" ".join([id_to_word[id_] for id_ in X_train[0][:10]])

'<sos> this film was just brilliant casting location scenery story'

In [9]:
import tensorflow_datasets as tfds

datasets, info = tfds.load("imdb_reviews", as_supervised=True, with_info=True)
train_size = info.splits["train"].num_examples
train_size

[1mDownloading and preparing dataset imdb_reviews/plain_text/1.0.0 (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...[0m


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]





0 examples [00:00, ? examples/s]

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteFD3YIE/imdb_reviews-train.tfrecord


  0%|          | 0/25000 [00:00<?, ? examples/s]

0 examples [00:00, ? examples/s]

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteFD3YIE/imdb_reviews-test.tfrecord


  0%|          | 0/25000 [00:00<?, ? examples/s]

0 examples [00:00, ? examples/s]

Shuffling and writing examples to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteFD3YIE/imdb_reviews-unsupervised.tfrecord


  0%|          | 0/50000 [00:00<?, ? examples/s]



[1mDataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.[0m


25000

In [10]:
def preprocess(X_batch, y_batch):
    X_batch = tf.strings.substr(X_batch, 0, 300)
    X_batch = tf.strings.regex_replace(X_batch, b"<br\\s*/?", b" ")
    X_batch = tf.strings.regex_replace(X_batch, b"[^a-zA-Z']", b" ")
    X_batch = tf.strings.split(X_batch)
    return X_batch.to_tensor(default_value=b"<pad>"), y_batch