This is a companion notebook for the book [Deep Learning with Python, Second Edition](https://www.manning.com/books/deep-learning-with-python-second-edition?a_aid=keras&a_bid=76564dff). For readability, it only contains runnable code blocks and section titles, and omits everything else in the book: text paragraphs, figures, and pseudocode.

**If you want to be able to follow what's going on, I recommend reading the notebook side by side with your copy of the book.**

This notebook was generated for TensorFlow 2.6.

# Deep learning for text

## Natural-language processing: The bird's eye view

## Preparing text data

### Text standardization

### Text splitting (tokenization)

### Vocabulary indexing

### Using the TextVectorization layer

In [None]:
import string # importing the string module to use the punctuation attribute to remove punctuation from the text data to be tokenized and vectorized in the Vectorizer class below 

class Vectorizer: # defining the Vectorizer class to vectorize text data
    def standardize(self, text): # defining the standardize method to convert the text data to lowercase and remove punctuation
        text = text.lower() # converting the text data to lowercase
        return "".join(char for char in text if char not in string.punctuation) # removing punctuation from the text data

    def tokenize(self, text): # defining the tokenize method to split the text data into tokens
        text = self.standardize(text) # standardizing the text data
        return text.split() # splitting the text data into tokens

    def make_vocabulary(self, dataset): # defining the make_vocabulary method to create a vocabulary from the text data
        self.vocabulary = {"": 0, "[UNK]": 1} # initializing the vocabulary with empty string and unknown token
        for text in dataset: # iterating over the text data in the dataset
            text = self.standardize(text) # standardizing the text data
            tokens = self.tokenize(text) # tokenizing the text data
            for token in tokens: # iterating over the tokens in the text data
                if token not in self.vocabulary: # checking if the token is not in the vocabulary
                    self.vocabulary[token] = len(self.vocabulary) # adding the token to the vocabulary
        self.inverse_vocabulary = dict( # creating the inverse vocabulary
            (v, k) for k, v in self.vocabulary.items()) # iterating over the vocabulary and creating the inverse vocabulary

    def encode(self, text): # defining the encode method to encode text data into integer sequences
        text = self.standardize(text) # standardizing the text data
        tokens = self.tokenize(text) # tokenizing the text data
        return [self.vocabulary.get(token, 1) for token in tokens] # encoding the text data into integer sequences

    def decode(self, int_sequence): # defining the decode method to decode integer sequences into text data
        return " ".join( # joining the tokens in the text data
            self.inverse_vocabulary.get(i, "[UNK]") for i in int_sequence) # iterating over the integer sequence and decoding the integer sequence into text data

vectorizer = Vectorizer() # instantiating the Vectorizer class
dataset = [ # defining the text data to be tokenized and vectorized
    "I write, erase, rewrite", # first line of text data
    "Erase again, and then", # second line of text data
    "A poppy blooms.", # third line of text data
]
vectorizer.make_vocabulary(dataset) # creating the vocabulary from the text data

In [None]:
test_sentence = "I write, rewrite, and still rewrite again" # defining the test sentence to be encoded
encoded_sentence = vectorizer.encode(test_sentence) # encoding the test sentence
print(encoded_sentence) # printing the encoded sentence

In [None]:
decoded_sentence = vectorizer.decode(encoded_sentence) # decoding the encoded sentence
print(decoded_sentence) # printing the decoded sentence

In [None]:
from tensorflow.keras.layers import TextVectorization # importing the TextVectorization class from the tensorflow.keras.layers module to vectorize text data
text_vectorization = TextVectorization( # instantiating the TextVectorization class
    output_mode="int", # setting the output mode to integer
) 

In [None]:
import re # importing the re module to use the escape function to escape special characters in the punctuation attribute
import string # importing the string module to use the punctuation attribute to remove punctuation from the text data to be tokenized and vectorized in the Vectorizer class below
import tensorflow as tf # importing the tensorflow module to use the string operations in the TextVectorization class

def custom_standardization_fn(string_tensor): # defining the custom_standardization_fn function to standardize the text data
    lowercase_string = tf.strings.lower(string_tensor) # converting the text data to lowercase
    return tf.strings.regex_replace( # returning the text data with punctuation removed
        lowercase_string, f"[{re.escape(string.punctuation)}]", "") # removing punctuation from the text data
 
def custom_split_fn(string_tensor): # defining the custom_split_fn function to split the text data into tokens
    return tf.strings.split(string_tensor) # splitting the text data into tokens

text_vectorization = TextVectorization( # instantiating the TextVectorization class
    output_mode="int", # setting the output mode to integer
    standardize=custom_standardization_fn, # setting the standardize function to the custom_standardization_fn function
    split=custom_split_fn, # setting the split function to the custom_split_fn function
)

In [None]:
dataset = [ # defining the text data to be tokenized and vectorized
    "I write, erase, rewrite", # first line of text data
    "Erase again, and then", # second line of text data
    "A poppy blooms.", # third line of text data
]
text_vectorization.adapt(dataset) # adapting the text data to the TextVectorization class

**Displaying the vocabulary**

In [None]:
text_vectorization.get_vocabulary() # getting the vocabulary from the TextVectorization class

In [None]:
vocabulary = text_vectorization.get_vocabulary() # getting the vocabulary from the TextVectorization class
test_sentence = "I write, rewrite, and still rewrite again" # defining the test sentence to be encoded
encoded_sentence = text_vectorization(test_sentence) # encoding the test sentence
print(encoded_sentence) # printing the encoded sentence

In [None]:
inverse_vocab = dict(enumerate(vocabulary)) # creating the inverse vocabulary
decoded_sentence = " ".join(inverse_vocab[int(i)] for i in encoded_sentence) # decoding the encoded sentence
print(decoded_sentence) # printing the decoded sentence

## Two approaches for representing groups of words: Sets and sequences

### Preparing the IMDB movie reviews data

In [None]:
!curl -O https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz # downloading the IMDB dataset
!tar -xf aclImdb_v1.tar.gz # extracting the IMDB dataset

In [None]:
!rm -r aclImdb/train/unsup # removing the unsupervised training data

In [None]:
!cat aclImdb/train/pos/4077_10.txt # printing the contents of a positive review

In [None]:
import os, pathlib, shutil, random # importing the os, pathlib, shutil, and random modules to move the validation data to the validation directory

base_dir = pathlib.Path("aclImdb") # defining the base directory
val_dir = base_dir / "val" # defining the validation directory
train_dir = base_dir / "train" # defining the training directory
for category in ("neg", "pos"): # iterating over the categories in the dataset
    os.makedirs(val_dir / category) # creating the validation directory for the category
    files = os.listdir(train_dir / category) # listing the files in the training directory for the category
    random.Random(1337).shuffle(files) # shuffling the files
    num_val_samples = int(0.2 * len(files)) # calculating the number of validation samples
    val_files = files[-num_val_samples:] # getting the validation files
    for fname in val_files: # iterating over the validation files
        shutil.move(train_dir / category / fname, 
                    val_dir / category / fname) # moving the validation files to the validation directory

In [None]:
from tensorflow import keras # importing the keras module from the tensorflow package to use the text_dataset_from_directory function to create a dataset from a directory
batch_size = 32 # defining the batch size

train_ds = keras.utils.text_dataset_from_directory( # creating the training dataset
    "aclImdb/train", batch_size=batch_size # specifying the training directory and batch size
)
val_ds = keras.utils.text_dataset_from_directory( # creating the validation dataset
    "aclImdb/val", batch_size=batch_size # specifying the validation directory and batch size
)
test_ds = keras.utils.text_dataset_from_directory( # creating the test dataset
    "aclImdb/test", batch_size=batch_size # specifying the test directory and batch size
)

**Displaying the shapes and dtypes of the first batch**

In [None]:
for inputs, targets in train_ds: # iterating over the training dataset
    print("inputs.shape:", inputs.shape) # printing the shape of the inputs
    print("inputs.dtype:", inputs.dtype) # printing the data type of the inputs
    print("targets.shape:", targets.shape) # printing the shape of the targets
    print("targets.dtype:", targets.dtype) # printing the data type of the targets
    print("inputs[0]:", inputs[0]) # printing the first input
    print("targets[0]:", targets[0]) # printing the first target
    break # breaking the loop after the first iteration

### Processing words as a set: The bag-of-words approach

#### Single words (unigrams) with binary encoding

**Preprocessing our datasets with a `TextVectorization` layer**

In [None]:
text_vectorization = TextVectorization( # instantiating the TextVectorization class
    max_tokens=20000, # setting the maximum number of tokens to 20,000
    output_mode="multi_hot", # setting the output mode to multi-hot, which returns a binary matrix where each token is represented by a 1 or 0 representing its presence or absence in the text data
)
text_only_train_ds = train_ds.map(lambda x, y: x) # mapping the training dataset to only include the text data
text_vectorization.adapt(text_only_train_ds) # adapting the text data to the TextVectorization class

binary_1gram_train_ds = train_ds.map( # mapping the training dataset to include the text data and the target data
    lambda x, y: (text_vectorization(x), y), # mapping the text data to the vectorized text data
    num_parallel_calls=4) # specifying the number of parallel calls
binary_1gram_val_ds = val_ds.map( # mapping the validation dataset to include the text data and the target data
    lambda x, y: (text_vectorization(x), y), # mapping the text data to the vectorized text data
    num_parallel_calls=4) # specifying the number of parallel calls
binary_1gram_test_ds = test_ds.map( # mapping the test dataset to include the text data and the target data
    lambda x, y: (text_vectorization(x), y), # mapping the text data to the vectorized text data
    num_parallel_calls=4) # specifying the number of parallel calls

**Inspecting the output of our binary unigram dataset**

In [None]:
for inputs, targets in binary_1gram_train_ds: # iterating over the training dataset
    print("inputs.shape:", inputs.shape) # printing the shape of the inputs
    print("inputs.dtype:", inputs.dtype) # printing the data type of the inputs
    print("targets.shape:", targets.shape) # printing the shape of the targets
    print("targets.dtype:", targets.dtype) # printing the data type of the targets
    print("inputs[0]:", inputs[0]) # printing the first input
    print("targets[0]:", targets[0]) # printing the first target
    break # breaking the loop after the first iteration

**Our model-building utility**

In [None]:
from tensorflow import keras # importing the keras module from the tensorflow package to use the TextVectorization class
from tensorflow.keras import layers # importing the layers module from the tensorflow.keras package to use the Dense and Dropout classes

def get_model(max_tokens=20000, hidden_dim=16): # defining the get_model function to create a model with a specified number of tokens and hidden dimensions
    inputs = keras.Input(shape=(max_tokens,)) # defining the input layer with the specified number of tokens as the shape of the input data 
    x = layers.Dense(hidden_dim, activation="relu")(inputs) # defining the hidden layer with the specified number of hidden dimensions and ReLU activation function
    x = layers.Dropout(0.5)(x) # adding a dropout layer with a dropout rate of 0.5 to prevent overfitting 
    outputs = layers.Dense(1, activation="sigmoid")(x) # defining the output layer with a single unit and sigmoid activation function
    model = keras.Model(inputs, outputs) # creating the model with the input and output layers
    model.compile(optimizer="rmsprop", # compiling the model with the RMSprop optimizer
                  loss="binary_crossentropy", # using binary cross-entropy as the loss function
                  metrics=["accuracy"]) # using accuracy as the evaluation metric
    return model # returning the model

**Training and testing the binary unigram model**

In [None]:
model = get_model() # creating the model
model.summary() # printing the model summary
callbacks = [ # defining the callbacks to save the best model
    keras.callbacks.ModelCheckpoint("binary_1gram.keras", # saving the model to the specified file
                                    save_best_only=True) # saving only the best model based on the validation loss
]
model.fit(binary_1gram_train_ds.cache(), # training the model on the training dataset
          validation_data=binary_1gram_val_ds.cache(), # validating the model on the validation dataset
          epochs=10, # training the model for 10 epochs
          callbacks=callbacks) # using the specified callbacks
model = keras.models.load_model("binary_1gram.keras") # loading the best model
print(f"Test acc: {model.evaluate(binary_1gram_test_ds)[1]:.3f}") # evaluating the model on the test dataset

#### Bigrams with binary encoding

**Configuring the `TextVectorization` layer to return bigrams**

In [None]:
text_vectorization = TextVectorization( # instantiating the TextVectorization class
    ngrams=2, # using 2-grams to capture more context, which means that each token will represent a pair of words
    max_tokens=20000, # setting the maximum number of tokens to 20,000
    output_mode="multi_hot", # setting the output mode to multi-hot to represent the presence or absence of each token in the text data
)

**Training and testing the binary bigram model**

In [None]:
text_vectorization.adapt(text_only_train_ds) # adapting the text data to the TextVectorization class
binary_2gram_train_ds = train_ds.map( # mapping the training dataset to include the text data and the target data
    lambda x, y: (text_vectorization(x), y), # mapping the text data to the vectorized text data
    num_parallel_calls=4) # specifying the number of parallel calls
binary_2gram_val_ds = val_ds.map( # mapping the validation dataset to include the text data and the target data
    lambda x, y: (text_vectorization(x), y), # mapping the text data to the vectorized text data
    num_parallel_calls=4) # specifying the number of parallel calls
binary_2gram_test_ds = test_ds.map( # mapping the test dataset to include the text data and the target data
    lambda x, y: (text_vectorization(x), y), # mapping the text data to the vectorized text data
    num_parallel_calls=4) # specifying the number of parallel calls

model = get_model() # creating the model
model.summary() # printing the model summary
callbacks = [ # defining the callbacks to save the best model
    keras.callbacks.ModelCheckpoint("binary_2gram.keras", # saving the model to the specified file
                                    save_best_only=True) # saving only the best model based on the validation loss
]
model.fit(binary_2gram_train_ds.cache(), # training the model on the training dataset
          validation_data=binary_2gram_val_ds.cache(), # validating the model on the validation dataset
          epochs=10, # training the model for 10 epochs
          callbacks=callbacks) # using the specified callbacks
model = keras.models.load_model("binary_2gram.keras") # loading the best model
print(f"Test acc: {model.evaluate(binary_2gram_test_ds)[1]:.3f}") # evaluating the model on the test dataset

#### Bigrams with TF-IDF encoding

**Configuring the `TextVectorization` layer to return token counts**

In [None]:
text_vectorization = TextVectorization( # instantiating the TextVectorization class
    ngrams=2, # using 2-grams to capture more context, which means that each token will represent a pair of words
    max_tokens=20000, # setting the maximum number of tokens to 20,000
    output_mode="count" # setting the output mode to count to represent the frequency of each token in the text data
)

**Configuring `TextVectorization` to return TF-IDF-weighted outputs**

In [None]:
text_vectorization = TextVectorization( # instantiating the TextVectorization class
    ngrams=2, # using 2-grams to capture more context, which means that each token will represent a pair of words
    max_tokens=20000, # setting the maximum number of tokens to 20,000
    output_mode="tf_idf", # setting the output mode to TF-IDF to represent the importance of each token in the text data
)

**Training and testing the TF-IDF bigram model**

In [None]:
text_vectorization.adapt(text_only_train_ds) # adapting the text data to the TextVectorization class

tfidf_2gram_train_ds = train_ds.map( # mapping the training dataset to include the text data and the target data
    lambda x, y: (text_vectorization(x), y), # mapping the text data to the vectorized text data
    num_parallel_calls=4) # specifying the number of parallel calls
tfidf_2gram_val_ds = val_ds.map( # mapping the validation dataset to include the text data and the target data
    lambda x, y: (text_vectorization(x), y), # mapping the text data to the vectorized text data
    num_parallel_calls=4) # specifying the number of parallel calls
tfidf_2gram_test_ds = test_ds.map( # mapping the test dataset to include the text data and the target data
    lambda x, y: (text_vectorization(x), y), # mapping the text data to the vectorized text data
    num_parallel_calls=4) # specifying the number of parallel calls

model = get_model() # creating the model
model.summary() # printing the model summary
callbacks = [ # defining the callbacks to save the best model
    keras.callbacks.ModelCheckpoint("tfidf_2gram.keras", # saving the model to the specified file
                                    save_best_only=True) # saving only the best model based on the validation loss
]
model.fit(tfidf_2gram_train_ds.cache(), # training the model on the training dataset 
          validation_data=tfidf_2gram_val_ds.cache(), # validating the model on the validation dataset
          epochs=10, # training the model for 10 epochs
          callbacks=callbacks) # using the specified callbacks
model = keras.models.load_model("tfidf_2gram.keras") # loading the best model
print(f"Test acc: {model.evaluate(tfidf_2gram_test_ds)[1]:.3f}") # evaluating the model on the test dataset

In [None]:
inputs = keras.Input(shape=(1,), dtype="string") # defining the input layer with a single string input
processed_inputs = text_vectorization(inputs) # processing the input data using the TextVectorization layer
outputs = model(processed_inputs) # passing the processed input data through the model
inference_model = keras.Model(inputs, outputs) # creating the inference model with the input and output layers

In [None]:
import tensorflow as tf # importing the tensorflow module to use the string operations in the TextVectorization class
raw_text_data = tf.convert_to_tensor([ # defining the raw text data to be classified
    ["That was an excellent movie, I loved it."], # positive review
])
predictions = inference_model(raw_text_data) # making predictions on the raw text data
print(f"{float(predictions[0] * 100):.2f} percent positive") # printing the percentage of positive sentiment