# Imports

In [1]:
import logging as log
import functools
from time import time

import pathlib

import numpy as np
import tensorflow as tf
import tensorflow_text as tf_text
from tensorflow_text.tools.wordpiece_vocab import bert_vocab_from_dataset as bert_vocab

# import matplotlib.pyplot as plt

# Notebook settings

In [2]:
log.basicConfig(
    format='%(asctime)s %(levelname)-8s %(message)s',
    level=log.INFO,
    datefmt='%Y-%m-%d %H:%M:%S')

log_enabled = True
run_helper = False

## Decorators

In [3]:
def log_dec(func):
    @functools.wraps(func)
    def wrapper(*args, **kwargs):
        try:
            if log_enabled:
                start_time = time()
                log.info('{} started'.format(func.__name__))
            return func(*args, **kwargs)
        except Exception as ex:
            raise ex
        finally:
            if log_enabled:
                duration = time() - start_time
                log.info('{} finished'.format(func.__name__))
    return wrapper

def run_helper(func):
    @functools.wraps(func)
    def wrapper(*args, **kwargs):
        if run_helper:
            return func(*args, **kwargs)
        else:
            return
    return wrapper

# Tokenizer pipeline


Dataset from tensorflow as described in [https://www.tensorflow.org/text/tutorials/text_generation](https://www.tensorflow.org/text/tutorials/text_generation)

## Tokenizer settings

In [4]:
dataset_path = 'datasets\\corpus.txt'
reserved_tokens = ["[PAD]", "[UNK]", "[START]", "[END]"]
vocab_path = 'datasets\\vocab.txt'

## Create vocab from dataset

### Documentation


1. `load_dataset(dataset_text_file)`:

    This function loads a text dataset from a file.

    Parameters:

    `dataset_text_file`: The path of the text file to be loaded.

    How it works:

    - It uses the `TextLineDataset` function from TensorFlow's `tf.data` module to load the dataset from the specified file. Each line of the file becomes an element of the dataset.
    - It returns the loaded dataset.

2. `create_vocab(dataset)`:

    This function creates a vocabulary from a given dataset.

    Parameters:

    `dataset`: The dataset from which to create the vocabulary.

    How it works:

    - It defines the parameters for the vocabulary, including the size of the vocabulary, reserved tokens, and BERT tokenizer parameters.
    - It uses the `bert_vocab_from_dataset` function from TensorFlow's `bert_vocab` module to create the vocabulary from the dataset.
    - It returns the created vocabulary.

3. `create_vocab_from_textdata(text_file=dataset_path)`:

    This function creates a vocabulary from a text file.

    Parameters:

    `text_file`: The path of the text file from which to create the vocabulary. Default is `dataset_path`.

    How it works:

    - It loads the dataset from the specified text file using the `load_dataset()` function.
    - It creates the vocabulary from the loaded dataset using the `create_vocab()` function.
    - It returns the created vocabulary.

4. `write_vocab_file(filepath, vocab)`:

    This function writes a vocabulary to a file.

    Parameters:

    `filepath`: The path of the file to which to write the vocabulary.
    
    `vocab`: The vocabulary to write.

    How it works:

    - It opens the specified file in write mode.
    - It writes each token of the vocabulary to the file on a new line.

### Code

In [5]:
@log_dec
def load_dataset(dataset_text_file):
    return tf.data.TextLineDataset(filenames=dataset_text_file)

@log_dec
def create_vocab(dataset):
    bert_vocab_args=dict(
        vocab_size = 8000,
        reserved_tokens = ["[PAD]", "[UNK]", "[START]", "[END]"],
        bert_tokenizer_params = dict(lower_case=True),
        learn_params = {},
    )

    story_vocab = bert_vocab.bert_vocab_from_dataset(
        dataset.batch(1000).prefetch(2),
        **bert_vocab_args
    )
    return story_vocab

@run_helper
@log_dec
def create_vocab_from_textdata(text_file=dataset_path):
    dataset = load_dataset(text_file)
    vocab = create_vocab(dataset)
    return vocab

@run_helper
@log_dec
def write_vocab_file(filepath, vocab):
    with open(filepath, 'w') as file:
        for token in vocab:
            print(token, file=file)

#write_vocab_file('datasets\\vocab.txt', create_vocab_from_textdata())

## Create tokenizer class from vocab

### Documentation

1. `add_start_end(ragged)`:

    This function adds start and end tokens to a ragged tensor. The ragged parameter is a ragged tensor where each slice along the first dimension represents a tokenized string (for example, a sentence or a line of text), and the length of each slice varies depending on the number of tokens in the string. The function adds start and end tokens to each of these strings.

    Parameters:

    `ragged`: The ragged tensor to which to add start and end tokens.

    How it works:

    - It finds the indices of the start and end tokens in the reserved tokens list.
    - It creates tensors of start and end tokens with the same number of elements as the number of sequences in the ragged tensor.
    - It concatenates the start tokens, the original ragged tensor, and the end tokens along the sequence axis.
    - It returns the new ragged tensor with added start and end tokens.

2. `cleanup_text(reserved_tokens, token_txt)`:

    This function cleans up a tokenized text by removing certain reserved tokens.

    Parameters:

    `reserved_tokens`: The list of reserved tokens from which to remove certain tokens.

    `token_txt`: The tokenized text to clean up.

    How it works:

    - It creates a list of bad tokens by removing the "[UNK]" token from the reserved tokens list.
    - It creates a regular expression pattern from the bad tokens list.
    - It finds the cells in the tokenized text that match the bad tokens pattern.
    - It removes the bad cells from the tokenized text using a boolean mask.
    - It joins the cleaned tokenized text back into a single string with a space as the separator.
    - It returns the cleaned text.

### Code

In [6]:
@log_dec
def add_start_end(ragged):
    START = tf.argmax(tf.constant(reserved_tokens) == "[START]")
    END = tf.argmax(tf.constant(reserved_tokens) == "[END]")

    count = ragged.bounding_shape()[0]
    starts = tf.fill([count, 1], START)
    ends = tf.fill([count, 1], END)
    return tf.concat([starts, ragged, ends], axis=1)

@log_dec
def cleanup_text(reserved_tokens, token_txt):
    bad_tokens = list(filter(lambda token: token != "[UNK]", reserved_tokens))
    bad_tokens_re = "|".join(bad_tokens)

    bad_cells = tf.strings.regex_full_match(token_txt, bad_tokens_re)
    ragged_result = tf.ragged.boolean_mask(token_txt, ~bad_cells)

    result = tf.strings.reduce_join(ragged_result, separator=' ', axis=-1)

    return result

### Documentation

3. `StoryTokenizer`:

    This class defines a custom tokenizer for story data using the BERT tokenizer. The class is a subclass of the `tf.Module` class in TensorFlow, which is a base class for building reusable and shareable machine learning modules.

    Attributes:

    - `tokenizer`: A `tf_text.BertTokenizer` object, which is a tokenizer specifically designed to preprocess text for BERT models. It handles tasks like lowercasing and Unicode normalization in addition to tokenization.

    - `_reserved_tokens`: A list of special tokens reserved for specific uses like padding, marking the start or end of sequences, etc.

    - `_vocab_path`: A `tf.saved_model.Asset` object, which tracks the path to the vocabulary file and ensures that it is included when the module is exported to a SavedModel.

    - `vocab`: A `tf.Variable` containing the vocabulary read from the vocabulary file.

    Methods:

    - `__init__(reserved_tokens, vocab_path)`: Initializes the tokenizer and the vocabulary from a given vocabulary file. It also defines concrete functions for the other methods. Concrete functions are TensorFlow graph functions that can be called directly and support serialization and SavedModels.

    - `tokenize(strings)`: Tokenizes a batch of strings. This involves splitting the strings into words, subwords, or other meaningful units using the BERT tokenizer. It then adds start and end tokens to each tokenized string.

    - `detokenize(tokenized)`: Converts a batch of tokenized text back into human-readable strings. This involves replacing token ids with the corresponding tokens from the vocabulary and joining them into strings. It also cleans up the text by removing certain reserved tokens.

    - `lookup(token_ids)`: Converts a batch of token ids into the corresponding tokens from the vocabulary.

    - `get_vocab_size()`: Returns the size of the vocabulary.

    - `get_vocab_path()`: Returns the path of the vocabulary file.

    - `get_reserved_tokens()`: Returns the list of reserved tokens.

    Note: All methods of this class are decorated with `@tf.function`, meaning that they are compiled into TensorFlow graph functions for better performance. This is especially useful when the methods involve TensorFlow operations, as it allows the methods to be run in graph mode for speed, and it also enables them to be serialized and exported as part of a SavedModel.

### Code

In [7]:
class StoryTokenizer(tf.Module):
    def __init__(self, reserved_tokens, vocab_path):
        super().__init__()
        self.tokenizer = tf_text.BertTokenizer(vocab_path, lower_case=True)
        self._reserved_tokens = reserved_tokens
        self._vocab_path = tf.saved_model.Asset(vocab_path)

        vocab = pathlib.Path(vocab_path).read_text().splitlines()
        self.vocab = tf.Variable(vocab)

        ## Create the signatures for export:

        # tokenize signature for a batch of strings
        self.tokenize.get_concrete_function(
            tf.TensorSpec(shape=[None], dtype=tf.string))
        
        # detokenize and lookup signature for:
        # * Tensor with shape [tokens] and [batch, tokens]
        # * RaggedTensor with shape [batch, tokens]
        self.detokenize.get_concrete_function(
            tf.TensorSpec(shape=[None, None], dtype=tf.int64))
        self.detokenize.get_concrete_function(
            tf.RaggedTensorSpec(shape=[None, None], dtype=tf.int64))
        
        self.lookup.get_concrete_function(
            tf.TensorSpec(shape=[None, None], dtype=tf.int64))
        self.lookup.get_concrete_function(
            tf.RaggedTensorSpec(shape=[None, None], dtype=tf.int64))
        

        # get_* methods take no argument
        self.get_vocab_size.get_concrete_function()
        self.get_vocab_path.get_concrete_function()
        self.get_reserved_tokens.get_concrete_function()

    @tf.function
    def tokenize(self, strings):
        encoded = self.tokenizer.tokenize(strings)
        merged_enc = encoded.merge_dims(-2, -1)
        merg_enc_start_end = add_start_end(merged_enc)
        return merg_enc_start_end
    
    @tf.function
    def detokenize(self, tokenized):
        words = self.tokenizer.detokenize(tokenized)
        return cleanup_text(self._reserved_tokens, words)
    
    @tf.function
    def lookup(self, token_ids):
        return tf.gather(self.vocab, token_ids)
    
    @tf.function
    def get_vocab_size(self):
        return tf.shape(self.vocab)[0]
    
    @tf.function
    def get_vocab_path(self):
        return self._vocab_path
    
    @tf.function
    def get_reserved_tokens(self):
        return tf.constant(self._reserved_tokens)

In [8]:
tokenizer = StoryTokenizer(reserved_tokens, vocab_path)

@log_dec
def test_tokenizer(tokenizer):
    dataset = load_dataset('datasets\\corpus.txt')
    dataset_short = dataset.take(2)
    token_batch = list(map(lambda x: tokenizer.tokenize(x), dataset_short))
    text = list(map(lambda x: tokenizer.detokenize(x), token_batch))
    return token_batch, text

test_tokenizer(tokenizer)

Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'NoneType' object has no attribute '_fields'


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'NoneType' object has no attribute '_fields'


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: 'NoneType' object has no attribute '_fields'


2023-05-15 15:18:10 INFO     add_start_end started


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method


2023-05-15 15:18:12 INFO     add_start_end finished
2023-05-15 15:18:13 INFO     cleanup_text started
2023-05-15 15:18:13 INFO     cleanup_text finished
2023-05-15 15:18:13 INFO     cleanup_text started
2023-05-15 15:18:13 INFO     cleanup_text finished
2023-05-15 15:18:13 INFO     test_tokenizer started
2023-05-15 15:18:13 INFO     load_dataset started
2023-05-15 15:18:13 INFO     load_dataset finished
2023-05-15 15:18:14 INFO     add_start_end started
2023-05-15 15:18:14 INFO     add_start_end finished
2023-05-15 15:18:14 INFO     test_tokenizer finished


([<tf.RaggedTensor [[2, 82, 489, 326, 17, 3]]>,
  <tf.RaggedTensor [[2, 452, 553, 82, 628, 15, 104, 42, 926, 5409, 15, 325, 82, 4235, 85, 82,
    489, 326, 17, 3]]>],
 [<tf.Tensor: shape=(1,), dtype=string, numpy=array([b'[START] the happy prince . [END]'], dtype=object)>,
  <tf.Tensor: shape=(1,), dtype=string, numpy=
  array([b'[START] high above the city , on a tall column , stood the statue of the happy prince . [END]'],
        dtype=object)>])

In [9]:
model_name = 'story_corpus_tokenizer'
tf.saved_model.save(tokenizer, model_name)

2023-05-15 15:18:15 INFO     cleanup_text started
2023-05-15 15:18:15 INFO     cleanup_text finished
2023-05-15 15:18:15 INFO     cleanup_text started
2023-05-15 15:18:15 INFO     cleanup_text finished
2023-05-15 15:18:15 INFO     add_start_end started
2023-05-15 15:18:15 INFO     add_start_end finished
2023-05-15 15:18:16 INFO     add_start_end started
2023-05-15 15:18:16 INFO     add_start_end finished


INFO:tensorflow:Assets written to: story_corpus_tokenizer\assets


2023-05-15 15:18:16 INFO     Assets written to: story_corpus_tokenizer\assets


In [10]:
model_name = 'story_corpus_tokenizer'
reload_story_tokenizer = tf.saved_model.load(model_name)
reload_story_tokenizer.get_vocab_size().numpy()

7981

In [11]:
tokens = reload_story_tokenizer.tokenize(['Hello TensorFlow!'])
tokens.numpy()

array([[   2, 2090,  655,   93,  440, 1051, 4666,    4,    3]],
      dtype=int64)

In [12]:
text_tokens = reload_story_tokenizer.lookup(tokens)
text_tokens

<tf.RaggedTensor [[b'[START]', b'hello', b'ten', b'##s', b'##or', b'##f', b'##low', b'!',
  b'[END]']]>