# Description

In this notebook, we will generate a subword vocabulary from a dataset in both English and Vietnamese.

In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2' 
import tensorflow as tf
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try: tf.config.experimental.set_memory_growth(gpus[0], True)
    except RuntimeError as e:   print(e)

import pathlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import regex as re
import string
import nltk

from tensorflow_text.tools.wordpiece_vocab import bert_vocab_from_dataset
import tensorflow_text as tf_text

from read_file_utils import *

2024-07-27 16:57:53.415819: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-07-27 16:57:53.415845: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-07-27 16:57:53.435842: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
PATH_EN_FILE = r"data/processed_data/en_sent.txt"
PATH_VI_FILE = r"data/processed_data/vi_sent.txt"

# PATH_FOLDER_PROCESS = r"data/processed_data"
PATH_FOLDER_VOCAB = r"data/vocab"

# 1. Read dataset

In [3]:
list_en_sentence = read_text_file(PATH_EN_FILE)
list_vi_sentence = read_text_file(PATH_VI_FILE)

assert len(list_en_sentence) == len(list_vi_sentence)
print(f"Number of pair sentence: {len(list_en_sentence)}")

Number of pair sentence: 3265005


In [4]:
train_en = tf.data.Dataset.from_tensor_slices(list_en_sentence)
train_vi = tf.data.Dataset.from_tensor_slices(list_vi_sentence)

In [5]:
for en, vi in zip(train_en, train_vi):
    print("English:   ", en.numpy().decode('utf-8'))
    print("Vietnamese:   ", vi.numpy().decode('utf-8'))
    break

English:    it begins with a countdown .
Vietnamese:    câu chuyện bắt đầu với buổi lễ đếm ngược .


# 2. Generate vocabulary

- This section generates a vocabulary from a dataset. 
- The vocabulary here mean the list of subword token, which use the `BertTokenizer`

## 2.1. English Vocabulary

In [6]:
# bert_tokenizer_params=dict(lower_case=True)
bert_tokenizer_params=dict()
RESERVED_TOKENS=["[PAD]", "[UNK]", "[START]", "[END]"]
VOCAB_SIE = 100_000

bert_vocab_args = dict(
    vocab_size = VOCAB_SIE,
    reserved_tokens=RESERVED_TOKENS,  # Reserved tokens that must be included in the vocabulary
    bert_tokenizer_params=bert_tokenizer_params,
    learn_params={},
)

In [7]:
%%time
en_vocab = bert_vocab_from_dataset.bert_vocab_from_dataset(
    train_en.batch(1000).prefetch(tf.data.AUTOTUNE),
    **bert_vocab_args
)

CPU times: user 6min 56s, sys: 2.59 s, total: 6min 59s
Wall time: 5min 58s


In [8]:
print(en_vocab[:10])
print(en_vocab[1000:1010])
print(en_vocab[-10:])

['[PAD]', '[UNK]', '[START]', '[END]', '!', '"', '#', '$', '%', '&']
['from', 'but', 'me', 'an', 're', 'she', 'so', 'my', 'will', 'all']
['##화', '##ﬁ', '##ﬂ', '##️', '##＊', '##＋', '##，', '##－', '##�', '##𒀭']


- We can save the `en_vocab` for later application usage.

In [9]:
def write_vocab_file(filepath, vocab):
  with open(filepath, 'w') as f:
    for token in vocab:
      print(token, file=f)

write_vocab_file(os.path.join(PATH_FOLDER_VOCAB, 'en_vocab.txt'), en_vocab)

## 2.2. Vietnamese vocab

In [10]:
%%time
vi_vocab = bert_vocab_from_dataset.bert_vocab_from_dataset(
    train_vi.batch(1000).prefetch(tf.data.AUTOTUNE),
    **bert_vocab_args
)

CPU times: user 6min 48s, sys: 2.16 s, total: 6min 50s
Wall time: 5min 22s


In [11]:
print(vi_vocab[:10])
print(vi_vocab[1000:1010])
print(vi_vocab[-10:])

['[PAD]', '[UNK]', '[START]', '[END]', '!', '"', '#', '$', '%', '&']
['sẽ', 'chúng', 'những', 'ta', 'để', 'này', 'khi', 'làm', 'vào', 'ra']
['##해', '##현', '##호', '##화', '##️', '##＋', '##，', '##￼', '##�', '##𒀭']


In [13]:
write_vocab_file(os.path.join(PATH_FOLDER_VOCAB, 'vi_vocab.txt'), vi_vocab)

# 3. Build tokenizer

- After extracting the vocabulary, we can build the Tokenizer from it. 
- We can use the function `tf_text.BertTokenizer` function.

In [14]:
en_tokenizer = tf_text.BertTokenizer(os.path.join(PATH_FOLDER_VOCAB, 'en_vocab.txt'))
vi_tokenizer = tf_text.BertTokenizer(os.path.join(PATH_FOLDER_VOCAB, 'vi_vocab.txt'))

- We can test the `tokenizer`, which take input as string and return the corresponding index.

In [20]:
vi_test = 'thành phố hồ chí minh ngập nước'
vi_token_idx = vi_tokenizer.tokenize(vi_test)
vi_token_idx = vi_token_idx.merge_dims(-2, -1)
print(f"Vietnamese token index: {vi_token_idx}")

Vietnamese token index: <tf.RaggedTensor [[1040, 1445, 1645, 1434, 1420, 2881, 1082]]>


- We can look back from the token index to the original text, by using `tf.gather`. 

In [21]:
# Lookup each token id in the vocabulary.
# txt_tokens = tf.gather(vi_vocab, vi_token_idx)
txt_tokens = vi_tokenizer.detokenize(vi_token_idx)
print(f"Output ot detokenizer: {txt_tokens}")

# Join with spaces.
original_str = tf.strings.reduce_join(txt_tokens, separator=' ', axis=-1).numpy()[0].decode('utf-8')
original_str

Output ot detokenizer: <tf.RaggedTensor [[b'th\xc3\xa0nh', b'ph\xe1\xbb\x91', b'h\xe1\xbb\x93', b'ch\xc3\xad',
  b'minh', b'ng\xe1\xba\xadp', b'n\xc6\xb0\xe1\xbb\x9bc']]>


'thành phố hồ chí minh ngập nước'

# 4. Custom and Export

We will export the tokenizer using `tf.saved_model` so they can be imported by other application.

In [22]:
START = tf.argmax(tf.constant(RESERVED_TOKENS) == "[START]")
END = tf.argmax(tf.constant(RESERVED_TOKENS) == "[END]")

def add_start_end(ragged):
    """
    This function take batch of token index and add index of START and END token
    """
    count = ragged.bounding_shape()[0]
    starts = tf.fill([count,1], START)
    ends = tf.fill([count,1], END)
    return tf.concat([starts, ragged, ends], axis=1)

def cleanup_text(reserved_tokens, token_txt):
    """
    This function take list of token and return the complete sentence
    """

    # Drop the reserved tokens, except for "[UNK]".
    bad_tokens = [re.escape(tok) for tok in reserved_tokens if tok != "[UNK]"]
    bad_token_re = "|".join(bad_tokens)

    bad_cells = tf.strings.regex_full_match(token_txt, bad_token_re)
    result = tf.ragged.boolean_mask(token_txt, ~bad_cells)

    # Join them into strings.
    result = tf.strings.reduce_join(result, separator=' ', axis=-1)

    return result

- Next, we will define the custom `tf.Module`, which can be use in later part.

In [23]:
class CustomTokenizer(tf.Module):
  def __init__(self, reserved_tokens, vocab_path):
    self.tokenizer = tf_text.BertTokenizer(vocab_path)
    self._reserved_tokens = reserved_tokens
    self._vocab_path = tf.saved_model.Asset(vocab_path)

    vocab = pathlib.Path(vocab_path).read_text().splitlines()
    self.vocab = tf.Variable(vocab)

    ## Create the signatures for export:   

    # Include a tokenize signature for a batch of strings. 
    self.tokenize.get_concrete_function(tf.TensorSpec(shape=[None], dtype=tf.string))

    # Include `detokenize` and `lookup` signatures for:
    #   * `Tensors` with shapes [tokens] and [batch, tokens]
    #   * `RaggedTensors` with shape [batch, tokens]
    self.detokenize.get_concrete_function(tf.TensorSpec(shape=[None, None], dtype=tf.int64))
    self.detokenize.get_concrete_function(tf.RaggedTensorSpec(shape=[None, None], dtype=tf.int64))

    self.lookup.get_concrete_function(tf.TensorSpec(shape=[None, None], dtype=tf.int64))
    self.lookup.get_concrete_function(tf.RaggedTensorSpec(shape=[None, None], dtype=tf.int64))

    # These `get_*` methods take no arguments
    self.get_vocab_size.get_concrete_function()
    self.get_vocab_path.get_concrete_function()
    self.get_reserved_tokens.get_concrete_function()

  @tf.function
  def tokenize(self, strings):
    enc = self.tokenizer.tokenize(strings)
    enc = enc.merge_dims(-2,-1)
    enc = add_start_end(enc)
    return enc

  @tf.function
  def detokenize(self, tokenized):
    words = self.tokenizer.detokenize(tokenized)
    return cleanup_text(self._reserved_tokens, words)

  @tf.function
  def lookup(self, token_ids):
    return tf.gather(self.vocab, token_ids)

  @tf.function
  def get_vocab_size(self):
    return tf.shape(self.vocab)[0]

  @tf.function
  def get_vocab_path(self):
    return self._vocab_path

  @tf.function
  def get_reserved_tokens(self):
    return tf.constant(self._reserved_tokens)

- Next, we build the `CustomTokenizer` for each language.

In [24]:
tokenizers = tf.Module()
tokenizers.en = CustomTokenizer(RESERVED_TOKENS, os.path.join(PATH_FOLDER_VOCAB, 'en_vocab.txt'))
tokenizers.vi = CustomTokenizer(RESERVED_TOKENS, os.path.join(PATH_FOLDER_VOCAB, 'vi_vocab.txt'))

In [25]:
model_name = os.path.join('data', 'tokeninzer_en_vi_converter')
tf.saved_model.save(tokenizers, model_name)

INFO:tensorflow:Assets written to: data/tokeninzer_en_vi_converter/assets


# 5. Test tokenizer

To make sure our tokenizer work correctly. 

## 5.1. English

In [26]:
en_test = 'i love you'

tokens = tokenizers.en.tokenize([en_test])
tokens.numpy()

array([[   2,   50, 1155,  972,    3]])

- We can `lookup` each token index and return corresponding text.

In [27]:
text_tokens = tokenizers.en.lookup(tokens)
text_tokens

<tf.RaggedTensor [[b'[START]', b'i', b'love', b'you', b'[END]']]>

- We can use the `detokenize` function to convert back to original sentence.

In [28]:
round_trip = tokenizers.en.detokenize(tokens)

print(round_trip.numpy()[0].decode('utf-8'))

i love you


## 5.2. Vietnamese

In [30]:
vi_test = 'hà nội không vội được đâu!'

tokens = tokenizers.vi.tokenize([vi_test])
tokens.numpy()

array([[   2, 1853, 1563,  988, 2924,  989, 1160,    4,    3]])

- We can `lookup` each token index and return corresponding text.

In [31]:
text_tokens = tokenizers.vi.lookup(tokens)
text_tokens

<tf.RaggedTensor [[b'[START]', b'h\xc3\xa0', b'n\xe1\xbb\x99i', b'kh\xc3\xb4ng',
  b'v\xe1\xbb\x99i', b'\xc4\x91\xc6\xb0\xe1\xbb\xa3c',
  b'\xc4\x91\xc3\xa2u', b'!', b'[END]']]>

- We can use the `detokenize` function to convert back to original sentence.

In [32]:
round_trip = tokenizers.vi.detokenize(tokens)

print(round_trip.numpy()[0].decode('utf-8'))

hà nội không vội được đâu !
