In [34]:
import collections
import os
import pathlib
import re
import string
import sys
import tempfile
import time

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import tensorflow_datasets as tfds
import tensorflow_text as text
import tensorflow as tf

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.layers import TextVectorization

tf.get_logger().setLevel('ERROR')
pwd = pathlib.Path.cwd()

In [35]:
# load dataset
df = pd.read_csv('data/en_hv_dataset.csv')
df

Unnamed: 0,en,hv
0,teen depression why .,depresyon sa mga tin - edyer ngaa .
1,what can help .,ano ang makabulig sa ila .
2,the compound heat shield of the saharan silver...,ang panagang sa init sang saharan silver ant .
3,do you have a feast constantly .,may ara ka bala sing dalayon nga piesta .
4,all the days of the afflicted one are bad but ...,"ang tanan nga adlaw sang napiotan malaut , apa..."
...,...,...
5251,i take the heavens and the earth as witnesses ...,ginahimo ko ang langit kag ang duta nga mga sa...
5252,"while you throw all your anxiety on him , beca...",samtang ginatugyan ninyo sa iya ang tanan niny...
5253,"draw close to god , and he will draw close to ...","magpalapit kamo sa dios , kag magapalapit sia ..."
5254,"cleanse your hands , you sinners , and purify ...","tinlui ang inyo mga kamot , kamo nga makasasal..."


## Word Tokenization

This note presents the word piece tokenization from the dataset. This is where English and Hiligaynon words are extracted to assign one hot encoding for each word.

In [36]:
en_texts = df['en'].values # get en column data
hv_texts = df['hv'].values # get hv column data

# ensure string data type
en_samples = []
for txt in en_texts:
    en_samples.append(str(txt)) 
    
hv_samples = []
for txt in hv_texts:
    hv_samples.append(str(txt))
    
print(len(en_samples))
print(len(hv_samples))

# convert to the texts to tensor
train_en = tf.data.Dataset.from_tensor_slices(en_samples)
train_hv = tf.data.Dataset.from_tensor_slices(hv_samples)

5256
5256


In [37]:
# create en vocabulary
from tensorflow_text.tools.wordpiece_vocab import bert_vocab_from_dataset as bert_vocab

bert_tokenizer_params=dict(lower_case=True)
reserved_tokens=["[PAD]", "[UNK]", "[START]", "[END]"]

bert_vocab_args = dict(
    # The target vocabulary size
    vocab_size = 8000,
    # Reserved tokens that must be included in the vocabulary
    reserved_tokens = reserved_tokens,
    # Arguments for `text.BertTokenizer`
    bert_tokenizer_params = bert_tokenizer_params,
    # Arguments for `wordpiece_vocab.wordpiece_tokenizer_learner_lib.learn`
    learn_params={},
)

In [38]:
hv_vocab = bert_vocab.bert_vocab_from_dataset(
    train_hv.batch(1000).prefetch(2),
    **bert_vocab_args
)

In [39]:
print(hv_vocab[:10])
print(hv_vocab[100:110])
print(hv_vocab[1000:1010])
print(hv_vocab[-10:])

['[PAD]', '[UNK]', '[START]', '[END]', "'", ',', '-', '.', '1', '2']
['##s', 'mag', 'paagi', 'pag', 'asawa', 'kabuhi', 'nagsiling', '##o', 'butang', 'gusto']
['##pangayo', '##panghikot', '##plano', '##pnag', '##po', '##son', '##tawag', '##tigulang', '##tima', '##tong']
['##1', '##2', '##c', '##j', '##q', '##v', '##x', '##z', '##¿', '##⁄']


In [40]:
def write_vocab_file(filepath, vocab):
    with open(filepath, 'w', encoding="utf-8") as f:
        for token in vocab:
            print(token, file=f)

In [41]:
write_vocab_file('data/hv_vocab.txt', hv_vocab)

In [42]:
en_vocab = bert_vocab.bert_vocab_from_dataset(
    train_en.batch(1000).prefetch(2),
    **bert_vocab_args
)

In [43]:
print(en_vocab[:10])
print(en_vocab[100:110])
print(en_vocab[1000:1010])
print(en_vocab[-10:])

['[PAD]', '[UNK]', '[START]', '[END]', "'", ',', '-', '.', '1', '2']
['time', 'her', 'an', '##t', 'him', 'does', 'some', 'good', 'no', 'even']
['disciples', 'discipline', 'eliezer', 'expect', 'fellow', 'filled', 'former', 'free', 'friendships', 'grow']
['##1', '##2', '##c', '##j', '##q', '##u', '##v', '##z', '##¿', '##⁄']


In [44]:
write_vocab_file('data/en_vocab.txt', en_vocab)

In [45]:
hv_tokenizer = text.BertTokenizer('data/hv_vocab.txt', **bert_tokenizer_params) # create hv tokenizer
en_tokenizer = text.BertTokenizer('data/en_vocab.txt', **bert_tokenizer_params) # create en tokenizer

In [46]:
# test en tokenizer
en_tokenizer_samples = en_samples[:3]

print("> Texts:")
print(en_tokenizer_samples)
print()

# Tokenize the examples
en_token_batch = en_tokenizer.tokenize(en_tokenizer_samples)
# Merge the word and word-piece axes
en_token_batch = en_token_batch.merge_dims(-2,-1)

print("> Tokens:")
for ex in en_token_batch.to_list():
    print(ex)

> Texts:
['teen depression why . ', 'what can help . ', 'the compound heat shield of the saharan silver ant . ']

> Tokens:
[29, 123, 291, 351, 165, 7]
[66, 60, 115, 7]
[38, 12, 520, 469, 576, 1377, 28, 241, 347, 421, 73, 41, 38, 28, 151, 241, 1285, 28, 347, 219, 1342, 102, 103, 7]


In [47]:
# test en tokenizer
hv_tokenizer_samples = hv_samples[:3]

print("> Texts:")
print(hv_tokenizer_samples)
print()

# Tokenize the examples
hv_token_batch = hv_tokenizer.tokenize(hv_tokenizer_samples)
# Merge the word and word-piece axes
hv_token_batch = hv_token_batch.merge_dims(-2,-1)

print("> Tokens:")
for ex in hv_token_batch.to_list():
    print(ex)

> Texts:
['depresyon sa mga tin - edyer ngaa . ', 'ano ang makabulig sa ila . ', 'ang panagang sa init sang saharan silver ant . ']

> Tokens:
[292, 40, 42, 652, 6, 641, 140, 7]
[73, 39, 161, 40, 56, 7]
[39, 68, 218, 308, 90, 40, 49, 115, 41, 40, 260, 242, 66, 127, 1311, 334, 110, 115, 7]


In [48]:
# Custom tokenization for Translation later
START = tf.argmax(tf.constant(reserved_tokens) == "[START]")
END = tf.argmax(tf.constant(reserved_tokens) == "[END]")

def add_start_end(ragged):
  count = ragged.bounding_shape()[0]
  starts = tf.fill([count,1], START)
  ends = tf.fill([count,1], END)
  return tf.concat([starts, ragged, ends], axis=1)

In [49]:
words = en_tokenizer.detokenize(add_start_end(en_token_batch))
tf.strings.reduce_join(words, separator=' ', axis=-1)

<tf.Tensor: shape=(3,), dtype=string, numpy=
array([b'[START] teen depression why . [END]',
       b'[START] what can help . [END]',
       b'[START] the compound heat shield of the saharan silver ant . [END]'],
      dtype=object)>

In [50]:
def cleanup_text(reserved_tokens, token_txt):
    # Drop the reserved tokens, except for "[UNK]".
    bad_tokens = [re.escape(tok) for tok in reserved_tokens if tok != "[UNK]"]
    bad_token_re = "|".join(bad_tokens)

    bad_cells = tf.strings.regex_full_match(token_txt, bad_token_re)
    result = tf.ragged.boolean_mask(token_txt, ~bad_cells)

    # Join them into strings.
    result = tf.strings.reduce_join(result, separator=' ', axis=-1)

    return result

In [51]:
token_batch = en_tokenizer.tokenize(en_tokenizer_samples).merge_dims(-2,-1)
words = en_tokenizer.detokenize(token_batch)
words

<tf.RaggedTensor [[b'teen', b'depression', b'why', b'.'], [b'what', b'can', b'help', b'.'],
 [b'the', b'compound', b'heat', b'shield', b'of', b'the', b'saharan',
  b'silver', b'ant', b'.']                                           ]>

In [52]:
cleanup_text(reserved_tokens, words).numpy()

array([b'teen depression why .', b'what can help .',
       b'the compound heat shield of the saharan silver ant .'],
      dtype=object)

In [53]:
class CustomTokenizer(tf.Module):
    def __init__(self, reserved_tokens, vocab_path):
        self.tokenizer = text.BertTokenizer(vocab_path, lower_case=True)
        self._reserved_tokens = reserved_tokens
        self._vocab_path = tf.saved_model.Asset(vocab_path)

        file_path = pathlib.Path(vocab_path, encoding="utf-8")

        vocab = []
        with file_path.open(encoding='utf-8') as file:
            # Read and process each line
            for line in file:
                vocab.append(line.strip())
                
        self.vocab = tf.Variable(vocab)

        ## Create the signatures for export:   

        # Include a tokenize signature for a batch of strings. 
        self.tokenize.get_concrete_function(
            tf.TensorSpec(shape=[None], dtype=tf.string))

        # Include `detokenize` and `lookup` signatures for:
        #   * `Tensors` with shapes [tokens] and [batch, tokens]
        #   * `RaggedTensors` with shape [batch, tokens]
        self.detokenize.get_concrete_function(
            tf.TensorSpec(shape=[None, None], dtype=tf.int64))
        self.detokenize.get_concrete_function(
                tf.RaggedTensorSpec(shape=[None, None], dtype=tf.int64))

        self.lookup.get_concrete_function(
            tf.TensorSpec(shape=[None, None], dtype=tf.int64))
        self.lookup.get_concrete_function(
                tf.RaggedTensorSpec(shape=[None, None], dtype=tf.int64))

        # These `get_*` methods take no arguments
        self.get_vocab_size.get_concrete_function()
        self.get_vocab_path.get_concrete_function()
        self.get_reserved_tokens.get_concrete_function()

    @tf.function
    def tokenize(self, strings):
        enc = self.tokenizer.tokenize(strings)
        # Merge the `word` and `word-piece` axes.
        enc = enc.merge_dims(-2,-1)
        enc = add_start_end(enc)
        return enc

    @tf.function
    def detokenize(self, tokenized):
        words = self.tokenizer.detokenize(tokenized)
        return cleanup_text(self._reserved_tokens, words)

    @tf.function
    def lookup(self, token_ids):
        return tf.gather(self.vocab, token_ids)

    @tf.function
    def get_vocab_size(self):
        return tf.shape(self.vocab)[0]

    @tf.function
    def get_vocab_path(self):
        return self._vocab_path

    @tf.function
    def get_reserved_tokens(self):
        return tf.constant(self._reserved_tokens)

In [54]:
tokenizers = tf.Module()
tokenizers.hv = CustomTokenizer(reserved_tokens, 'data/hv_vocab.txt')
tokenizers.en = CustomTokenizer(reserved_tokens, 'data/en_vocab.txt')

In [55]:
# save model
model_name = 'ted_hrlr_translate_hv_en_converter'
tf.saved_model.save(tokenizers, model_name)

In [56]:
# reload and test
reloaded_tokenizers = tf.saved_model.load(model_name)
reloaded_tokenizers.en.get_vocab_size().numpy()

1432

In [57]:
tokens = reloaded_tokenizers.hv.tokenize(['ano himuon ko'])
tokens.numpy()

array([[  2,  73, 135,  69,   3]], dtype=int64)