In [None]:
!pip install -q tensorflow_datasets
# `BertTokenizer.detokenize` is not in `tf-text` stable yet (currently 2.4.3).
!pip install -q tensorflow_text_nightly
# tf-text-nightly resquires tf-nightly
!pip install -q tf-nightly 

In [None]:
import collections
import io
import os
import pathlib
import re
import sys
import numpy as np

import tensorflow as tf
import tensorflow_datasets as tfds
import tensorflow_text as text

import unicodedata

from sklearn.model_selection import train_test_split
from tensorflow_text.tools.wordpiece_vocab import bert_vocab_from_dataset as bert_vocab

In [None]:
from google.colab import drive
drive_mount_path = "/content/data"
drive.mount(drive_mount_path)

Drive already mounted at /content/data; to attempt to forcibly remount, call drive.mount("/content/data", force_remount=True).


In [None]:
path_data_file = "/content/data/MyDrive/data/rus-eng/rus.txt"

In [None]:
def unicode_to_ascii(w):
    return ''.join(unicodedata.normalize("NFD", c) for c in w if unicodedata.category(c) != 'Mn')

word = "Оно там?"
unicode_to_ascii(word)

'Оно там?'

In [None]:
def preprocess_sentence(w):
    w = unicode_to_ascii(w)

    w = re.sub(r"([.!?,])", r" \1", w)
    w = re.sub(r'[" ]', " ", w)

    w = w.strip()
    return w
    # return "<start> %s <end>" % w

preprocess_sentence(word)

'Оно там ?'

In [None]:
def create_dataset(path, num_instance, from_end=False):
    lines = io.open(path, encoding="utf-8").read().strip().split("\n")
    lines = lines[-num_instance:] if from_end else lines[:num_instance]
    return zip(*[[preprocess_sentence(w) for w in line.split("\t")[:2]] for line in lines])

a, b = create_dataset(path_data_file, 1000, from_end=True)
a[0], b[0]

("Since I'm not so good at swimming , I avoid swimming in water that's over my head .",
 'Поскольку я не очень хорошо плаваю , я избегаю плавания там , где я ухожу с головой под воду .')

In [None]:
x = list(map(lambda x: tf.convert_to_tensor(x), a))
y = list(map(lambda y: tf.convert_to_tensor(y), b))
for i in zip(x, y):
    print(i)
    break

(<tf.Tensor: shape=(), dtype=string, numpy=b"Since I'm not so good at swimming , I avoid swimming in water that's over my head .">, <tf.Tensor: shape=(), dtype=string, numpy=b'\xd0\x9f\xd0\xbe\xd1\x81\xd0\xba\xd0\xbe\xd0\xbb\xd1\x8c\xd0\xba\xd1\x83 \xd1\x8f \xd0\xbd\xd0\xb5 \xd0\xbe\xd1\x87\xd0\xb5\xd0\xbd\xd1\x8c \xd1\x85\xd0\xbe\xd1\x80\xd0\xbe\xd1\x88\xd0\xbe \xd0\xbf\xd0\xbb\xd0\xb0\xd0\xb2\xd0\xb0\xd1\x8e , \xd1\x8f \xd0\xb8\xd0\xb7\xd0\xb1\xd0\xb5\xd0\xb3\xd0\xb0\xd1\x8e \xd0\xbf\xd0\xbb\xd0\xb0\xd0\xb2\xd0\xb0\xd0\xbd\xd0\xb8\xd1\x8f \xd1\x82\xd0\xb0\xd0\xbc , \xd0\xb3\xd0\xb4\xd0\xb5 \xd1\x8f \xd1\x83\xd1\x85\xd0\xbe\xd0\xb6\xd1\x83 \xd1\x81 \xd0\xb3\xd0\xbe\xd0\xbb\xd0\xbe\xd0\xb2\xd0\xbe\xd0\xb8\xcc\x86 \xd0\xbf\xd0\xbe\xd0\xb4 \xd0\xb2\xd0\xbe\xd0\xb4\xd1\x83 .'>)


In [None]:
len(x), len(y)

(1000, 1000)

In [None]:
BUFFER_SIZE = 100
BATCH_SIZE = 64

dataset = tf.data.Dataset.from_tensor_slices((x, y))
dataset = dataset.shuffle(BUFFER_SIZE).cache().prefetch(1)
dataset

<PrefetchDataset shapes: ((), ()), types: (tf.string, tf.string)>

In [None]:
for pt, en in dataset.take(1):
    print("Portuguese: ", pt.numpy().decode('utf-8'))
    print("English:   ", en.numpy().decode('utf-8'))

Portuguese:  A lie can travel halfway around the world while the truth is putting on its shoes .
English:    Ложь может обойти полмира , пока правда ещё обувается .


In [None]:

BUFFER_SIZE = 100
BATCH_SIZE = 64


def tokenize(texts):
    """

    :param texts: the text to tokenize
    :return: the tensors and tokenizer of the texts
    """
    tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
    tokenizer.fit_on_texts(texts)

    tensor = tokenizer.texts_to_sequences(texts)
    tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding="post")

    return tensor, tokenizer


def load_dataset(path, num_instance):
    tar, inp = create_dataset(path, num_instance)

    tar_tensor, tar_tokenizer = tokenize(tar)
    inp_tensor, inp_tokenizer = tokenize(inp)

    return tar_tensor, inp_tensor, tar_tokenizer, inp_tokenizer


def load_texts_train_val_dataset(path, num_instance, from_end=False):
    tar, inp = create_dataset(path, num_instance, from_end=from_end)

    inp = list(map(lambda x: tf.convert_to_tensor(x), inp))
    tar = list(map(lambda y: tf.convert_to_tensor(y), tar))

    train_inp, val_inp, train_tar, val_tar = train_test_split(inp, tar)
    train_dataset = tf.data.Dataset.from_tensor_slices((train_inp, train_tar))
    # train_dataset = dataset.shuffle(BUFFER_SIZE).cache().prefetch(1)
    # for i in zip(train_inp, train_tar):
    #     print(i)
    #     break
    # for i in train_dataset:
    #     print(i)
    #     break
    val_dataset = tf.data.Dataset.from_tensor_slices((val_inp, val_tar))
    # val_dataset = dataset.shuffle(BUFFER_SIZE).cache().prefetch(1)

    return train_dataset, val_dataset


In [None]:
# NUM_EXAMPLES = 150000

# tar_tensor, inp_tensor, tar_tokenizer, inp_tokenizer = load_dataset(path_data_file, NUM_EXAMPLES)

# max_len_tar = tar_tensor.shape[1]
# max_len_inp = inp_tensor.shape[1]

# max_len_tar, max_len_inp


In [None]:
# inp_tensor_train, inp_tensor_val, tar_tensor_train, tar_tensor_val = train_test_split(inp_tensor, tar_tensor)

# len(inp_tensor_train), len(inp_tensor_val)

In [None]:

# dataset = tf.data.Dataset.from_tensor_slices((inp_tensor_train, tar_tensor_train))
# dataset = dataset.shuffle(BUFFER_SIZE).cache().batch(BATCH_SIZE, drop_remainder=True).prefetch(1)
# dataset

In [None]:
NUM_EXAMPLES = 30000
train_dataset, val_dataset = load_texts_train_val_dataset(path_data_file, NUM_EXAMPLES, from_end=True)

In [None]:
for i in train_dataset.take(2):
    print(i)
    break

(<tf.Tensor: shape=(), dtype=string, numpy=b'\xd0\xaf \xd1\x85\xd0\xbe\xd1\x82\xd0\xb5\xd0\xbb \xd0\xbe\xd1\x81\xd1\x82\xd0\xb0\xd1\x82\xd1\x8c\xd1\x81\xd1\x8f \xd0\xb2 \xd0\x90\xd0\xb2\xd1\x81\xd1\x82\xd1\x80\xd0\xb0\xd0\xbb\xd0\xb8\xd0\xb8 \xd0\xb5\xd1\x89\xd0\xb5\xcc\x88 \xd0\xbd\xd0\xb0 \xd0\xbf\xd0\xb0\xd1\x80\xd1\x83 \xd0\xb4\xd0\xbd\xd0\xb5\xd0\xb8\xcc\x86 .'>, <tf.Tensor: shape=(), dtype=string, numpy=b'I wanted to stay in Australia a couple of extra days .'>)


In [None]:
train_en = train_dataset.map(lambda ru, en: en)
train_ru = train_dataset.map(lambda ru, en: ru)

In [None]:
bert_tokenizer_params = dict(lower_case=True)
reserved_tokens = ["[PAD]", "[UNK]", "[START]", "[END]"]
# reserved_tokens = []
bert_vocab_args = dict(
    vocab_size = 8000,
    reserved_tokens=reserved_tokens,
    bert_tokenizer_params=bert_tokenizer_params,
    learn_params={}
)

In [None]:
%%time
ru_vocab = bert_vocab.bert_vocab_from_dataset(
    train_ru.batch(1000).prefetch(2),
    **bert_vocab_args
)
en_vocab = bert_vocab.bert_vocab_from_dataset(
    train_en.batch(1000).prefetch(2),
    **bert_vocab_args
)

CPU times: user 1min 27s, sys: 800 ms, total: 1min 28s
Wall time: 1min 28s


In [None]:
print(ru_vocab[:10])
print(ru_vocab[100:110])
print(ru_vocab[1000:1010])
print(ru_vocab[-10:])

print(en_vocab[:10])
print(en_vocab[100:110])
print(en_vocab[1000:1010])
print(en_vocab[-10:])

['[PAD]', '[UNK]', '[START]', '[END]', '!', '$', '%', "'", '+', ',']
['когда', '##и', 'чтобы', '##а', 'то', 'если', 'мы', 'тома', 'думаю', 'по']
['поезде', 'работал', 'ребенком', 'самых', 'смогла', 'считают', 'томе', 'триста', 'услышал', 'шесть']
['##y', '##z', '##«', '##»', '##щ', '##ъ', '##э', '##–', '##—', '##’']
['[PAD]', '[UNK]', '[START]', '[END]', '!', '$', '%', "'", '+', ',']
['him', 'one', 'she', 'would', 'there', 'like', 've', 'were', 'will', 'who']
['##ch', '##nce', '##ood', '##ore', '##ose', '##tion', 'certain', 'chinese', 'completely', 'kids']
['##?', '##@', '##c', '##j', '##q', '##v', '##x', '##z', '##—', '##’']


In [None]:
def write_to_file(filepath, vocab):
    with open(filepath, 'w') as f:
        for token in vocab:
            print(token, file=f)

In [None]:
ru_vocab_file = "ru_vocab.txt"
en_vocab_file = "en_vocab.txt"

write_to_file(ru_vocab_file, ru_vocab)
write_to_file(en_vocab_file, en_vocab)

In [None]:
ru_tokenizer = text.BertTokenizer(ru_vocab_file, **bert_tokenizer_params)
en_tokenizer = text.BertTokenizer(en_vocab_file, **bert_tokenizer_params)

In [None]:
for ru_examples, en_examples in train_dataset.batch(3).take(1):
    for ex in en_examples:
        print(ex.numpy())
    for ex in ru_examples:
        print(ex.numpy())

b'I wanted to stay in Australia a couple of extra days .'
b'I hope the day will soon come when we can travel to the moon .'
b"We've all lived in Australia since we were born ."
b'\xd0\xaf \xd1\x85\xd0\xbe\xd1\x82\xd0\xb5\xd0\xbb \xd0\xbe\xd1\x81\xd1\x82\xd0\xb0\xd1\x82\xd1\x8c\xd1\x81\xd1\x8f \xd0\xb2 \xd0\x90\xd0\xb2\xd1\x81\xd1\x82\xd1\x80\xd0\xb0\xd0\xbb\xd0\xb8\xd0\xb8 \xd0\xb5\xd1\x89\xd0\xb5\xcc\x88 \xd0\xbd\xd0\xb0 \xd0\xbf\xd0\xb0\xd1\x80\xd1\x83 \xd0\xb4\xd0\xbd\xd0\xb5\xd0\xb8\xcc\x86 .'
b'\xd0\xaf \xd0\xbd\xd0\xb0\xd0\xb4\xd0\xb5\xd1\x8e\xd1\x81\xd1\x8c , \xd1\x87\xd1\x82\xd0\xbe \xd1\x81\xd0\xba\xd0\xbe\xd1\x80\xd0\xbe \xd0\xbd\xd0\xb0\xd1\x81\xd1\x82\xd0\xb0\xd0\xbd\xd0\xb5\xd1\x82 \xd1\x82\xd0\xbe\xd1\x82 \xd0\xb4\xd0\xb5\xd0\xbd\xd1\x8c , \xd0\xba\xd0\xbe\xd0\xb3\xd0\xb4\xd0\xb0 \xd0\xbc\xd1\x8b \xd1\x81\xd0\xbc\xd0\xbe\xd0\xb6\xd0\xb5\xd0\xbc \xd0\xbf\xd1\x83\xd1\x82\xd0\xb5\xd1\x88\xd0\xb5\xd1\x81\xd1\x82\xd0\xb2\xd0\xbe\xd0\xb2\xd0\xb0\xd1\x82\xd1\x8c \xd0\xbd\xd0\xb0

In [None]:
token_batch = en_tokenizer.tokenize(en_examples)
print(token_batch)
token_batch = token_batch.merge_dims(-2, -1)
for ex in token_batch.to_list():
    print(ex)

<tf.RaggedTensor [[[34], [149], [54], [281], [59], [155], [26], [610], [60], [2113], [342], [11]], [[34], [421], [55], [170], [108], [267], [180], [88], [81], [87], [971], [54], [55], [1284], [11]], [[81], [7], [106], [120], [343], [59], [155], [265], [81], [107], [504], [11]]]>
[34, 149, 54, 281, 59, 155, 26, 610, 60, 2113, 342, 11]
[34, 421, 55, 170, 108, 267, 180, 88, 81, 87, 971, 54, 55, 1284, 11]
[81, 7, 106, 120, 343, 59, 155, 265, 81, 107, 504, 11]


In [None]:
text_tokens = tf.gather(en_vocab, token_batch)
tf.strings.reduce_join(text_tokens, separator=' ', axis=-1)

<tf.Tensor: shape=(3,), dtype=string, numpy=
array([b'i wanted to stay in australia a couple of extra days .',
       b'i hope the day will soon come when we can travel to the moon .',
       b"we ' ve all lived in australia since we were born ."],
      dtype=object)>

In [None]:
words = en_tokenizer.detokenize(token_batch)
tf.strings.reduce_join(words, separator=' ', axis=-1)

<tf.Tensor: shape=(3,), dtype=string, numpy=
array([b'i wanted to stay in australia a couple of extra days .',
       b'i hope the day will soon come when we can travel to the moon .',
       b"we ' ve all lived in australia since we were born ."],
      dtype=object)>

In [None]:
START = tf.argmax(tf.constant(reserved_tokens) == "[START]")
END = tf.argmax(tf.constant(reserved_tokens) == "[END]")

def add_start_end(ragged):
    count = ragged.bounding_shape()[0]
    starts = tf.fill([count, 1], START)
    ends = tf.fill([count, 1], END)

    return tf.concat([starts, ragged, ends], axis=1)

words = en_tokenizer.detokenize(add_start_end(token_batch))
tf.strings.reduce_join(words, separator=' ', axis=-1)

<tf.Tensor: shape=(3,), dtype=string, numpy=
array([b'[START] i wanted to stay in australia a couple of extra days . [END]',
       b'[START] i hope the day will soon come when we can travel to the moon . [END]',
       b"[START] we ' ve all lived in australia since we were born . [END]"],
      dtype=object)>

In [None]:
def cleanup_text(reserved_tokens, token_txt):
    # Drop the reserved tokens, except for "[UNK]".
    bad_tokens = [re.escape(tok) for tok in reserved_tokens if tok != "[UNK]"]
    bad_token_re = "|".join(bad_tokens)

    bad_cells = tf.strings.regex_full_match(token_txt, bad_token_re)
    result = tf.ragged.boolean_mask(token_txt, ~bad_cells)

    # Join them into strings.
    result = tf.strings.reduce_join(result, separator=' ', axis=-1)

    return result

cleanup_text(reserved_tokens, words).numpy()

array([b'i wanted to stay in australia a couple of extra days .',
       b'i hope the day will soon come when we can travel to the moon .',
       b"we ' ve all lived in australia since we were born ."],
      dtype=object)

In [None]:
class CustomeTokenizer(tf.Module):
    def __init__(self, reserved_tokens, vocab_path):
        self.tokenizer = text.BertTokenizer(vocab_path, lower_case=True)
        self._reserved_tokens = reserved_tokens
        self._vocab_path = tf.saved_model.Asset(vocab_path)

        vocab = pathlib.Path(vocab_path).read_text().splitlines()
        self.vocab = tf.Variable(vocab)

        # Include a tokenize signature for a batch of strings. 
        self.tokenize.get_concrete_function(tf.TensorSpec(shape=[None], dtype=tf.string))

        # Include `detokenize` and `lookup` signatures for:
        #   * `Tensors` with shapes [tokens] and [batch, tokens]
        #   * `RaggedTensors` with shape [batch, tokens]
        self.detokenize.get_concrete_function(tf.TensorSpec(shape=[None, None], dtype=tf.int64))
        self.detokenize.get_concrete_function(tf.RaggedTensorSpec(shape=[None, None], dtype=tf.int64))

        self.lookup.get_concrete_function(
            tf.TensorSpec(shape=[None, None], dtype=tf.int64))
        self.lookup.get_concrete_function(
            tf.RaggedTensorSpec(shape=[None, None], dtype=tf.int64))
        
        # These `get_*` methods take no arguments
        self.get_vocab_size.get_concrete_function()
        self.get_vocab_path.get_concrete_function()
        self.get_reserved_tokens.get_concrete_function()

    @tf.function
    def detokenize(self, tokenized):
        words = self.tokenizer.detokenize(tokenized)
        return cleanup_text(self._reserved_tokens, words)
    
    @tf.function
    def tokenize(self, strings):
        enc = self.tokenizer.tokenize(strings)
        # Merge the `word` and `word-piece` axes.
        enc = enc.merge_dims(-2, -1)
        enc = add_start_end(enc)

        return enc
    
    @tf.function
    def lookup(self, token_ids):
        return tf.gather(self.vocab, token_ids)
    
    @tf.function
    def get_vocab_size(self):
        return tf.shape(self.vocab)[0]
    
    @tf.function
    def get_vocab_path(self):
        return self._vocab_path
    
    @tf.function
    def get_reserved_tokens(self):
        return tf.constant(self._reserved_tokens)



In [None]:
tokenizers = tf.Module()
tokenizers.ru = CustomeTokenizer(reserved_tokens, ru_vocab_file)
tokenizers.en = CustomeTokenizer(reserved_tokens, en_vocab_file)

In [None]:
model_name = "ru_en_subwordtokenizer"
model_path = os.path.join(drive_mount_path, "MyDrive", "Models", model_name)
tf.saved_model.save(tokenizers, model_path)


INFO:tensorflow:Assets written to: /content/data/MyDrive/Models/ru_en_subwordtokenizer/assets


In [None]:
reloaded_tokenizers = tf.saved_model.load(model_path)
reloaded_tokenizers.en.get_vocab_size().numpy()

2215

In [None]:
tokens = reloaded_tokenizers.en.tokenize(["Hello From Tokenizer"])
tokens.numpy()

array([[   2,   61, 1569,  361,  111,   54, 1489,  301, 1564,  309,    3]])

In [None]:
text_tokens = reloaded_tokenizers.en.lookup(tokens)
text_tokens

<tf.RaggedTensor [[b'[START]', b'he', b'##ll', b'##o', b'from', b'to', b'##ke', b'##n', b'##ize', b'##r', b'[END]']]>

In [None]:
round_trip = reloaded_tokenizers.en.detokenize(tokens)
print(round_trip.numpy()[0].decode('utf-8'))

hello from tokenizer
