In [14]:
import collections
import os
import pathlib
import re
import string
import sys
import tempfile
import time
import sys

import numpy as np
import matplotlib.pyplot as plt

import tensorflow_datasets as tfds
import tensorflow_text as text
import tensorflow as tf

from tensorflow_text.tools.wordpiece_vocab import bert_vocab_from_dataset as bert_vocab

import pandas as pd
from sklearn.model_selection import train_test_split

In [15]:
current_dir = os.getcwd()
source_dir = os.path.join(current_dir)
first_language = 'pt'
second_language = 'en'
ted_talks = True

bert_tokenizer_params=dict(lower_case=True)
reserved_tokens=["[PAD]", "[UNK]", "[START]", "[END]"]

bert_vocab_args = dict(
    vocab_size = 8000,
    reserved_tokens=reserved_tokens,
    bert_tokenizer_params=bert_tokenizer_params,
    learn_params={},
)

START = tf.argmax(tf.constant(reserved_tokens) == "[START]")
END = tf.argmax(tf.constant(reserved_tokens) == "[END]")

def add_start_end(ragged):
  count = ragged.bounding_shape()[0]
  starts = tf.fill([count,1], START)
  ends = tf.fill([count,1], END)
  return tf.concat([starts, ragged, ends], axis=1)

def cleanup_text(reserved_tokens, token_txt):
  bad_tokens = [re.escape(tok) for tok in reserved_tokens if tok != "[UNK]"]
  bad_token_re = "|".join(bad_tokens)

  bad_cells = tf.strings.regex_full_match(token_txt, bad_token_re)
  result = tf.ragged.boolean_mask(token_txt, ~bad_cells)

  result = tf.strings.reduce_join(result, separator=' ', axis=-1)

  return result

class CustomTokenizer(tf.Module):
  def __init__(self, reserved_tokens, vocab_path):
    self.tokenizer = text.BertTokenizer(vocab_path, lower_case=True)
    self._reserved_tokens = reserved_tokens
    self._vocab_path = tf.saved_model.Asset(vocab_path)

    vocab = pathlib.Path(vocab_path).read_text(encoding='utf-8').splitlines()
    self.vocab = tf.Variable(vocab)

    self.tokenize.get_concrete_function(
        tf.TensorSpec(shape=[None], dtype=tf.string))

    self.detokenize.get_concrete_function(
        tf.TensorSpec(shape=[None, None], dtype=tf.int64))
    self.detokenize.get_concrete_function(
          tf.RaggedTensorSpec(shape=[None, None], dtype=tf.int64))

    self.lookup.get_concrete_function(
        tf.TensorSpec(shape=[None, None], dtype=tf.int64))
    self.lookup.get_concrete_function(
          tf.RaggedTensorSpec(shape=[None, None], dtype=tf.int64))

    self.get_vocab_size.get_concrete_function()
    self.get_vocab_path.get_concrete_function()
    self.get_reserved_tokens.get_concrete_function()

  @tf.function
  def tokenize(self, strings):
    enc = self.tokenizer.tokenize(strings)
    enc = enc.merge_dims(-2,-1)
    enc = add_start_end(enc)
    return enc

  @tf.function
  def detokenize(self, tokenized):
    words = self.tokenizer.detokenize(tokenized)
    return cleanup_text(self._reserved_tokens, words)

  @tf.function
  def lookup(self, token_ids):
    return tf.gather(self.vocab, token_ids)

  @tf.function
  def get_vocab_size(self):
    return tf.shape(self.vocab)[0]

  @tf.function
  def get_vocab_path(self):
    return self._vocab_path

  @tf.function
  def get_reserved_tokens(self):
    return tf.constant(self._reserved_tokens)


def write_vocab_file(filepath, vocab):
  with open(filepath, 'w',encoding='utf-8') as f:
    for token in vocab:
      print(token, file=f)
      
def make_own_tokenizers(train_first, train_secound, first_language='',secound_language=''):
    first_vocab = bert_vocab.bert_vocab_from_dataset(
        train_first.batch(1000).prefetch(2),
        **bert_vocab_args
    )

    secound_vocab = bert_vocab.bert_vocab_from_dataset(
        train_secound.batch(1000).prefetch(2),
        **bert_vocab_args
    )

    write_vocab_file(os.path.join(source_dir,f'{first_language}_vocab.txt'), first_vocab)
    write_vocab_file(os.path.join(source_dir,f'{secound_language}_vocab.txt'), secound_vocab)

    first_tokenizer = text.BertTokenizer(os.path.join(source_dir,f'{first_language}_vocab.txt'), **bert_tokenizer_params)
    secound_tokenizer = text.BertTokenizer(os.path.join(source_dir,f'{secound_language}_vocab.txt'), **bert_tokenizer_params)

    return first_tokenizer, secound_tokenizer

In [16]:
dictionary = os.path.join(source_dir, f'{first_language}_{second_language}')
if not os.path.exists(dictionary) or not os.path.isdir(dictionary):
    os.mkdir(dictionary)
source_dir = os.path.join(current_dir,  f'{first_language}_{second_language}')

if ted_talks:
    examples, metadata = tfds.load(f'ted_hrlr_translate/{first_language}_to_{second_language}', with_info=True, as_supervised=True)
    train_examples, val_examples = examples['train'], examples['validation']

    train_first = train_examples.map(lambda first, second: first)
    train_second = train_examples.map(lambda first, second: second)

    make_own_tokenizers(train_first, train_second, first_language=first_language, secound_language=second_language)
else:
    df = pd.read_csv(os.path.join(source_dir,f'{first_language}_{second_language}.tsv'), sep='\t', header=None, names=[f'ID_{second_language}', second_language, f'ID_{first_language}', first_language]).loc[:, [first_language, second_language]]

    train_examples, val_examples = train_test_split(df, test_size=0.05, random_state=42)

    train_first = train_examples[[first_language]]
    train_second = train_examples[[second_language]]

    train_first = tf.data.Dataset.from_tensor_slices(train_first.values)
    train_second = tf.data.Dataset.from_tensor_slices(train_second.values)

    make_own_tokenizers(train_first, train_second, first_language=first_language, secound_language=second_language)

In [None]:
model_name = os.path.join(source_dir, f'translate_{first_language}_{second_language}_converter')

tokenizers = tf.Module()
tokenizers.pt= CustomTokenizer(reserved_tokens, os.path.join(source_dir,f'{first_language}_vocab.txt'))
tokenizers.en = CustomTokenizer(reserved_tokens, os.path.join(source_dir,f'{second_language}_vocab.txt'))

tf.saved_model.save(tokenizers, model_name)

In [18]:
reloaded_tokenizers = tf.saved_model.load(model_name)
reloaded_tokenizers.pt.get_vocab_size().numpy()

tokens = reloaded_tokenizers.pt.tokenize(['Eu li sobre triceratops na enciclopédia.'])

round_trip = reloaded_tokenizers.pt.detokenize(tokens)

print(round_trip.numpy()[0].decode('utf-8'))

reloaded_tokenizers.en.get_vocab_size().numpy()

tokens = reloaded_tokenizers.en.tokenize(['I read about triceratops in the encyclopedia.'])

round_trip = reloaded_tokenizers.en.detokenize(tokens)

print(round_trip.numpy()[0].decode('utf-8'))

eu li sobre triceratops na enciclopedia .
i read about triceratops in the encyclopedia .
