# Subword tokenizer
subword的tokenizer介于word-based和character-based之间 word太大了 是有可能跟根据语义细分的 character-based太小了 字母级似乎没有任何意义
tf_text提供了三种subword level的 tokenizer
1. BertTokenizer是一种high level的接口 其集成了token split算法和WordPieceTokenizer 本质是对每个token进行vocab的int映射
2. WordpieceTokenizer是一种low level的接口 它仅仅实现了WordPiece算法 在调用之前必须分隔word 其本质也是token对vocab的int映射
3. sentencepieceTokenizer是实现了sentencepiece算法 其根据文本训练出了一个模型进行分词sub-word 分词效果很明显

In [1]:
import collections
import os
import pathlib
import re
import string
import sys
import tempfile
import time

import numpy as np
import matplotlib.pyplot as plt

import tensorflow_datasets as tfds
import tensorflow_text as text
import tensorflow as tf
import tensorflow.keras as keras

In [2]:
tf.get_logger().setLevel('ERROR')
os.environ['CUDA_VISIBLE_DEVICES'] = '1' # 使用 GPU 1
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'
physical_devices = tf.config.list_physical_devices("GPU")
tf.config.experimental.set_memory_growth(physical_devices[0],True)
logical_devices = tf.config.list_logical_devices("GPU")

In [3]:
pwd = pathlib.Path.cwd()
pwd

PosixPath('/home/wy')

## 数据集

In [3]:
data_dir = 'tensorflow_study/tensorflow-text/data_dir/datasets/pt_to_en/'

In [5]:
en_train_list = list()

In [6]:
with open(data_dir + 'en.train', 'r') as en_train_file:
    while True:
        line = en_train_file.readline().strip()
        if not line:
            break
        en_train_list.append(line)


In [7]:
en_train_list[:10]

["amongst all the troubling deficits we struggle with today — we think of financial and economic primarily — the ones that concern me most is the deficit of political dialogue — our ability to address modern conflicts as they are , to go to the source of what they 're all about and to understand the key players and to deal with them .",
 'we who are diplomats , we are trained to deal with conflicts between states and issues between states .',
 'and i can tell you , our agenda is full .',
 'there is trade , there is disarmament , there is cross-border relations .',
 'but the picture is changing , and we are seeing that there are new key players coming onto the scene .',
 "`` we loosely call them `` '' groups . '' '' they may represent social , religious , political , economic , military realities . ''",
 'and we struggle with how to deal with them .',
 'the rules of engagement : how to talk , when to talk , and how to deal with them .',
 'let me show you a slide here which illustrates t

In [8]:
pt_train_list = list()

In [9]:
with open(data_dir + 'pt.train', 'r') as pt_train_file:
    while True:
        line = pt_train_file.readline().strip()
        if not line:
            break
        pt_train_list.append(line)

In [10]:
pt_train_list[:10]

['entre todas as grandes privações com que nos debatemos hoje — pensamos em financeiras e económicas primeiro — aquela que mais me preocupa é a falta de diálogo político — a nossa capacidade de abordar conflitos modernos como eles são , de ir à raiz do que eles são e perceber os agentes-chave e lidar com eles .',
 'nós que somos diplomatas , somos treinados para lidar com conflitos entre estados e problemas entre estados .',
 'e posso dizer-vos , a nossa agenda está lotada .',
 'há o comércio , o desarmamento , as relações inter-fronteiras .',
 'mas o cenário está a mudar , e estamos a ver que há novos agentes-chave a surgirem .',
 "`` nós chamamos-lhes , de forma vaga , `` '' grupos '' '' . podem representar realidades sociais , religiosas , políticas , económicas ou militares . ''",
 'e debatemo-nos sobre como lidar com elas .',
 'as regras de interacção : como falar , quando falar e como lidar com elas .',
 'deixem-me mostrar-vos aqui um diapositivos que ilustra o carácter dos confl

In [11]:
print(np.array(en_train_list).shape)

(51785,)


In [12]:
train_en = tf.constant(np.array(en_train_list), dtype=tf.string)
train_en[:10]

<tf.Tensor: shape=(10,), dtype=string, numpy=
array([b"amongst all the troubling deficits we struggle with today \xe2\x80\x94 we think of financial and economic primarily \xe2\x80\x94 the ones that concern me most is the deficit of political dialogue \xe2\x80\x94 our ability to address modern conflicts as they are , to go to the source of what they 're all about and to understand the key players and to deal with them .",
       b'we who are diplomats , we are trained to deal with conflicts between states and issues between states .',
       b'and i can tell you , our agenda is full .',
       b'there is trade , there is disarmament , there is cross-border relations .',
       b'but the picture is changing , and we are seeing that there are new key players coming onto the scene .',
       b"`` we loosely call them `` '' groups . '' '' they may represent social , religious , political , economic , military realities . ''",
       b'and we struggle with how to deal with them .',
       b'

In [13]:
train_pt = tf.constant(np.array(pt_train_list), dtype=tf.string)
train_pt[:10]

<tf.Tensor: shape=(10,), dtype=string, numpy=
array([b'entre todas as grandes priva\xc3\xa7\xc3\xb5es com que nos debatemos hoje \xe2\x80\x94 pensamos em financeiras e econ\xc3\xb3micas primeiro \xe2\x80\x94 aquela que mais me preocupa \xc3\xa9 a falta de di\xc3\xa1logo pol\xc3\xadtico \xe2\x80\x94 a nossa capacidade de abordar conflitos modernos como eles s\xc3\xa3o , de ir \xc3\xa0 raiz do que eles s\xc3\xa3o e perceber os agentes-chave e lidar com eles .',
       b'n\xc3\xb3s que somos diplomatas , somos treinados para lidar com conflitos entre estados e problemas entre estados .',
       b'e posso dizer-vos , a nossa agenda est\xc3\xa1 lotada .',
       b'h\xc3\xa1 o com\xc3\xa9rcio , o desarmamento , as rela\xc3\xa7\xc3\xb5es inter-fronteiras .',
       b'mas o cen\xc3\xa1rio est\xc3\xa1 a mudar , e estamos a ver que h\xc3\xa1 novos agentes-chave a surgirem .',
       b"`` n\xc3\xb3s chamamos-lhes , de forma vaga , `` '' grupos '' '' . podem representar realidades sociais , religi

In [14]:
en_dataset = tf.data.TextLineDataset([data_dir+'en.train'])
pt_dataset = tf.data.TextLineDataset([data_dir+'pt.train'])
# 从文件中读取line生成dataset

## 从所有数据中生成词汇表vocab
bert_vocab_from_dataset.bert_vocab_from_dataset()方法

In [15]:
from tensorflow_text.tools.wordpiece_vocab import bert_vocab_from_dataset as bert_vocab

In [16]:
# bert_vocab_from_dataset的参数
bert_tokenizer_params=dict(lower_case=True)
reserved_tokens=["[PAD]", "[UNK]", "[START]", "[END]"]

bert_vocab_args = dict(
    # 词表大小
    vocab_size = 8000,
    # 特殊token
    reserved_tokens=reserved_tokens,
    # 参数
    bert_tokenizer_params=bert_tokenizer_params,
    # Arguments for `wordpiece_vocab.wordpiece_tokenizer_learner_lib.learn`
    learn_params={},
)

In [17]:
%%time
pt_vocab = bert_vocab.bert_vocab_from_dataset(
    pt_dataset.batch(1000).prefetch(2),
    **bert_vocab_args
)
# 生成了葡语的pt_vocab

CPU times: user 1min 39s, sys: 656 ms, total: 1min 40s
Wall time: 1min 26s


In [18]:
print(len(pt_vocab))
print(pt_vocab[:10])
print(pt_vocab[100:110])

7765
['[PAD]', '[UNK]', '[START]', '[END]', '!', '#', '$', '%', '&', "'"]
['no', 'por', 'mais', 'na', 'eu', 'esta', 'muito', 'isso', 'isto', 'sao']


In [19]:
with open(data_dir+'pt_vocab.txt','w') as pt_vocab_file:
    for each in pt_vocab:
        print(each, file=pt_vocab_file)
# 保存vocab词典

In [20]:
%%time
en_vocab = bert_vocab.bert_vocab_from_dataset(
    en_dataset.batch(1000).prefetch(2),
    **bert_vocab_args
)
# 生成了英语的pt_vocab

CPU times: user 1min 13s, sys: 344 ms, total: 1min 13s
Wall time: 1min


In [21]:
print(len(en_vocab))
print(en_vocab[:10])
print(en_vocab[100:110])

7010
['[PAD]', '[UNK]', '[START]', '[END]', '!', '#', '$', '%', '&', "'"]
['as', 'all', 'at', 'one', 'people', 're', 'like', 'if', 'our', 'from']


In [22]:
with open(data_dir+'en_vocab.txt','w') as en_vocab_file:
    for each in en_vocab:
        print(each, file=en_vocab_file)
# 保存vocab词典

## 基于vocab生成tokenizer
BertTokenizer方法 和tokenize_strings部分相同

In [23]:
pt_tokenizer = text.BertTokenizer(data_dir+'pt_vocab.txt', lower_case=True)
en_tokenizer = text.BertTokenizer(data_dir+'en_vocab.txt', lower_case=True)

In [24]:
en_train_list[0]

"amongst all the troubling deficits we struggle with today — we think of financial and economic primarily — the ones that concern me most is the deficit of political dialogue — our ability to address modern conflicts as they are , to go to the source of what they 're all about and to understand the key players and to deal with them ."

In [25]:
test_en_tokenized = en_tokenizer.tokenize(en_train_list[0])
# [batch, seq_len, token]

In [26]:
print(test_en_tokenized.shape)
print(test_en_tokenized[0].shape)
print(test_en_tokenized[0][0].shape)

(1, None, None)
(65, None)
(1,)


In [27]:
test_en_tokenized[0].values
# 拉直成tensor进行处理

<tf.Tensor: shape=(71,), dtype=int64, numpy=
array([2568,  101,   71,   56, 1548, 4593, 2159, 6437, 2364,   78, 2003,
         93,  208,   67,   78,  133,   74, 1332,   72,  638, 4039,   67,
         71,  615,   75, 3458,  114,  190,   80,   71, 2159, 6437,  893,
         74,  730, 2654,   67,  108,  859,   73, 1510,  832, 2725,  100,
         83,   86,   13,   73,  164,   73,   71,  948,   74,   90,   83,
          9,  105,  101,   95,   72,   73,  281,   71,  679, 6258,   72,
         73,  725,   93,  124,   15])>

In [28]:
text_tokens = tf.gather(en_vocab,test_en_tokenized[0].values)
text_tokens
# tf.gather进行映射 对应回sub-word

<tf.Tensor: shape=(71,), dtype=string, numpy=
array([b'amongst', b'all', b'the', b't', b'##ro', b'##ubling', b'de',
       b'##fic', b'##its', b'we', b'struggle', b'with', b'today',
       b'\xe2\x80\x94', b'we', b'think', b'of', b'financial', b'and',
       b'economic', b'primarily', b'\xe2\x80\x94', b'the', b'ones',
       b'that', b'concern', b'me', b'most', b'is', b'the', b'de',
       b'##fic', b'##it', b'of', b'political', b'dialogue',
       b'\xe2\x80\x94', b'our', b'ability', b'to', b'address', b'modern',
       b'conflicts', b'as', b'they', b'are', b',', b'to', b'go', b'to',
       b'the', b'source', b'of', b'what', b'they', b"'", b're', b'all',
       b'about', b'and', b'to', b'understand', b'the', b'key', b'players',
       b'and', b'to', b'deal', b'with', b'them', b'.'], dtype=object)>

In [29]:
print('raw sentence: ', en_train_list[0])
print('after tokenize: ',tf.strings.reduce_join(text_tokens, separator=' ', axis=-1).numpy())
# 值得注意的是 tokenize之后 troubling -> t ro ubling 很多词都被分隔了

raw sentence:  amongst all the troubling deficits we struggle with today — we think of financial and economic primarily — the ones that concern me most is the deficit of political dialogue — our ability to address modern conflicts as they are , to go to the source of what they 're all about and to understand the key players and to deal with them .
after tokenize:  b"amongst all the t ##ro ##ubling de ##fic ##its we struggle with today \xe2\x80\x94 we think of financial and economic primarily \xe2\x80\x94 the ones that concern me most is the de ##fic ##it of political dialogue \xe2\x80\x94 our ability to address modern conflicts as they are , to go to the source of what they ' re all about and to understand the key players and to deal with them ."


In [30]:
words = en_tokenizer.detokenize(test_en_tokenized[0])
print(words.shape)
words
# detokenize进行还原

(65, None)


<tf.RaggedTensor [[b'amongst'],
 [b'all'],
 [b'the'],
 [b'troubling'],
 [b'deficits'],
 [b'we'],
 [b'struggle'],
 [b'with'],
 [b'today'],
 [b'\xe2\x80\x94'],
 [b'we'],
 [b'think'],
 [b'of'],
 [b'financial'],
 [b'and'],
 [b'economic'],
 [b'primarily'],
 [b'\xe2\x80\x94'],
 [b'the'],
 [b'ones'],
 [b'that'],
 [b'concern'],
 [b'me'],
 [b'most'],
 [b'is'],
 [b'the'],
 [b'deficit'],
 [b'of'],
 [b'political'],
 [b'dialogue'],
 [b'\xe2\x80\x94'],
 [b'our'],
 [b'ability'],
 [b'to'],
 [b'address'],
 [b'modern'],
 [b'conflicts'],
 [b'as'],
 [b'they'],
 [b'are'],
 [b','],
 [b'to'],
 [b'go'],
 [b'to'],
 [b'the'],
 [b'source'],
 [b'of'],
 [b'what'],
 [b'they'],
 [b"'"],
 [b're'],
 [b'all'],
 [b'about'],
 [b'and'],
 [b'to'],
 [b'understand'],
 [b'the'],
 [b'key'],
 [b'players'],
 [b'and'],
 [b'to'],
 [b'deal'],
 [b'with'],
 [b'them'],
 [b'.']]>

In [31]:
words_ = tf.strings.reduce_join(words, separator=' ', axis=-1)

In [32]:
tf.strings.reduce_join(words_, separator=' ', axis=-1)

<tf.Tensor: shape=(), dtype=string, numpy=b"amongst all the troubling deficits we struggle with today \xe2\x80\x94 we think of financial and economic primarily \xe2\x80\x94 the ones that concern me most is the deficit of political dialogue \xe2\x80\x94 our ability to address modern conflicts as they are , to go to the source of what they ' re all about and to understand the key players and to deal with them .">

## 自定义detokenization
在上面那个detokenize部分看到 detokenize之后似乎不是很好理解 可以对其进行额外操作 有助于后续任务
1. clean text 清除[START] [PAD] [END]等token 因为这些在下游任务没有
2. 将一个一个token拼接起来 用空格隔开 join

In [33]:
def cleanup_text(token_txt, reserved_tokens=None):
    """
    丢弃保留词 并拉直
    :param reserved_tokens: 保留此表
    :param token_txt: 待处理文本
    :return: 处理后的文本
    """
    if reserved_tokens is None:
        reserved_tokens = ["[PAD]", "[START]", "[END]"]
    bad_token_re = "|".join(reserved_tokens)
    # 丢弃词的正则表达式

    bad_cells = tf.strings.regex_full_match(token_txt, bad_token_re)
    # 得到boolean值的list True位置是符合正则表达的
    result = tf.ragged.boolean_mask(token_txt, ~bad_cells)
    # 对bad_cell取非 然后使用boolean_mask进行丢弃
    result = tf.strings.reduce_join(result, separator=' ', axis=-1)
    # 使用reduce_join进行拼接
    return  result

In [34]:
en_examples = None
for en_examples in en_dataset.batch(3).take(1):
    print(en_examples)

tf.Tensor(
[b"amongst all the troubling deficits we struggle with today \xe2\x80\x94 we think of financial and economic primarily \xe2\x80\x94 the ones that concern me most is the deficit of political dialogue \xe2\x80\x94 our ability to address modern conflicts as they are , to go to the source of what they 're all about and to understand the key players and to deal with them ."
 b'we who are diplomats , we are trained to deal with conflicts between states and issues between states .'
 b'and i can tell you , our agenda is full .'], shape=(3,), dtype=string)


In [35]:
print(en_examples.numpy())

[b"amongst all the troubling deficits we struggle with today \xe2\x80\x94 we think of financial and economic primarily \xe2\x80\x94 the ones that concern me most is the deficit of political dialogue \xe2\x80\x94 our ability to address modern conflicts as they are , to go to the source of what they 're all about and to understand the key players and to deal with them ."
 b'we who are diplomats , we are trained to deal with conflicts between states and issues between states .'
 b'and i can tell you , our agenda is full .']


In [36]:
token_batch = en_tokenizer.tokenize(en_examples)
print(token_batch.shape)
token_batch

(3, None, None)


<tf.RaggedTensor [[[2568], [101], [71], [56, 1548, 4593], [2159, 6437, 2364], [78], [2003],
  [93], [208], [67], [78], [133], [74], [1332], [72], [638], [4039], [67],
  [71], [615], [75], [3458], [114], [190], [80], [71], [2159, 6437, 893],
  [74], [730], [2654], [67], [108], [859], [73], [1510], [832], [2725],
  [100], [83], [86], [13], [73], [164], [73], [71], [948], [74], [90], [83],
  [9], [105], [101], [95], [72], [73], [281], [71], [679], [6258], [72],
  [73], [725], [93], [124], [15]]                                           ,
 [[78], [136], [86], [40, 2423, 2425, 1361, 803], [13], [78], [86], [1876],
  [73], [725], [93], [2725], [284], [451], [72], [1127], [284], [451], [15]],
 [[72], [45], [94], [224], [79], [13], [108], [3432], [80], [636], [15]]]>

In [37]:
# 由于最后一维没有用，拼接起来
token_batch = token_batch.merge_dims(-2,-1)
# merge_dims将倒数第二维和倒数第一维拼起来
print(token_batch.shape)
token_batch

(3, None)


<tf.RaggedTensor [[2568, 101, 71, 56, 1548, 4593, 2159, 6437, 2364, 78, 2003, 93, 208, 67,
  78, 133, 74, 1332, 72, 638, 4039, 67, 71, 615, 75, 3458, 114, 190, 80, 71,
  2159, 6437, 893, 74, 730, 2654, 67, 108, 859, 73, 1510, 832, 2725, 100,
  83, 86, 13, 73, 164, 73, 71, 948, 74, 90, 83, 9, 105, 101, 95, 72, 73,
  281, 71, 679, 6258, 72, 73, 725, 93, 124, 15]                             ,
 [78, 136, 86, 40, 2423, 2425, 1361, 803, 13, 78, 86, 1876, 73, 725, 93,
  2725, 284, 451, 72, 1127, 284, 451, 15]                               ,
 [72, 45, 94, 224, 79, 13, 108, 3432, 80, 636, 15]]>

In [38]:
words = en_tokenizer.detokenize(token_batch)
words
# detokenize的结果 假设这就是一个翻译的结果 现在要进行clean处理

<tf.RaggedTensor [[b'amongst', b'all', b'the', b'troubling', b'deficits', b'we', b'struggle',
  b'with', b'today', b'\xe2\x80\x94', b'we', b'think', b'of', b'financial',
  b'and', b'economic', b'primarily', b'\xe2\x80\x94', b'the', b'ones',
  b'that', b'concern', b'me', b'most', b'is', b'the', b'deficit', b'of',
  b'political', b'dialogue', b'\xe2\x80\x94', b'our', b'ability', b'to',
  b'address', b'modern', b'conflicts', b'as', b'they', b'are', b',', b'to',
  b'go', b'to', b'the', b'source', b'of', b'what', b'they', b"'", b're',
  b'all', b'about', b'and', b'to', b'understand', b'the', b'key',
  b'players', b'and', b'to', b'deal', b'with', b'them', b'.']               ,
 [b'we', b'who', b'are', b'diplomats', b',', b'we', b'are', b'trained',
  b'to', b'deal', b'with', b'conflicts', b'between', b'states', b'and',
  b'issues', b'between', b'states', b'.']                              ,
 [b'and', b'i', b'can', b'tell', b'you', b',', b'our', b'agenda', b'is',
  b'full', b'.']              

In [39]:
print(cleanup_text(words).numpy())
# 使用cleanup_text进行处理 可见还是很有效果的 变成了可读性很高的结果

[b"amongst all the troubling deficits we struggle with today \xe2\x80\x94 we think of financial and economic primarily \xe2\x80\x94 the ones that concern me most is the deficit of political dialogue \xe2\x80\x94 our ability to address modern conflicts as they are , to go to the source of what they ' re all about and to understand the key players and to deal with them ."
 b'we who are diplomats , we are trained to deal with conflicts between states and issues between states .'
 b'and i can tell you , our agenda is full .']


In [40]:
# 自定义一个customTokenizer实现

class CustomTokenizer(tf.Module):
  def __init__(self, reserved_tokens, vocab_path):
    super(CustomTokenizer,self).__init__()
    self.tokenizer = text.BertTokenizer(vocab_path, lower_case=True)
    # 初始化一个BertTokenizer
    self._reserved_tokens = reserved_tokens
    # 保存保留词
    self._vocab_path = tf.saved_model.Asset(vocab_path)
    # vocab_path

    vocab = pathlib.Path(vocab_path).read_text().splitlines()
    self.vocab = tf.Variable(vocab)
    # vocab的tf对象

    ## Create the signatures for export:

    # Include a tokenize signature for a batch of strings.
    self.tokenize.get_concrete_function(
        tf.TensorSpec(shape=[None], dtype=tf.string))

    # Include `detokenize` and `lookup` signatures for:
    #   * `Tensors` with shapes [tokens] and [batch, tokens]
    #   * `RaggedTensors` with shape [batch, tokens]
    self.detokenize.get_concrete_function(
        tf.TensorSpec(shape=[None, None], dtype=tf.int64))
    self.detokenize.get_concrete_function(
          tf.RaggedTensorSpec(shape=[None, None], dtype=tf.int64))

    self.lookup.get_concrete_function(
        tf.TensorSpec(shape=[None, None], dtype=tf.int64))
    self.lookup.get_concrete_function(
          tf.RaggedTensorSpec(shape=[None, None], dtype=tf.int64))

    # These `get_*` methods take no arguments
    self.get_vocab_size.get_concrete_function()
    self.get_vocab_path.get_concrete_function()
    self.get_reserved_tokens.get_concrete_function()

  @tf.function
  def tokenize(self, strings):
    enc = self.tokenizer.tokenize(strings)
    # 先使用bertTokenizer完成
    # Merge the `word` and `word-piece` axes.
    enc = enc.merge_dims(-2,-1)
    # 由于bertTokenizer处理过后是单个token最后一维没有意义 合并一下
    # enc = add_start_end(enc)
    # 加上start和end
    return enc

  @tf.function
  def detokenize(self, tokenized):
    words = self.tokenizer.detokenize(tokenized)
    # 先使用bert解码
    return cleanup_text(self._reserved_tokens, words)
    # 使用cleanup_text函数后续处理

  @tf.function
  def lookup(self, token_ids):
    return tf.gather(self.vocab, token_ids)
    # 找到token_ids对应的文字

  @tf.function
  def get_vocab_size(self):
    return tf.shape(self.vocab)[0]

  @tf.function
  def get_vocab_path(self):
    return self._vocab_path

  @tf.function
  def get_reserved_tokens(self):
    return tf.constant(self._reserved_tokens)

In [None]:
tokenizers = tf.Module()
tokenizers.pt = CustomTokenizer(reserved_tokens, data_dir+'pt_vocab.txt')
tokenizers.en = CustomTokenizer(reserved_tokens, data_dir+'en_vocab.txt')

In [None]:
# 保存模型
model_name = 'tensorflow_study/tensorflow-text/model_dir/ted_hrlr_translate_pt_en_converter'
tf.saved_model.save(tokenizers, model_name)

In [None]:
#加载模型
reloaded_tokenizers = tf.saved_model.load(model_name)
reloaded_tokenizers.en.get_vocab_size().numpy()

In [None]:
tokens = reloaded_tokenizers.en.tokenize(['Hello TensorFlow!'])
tokens.numpy()

## 自定义lookup对照表
tf.lookup

In [4]:
pt_lookup = tf.lookup.StaticVocabularyTable(
    num_oov_buckets=1,
    initializer=tf.lookup.TextFileInitializer(
        filename=data_dir+'pt_vocab.txt',
        key_dtype=tf.string,
        key_index = tf.lookup.TextFileIndex.WHOLE_LINE,
        value_dtype = tf.int64,
        value_index=tf.lookup.TextFileIndex.LINE_NUMBER))
pt_tokenizer = text.BertTokenizer(pt_lookup)
# 使用tf.lookup.StaticVocabularyTable创建一个vocab table进行lookup
# 这个方式是使用vocab文件进行创建

In [5]:
pt_lookup.lookup(tf.constant(['é', 'um', 'uma', 'para', 'não']))

<tf.Tensor: shape=(5,), dtype=int64, numpy=array([7765,   85,   86,   87, 7765])>

In [6]:
pt_lookup = tf.lookup.StaticVocabularyTable(
    num_oov_buckets=1,
    initializer=tf.lookup.KeyValueTensorInitializer(
        keys=pt_vocab,
        values=tf.range(len(pt_vocab), dtype=tf.int64)))
pt_tokenizer = text.BertTokenizer(pt_lookup)
# 这个是使用内存里的变量进行创建

NameError: name 'pt_vocab' is not defined