# Text preprocessing to BERT model using TF.Text
通过TF.Text的API奖文本进行preprocessing处理 变为整型向量输入

In [1]:
import tensorflow as tf
import tensorflow_text as text
import functools
import os
import tensorflow.keras as keras

In [2]:
tf.get_logger().setLevel('ERROR')
os.environ['CUDA_VISIBLE_DEVICES'] = '1' # 使用 GPU 1
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'
physical_devices = tf.config.list_physical_devices("GPU")
tf.config.experimental.set_memory_growth(physical_devices[0],True)
logical_devices = tf.config.list_logical_devices("GPU")

## 创建一个demo dataset

In [3]:
# 注意这个examples的目的是要变成BERT训练的样子 那么就是text_a[0]和text_b[0] 拼接作为输入 text_a[1]和text_b[1]作为输入
examples = {
    "text_a": [
      "Sponge bob Squarepants is an Avenger",
      "Marvel Avengers"
    ],
    "text_b": [
     "Barack Obama is the President.",
     "President is the highest office"
  ],
}

dataset = tf.data.Dataset.from_tensor_slices(examples)
next(iter(dataset))

{'text_a': <tf.Tensor: shape=(), dtype=string, numpy=b'Sponge bob Squarepants is an Avenger'>,
 'text_b': <tf.Tensor: shape=(), dtype=string, numpy=b'Barack Obama is the President.'>}

## Tokenizing
Tokenizing的方法可以使用tokenize_strings内的多种方法，最直接的就是用BertTokenizer 可以自动地将sentence -> subwords/wordpieces
BertTokenizer的初始化需要一个vocab文件 这个可以下载 在这里尝试一个创建一个toy vocabulary

In [56]:
_VOCAB = [
    # Special tokens 特殊token
    b"[UNK]", b"[MASK]", b"[RANDOM]", b"[CLS]", b"[SEP]",
    # Suffixes 词根
    b"##ack", b"##ama", b"##ger", b"##gers", b"##onge", b"##pants",  b"##uare",
    b"##vel", b"##ven", b"an", b"A", b"Bar", b"Hates", b"Mar", b"Ob",
    b"Patrick", b"President", b"Sp", b"Sq", b"bob", b"box", b"has", b"highest",
    b"is", b"office", b"the",
]
_VOCAB_SIZE = len(_VOCAB)

In [5]:
_START_TOKEN = _VOCAB.index(b"[CLS]")
_END_TOKEN = _VOCAB.index(b"[SEP]")
_MASK_TOKEN = _VOCAB.index(b"[MASK]")
_RANDOM_TOKEN = _VOCAB.index(b"[RANDOM]")
_UNK_TOKEN = _VOCAB.index(b"[UNK]")
_MAX_SEQ_LEN = 8
_MAX_PREDICTIONS_PER_BATCH = 5

In [6]:
tf.range(tf.size(_VOCAB, out_type=tf.int64),dtype=tf.int64,)

<tf.Tensor: shape=(31,), dtype=int64, numpy=
array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30])>

In [8]:
lookup_table = tf.lookup.StaticVocabularyTable(
    tf.lookup.KeyValueTensorInitializer(
        keys = _VOCAB, # keys就是对应表的key 由string映射到int 那么key就是string
        key_dtype=tf.string,
        values=tf.range(
            tf.size(_VOCAB, out_type=tf.int64), # values就是对应表的value 是int值 用0->VOCAB的长度表示
            dtype=tf.int64,
        ),
        value_dtype=tf.int64,
    ),
    num_oov_buckets = 1
)

In [10]:
bert_tokenizer_str = text.BertTokenizer(lookup_table, token_out_type=tf.string)
bert_tokenizer_str.tokenize(examples["text_a"])
# 用lookup对象进行BertTokenizer的初始化 设置输出type是string
# 可以看到tokenizer将输入进行了分割

<tf.RaggedTensor [[[b'Sp', b'##onge'], [b'bob'], [b'Sq', b'##uare', b'##pants'], [b'is'],
  [b'an'], [b'A', b'##ven', b'##ger']]                                  ,
 [[b'Mar', b'##vel'], [b'A', b'##ven', b'##gers']]]>

In [13]:
examples['text_a']

['Sponge bob Squarepants is an Avenger', 'Marvel Avengers']

In [11]:
bert_tokenizer_int = text.BertTokenizer(lookup_table, token_out_type=tf.int64)
bert_tokenizer_int.tokenize(examples['text_a'])
# 设置输出为int
# 可以看到对输入进行了分割

<tf.RaggedTensor [[[22, 9], [24], [23, 11, 10], [28], [14], [15, 13, 7]],
 [[18, 12], [15, 13, 8]]]>

In [24]:
segment_a_pieces = bert_tokenizer_int.tokenize(examples['text_a'])
segment_b_pieces = bert_tokenizer_int.tokenize(examples['text_b'])
print(segment_a_pieces)
print(segment_b_pieces)

<tf.RaggedTensor [[[22, 9], [24], [23, 11, 10], [28], [14], [15, 13, 7]],
 [[18, 12], [15, 13, 8]]]>
<tf.RaggedTensor [[[16, 5], [19, 6], [28], [30], [21], [0]], [[21], [28], [30], [27], [29]]]>


In [35]:
segment_a = segment_a_pieces.merge_dims(-2, -1)
segment_b = segment_b_pieces.merge_dims(-2, -1)
print(segment_a)
print(segment_b)
# 使用merge_dims将最后两维合并，因为不论是不是sub-word都不重要 还原为一个sentence的形式更加重要

<tf.RaggedTensor [[22, 9, 24, 23, 11, 10, 28, 14, 15, 13, 7], [18, 12, 15, 13, 8]]>
<tf.RaggedTensor [[16, 5, 19, 6, 28, 30, 21, 0], [21, 28, 30, 27, 29]]>


## 长度裁剪
BERT论文里输入的是两个句子的连接，两个句子的长度和的上限是固定的 所以长于这个的上限要被裁减掉
使用text.Trimmer进行裁剪
text.RoundRobinTrimmer是为每一段分配平均份额 有可能裁剪句子结尾
text.WaterfallTrimmer从最后一个句子的末尾开始裁剪
注意trimmer方法是在多个数据间最后一个轴上进行的裁剪

In [51]:
# 这个例子中裁剪的维度是最后一维 输入是[batch = 3, sentence_count = 2, sub_word_count = ?]
# 那么就是在一个sentence_count维度上 多个batch的元素的和为max_seq_length 即三个裁剪后在sentence_count=1位置和为9等
trimmer_test = text.RoundRobinTrimmer(max_seq_length=8)
trimmed_test = trimmer_test.trim([segment_a,segment_b, segment_a])
trimmed_test

[<tf.RaggedTensor [[22, 9, 24],
  [18, 12, 15]]>,
 <tf.RaggedTensor [[16, 5, 19],
  [21, 28, 30]]>,
 <tf.RaggedTensor [[22, 9],
  [18, 12]]>]

In [47]:
trimmer_pieces = text.RoundRobinTrimmer(max_seq_length=9)
trimmed_pieces = trimmer_pieces.trim([segment_a_pieces])
trimmed_pieces

[<tf.RaggedTensor [[[22, 9], [24], [23, 11, 10], [28], [14], [15]], [[18, 12], [15, 13, 8]]]>]

In [52]:
trimmer = text.RoundRobinTrimmer(max_seq_length=_MAX_SEQ_LEN)
trimmed = trimmer.trim([segment_a, segment_b])
trimmed

[<tf.RaggedTensor [[22, 9, 24, 23],
  [18, 12, 15, 13]]>,
 <tf.RaggedTensor [[16, 5, 19, 6],
  [21, 28, 30, 27]]>]

## combine segment
使用combine_segments方法将裁剪后的在seq_len相同的维度拼接起来 并加上SOS EOS(SEP) 并且得到句子的ids
这一步其实就是BERT人物的Next Sentence Prediction

In [53]:
segments_combined, segment_ids = text.combine_segments(
    trimmed,
    start_of_sequence_id=_START_TOKEN,
    end_of_segment_id=_END_TOKEN
)
segments_combined, segment_ids
# [batch, seq_len] 这个seq_len = SOS + sentence1 + EOS + sentence2 + EOS sentence1 + sentence2 < MAX_LEN

(<tf.RaggedTensor [[3, 22, 9, 24, 23, 4, 16, 5, 19, 6, 4],
  [3, 18, 12, 15, 13, 4, 21, 28, 30, 27, 4]]>,
 <tf.RaggedTensor [[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1],
  [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1]]>)

## MLM Masked Language Model
1. 选择被mask的tokens item selection text.RandomItemSelector
2. 将被mask的tokens选择处理方法 [MASK] [RANDOM]等 Choosing the Masked Value text.MaskValuesChooser
3. 使用RandomItemSelector和MaskValueChooser 生成mask_language_model

In [54]:
# item selection 随机选择被mask的tokens
random_selector = text.RandomItemSelector(
    max_selections_per_batch=_MAX_PREDICTIONS_PER_BATCH,   # 选择最大的选择长度 就是最多mask多少个tokens
    selection_rate=0.2,                                     # 选择mask的比例
    unselectable_ids=[_START_TOKEN, _END_TOKEN, _UNK_TOKEN] # 选择不能被mask的特殊tokens
)

In [55]:
selected = random_selector.get_selection_mask(
    segments_combined,
    axis=1
)

In [58]:
selected
# True就是要MASK处理的部分

<tf.RaggedTensor [[False, False, False, False, False, False, True, False, True, False,
  False],
 [False, True, False, False, False, False, False, False, False, True,
  False]]>

In [None]:
# 选择mask的tokens的处理方法 [MASK] random_word unchanged
# MaskValuesChooser  这个的处理逻辑是按照0.8的概率变成MASK->1 其他0.2的概率变成要么random 要么保留

In [60]:
mask_values_chooser = text.MaskValuesChooser(_VOCAB_SIZE, _MASK_TOKEN, 0.8)
mask_values_chooser.get_mask_values(segments_combined)

<tf.RaggedTensor [[1, 1, 6, 1, 1, 1, 1, 1, 1, 16, 1],
 [29, 1, 1, 1, 13, 1, 21, 28, 1, 1, 1]]>

In [61]:
masked_token_ids, masked_pos, masked_lm_ids = text.mask_language_model(
  segments_combined,
  item_selector=random_selector, mask_values_chooser=mask_values_chooser)

In [63]:
print('raw: ', segments_combined)
print('after mask',masked_token_ids)

raw:  <tf.RaggedTensor [[3, 22, 9, 24, 23, 4, 16, 5, 19, 6, 4],
 [3, 18, 12, 15, 13, 4, 21, 28, 30, 27, 4]]>
after mask <tf.RaggedTensor [[3, 22, 1, 25, 23, 4, 16, 5, 19, 6, 4],
 [3, 18, 12, 15, 13, 4, 1, 28, 30, 0, 4]]>


In [65]:
after_mask_ragged = tf.gather(_VOCAB, masked_token_ids)
after_mask_ragged
# 按照MASK之后进行VOCAB的对照

<tf.RaggedTensor [[b'[CLS]', b'Sp', b'[MASK]', b'box', b'Sq', b'[SEP]', b'Bar', b'##ack',
  b'Ob', b'##ama', b'[SEP]'],
 [b'[CLS]', b'Mar', b'##vel', b'A', b'##ven', b'[SEP]', b'[MASK]', b'is',
  b'the', b'[UNK]', b'[SEP]']]>

In [76]:
print('raw sentence: ')
print(examples['text_a'][0] + ' ' + examples['text_b'][0])
print(examples['text_a'][1] + ' ' + examples['text_b'][1])
after_mask_string = tf.strings.reduce_join(after_mask_ragged,axis=-1,separator=' ')
print('masked language model sentence: ')
print(after_mask_string.numpy()[0])
print(after_mask_string.numpy()[1])

raw sentence: 
Sponge bob Squarepants is an Avenger Barack Obama is the President.
Marvel Avengers President is the highest office
masked language model sentence: 
b'[CLS] Sp [MASK] box Sq [SEP] Bar ##ack Ob ##ama [SEP]'
b'[CLS] Mar ##vel A ##ven [SEP] [MASK] is the [UNK] [SEP]'


In [77]:
masked_pos
# 这是masked的位置

<tf.RaggedTensor [[2, 3],
 [6, 9]]>

In [78]:
masked_lm_ids
# masked ids 作为y_true

<tf.RaggedTensor [[9, 24],
 [21, 27]]>

In [81]:
print(tf.gather(_VOCAB, masked_lm_ids).numpy())
# masked对应回tokens

[[b'##onge' b'bob']
 [b'President' b'highest']]


## padding
将句子padding到固定的长度变成Tensor而非RaggedTensor
text.pad_model_inputs方法

In [83]:
# pad masked_input
NEW_MAK_SEQ_LEN = 20
print('masked_token_ids: ', masked_token_ids)
input_word_ids, input_mask = text.pad_model_inputs(
    input=masked_token_ids,
    max_seq_length=NEW_MAK_SEQ_LEN,
    pad_value=0
)
print('input_word_ids: ', input_word_ids)
print('input_mask: ', input_mask)
# 这个本质上就是进行后面加0 如果句子长度比MAX_SEQ_LEN长的话还可以进行截断

masked_token_ids:  <tf.RaggedTensor [[3, 22, 1, 25, 23, 4, 16, 5, 19, 6, 4],
 [3, 18, 12, 15, 13, 4, 1, 28, 30, 0, 4]]>
input_word_ids:  tf.Tensor(
[[ 3 22  1 25 23  4 16  5 19  6  4  0  0  0  0  0  0  0  0  0]
 [ 3 18 12 15 13  4  1 28 30  0  4  0  0  0  0  0  0  0  0  0]], shape=(2, 20), dtype=int64)
input_mask:  tf.Tensor(
[[1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0]
 [1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0]], shape=(2, 20), dtype=int64)


In [84]:
# pad segment 即区分两个句子的向量 生成input_type_ids
print('segments_ids: ', segment_ids)
input_type_ids, _ = text.pad_model_inputs(
    input = segment_ids,
    max_seq_length=NEW_MAK_SEQ_LEN
)
print('input_type_ids: ', input_type_ids)

segments_ids:  <tf.RaggedTensor [[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1],
 [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1]]>
input_type_ids:  tf.Tensor(
[[0 0 0 0 0 0 1 1 1 1 1 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 1 1 1 1 1 0 0 0 0 0 0 0 0 0]], shape=(2, 20), dtype=int64)


In [86]:
# 对输出进行pad 即对真实的结果进行pad
# 对masked position进行pad 第二维是需要的 因为0代表不用预测
print('masked_position: ', masked_pos)
masked_lm_positions, masked_lm_weights = text.pad_model_inputs(
    input = masked_pos,
    max_seq_length=_MAX_PREDICTIONS_PER_BATCH
)
print('masked_lm_positions: ', masked_lm_positions)
print('masked_lm_weights: ', masked_lm_weights)

masked_position:  <tf.RaggedTensor [[2, 3],
 [6, 9]]>
masked_lm_positions:  tf.Tensor(
[[2 3 0 0 0]
 [6 9 0 0 0]], shape=(2, 5), dtype=int64)
masked_lm_weights:  tf.Tensor(
[[1 1 0 0 0]
 [1 1 0 0 0]], shape=(2, 5), dtype=int64)


In [88]:
# 对被masked的tokens的真实id进行pad
print('masked_ids: ', masked_lm_ids)
masked_lm_ids, _ = text.pad_model_inputs(
    input=masked_lm_ids,
    max_seq_length=_MAX_PREDICTIONS_PER_BATCH
)
print('masked_lm_ids: ', masked_lm_ids)

masked_ids:  <tf.RaggedTensor [[9, 24],
 [21, 27]]>
masked_lm_ids:  tf.Tensor(
[[ 9 24  0  0  0]
 [21 27  0  0  0]], shape=(2, 5), dtype=int64)


In [89]:
model_inputs = {
    "input_word_ids": input_word_ids,
    "input_mask": input_mask,
    "input_type_ids": input_type_ids,
    "masked_lm_ids": masked_lm_ids,
    "masked_lm_positions": masked_lm_positions,
    "masked_lm_weights": masked_lm_weights,
}
model_inputs

{'input_word_ids': <tf.Tensor: shape=(2, 20), dtype=int64, numpy=
 array([[ 3, 22,  1, 25, 23,  4, 16,  5, 19,  6,  4,  0,  0,  0,  0,  0,
          0,  0,  0,  0],
        [ 3, 18, 12, 15, 13,  4,  1, 28, 30,  0,  4,  0,  0,  0,  0,  0,
          0,  0,  0,  0]])>,
 'input_mask': <tf.Tensor: shape=(2, 20), dtype=int64, numpy=
 array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]])>,
 'input_type_ids': <tf.Tensor: shape=(2, 20), dtype=int64, numpy=
 array([[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]])>,
 'masked_lm_ids': <tf.Tensor: shape=(2, 5), dtype=int64, numpy=
 array([[ 9, 24,  0,  0,  0],
        [21, 27,  0,  0,  0]])>,
 'masked_lm_positions': <tf.Tensor: shape=(2, 5), dtype=int64, numpy=
 array([[2, 3, 0, 0, 0],
        [6, 9, 0, 0, 0]])>,
 'masked_lm_weights': <tf.Tensor: shape=(2, 5), dtype=int64, numpy=
 

# 总结方法
1. Input是tf.string类型的Tensor 其中有text_a text_b 每个text_a和text_b的维度是[batch,1]
2. 将Input进行Tokenizer 按照vocab初始化BertTokenizer 将每个text_a text_b 维度变成[batch, num_words, wordpieces] 即将一个句子分成word 再把word分成wordpieces
3. 本任务无需wordpieces单独处理 所以将每个text_a text_b 变成[batch, num_wordpieces]
4. 将两个句子裁剪为MAX_SEQ_LEN以内 trim方法 变成[batch, num_wordpieces_a] [batch, num_wordpiece_b] 其中num_wordpieces_a + num_wordpiece_b <= MAX_SEQ_LEN
5. 将text_a text_b 进行拼接 变成[batch, seq_len] seq_len = num_wordpieces_a + num_wordpiece_b < MAX_SEQ_LEN 这一步之后得到一个句子和区分text a和b的type_ids
6. 进行mask 随机选择mask的位置和mask的value 进行mask 得到masked_input被mask后的输入 mask_pos即mask的位置 mask_ids 被mask的原始token id
7. 进行pad到MAX_SEQ_LEN 和 MAX_PREDICT_LEN
8. 用上面所有东西作为inputs

In [147]:
_VOCAB = [
    # Special tokens 特殊token
    b"[UNK]", b"[MASK]", b"[RANDOM]", b"[CLS]", b"[SEP]",
    # Suffixes 词根
    b"##ack", b"##ama", b"##ger", b"##gers", b"##onge", b"##pants",  b"##uare",
    b"##vel", b"##ven", b"an", b"A", b"Bar", b"Hates", b"Mar", b"Ob",
    b"Patrick", b"President", b"Sp", b"Sq", b"bob", b"box", b"has", b"highest",
    b"is", b"office", b"the",
]
_VOCAB_SIZE = len(_VOCAB)

In [158]:
_START_TOKEN = _VOCAB.index(b"[CLS]")
_END_TOKEN = _VOCAB.index(b"[SEP]")
_MASK_TOKEN = _VOCAB.index(b"[MASK]")
_RANDOM_TOKEN = _VOCAB.index(b"[RANDOM]")
_UNK_TOKEN = _VOCAB.index(b"[UNK]")
_MAX_SEQ_LEN = 128
_MAX_PREDICTIONS_PER_BATCH = 20

In [159]:

def bert_pretrain_preprocess(vocab_table, features):
    """
    对features处理为bert的训练输入
    :param vocab_table: BertTokenizer进行tokenize使用的vocab_table
    :param features: {'text_a': [batch,1], 'text_b':[batch,1]} 其中1是tf.string的一个句子
    :return: 处理后的inputs
    """
    # step1 得到Input
    text_a = features['text_a']
    text_b = features['text_b']

    # step2 初始化BertTokenizer将 text_a text_b进行处理为wordpieces
    bert_tokenizer = text.BertTokenizer(vocab_lookup_table=vocab_table,token_out_type=tf.int64)
    text_a_tokenized = bert_tokenizer.tokenize(text_a)
    text_b_tokenized = bert_tokenizer.tokenize(text_b)
    # [batch, num_words, wordpieces]

    # step3 合并最后两维
    text_a_merged = text_a_tokenized.merge_dims(-2,-1)
    text_b_merged = text_b_tokenized.merge_dims(-2,-1)
    # [batch, num_wordpieces]

    # step4 将两个句子按照batch相同的维度裁剪到和小于等于MAX_SEQ_LEN
    trimmer = text.RoundRobinTrimmer(max_seq_length=_MAX_SEQ_LEN)
    text_a_trimmed, text_b_trimmed = trimmer.trim([text_a_merged, text_b_merged])
    # [batch, num_wordpieces_a] [batch, num_wordpieces_b] num_wordpieces_a + num_wordpiece_b <= MAX_SEQ_LEN

    # step5 将text_a_trimmed text_b_trimmed进行拼接
    combined_segments, segment_ids = text.combine_segments(
        segments=[text_a_trimmed, text_b_trimmed],
        start_of_sequence_id=_START_TOKEN,
        end_of_segment_id=_END_TOKEN
    )
    # [batch, 3+num_wordpieces_a+num_wordpieces_b] 3 = [SOS] + [EOS] + [EOS]

    # step6 进行mask
    random_item_selector = text.RandomItemSelector(
        max_selections_per_batch=_MAX_PREDICTIONS_PER_BATCH,
        selection_rate=0.2,
        unselectable_ids=[_START_TOKEN, _END_TOKEN, _UNK_TOKEN]
    )

    masked_values_chooser = text.MaskValuesChooser(vocab_size=_VOCAB_SIZE,
                                                   mask_token=_MASK_TOKEN,
                                                   mask_token_rate=0.8,
                                                   random_token_rate=0.1)

    masked_input_ids, masked_positions, masked_ids = (text.mask_language_model(
        combined_segments,
        item_selector=random_item_selector,
        mask_values_chooser=masked_values_chooser,
    ))
    # masked_input_ids [batch, 3+num_wordpieces_a+num_wordpieces_b]
    # masked_positions [batch, masked_len]
    # masked_ids [batch, masked_len]

    # step7 进行pad 对输入pad到MAX_SEQ_LEN 对masked部分pad到_MAX_PREDICTIONS_PER_BATCH
    input_word_ids, input_mask = text.pad_model_inputs(
        input=masked_input_ids,
        max_seq_length=_MAX_SEQ_LEN
    )
    # 对masked_input_ids进行pad 得到输入的句子和mask值 1代表是句子 0代表是padding

    input_type_ids, _ = text.pad_model_inputs(
        input=segment_ids,
        max_seq_length=_MAX_SEQ_LEN
    )
    # 对sentence_a sentence_b区分的部分进行pad 0代表句子a和padding 1代表句子b 只要在后面pad上0 即可

    masked_lm_ids, _ = text.pad_model_inputs(
        input=masked_ids,
        max_seq_length=_MAX_PREDICTIONS_PER_BATCH
    )
    # 对masked ids进行pad 得到的是真正要输出的ids

    masked_lm_positions, masked_lm_weights = text.pad_model_inputs(
        input=masked_positions,
        max_seq_length=_MAX_PREDICTIONS_PER_BATCH
    )
    # 对masked position进行pad masked_lm_positions是pad后的结果  masked_lm_weights中1代表是position位置0代表是pad的部分

    # step8 综合上面所有作为字典 作为BERT模型的输入
    model_inputs = {
        "input_word_ids" : input_word_ids,
        "input_mask" : input_mask,
        "input_type_ids" : input_type_ids,
        "masked_lm_ids" : masked_lm_ids,
        "masked_lm_positions" : masked_lm_positions,
        "masked_lm_weights" : masked_lm_weights
    }
    return model_inputs

In [160]:
lookup_table = tf.lookup.StaticVocabularyTable(
    tf.lookup.KeyValueTensorInitializer(
        keys = _VOCAB, # keys就是对应表的key 由string映射到int 那么key就是string
        key_dtype=tf.string,
        values=tf.range(
            tf.size(_VOCAB, out_type=tf.int64), # values就是对应表的value 是int值 用0->VOCAB的长度表示
            dtype=tf.int64,
        ),
        value_dtype=tf.int64,
    ),
    num_oov_buckets = 1
)

In [161]:
# 使用lookup_table初始化bert_pretrain_preprocess函数
# 使用map处理dataset中的数据
dataset = (
    tf.data.Dataset.from_tensors(examples)
    .map(functools.partial(bert_pretrain_preprocess, lookup_table))
)

In [162]:
next(iter(dataset))

{'input_word_ids': <tf.Tensor: shape=(2, 128), dtype=int64, numpy=
 array([[ 3, 22,  1, 24, 23, 11, 10, 28, 14, 15, 13,  7,  4, 16,  5, 19,
          1, 28, 30,  1,  0,  4,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 3,  1, 12, 15, 13,  8,  4, 21, 28, 30, 27, 29,  4,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,