# 使用TF TEXT进行tokenize

In [1]:
import requests
import tensorflow as tf
import tensorflow_text as tf_text
import functools
import os
import tensorflow.keras as keras

In [2]:
tf.get_logger().setLevel('ERROR')
os.environ['CUDA_VISIBLE_DEVICES'] = '1' # 使用 GPU 1
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'
physical_devices = tf.config.list_physical_devices("GPU")
tf.config.experimental.set_memory_growth(physical_devices[0],True)
logical_devices = tf.config.list_logical_devices("GPU")

## tokenize的API
主要的接口是Splitter和SplitterWithOffset API 他们分别只有一个方法split 和 split_with_offset
Tokenizer和TokenizerWithOffset是Splitter的实例化 提供了便捷的tokenize和tokenize_with_offsets方法
输入是N维矩阵的话输出是N+1维的RaggedTensor 最内层是tokenize的结果
同时还有Detokenizer接口 通过tokenizer接口tokenize的N维RaggedTensor会变为N-1维的tensor或者RaggedTensor

## Whole word tokenizers 整个单词进行tokenize
1. WhitespaceTokenizer
空白符号tokenizer使用ICU定义的空白字符(空格 制表符 换行符)进行分割
空白符号tokenizer将标点符号和字母连接在了一起 同时不能处理类似于汉字没有空格分隔的句子
2. UnicodeScriptTokenizer
UnicodeScriptTokenizer用unicode进行分割 同时将标点符号单独处理
UnicodeScriptTokenizer还是使用空白符进行分割 不能分割没有空白符的汉字 但是能将标点符号处理

In [18]:
## WhitespaceTokenizer
## 空白符号tokenizer使用ICU定义的空白字符(空格 制表符 换行符)进行分割
## 空白符号tokenizer将标点符号和字母连接在了一起 同时不能处理类似于汉字没有空格分隔的句子

WSTokenizer = tf_text.WhitespaceTokenizer()
WStokens = WSTokenizer.tokenize([["How I can study tensorflow well?", "sorry, maybe next sentence knows the answer.","我不知道呀。"],
                               ["Just study the tutorials carefully.", "come on!","加油!"]])
WStokens
# 生成了一个RaggedTensor
# 之前的输入是二维 [batch=2, sentence_count = 2]
# 输出是三维[batch=2, sentence_count = 2, seq_len] 最内层是token的分割后表示

<tf.RaggedTensor [[[b'How', b'I', b'can', b'study', b'tensorflow', b'well?'],
  [b'sorry,', b'maybe', b'next', b'sentence', b'knows', b'the', b'answer.'],
  [b'\xe6\x88\x91\xe4\xb8\x8d\xe7\x9f\xa5\xe9\x81\x93\xe5\x91\x80\xe3\x80\x82']],
 [[b'Just', b'study', b'the', b'tutorials', b'carefully.'],
  [b'come', b'on!'], [b'\xe5\x8a\xa0\xe6\xb2\xb9!']]]>

In [19]:
print('第一个batch内的内容的表示：')
print(WStokens[0])
print('第一个batch内,第一个句子的表示:')
print(WStokens[0][0])
print('这个句子对应string tensor的维度：')
print(WStokens[0][0].shape)
print('第一个batch内,第一个句子的第一个token的表示:')
print(WStokens[0][0][0])

第一个batch内的内容的表示：
<tf.RaggedTensor [[b'How', b'I', b'can', b'study', b'tensorflow', b'well?'],
 [b'sorry,', b'maybe', b'next', b'sentence', b'knows', b'the', b'answer.'],
 [b'\xe6\x88\x91\xe4\xb8\x8d\xe7\x9f\xa5\xe9\x81\x93\xe5\x91\x80\xe3\x80\x82']]>
第一个batch内,第一个句子的表示:
tf.Tensor([b'How' b'I' b'can' b'study' b'tensorflow' b'well?'], shape=(6,), dtype=string)
这个句子对应string tensor的维度：
(6,)
第一个batch内,第一个句子的第一个token的表示:
tf.Tensor(b'How', shape=(), dtype=string)


In [21]:
## UnicodeScriptTokenizer
## UnicodeScriptTokenizer用unicode进行分割 同时将标点符号单独处理
## UnicodeScriptTokenizer还是使用空白符进行分割 不能分割没有空白符的汉字 但是能将标点符号处理

USTokenizer = tf_text.UnicodeScriptTokenizer()
UStokens = USTokenizer.tokenize([["How I can study tensorflow well?", "sorry, maybe next sentence knows the answer.","我不知道呀。"],
                               ["Just study the tutorials carefully.", "come on!","加油!"]])
UStokens

<tf.RaggedTensor [[[b'How', b'I', b'can', b'study', b'tensorflow', b'well', b'?'],
  [b'sorry', b',', b'maybe', b'next', b'sentence', b'knows', b'the',
   b'answer', b'.']                                                 ,
  [b'\xe6\x88\x91\xe4\xb8\x8d\xe7\x9f\xa5\xe9\x81\x93\xe5\x91\x80',
   b'\xe3\x80\x82']                                                ],
 [[b'Just', b'study', b'the', b'tutorials', b'carefully', b'.'],
  [b'come', b'on', b'!'], [b'\xe5\x8a\xa0\xe6\xb2\xb9', b'!']]]>

## Subword tokenizers
subword tokenizers可以和较小的词汇表一起使用，允许模型从创建词汇的子词中获得一些信息
1. WordpieceTokenizer
WordpieceTokenizer是一种数据驱动的tokenization方法 其生成了一组sub-tokens 和语言本身的语素有关
WordpieceTokenizer期望接受一个分割后的tokens作为输入进行数据驱动 所以一般先用WhiteSpaceTokenizer或者UnicodeScriptTokenizer
本质就是对vocab文件的映射
2. BertTokenizer
BertTokenizer实现了BERT论文中的实现方法，本质是由WordPieceTokenizer支持的 但是还执行其他任务 如单词规范化和标记化
本质还是对vocab文件的映射
3. SentencepieceTokenizer
SentencepieceTokenizer是基于sentencepiece库的 这个方法是根据输入数据快速迭代的 有很明显的sub-word的效果 见ML_tools仓库

In [42]:
## WordpieceTokenizer

# 先使用WSTokenizer进行分词处理
tokens = USTokenizer.tokenize([[["How I can study tensorflow well?", "sorry, maybe next sentence knows the answer.","我不知道呀。"],
                               ["Just study the tutorials carefully.", "come on!","加油!"]]])
print(tokens.to_list())

[[[[b'How', b'I', b'can', b'study', b'tensorflow', b'well', b'?'], [b'sorry', b',', b'maybe', b'next', b'sentence', b'knows', b'the', b'answer', b'.'], [b'\xe6\x88\x91\xe4\xb8\x8d\xe7\x9f\xa5\xe9\x81\x93\xe5\x91\x80', b'\xe3\x80\x82']], [[b'Just', b'study', b'the', b'tutorials', b'carefully', b'.'], [b'come', b'on', b'!'], [b'\xe5\x8a\xa0\xe6\xb2\xb9', b'!']]]]


In [24]:
url = "https://github.com/tensorflow/text/blob/master/tensorflow_text/python/ops/test_data/test_wp_en_vocab.txt?raw=true"
r = requests.get(url)
filepath = "tensorflow_study/tensorflow-text/data_dir/vocab.txt"
open(filepath, 'wb').write(r.content)
# 下载vocab.txt 作为数据驱动

52382

In [43]:
WPTokenizer = tf_text.WordpieceTokenizer(filepath)
wordpiecetokens = WPTokenizer.tokenize(tokens)
wordpiecetokens
# 本质上就是用一个vocab.txt进行映射 将byte映射到int表示上去

<tf.RaggedTensor [[[[[1], [1], [94], [574], [2358, 687, 1192, 2365], [157], [30]],
   [[1504], [13], [285], [261], [1757], [996], [71], [430], [15]],
   [[1], [1]]],
  [[[1], [574], [71], [56, 1804, 687, 6452], [1927], [15]],
   [[211], [92], [4]], [[1], [4]]]]]>

In [33]:
WPTokenizer.detokenize(wordpiecetokens)

<tf.RaggedTensor [[[b'[UNK]'],
  [b'you'],
  [b'know'],
  [b'you'],
  [b'can'],
  [b"'"],
  [b't'],
  [b'explain'],
  [b','],
  [b'but'],
  [b'you'],
  [b'feel'],
  [b'it'],
  [b'.']]]>

In [36]:
# BertTokenizer
# BertTokenizer实现了BERT论文中的实现方法，本质是由WordPieceTokenizer支持的 但是还执行其他任务 如单词规范化和标记化
BTokenizer = tf_text.BertTokenizer(filepath, lower_case=True)
BTokens = BTokenizer.tokenize([["How I can study tensorflow well?", "sorry, maybe next sentence knows the answer.","我不知道呀。"],
                               ["Just study the tutorials carefully.", "come on!","加油!"]])
BTokens
# 本质上还是对vocab的映射

<tf.RaggedTensor [[[[119], [45], [94], [574], [2358, 687, 1192, 2365], [157], [30]],
  [[1504], [13], [285], [261], [1757], [996], [71], [430], [15]],
  [[1], [1], [1], [1], [1], [1]]],
 [[[112], [574], [71], [56, 1804, 687, 6452], [1927], [15]],
  [[211], [92], [4]], [[1], [1], [4]]]]>

In [37]:
## SentencepieceTokenizer
## SentencepieceTokenizer由Sentencepiece库支持
## 类似于BertTokenizer 其包括正则化和sub-tokens的分割 这个可以见ML_tools部分有

url = "https://github.com/tensorflow/text/blob/master/tensorflow_text/python/ops/test_data/test_oss_model.model?raw=true"
sp_model = requests.get(url).content

In [39]:
tokenizer = tf_text.SentencepieceTokenizer(sp_model,out_type=tf.string)
tokens = tokenizer.tokenize(["And it's truly a great honor to have the opportunity to come to this stage twice ; I'm extremely grateful ."])
print(tokens.to_list())
# 下面的\xe2\x96\x81是一个特殊token
# 可以看到其有sub-word的能力 如truly->tru + ly

[[b'\xe2\x96\x81And', b'\xe2\x96\x81it', b"'", b's', b'\xe2\x96\x81tru', b'ly', b'\xe2\x96\x81a', b'\xe2\x96\x81great', b'\xe2\x96\x81honor', b'\xe2\x96\x81to', b'\xe2\x96\x81have', b'\xe2\x96\x81the', b'\xe2\x96\x81', b'op', b'p', b'or', b't', b'un', b'ity', b'\xe2\x96\x81to', b'\xe2\x96\x81come', b'\xe2\x96\x81to', b'\xe2\x96\x81this', b'\xe2\x96\x81', b'st', b'age', b'\xe2\x96\x81', b't', b'w', b'ic', b'e', b'\xe2\x96\x81', b';', b'\xe2\x96\x81I', b"'", b'm', b'\xe2\x96\x81ex', b't', b're', b'm', b'e', b'ly', b'\xe2\x96\x81gr', b'ate', b'ful', b'\xe2\x96\x81', b'.']]


## 其他Tokenizer
1. UnicodeCharTokenizer
按照UTF-8进行分隔 对没有空格的语言很有用
本质和[word_with_unicode]中一样 用unicode编号进行tokenize
2. HubModuleTokenizer
这是一个部署在TF Hub上的模型 并不支持RaggedTensor 这个对没有空格的启发式语言很有效果
这个很棒！https://hub.tensorflow.google.cn/google/zh_segmentation/1 提供了一个应用于中文的按照语义进行分割的tokenizer
3. SplitMergeTokenizer
SplitMergeTokenizer 和 SplitMergeFromLogitsTokenizer 通过对分割点的显式提供进行分割
SplitMergeTokenizer 提供0,1的向量0代表分割点即token的开头1代表不分隔
SplitMergeFromLogitsTokenizer 是通过得分进行分割 第一维大于第二维则代表0 否则代表1
4. RegexSplitter
通过正则表达式进行确定分割点

In [44]:
UCTokenizer = tf_text.UnicodeCharTokenizer()
UCTokens = UCTokenizer.tokenize([["How I can study tensorflow well?", "sorry, maybe next sentence knows the answer.","我不知道呀。"],
                               ["Just study the tutorials carefully.", "come on!","加油!"]])
UCTokens

<tf.RaggedTensor [[[72, 111, 119, 32, 73, 32, 99, 97, 110, 32, 115, 116, 117, 100, 121, 32,
   116, 101, 110, 115, 111, 114, 102, 108, 111, 119, 32, 119, 101, 108, 108,
   63]                                                                      ,
  [115, 111, 114, 114, 121, 44, 32, 109, 97, 121, 98, 101, 32, 110, 101, 120,
   116, 32, 115, 101, 110, 116, 101, 110, 99, 101, 32, 107, 110, 111, 119,
   115, 32, 116, 104, 101, 32, 97, 110, 115, 119, 101, 114, 46]              ,
  [25105, 19981, 30693, 36947, 21568, 12290]],
 [[74, 117, 115, 116, 32, 115, 116, 117, 100, 121, 32, 116, 104, 101, 32,
   116, 117, 116, 111, 114, 105, 97, 108, 115, 32, 99, 97, 114, 101, 102,
   117, 108, 108, 121, 46]                                               ,
  [99, 111, 109, 101, 32, 111, 110, 33], [21152, 27833, 33]]]>

In [55]:
MODEL_HANDLE = "https://hub.tensorflow.google.cn/google/zh_segmentation/1"
segmenter = tf_text.HubModuleTokenizer(MODEL_HANDLE)
tokens = segmenter.tokenize(["新华社北京今天报道了一个新闻"])
tokens
# 这个很难理解

<tf.RaggedTensor [[b'\xe6\x96\xb0\xe5\x8d\x8e\xe7\xa4\xbe', b'\xe5\x8c\x97\xe4\xba\xac',
  b'\xe4\xbb\x8a\xe5\xa4\xa9', b'\xe6\x8a\xa5\xe9\x81\x93',
  b'\xe4\xba\x86', b'\xe4\xb8\x80', b'\xe4\xb8\xaa',
  b'\xe6\x96\xb0\xe9\x97\xbb']]>

In [56]:
tokens.to_list()[0][0].decode('utf-8')

'新华社'

In [57]:
def decode_list(x):
  if type(x) is list:
    return list(map(decode_list, x))
  return x.decode("UTF-8")

def decode_utf8_tensor(x):
  return list(map(decode_list, x.to_list()))

print(decode_utf8_tensor(tokens))
# 可以看到其将一个连续的多个token按照语义切分为了多个部分

[['新华社', '北京', '今天', '报道', '了', '一', '个', '新闻']]


In [61]:
strings_ = ["新华社北京今天报道了一个新闻"]
labels = [[0, 1, 1, 0, 1,0,1,0,1,1,0,1,0,1]] # 提供的分割点
tokenizer = tf_text.SplitMergeTokenizer()
tokens = tokenizer.tokenize(strings_, labels)
print(decode_utf8_tensor(tokens))

[['新华社', '北京', '今天', '报道了', '一个', '新闻']]


In [63]:
strings = [["新华社北京"]]
labels = [[[5.0, -3.2], [0.2, 12.0], [0.0, 11.0], [2.2, -1.0], [-3.0, 3.0]]] # 提供的分割点的概率
tokenizer = tf_text.SplitMergeFromLogitsTokenizer()
tokens = tokenizer.tokenize(strings, labels)
print(decode_utf8_tensor(tokens))

[['新华社', '北京']]


In [64]:
splitter = tf_text.RegexSplitter("\s")
tokens = splitter.split(["What you know you can't explain, but you feel it."], )
print(tokens.to_list())
# 使用\s即每个字符串进行一次分隔

[[b'What', b'you', b'know', b'you', b"can't", b'explain,', b'but', b'you', b'feel', b'it.']]


## offset
如果要知道每个被分隔的token在原始句子的位置 就可以使用几乎所有tokenizer的tokenize_with_offsets方法
左闭右开的

In [71]:
(tokens_, start_offsets, end_offsets) = segmenter.tokenize_with_offsets(['新华社北京今天报道了一个新闻，令大家大为吃惊！'])
print(tokens_.to_list())
print(decode_utf8_tensor(tokens_))
print(start_offsets.to_list())
print(end_offsets.to_list())

[[b'\xe6\x96\xb0\xe5\x8d\x8e\xe7\xa4\xbe', b'\xe5\x8c\x97\xe4\xba\xac', b'\xe4\xbb\x8a\xe5\xa4\xa9', b'\xe6\x8a\xa5\xe9\x81\x93', b'\xe4\xba\x86', b'\xe4\xb8\x80', b'\xe4\xb8\xaa', b'\xe6\x96\xb0\xe9\x97\xbb', b'\xef\xbc\x8c', b'\xe4\xbb\xa4', b'\xe5\xa4\xa7\xe5\xae\xb6', b'\xe5\xa4\xa7\xe4\xb8\xba', b'\xe5\x90\x83\xe6\x83\x8a', b'\xef\xbc\x81']]
[['新华社', '北京', '今天', '报道', '了', '一', '个', '新闻', '，', '令', '大家', '大为', '吃惊', '！']]
[[0, 9, 15, 21, 27, 30, 33, 36, 42, 45, 48, 54, 60, 66]]
[[9, 15, 21, 27, 30, 33, 36, 42, 45, 48, 54, 60, 66, 69]]


In [70]:
tokenizer = tf_text.UnicodeScriptTokenizer()
(tokens, start_offsets, end_offsets) = tokenizer.tokenize_with_offsets(['Everything not saved will be lost.'])
print(tokens.to_list())
print(start_offsets.to_list())
print(end_offsets.to_list())

[[b'Everything', b'not', b'saved', b'will', b'be', b'lost', b'.']]
[[0, 11, 15, 21, 26, 29, 33]]
[[10, 14, 20, 25, 28, 33, 34]]


## detokenization
tokenize的逆操作 但是并不是所有的tokenizer都有这个方法
同时tokenize和detokenization可能是有损的 并不一定完全还原

In [74]:
tokenizer = tf_text.UnicodeCharTokenizer()
(tokens, start_offsets, end_offsets) = tokenizer.tokenize_with_offsets(['Everything not saved will be lost.'])
print(tokens.to_list())
print(start_offsets.to_list())
print(end_offsets.to_list())
strings = tokenizer.detokenize(tokens)
print(strings)

[[69, 118, 101, 114, 121, 116, 104, 105, 110, 103, 32, 110, 111, 116, 32, 115, 97, 118, 101, 100, 32, 119, 105, 108, 108, 32, 98, 101, 32, 108, 111, 115, 116, 46]]
[[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33]]
[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34]]
tf.Tensor([b'Everything not saved will be lost.'], shape=(1,), dtype=string)


## 对tf.data.Dataset对象使用tokenizer
使用map(lambda x: tokenizer.tokenize(x))方法

In [78]:
docs = tf.data.Dataset.from_tensor_slices([["How I can study tensorflow well?", "sorry, maybe next sentence knows the answer.","我不知道呀。"],
                               ["Just study the tutorials carefully.", "come on!","加油!"]])
tokenizer = tf_text.UnicodeCharTokenizer()
tokenized_docs = docs.map(lambda x: tokenizer.tokenize(x))
iterator = iter(tokenized_docs)
print(next(iterator).to_list())
print(next(iterator).to_list())

[[72, 111, 119, 32, 73, 32, 99, 97, 110, 32, 115, 116, 117, 100, 121, 32, 116, 101, 110, 115, 111, 114, 102, 108, 111, 119, 32, 119, 101, 108, 108, 63], [115, 111, 114, 114, 121, 44, 32, 109, 97, 121, 98, 101, 32, 110, 101, 120, 116, 32, 115, 101, 110, 116, 101, 110, 99, 101, 32, 107, 110, 111, 119, 115, 32, 116, 104, 101, 32, 97, 110, 115, 119, 101, 114, 46], [25105, 19981, 30693, 36947, 21568, 12290]]
[[74, 117, 115, 116, 32, 115, 116, 117, 100, 121, 32, 116, 104, 101, 32, 116, 117, 116, 111, 114, 105, 97, 108, 115, 32, 99, 97, 114, 101, 102, 117, 108, 108, 121, 46], [99, 111, 109, 101, 32, 111, 110, 33], [21152, 27833, 33]]
