# unicode NLP
NLP模型经常处理不同的语言，不同的语言又有不同的词典
Unicode是针对于几乎所有语言都可以用其进行表示文字的方法
unicode character是0-0x0FFFF的int值
unicode string是一串0或者unicode character

In [3]:
import os

import numpy as np
import matplotlib.pyplot as plt

import tensorflow as tf
import tensorflow.keras as keras

In [2]:
tf.get_logger().setLevel('ERROR')
os.environ['CUDA_VISIBLE_DEVICES'] = '1' # 使用 GPU 1
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'
physical_devices = tf.config.list_physical_devices("GPU")
tf.config.experimental.set_memory_growth(physical_devices[0],True)
logical_devices = tf.config.list_logical_devices("GPU")

## tf.string data type

In [4]:
# 基本的tf.string类型允许byte strings 即b''
# 默认unicode string是通过utf-8进行编码
tf.constant(u"Thanks θ_θ!")

<tf.Tensor: shape=(), dtype=string, numpy=b'Thanks \xce\xb8_\xce\xb8!'>

In [114]:
# tf.string将byte strings看做一个原字体 这样tf.string的长度是可变的 所以tf.string的多维表示的shape最后一维是没有意义的
tf.constant([u"You're", u"welcome!"]).shape
# shape 2表示有2个tf.string对象 第二维不表示

<tf.Tensor: shape=(2,), dtype=string, numpy=array([b"You're", b'welcome!'], dtype=object)>

## 原生unicode的表示
tensorflow中unicode的表示有两种
1. string scalar string类型的标量 这样表示为一个string内部用\分割的各个character的表示
2. int32 vector int类型的向量 这样就将每个character表征为int 生成向量表示

In [11]:
# 默认是使用utf-8进行表示 这是因为string默认编码为utf-8
text_utf8 = tf.constant("自然语言处理")
text_utf8

<tf.Tensor: shape=(), dtype=string, numpy=b'\xe8\x87\xaa\xe7\x84\xb6\xe8\xaf\xad\xe8\xa8\x80\xe5\xa4\x84\xe7\x90\x86'>

In [12]:
# 将string转为utf-16-be类型 对应的tf表示也会变化
text_utf16be = tf.constant(u"自然语言处理".encode("UTF-16-BE"))
text_utf16be

<tf.Tensor: shape=(), dtype=string, numpy=b'\x81\xeaq6\x8b\xed\x8a\x00Y\x04t\x06'>

In [13]:
# 这样通过ord方法就可以获得int值 作为向量进行表示
text_chars = tf.constant([ord(s) for s in u"自然语言处理"])
text_chars

<tf.Tensor: shape=(6,), dtype=int32, numpy=array([33258, 28982, 35821, 35328, 22788, 29702], dtype=int32)>

## tf.strings方法进行表示unicode
1. tf.strings.unicode_decode:将string scalar转为code points的向量
2. tf.strings.unicode_encode:将code points向量转为 string scalar
3. tf.strings.unicode_transcode:将string scalar转为其他形式的编码 如utf-8 -> utf-16-be

In [14]:
tf.strings.unicode_decode(text_utf8, input_encoding='UTF-8')

<tf.Tensor: shape=(6,), dtype=int32, numpy=array([33258, 28982, 35821, 35328, 22788, 29702], dtype=int32)>

In [15]:
tf.strings.unicode_encode(text_chars, output_encoding='UTF-8')

<tf.Tensor: shape=(), dtype=string, numpy=b'\xe8\x87\xaa\xe7\x84\xb6\xe8\xaf\xad\xe8\xa8\x80\xe5\xa4\x84\xe7\x90\x86'>

In [16]:
tf.strings.unicode_transcode(text_utf16be, input_encoding='UTF-16-BE', output_encoding='UTF-8')

<tf.Tensor: shape=(), dtype=string, numpy=b'\xe8\x87\xaa\xe7\x84\xb6\xe8\xaf\xad\xe8\xa8\x80\xe5\xa4\x84\xe7\x90\x86'>

## tf.strings方法对带有batch的内容处理
对于不同长度的string进行处理 生成的是tf.RaggedTensor

In [17]:
batch_utf8 = [s.encode('UTF-8') for s in [u'hÃllo', u'What is the weather tomorrow', u'Göödnight', u'😊']]
batch_utf8

[b'h\xc3\x83llo',
 b'What is the weather tomorrow',
 b'G\xc3\xb6\xc3\xb6dnight',
 b'\xf0\x9f\x98\x8a']

In [20]:
batch_chars_ragged = tf.strings.unicode_decode(batch_utf8, input_encoding='UTF-8')
batch_chars_ragged
# 生成了RaggedTensor 不同维度长度不同

<tf.RaggedTensor [[104, 195, 108, 108, 111],
 [87, 104, 97, 116, 32, 105, 115, 32, 116, 104, 101, 32, 119, 101, 97, 116,
  104, 101, 114, 32, 116, 111, 109, 111, 114, 114, 111, 119]               ,
 [71, 246, 246, 100, 110, 105, 103, 104, 116], [128522]]>

In [21]:
for each in batch_chars_ragged.numpy():
    print(each)

[104 195 108 108 111]
[ 87 104  97 116  32 105 115  32 116 104 101  32 119 101  97 116 104 101
 114  32 116 111 109 111 114 114 111 119]
[ 71 246 246 100 110 105 103 104 116]
[128522]


In [23]:
# 对于RaggedTensor 可以使用to_tensor或者to_sparse进行padding
batch_chars_ragged.to_tensor().shape
# to_tensor padding到了28维

TensorShape([4, 28])

In [31]:
batch_chars_ragged.to_sparse().values

<tf.Tensor: shape=(43,), dtype=int32, numpy=
array([   104,    195,    108,    108,    111,     87,    104,     97,
          116,     32,    105,    115,     32,    116,    104,    101,
           32,    119,    101,     97,    116,    104,    101,    114,
           32,    116,    111,    109,    111,    114,    114,    111,
          119,     71,    246,    246,    100,    110,    105,    103,
          104,    116, 128522], dtype=int32)>

In [37]:
tf.strings.unicode_encode(tf.ragged.constant([[99,97,116],[100,111,103],[99,111]]), output_encoding='UTF-8')
# unicode_encode的时候可以输入RaggedTensor 进行encode 转化为Tensor

<tf.Tensor: shape=(3,), dtype=string, numpy=array([b'cat', b'dog', b'co'], dtype=object)>

In [39]:
tf.strings.unicode_encode(batch_chars_ragged.to_tensor(),output_encoding='UTF-8')
# 值得注意的是 如果直接把padding之后的tensor调用unicode_encode 会有很多padding的0值

<tf.Tensor: shape=(4,), dtype=string, numpy=
array([b'h\xc3\x83llo\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00',
       b'What is the weather tomorrow',
       b'G\xc3\xb6\xc3\xb6dnight\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00',
       b'\xf0\x9f\x98\x8a\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00'],
      dtype=object)>

In [42]:
# 需要将其处理回 RaggedTensor
tf.strings.unicode_encode(tf.RaggedTensor.from_tensor(batch_chars_ragged.to_tensor(),padding=0), output_encoding='UTF-8')

<tf.Tensor: shape=(4,), dtype=string, numpy=
array([b'h\xc3\x83llo', b'What is the weather tomorrow',
       b'G\xc3\xb6\xc3\xb6dnight', b'\xf0\x9f\x98\x8a'], dtype=object)>

## unicode对象的操作
1. character length             tf.strings.length
2. character substrings         tf.strings.substr
3. split unicode strings        tf.strings.unicode_split
4. byte offset for characters   tf.strings.unicode_decode_with_offsets

In [43]:
thanks = u'Thanks 😊'.encode('UTF-8')
thanks

b'Thanks \xf0\x9f\x98\x8a'

In [46]:
# 使用tf.strings.length表示多少个单位 默认使用'BYTE' 还可以使用'UTF8-CHAR' 'UTF16_CHAR'等
num_bytes = tf.strings.length(thanks).numpy()
num_chars = tf.strings.length(thanks, unit='UTF8_CHAR').numpy()
print(f'{num_bytes} bytes, {num_chars} UTF-8 characters')

11 bytes, 8 UTF-8 characters


In [54]:
# 使用tf.strings.substr 接收unit参数， 使用它确定substring的开始位置pos和len长度
tf.strings.substr(thanks, pos=9, len=10)

<tf.Tensor: shape=(), dtype=string, numpy=b'\x98\x8a'>

In [55]:
# 使用tf.strings.unicode_split将unicode strings转为substrings
tf.strings.unicode_split(thanks, input_encoding='UTF-8')

<tf.Tensor: shape=(8,), dtype=string, numpy=
array([b'T', b'h', b'a', b'n', b'k', b's', b' ', b'\xf0\x9f\x98\x8a'],
      dtype=object)>

In [56]:
# tf.strings.unicode_decode_with_offsets 在decode的基础上，加上了offset offset即为不同的character的起始byte的位置
codepoints, offsets = tf.strings.unicode_decode_with_offsets(u'🎈🎉🎊', 'UTF-8')
for (codepoint, offset) in zip(codepoints.numpy(), offsets.numpy()):
  print('At byte offset {}: codepoint {}'.format(offset, codepoint))

At byte offset 0: codepoint 127880
At byte offset 4: codepoint 127881
At byte offset 8: codepoint 127882


## unicode scripts
tf.strings.unicode_script方法可以表征character所在的语言

In [68]:
# unicode 33464代表汉字芸 1041代表西里尔语Б
# 可以直接处理list对象
uscript = tf.strings.unicode_script([33464, 1041])  # ['芸', 'Б']
print(uscript.numpy())  # [17, 8] == [USCRIPT_HAN, USCRIPT_CYRILLIC]
# unicode_script之后得到了17->汉语 8->西里尔语

[17  8]


In [69]:
# 也可以处理RaggedTensor 和 Tensor对象
tf.strings.unicode_script(batch_chars_ragged)

<tf.RaggedTensor [[25, 25, 25, 25, 25],
 [25, 25, 25, 25, 0, 25, 25, 0, 25, 25, 25, 0, 25, 25, 25, 25, 25, 25, 25,
  0, 25, 25, 25, 25, 25, 25, 25, 25]                                      ,
 [25, 25, 25, 25, 25, 25, 25, 25, 25], [0]]>

# 实践：分词
分词就是把一个句子变成word-like units
在如英语的语言中，可以直接使用空格分词
但是在汉语或者日语中，没有空格，直接用字进行分割
而在类似于德语的场景中，可能需要对长词 进行分割
最后，一段文本中可能有多种语言 如我的名字是Bob

In [74]:
# 初始文本 dtype unicode string
sentence_texts = [u'Hello, world. 你好鸭~', u'你好，世界。Yes, well Done.', u'世界こんにちは']

In [75]:
# 将string变为int向量表示
sentence_texts_codepoint = tf.strings.unicode_decode(sentence_texts, input_encoding='UTF-8')
sentence_texts_codepoint

<tf.RaggedTensor [[72, 101, 108, 108, 111, 44, 32, 119, 111, 114, 108, 100, 46, 32, 20320,
  22909, 40493, 126]                                                     ,
 [20320, 22909, 65292, 19990, 30028, 12290, 89, 101, 115, 44, 32, 119, 101,
  108, 108, 32, 68, 111, 110, 101, 46]                                     ,
 [19990, 30028, 12371, 12435, 12395, 12385, 12399]]>

In [77]:
# 找到其对应的语言 unicode_script
sentence_texts_language = tf.strings.unicode_script(sentence_texts_codepoint)
sentence_texts_language

<tf.RaggedTensor [[25, 25, 25, 25, 25, 0, 0, 25, 25, 25, 25, 25, 0, 0, 17, 17, 17, 0],
 [17, 17, 0, 17, 17, 0, 25, 25, 25, 0, 0, 25, 25, 25, 25, 0, 25, 25, 25, 25,
  0]                                                                        ,
 [17, 17, 20, 20, 20, 20, 20]]>

In [92]:
all_sentence_list = list()
sentence_count = sentence_texts_language.shape[0]

In [93]:
# 按照语言的连续性进行分割
# method1 比较笨 用的滑动窗口-----------------------------------------------------------------------------

for index, sentence in enumerate(sentence_texts_language):
    # print(index)
    # print(sentence)
    sentence_codepoint = sentence_texts_codepoint[index]
    len_ = sentence.shape[0]
    left = 0
    right = 0
    while right < len_ -1:
        if sentence[right] != sentence[right+1]:
            sub_sentence = sentence_codepoint[left:right+1].numpy()
            all_sentence_list.append(sub_sentence)
            left = right + 1
        right = right + 1
    last_sentence = sentence_codepoint[left:len_].numpy()
    all_sentence_list.append(last_sentence)

In [94]:
all_sentence_list

[array([ 72, 101, 108, 108, 111], dtype=int32),
 array([44, 32], dtype=int32),
 array([119, 111, 114, 108, 100], dtype=int32),
 array([46, 32], dtype=int32),
 array([20320, 22909, 40493], dtype=int32),
 array([126], dtype=int32),
 array([20320, 22909], dtype=int32),
 array([65292], dtype=int32),
 array([19990, 30028], dtype=int32),
 array([12290], dtype=int32),
 array([ 89, 101, 115], dtype=int32),
 array([44, 32], dtype=int32),
 array([119, 101, 108, 108], dtype=int32),
 array([32], dtype=int32),
 array([ 68, 111, 110, 101], dtype=int32),
 array([46], dtype=int32),
 array([19990, 30028], dtype=int32),
 array([12371, 12435, 12395, 12385, 12399], dtype=int32)]

In [96]:
word_char_codepoint = tf.ragged.constant(all_sentence_list)
word_char_codepoint
# method1 比较笨 用的滑动窗口-----------------------------------------------------------------------------

<tf.RaggedTensor [[72, 101, 108, 108, 111], [44, 32], [119, 111, 114, 108, 100], [46, 32],
 [20320, 22909, 40493], [126], [20320, 22909], [65292], [19990, 30028],
 [12290], [89, 101, 115], [44, 32], [119, 101, 108, 108], [32],
 [68, 111, 110, 101], [46], [19990, 30028],
 [12371, 12435, 12395, 12385, 12399]]>

In [100]:
# method2 官方写的高级方法-----------------------------------------------------------------------------
tf.fill([sentence_texts_language.nrows(), 1], True)
# 这是代表每句的开头都是分隔点

<tf.Tensor: shape=(3, 1), dtype=bool, numpy=
array([[ True],
       [ True],
       [ True]])>

In [101]:
tf.not_equal(sentence_texts_language[:, 1:], sentence_texts_language[:, :-1])
# 使用交错位置即一个从头开始考虑 另一个从第二个位置开始考虑 不断往后 有不同的就是分隔点-1
# 使用not_equal方法十分优美

<tf.RaggedTensor [[False, False, False, False, True, False, True, False, False, False, False,
  True, False, True, False, False, True]                                    ,
 [False, True, True, False, True, True, False, False, True, False, True,
  False, False, False, True, True, False, False, False, True]           ,
 [False, True, False, False, False, False]]>

In [102]:
sentence_char_starts_word = tf.concat(
    [tf.fill([sentence_texts_language.nrows(), 1], True),
     tf.not_equal(sentence_texts_language[:, 1:], sentence_texts_language[:, :-1])],
    axis=1)
sentence_char_starts_word
# tf.not_equal是分隔点-1 再拼接上刚开始的句首的True 就刚好对应了

<tf.RaggedTensor [[True, False, False, False, False, True, False, True, False, False, False,
  False, True, False, True, False, False, True]                            ,
 [True, False, True, True, False, True, True, False, False, True, False,
  True, False, False, False, True, True, False, False, False, True]     ,
 [True, False, True, False, False, False, False]]>

In [103]:
sentence_char_starts_word.values
# RaggedTensor的values会将其拉直

<tf.Tensor: shape=(46,), dtype=bool, numpy=
array([ True, False, False, False, False,  True, False,  True, False,
       False, False, False,  True, False,  True, False, False,  True,
        True, False,  True,  True, False,  True,  True, False, False,
        True, False,  True, False, False, False,  True,  True, False,
       False, False,  True,  True, False,  True, False, False, False,
       False])>

In [104]:
tf.where(sentence_char_starts_word.values)

<tf.Tensor: shape=(18, 1), dtype=int64, numpy=
array([[ 0],
       [ 5],
       [ 7],
       [12],
       [14],
       [17],
       [18],
       [20],
       [21],
       [23],
       [24],
       [27],
       [29],
       [33],
       [34],
       [38],
       [39],
       [41]])>

In [105]:
word_starts = tf.squeeze(tf.where(sentence_char_starts_word.values), axis=1)
word_starts
# 这样就把True的部分找出来

<tf.Tensor: shape=(18,), dtype=int64, numpy=
array([ 0,  5,  7, 12, 14, 17, 18, 20, 21, 23, 24, 27, 29, 33, 34, 38, 39,
       41])>

In [106]:
# 由于word_starts的表示方式是拉直的 RaggedTensor就有from_row_start方法进行切片
word_char_codepoint_ = tf.RaggedTensor.from_row_starts(
    values=sentence_texts_codepoint.values,
    row_starts=word_starts
)
word_char_codepoint_
# method2 官方写的高级方法-----------------------------------------------------------------------------

<tf.RaggedTensor [[72, 101, 108, 108, 111], [44, 32], [119, 111, 114, 108, 100], [46, 32],
 [20320, 22909, 40493], [126], [20320, 22909], [65292], [19990, 30028],
 [12290], [89, 101, 115], [44, 32], [119, 101, 108, 108], [32],
 [68, 111, 110, 101], [46], [19990, 30028],
 [12371, 12435, 12395, 12385, 12399]]>

In [107]:
#  将True False强制类型转换为int
tf.cast(sentence_char_starts_word, tf.int64)

<tf.RaggedTensor [[1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1],
 [1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1],
 [1, 0, 1, 0, 0, 0, 0]]>

In [109]:
sentence_num_words = tf.reduce_sum(
    tf.cast(sentence_char_starts_word, tf.int64),
    axis=1)
sentence_num_words
# 这样就能计算出分隔点的位置

<tf.Tensor: shape=(3,), dtype=int64, numpy=array([ 6, 10,  2])>

In [110]:
# 用from_row_lengths接口 能够将本来就是RaggedTensor合并为一个新的RaggedTensor
# word_char_codepoint是分割开的RaggedTensor
sentence_word_char_codepoint = tf.RaggedTensor.from_row_lengths(
    values=word_char_codepoint,
    row_lengths=sentence_num_words)
print(sentence_word_char_codepoint)
# 得到了按照数量合并的RaggedTensor 由RaggedTensor组成的RaggedTensor

<tf.RaggedTensor [[[72, 101, 108, 108, 111], [44, 32], [119, 111, 114, 108, 100], [46, 32],
  [20320, 22909, 40493], [126]]                                           ,
 [[20320, 22909], [65292], [19990, 30028], [12290], [89, 101, 115],
  [44, 32], [119, 101, 108, 108], [32], [68, 111, 110, 101], [46]] ,
 [[19990, 30028], [12371, 12435, 12395, 12385, 12399]]]>


In [112]:
sentence_word_char_codepoint[0]

<tf.RaggedTensor [[72, 101, 108, 108, 111], [44, 32], [119, 111, 114, 108, 100], [46, 32],
 [20320, 22909, 40493], [126]]>

In [113]:
tf.strings.unicode_encode(sentence_word_char_codepoint, 'UTF-8').to_list()

[[b'Hello',
  b', ',
  b'world',
  b'. ',
  b'\xe4\xbd\xa0\xe5\xa5\xbd\xe9\xb8\xad',
  b'~'],
 [b'\xe4\xbd\xa0\xe5\xa5\xbd',
  b'\xef\xbc\x8c',
  b'\xe4\xb8\x96\xe7\x95\x8c',
  b'\xe3\x80\x82',
  b'Yes',
  b', ',
  b'well',
  b' ',
  b'Done',
  b'.'],
 [b'\xe4\xb8\x96\xe7\x95\x8c',
  b'\xe3\x81\x93\xe3\x82\x93\xe3\x81\xab\xe3\x81\xa1\xe3\x81\xaf']]