# word embedding
对自然语言处理进行数字化 变为数字所表示的向量，才可以经过神经网络进行处理
有一些方法如：
1. one-hot 很明显太过于稀疏
2. word with a unique number 很难找到一个好的编码 同时无法衡量两个词之间的关系
3. word embeddings word embeddings是使用浮点数向量进行表示的，同时这些数是学习得到的 而且维度可以自行制定一般从8-1024维不等

In [1]:
import io
import os
import re
import shutil
import string
import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D
from tensorflow.keras.layers import TextVectorization

In [2]:
tf.get_logger().setLevel('ERROR')
os.environ['CUDA_VISIBLE_DEVICES'] = '1' # 使用 GPU 1
os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'
physical_devices = tf.config.list_physical_devices("GPU")
tf.config.experimental.set_memory_growth(physical_devices[0],True)
logical_devices = tf.config.list_logical_devices("GPU")

In [3]:
os.getcwd()

'/home/wy'

## 数据部分

In [4]:
data_dir = 'tensorflow_study/tensorflow-text/data_dir/'

In [9]:
Imdb_data_dir = os.path.join(data_dir, 'aclImdb')
Imdb_data_dir

'tensorflow_study/tensorflow-text/data_dir/aclImdb'

In [10]:
# 数据集
train_dir = os.path.join(Imdb_data_dir, 'train')
os.listdir(train_dir)

['urls_pos.txt',
 'urls_unsup.txt',
 'neg',
 'pos',
 'unsupBow.feat',
 'labeledBow.feat',
 'urls_neg.txt']

In [13]:
# 通过text_dataset_from_directory创建tf.data.Dataset对象
batch_size = 64
seed = 123
train_ds = keras.utils.text_dataset_from_directory(
    directory=train_dir,
    batch_size=batch_size,
    validation_split=0.2,
    subset='training',
    seed=seed
)

Found 25000 files belonging to 2 classes.
Using 20000 files for training.


In [14]:
val_ds = keras.utils.text_dataset_from_directory(
    directory=train_dir,
    batch_size=batch_size,
    validation_split=0.2,
    subset='validation',
    seed=seed
)

Found 25000 files belonging to 2 classes.
Using 5000 files for validation.


In [16]:
for text_batch, label_batch in train_ds.take(1):
    for i in range(2):
        print(label_batch[i].numpy(), text_batch[i].numpy())

1 b'Jason Bourne sits in a dusty room in with blood on his hands, trying to make sense of what he\'s just done. Meanwhile, a CIA chief in NYC outlines the agency\'s response to what\'s just happened on screen. An American flag stands proudly on the centre of his desk in the foreground of the shot, but as he speaks, it slips out of focus as his plan veers into morally dubious territory, as if it doesn\'t want to be associated with the course of action the government man decides is necessary in the interests of national security.<br /><br />This shot effectively captures the mood of the film. As well as portraying Bourne\'s quest to find out how he became Jason Bourne, Ultimatum is also an examination of the human costs of the measures taken to protect us in the interests of stability and security.<br /><br />It is also probably the best film you\'ll see in the cinema this year. <br /><br />It\'s just so intense. Bourne says to Simon Ross (Considine) "This isn\'t some newspaper story, th

In [17]:
AUTOTUNE = tf.data.AUTOTUNE

train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

## Embedding层 将int->vector

In [None]:
# 使用embedding layer进行embedding
# Embedding的效果是将从一个数字表示映射到一个稠密向量 维度都是人为确定的

In [18]:
embedding_layer = keras.layers.Embedding(1000, 16)

In [19]:
result = embedding_layer(tf.constant([1, 2, 3]))
result.numpy()
# embedding_layer就将1 2 3 三个int常量转化为了embedding的16维表示

array([[-0.00936151,  0.03384093, -0.02365005, -0.00131559,  0.04822022,
         0.0353758 , -0.0362368 ,  0.03327969, -0.02155138, -0.03955991,
         0.01556111,  0.04476155,  0.03947124, -0.01426164,  0.00703765,
        -0.02079763],
       [ 0.0073679 ,  0.02831913, -0.01844732,  0.02355484,  0.02193354,
        -0.04042045, -0.01318113, -0.03431385,  0.03917738, -0.04211224,
         0.04126408,  0.02613726,  0.01594051,  0.04059455,  0.00451093,
         0.02150061],
       [-0.04095035,  0.03070948, -0.03090088, -0.01668663, -0.0235302 ,
        -0.00186567, -0.0433871 ,  0.04784938,  0.00201153, -0.04484543,
         0.00599276,  0.02481134,  0.01584301, -0.0457539 , -0.04185405,
        -0.04755386]], dtype=float32)

In [None]:
# 而对于一般的文本任务 embedding_layer的输入是[samples, seq_len]

In [20]:
result_batched = embedding_layer(tf.constant([[1,2,3], [6,7,8]]))
result_batched
# 输入[batch=2, seq_len=3] 输出[batch=2, seq_len=3, feature=16]

<tf.Tensor: shape=(2, 3, 16), dtype=float32, numpy=
array([[[-0.00936151,  0.03384093, -0.02365005, -0.00131559,
          0.04822022,  0.0353758 , -0.0362368 ,  0.03327969,
         -0.02155138, -0.03955991,  0.01556111,  0.04476155,
          0.03947124, -0.01426164,  0.00703765, -0.02079763],
        [ 0.0073679 ,  0.02831913, -0.01844732,  0.02355484,
          0.02193354, -0.04042045, -0.01318113, -0.03431385,
          0.03917738, -0.04211224,  0.04126408,  0.02613726,
          0.01594051,  0.04059455,  0.00451093,  0.02150061],
        [-0.04095035,  0.03070948, -0.03090088, -0.01668663,
         -0.0235302 , -0.00186567, -0.0433871 ,  0.04784938,
          0.00201153, -0.04484543,  0.00599276,  0.02481134,
          0.01584301, -0.0457539 , -0.04185405, -0.04755386]],

       [[ 0.02101446, -0.04563413, -0.03810685, -0.02001367,
          0.04243615, -0.0263909 , -0.00751717, -0.03385391,
         -0.04545246, -0.04952151, -0.04655317,  0.03153383,
          0.01096991, -0.003

In [21]:
result_batched_ragged = embedding_layer(tf.ragged.constant([[1,2,3],[2,3],[1,5,6,7]]))
result_batched_ragged
# 同时还可以处理RaggedTensor

<tf.RaggedTensor [[[-0.0093615055, 0.03384093, -0.02365005, -0.0013155937, 0.04822022,
   0.0353758, -0.0362368, 0.03327969, -0.021551384, -0.039559912,
   0.015561115, 0.04476155, 0.039471235, -0.014261641, 0.007037651,
   -0.020797634],
  [0.007367898, 0.028319132, -0.018447317, 0.023554835, 0.021933544,
   -0.04042045, -0.013181128, -0.034313846, 0.039177384, -0.042112242,
   0.041264083, 0.026137259, 0.015940513, 0.04059455, 0.004510928,
   0.021500614],
  [-0.040950347, 0.030709479, -0.030900884, -0.01668663, -0.023530198,
   -0.0018656738, -0.043387104, 0.047849383, 0.0020115264, -0.04484543,
   0.005992759, 0.024811339, 0.015843008, -0.045753896, -0.041854046,
   -0.047553863]]                                                      ,
 [[0.007367898, 0.028319132, -0.018447317, 0.023554835, 0.021933544,
   -0.04042045, -0.013181128, -0.034313846, 0.039177384, -0.042112242,
   0.041264083, 0.026137259, 0.015940513, 0.04059455, 0.004510928,
   0.021500614],
  [-0.040950347, 0.03070947

## TextVectorization string->int

In [25]:
def custom_standardization(input_data):
    lowercase = tf.strings.lower(input_data) # 转成小写字母
    stripped_html = tf.strings.regex_replace(lowercase, '<br />', ' ') # 由于文本中有很多html的<br /> tag 使用正则替换为空格
    return tf.strings.regex_replace(stripped_html,'[%s]' % re.escape(string.punctuation), '') # 去掉标点符号

In [26]:
'[%s]' % re.escape(string.punctuation) # 这应该是所有的标点符号集合

'[!"\\#\\$%\\&\'\\(\\)\\*\\+,\\-\\./:;<=>\\?@\\[\\\\\\]\\^_`\\{\\|\\}\\~]'

In [27]:
vocab_size = 10000
sequence_length = 100
# 设置词典大小和一个句子中单词个数

In [28]:
vectorize_layer = TextVectorization(
    standardize = custom_standardization,
    max_tokens = vocab_size,
    output_mode = 'int',                     # 设置输出为int
    output_sequence_length = sequence_length # 设置seq_len最大长度
)

In [29]:
text_ds = train_ds.map(lambda x,y :x) # train_ds本来的组织形式是text,label的形式 使用map+lambda表达式将其仅保留text

In [33]:
vectorize_layer.adapt(text_ds)
# 使用text_ds的文本进行训练TextVectorization

In [41]:
vectorize_layer('I know who you are, I love tensorflow.') # tensorflow->1 应该是unknown

<tf.Tensor: shape=(100,), dtype=int64, numpy=
array([ 10, 118,  36,  22,  23,  10, 116,   1,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0])>

In [53]:
for text_batch in text_ds.take(1):
    for i in range(1):
        print('raw_sentence:')
        print(text_batch[i].numpy())
        print('after vectorize_layer:')
        print(vectorize_layer(text_batch[i]))

# 将一个sentence转化为了int的向量表示

raw_sentence:
b'It\'s always nice to see Angela Bassett getting to do a role that she can really sink her teeth into. She is at times intense, funny and even sexy in her role as Lena, a "colored" woman forced to make a home on a desolate mudbank just outside of Cape Town, South Africa. Danny Glover is also good in a not entirely sympathetic role as her partner, Boesman. Willie Jonah gives a finely nuanced performance as the stranger that discovers Boesman and Lena\'s new living area. It\'s not often that you get a chance to see an intelligent film dealing with mature themes. Although it is based on a play, the late director John Berry (who also directed Claudine) opens the material up by having the film shot in the widescreen Cinemascope format. He also keeps things visually interesting through the creative blocking of actors and by showing us things only mentioned in the play. Just like Diahann Carroll in Claudine, John Berry may have directed Angela Bassett into an Academy Award nomi

## 模型部分
vectorize_layer
Embedding
GlobalAveragePooling1D
Dense
Dense

In [42]:
embedding_dim = 16
model = Sequential([
    vectorize_layer,
    Embedding(input_dim=vocab_size,output_dim=embedding_dim,name='Embedding'),
    GlobalAveragePooling1D(),
    Dense(16, activation='relu'),
    Dense(1)
])

In [43]:
tensorboard_callback = keras.callbacks.TensorBoard(log_dir='tensorflow_study/tensorflow-text/log_dir/')

In [44]:
model.compile(
    optimizer=keras.optimizers.Adam(),
    loss = keras.losses.BinaryCrossentropy(from_logits=True), # 二分类 最后没有用激活函数sigmoid
    metrics=['accuracy']
)

In [45]:
model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=20,
    callbacks=[tensorboard_callback]
)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x7f7f3c1b7fd0>

In [46]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 text_vectorization (TextVec  (None, 100)              0         
 torization)                                                     
                                                                 
 Embedding (Embedding)       (None, 100, 16)           160000    
                                                                 
 global_average_pooling1d (G  (None, 16)               0         
 lobalAveragePooling1D)                                          
                                                                 
 dense (Dense)               (None, 16)                272       
                                                                 
 dense_1 (Dense)             (None, 1)                 17        
                                                                 
Total params: 160,289
Trainable params: 160,289
Non-trai

In [48]:
# 保存embedding layer的参数 其是一个[vocab_size, embedding_dim]的向量
weights = model.get_layer('Embedding').get_weights()[0]
weights

array([[ 0.04382979, -0.00486052,  0.03430449, ..., -0.06809052,
        -0.03361597,  0.02136711],
       [ 0.07515999, -0.01429346,  0.06753578, ..., -0.06306358,
        -0.00482039, -0.0447018 ],
       [ 0.03830411, -0.08916663,  0.02888909, ..., -0.12979527,
        -0.04456586, -0.04959089],
       ...,
       [ 0.4870867 ,  0.46741113,  0.4835528 , ..., -0.48341328,
         0.5170282 ,  0.5436005 ],
       [ 0.19580714,  0.21112469,  0.15403022, ..., -0.17184824,
         0.14987457,  0.149738  ],
       [-0.36725155, -0.3218097 , -0.35055414, ...,  0.263168  ,
        -0.33849916, -0.33139604]], dtype=float32)

In [49]:
vocab = vectorize_layer.get_vocabulary()
vocab

['',
 '[UNK]',
 'the',
 'and',
 'a',
 'of',
 'to',
 'is',
 'in',
 'it',
 'i',
 'this',
 'that',
 'was',
 'as',
 'with',
 'for',
 'movie',
 'but',
 'film',
 'on',
 'not',
 'you',
 'are',
 'his',
 'have',
 'be',
 'he',
 'one',
 'its',
 'at',
 'all',
 'by',
 'an',
 'they',
 'from',
 'who',
 'so',
 'like',
 'her',
 'just',
 'or',
 'about',
 'has',
 'out',
 'if',
 'some',
 'there',
 'what',
 'good',
 'more',
 'when',
 'very',
 'even',
 'she',
 'up',
 'my',
 'no',
 'would',
 'time',
 'only',
 'which',
 'really',
 'story',
 'their',
 'see',
 'were',
 'had',
 'can',
 'me',
 'we',
 'than',
 'much',
 'well',
 'been',
 'will',
 'get',
 'people',
 'bad',
 'also',
 'other',
 'do',
 'into',
 'great',
 'first',
 'because',
 'how',
 'most',
 'him',
 'dont',
 'made',
 'movies',
 'then',
 'them',
 'way',
 'films',
 'make',
 'could',
 'any',
 'after',
 'too',
 'characters',
 'think',
 'watch',
 'being',
 'two',
 'many',
 'seen',
 'character',
 'plot',
 'little',
 'never',
 'acting',
 'where',
 'best',
 '

In [50]:
# 将参数写入磁盘 生成tsv文件 可以可以到http://projector.tensorflow.org/进行可视化
out_v = io.open('tensorflow_study/tensorflow-text/model_dir/vectors.tsv', 'w', encoding='utf-8')
out_m = io.open('tensorflow_study/tensorflow-text/model_dir/metadata.tsv', 'w', encoding='utf-8')

for index, word in enumerate(vocab):
  if index == 0:
    continue  # skip 0, it's padding.
  vec = weights[index]
  out_v.write('\t'.join([str(x) for x in vec]) + "\n")
  out_m.write(word + "\n")
out_v.close()
out_m.close()