## 使用TextProcessor类处理文本，使用gensim导入已有的预训练词向量

In [1]:
import numpy as np
from gensim.models import KeyedVectors
from text_preprocess import TextProcessor

In [2]:
sent_lst = ['a b c d e',
           'a d e',
           'c b']
preprocessor = TextProcessor(sent_lst)
preprocessor.build_word_freq_dct()
preprocessor.build_word2id()   # 构造单词表，可以输入最大词数做截断，否则全部都要
preprocessor.view_sent_length_freq()  # 观察句长统计信息
print('\n单词表前四个是预设定的占位符')
print('vocabulary:', preprocessor.vocab)

Original 5 words in vocabulary.
After truncated low frequent word:
words num: 5/5; words freq: 1.000
length of sentence: length : freq
2 1
3 1
5 1

单词表前四个是预设定的占位符
vocabulary: ['<pad>', '<s>', '<\\s>', '<unk>', 'a', 'b', 'c', 'd', 'e']


## 这是一个w2v格式的词向量文件，首行表示词数量，维度

In [3]:
with open('../data/1-test_w2v.txt', 'r', encoding='utf-8') as file_open:
    print(file_open.read())

5 3
a 0.418 0.24968 -0.41242
b 0.013441 0.23682 -0.16899
c 0.70853 0.57088 -0.4716
d 0.68047 -0.039263 0.30186
e 0.26818 0.14346 -0.27877


In [4]:
key_weights = KeyedVectors.load_word2vec_format('../data/1-test_w2v.txt')   # 载入这个文件，得到word2vec的一种类

preprocessor.build_weights(keyed_vectors=key_weights)

print('\n1留给未登录词，从2开始对词表中单词建立词向量')
print('word | id | embedding')
for word in preprocessor.vocab:
    word_id = preprocessor.word2id[word]
    print(word, word_id, preprocessor.weights[word_id])

Words exit in w2v file: 5/9, rate: 55.555556%
Shape of weight matrix: (11, 3)

1留给未登录词，从2开始对词表中单词建立词向量
word | id | embedding
<pad> 2 [ 0.08079735  0.08735236 -0.08746586]
<s> 3 [-0.00064265 -0.0525261   0.01152341]
<\s> 4 [-0.07846112  0.01590114 -0.02880822]
<unk> 5 [ 0.06748111  0.02766662 -0.09496042]
a 6 [ 0.418    0.24968 -0.41242]
b 7 [ 0.013441  0.23682  -0.16899 ]
c 8 [ 0.70853  0.57088 -0.4716 ]
d 9 [ 0.68047  -0.039263  0.30186 ]
e 10 [ 0.26818  0.14346 -0.27877]


In [5]:
print('将句子序列padding成长度为7的id序列')

id_array, sent_len_arr = preprocessor.get_truncate_id_list(sent_lst, truncated_len=7)  # 获得这组句子的id表示，同时获得原始长度
for i in range(len(sent_lst)):
    print(sent_lst[i])
    print(id_array[i, :], sent_len_arr[i])

print('-'*100)
test_sent_lst = ['a c e bbb a as',
                 'a b de ab df as d bb c ad',
                 'ab bc cd de']
test_id_array, test_sent_len_arr = preprocessor.get_truncate_id_list(test_sent_lst, truncated_len=7)  # 在一组测试句子上做实验
for i in range(len(test_sent_lst)):
    print(test_sent_lst[i])
    print(test_id_array[i, :], test_sent_len_arr[i])

将句子序列padding成长度为7的id序列
a b c d e
[ 6  7  8  9 10  0  0] 5
a d e
[ 6  9 10  0  0  0  0] 3
c b
[8 7 0 0 0 0 0] 2
----------------------------------------------------------------------------------------------------
a c e bbb a as
[ 6  8 10  1  6  1  0] 6
a b de ab df as d bb c ad
[6 7 1 1 1 1 9] 7
ab bc cd de
[1 1 1 1 0 0 0] 4


## 观察keras中Embedding层是怎么运行的

In [6]:
from keras.models import Sequential
from keras.layers import Embedding


model = Sequential()
# Embedding 层定义：字典长度；词向量维度；是不是对0进行mask；初始化参数，注意加'[]'
model.add(Embedding(input_dim=len(preprocessor.weights), output_dim=3, mask_zero=True, weights=[preprocessor.weights]))
embedding_result = model.predict(id_array)
print('句子列表：')
print(sent_lst)
print('-'*100)
print('词向量列表：')
print(embedding_result)

Using TensorFlow backend.


句子列表：
['a b c d e', 'a d e', 'c b']
----------------------------------------------------------------------------------------------------
词向量列表：
[[[ 0.418     0.24968  -0.41242 ]
  [ 0.013441  0.23682  -0.16899 ]
  [ 0.70853   0.57088  -0.4716  ]
  [ 0.68047  -0.039263  0.30186 ]
  [ 0.26818   0.14346  -0.27877 ]
  [ 0.        0.        0.      ]
  [ 0.        0.        0.      ]]

 [[ 0.418     0.24968  -0.41242 ]
  [ 0.68047  -0.039263  0.30186 ]
  [ 0.26818   0.14346  -0.27877 ]
  [ 0.        0.        0.      ]
  [ 0.        0.        0.      ]
  [ 0.        0.        0.      ]
  [ 0.        0.        0.      ]]

 [[ 0.70853   0.57088  -0.4716  ]
  [ 0.013441  0.23682  -0.16899 ]
  [ 0.        0.        0.      ]
  [ 0.        0.        0.      ]
  [ 0.        0.        0.      ]
  [ 0.        0.        0.      ]
  [ 0.        0.        0.      ]]]


In [7]:
test_embedding_result = model.predict(test_id_array)
print('测试集句子列表：')
print(test_sent_lst)
print('-'*100)
print('测试集词向量列表：')
print(test_embedding_result)
print('可以看到所有未登录词的词向量是一样的。')

测试集句子列表：
['a c e bbb a as', 'a b de ab df as d bb c ad', 'ab bc cd de']
----------------------------------------------------------------------------------------------------
测试集词向量列表：
[[[ 0.418       0.24968    -0.41242   ]
  [ 0.70853     0.57088    -0.4716    ]
  [ 0.26818     0.14346    -0.27877   ]
  [ 0.09229596  0.06816354 -0.01181613]
  [ 0.418       0.24968    -0.41242   ]
  [ 0.09229596  0.06816354 -0.01181613]
  [ 0.          0.          0.        ]]

 [[ 0.418       0.24968    -0.41242   ]
  [ 0.013441    0.23682    -0.16899   ]
  [ 0.09229596  0.06816354 -0.01181613]
  [ 0.09229596  0.06816354 -0.01181613]
  [ 0.09229596  0.06816354 -0.01181613]
  [ 0.09229596  0.06816354 -0.01181613]
  [ 0.68047    -0.039263    0.30186   ]]

 [[ 0.09229596  0.06816354 -0.01181613]
  [ 0.09229596  0.06816354 -0.01181613]
  [ 0.09229596  0.06816354 -0.01181613]
  [ 0.09229596  0.06816354 -0.01181613]
  [ 0.          0.          0.        ]
  [ 0.          0.          0.        ]
  [ 0.       

## TensorFlow中如何使用Embedding

In [8]:
import tensorflow as tf

seq_input = tf.placeholder(tf.int32, shape=(None, None))  # sentence * seq_len
embedding_param = tf.Variable(initial_value=preprocessor.weights, dtype=tf.float32)  # 定义embedding为变量，给他初始化
embedding_layer = tf.nn.embedding_lookup(embedding_param, seq_input)

with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    print('句子列表：')
    print(sent_lst)
    print('-'*100)
    print('词向量列表：')
    print(sess.run(embedding_layer, feed_dict={seq_input: id_array}))
    print('-'*100)
    print('测试集句子列表：')
    print(test_sent_lst)
    print('-'*100)
    print('测试集词向量列表：')
    print(sess.run(embedding_layer, feed_dict={seq_input: test_id_array}))

句子列表：
['a b c d e', 'a d e', 'c b']
----------------------------------------------------------------------------------------------------
词向量列表：
[[[ 0.418     0.24968  -0.41242 ]
  [ 0.013441  0.23682  -0.16899 ]
  [ 0.70853   0.57088  -0.4716  ]
  [ 0.68047  -0.039263  0.30186 ]
  [ 0.26818   0.14346  -0.27877 ]
  [ 0.        0.        0.      ]
  [ 0.        0.        0.      ]]

 [[ 0.418     0.24968  -0.41242 ]
  [ 0.68047  -0.039263  0.30186 ]
  [ 0.26818   0.14346  -0.27877 ]
  [ 0.        0.        0.      ]
  [ 0.        0.        0.      ]
  [ 0.        0.        0.      ]
  [ 0.        0.        0.      ]]

 [[ 0.70853   0.57088  -0.4716  ]
  [ 0.013441  0.23682  -0.16899 ]
  [ 0.        0.        0.      ]
  [ 0.        0.        0.      ]
  [ 0.        0.        0.      ]
  [ 0.        0.        0.      ]
  [ 0.        0.        0.      ]]]
----------------------------------------------------------------------------------------------------
测试集句子列表：
['a c e bbb a as', 'a b de

## 通过程序，我们学习了如何用TextPreprocessor类快速fit训练集  
* 首先 build_word_freq_dct 函数构建词频统计字典，告知总共有多少个词。  
* 然后 build_word2id 函数构造词汇表和 word2id 字典。可以限制词表大小，按词频做截断。如不设置则保留所有词。
* view_sent_length_freq 函数输出句子长度（分词之后的词数量）分布。
* get_truncate_id_list 返回一个句子序列截断后的id序列，句子的原始长度序列。 可以fit测试集句子序列。
* build_weights 构建词向量 matrix 有返回值，同时保存至 self.weights
### **序号0留给padding，1留给未登录词，2-5留给4个预设值占位符，所以词表的第一个词序号是6。keras的Embedding层需要输入字典长度，直接用len(weights)就好了，已经把padding用的0算在里头了。**  