In [1]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense


docs = ["The cat sat on the mat.", "I love green eggs and ham."]

max_words = 9
# 只考虑最常见的9个单词
max_len = 5
# 统一序列化长度
embedding_dims = 3
# 嵌入维度

# 分词
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(docs)

#生成字典
word_index = tokenizer.word_index
print(word_index)

{'the': 1, 'cat': 2, 'sat': 3, 'on': 4, 'mat': 5, 'i': 6, 'love': 7, 'green': 8, 'eggs': 9, 'and': 10, 'ham': 11}


In [2]:
# 序列化
sequences = tokenizer.texts_to_sequences(docs)
print(sequences)
# 统一序列长度
data = pad_sequences(sequences, maxlen=max_len)
print(data)

[[1, 2, 3, 4, 1, 5], [6, 7, 8]]
[[2 3 4 1 5]
 [0 0 6 7 8]]


In [4]:
# Embedding模型
model = Sequential()
# Embedding至少需要max_words和embedding_dims两个参数
model.add(Embedding(max_words, embedding_dims, input_length=max_len, name='Embedding'))
model.compile('rmsprop', 'mse')

out = model.predict(data)
print(out)
print(out.shape)
# 查看权重
layer = model.get_layer('Embedding')
print(layer.get_weights())

[[[-0.00292559 -0.04992212 -0.04366062]
  [ 0.01076051 -0.03242324  0.02076508]
  [-0.00455843 -0.03653208 -0.00788287]
  [-0.0149475   0.04119522  0.03264712]
  [ 0.04683765 -0.00638938 -0.04281424]]

 [[ 0.04675872  0.015482   -0.0307904 ]
  [ 0.04675872  0.015482   -0.0307904 ]
  [-0.0479466  -0.02100326 -0.02366737]
  [-0.02635015  0.0478604   0.02456227]
  [ 0.04056449 -0.02963911  0.03667369]]]
(2, 5, 3)
[array([[ 0.04675872,  0.015482  , -0.0307904 ],
       [-0.0149475 ,  0.04119522,  0.03264712],
       [-0.00292559, -0.04992212, -0.04366062],
       [ 0.01076051, -0.03242324,  0.02076508],
       [-0.00455843, -0.03653208, -0.00788287],
       [ 0.04683765, -0.00638938, -0.04281424],
       [-0.0479466 , -0.02100326, -0.02366737],
       [-0.02635015,  0.0478604 ,  0.02456227],
       [ 0.04056449, -0.02963911,  0.03667369]], dtype=float32)]
