In [4]:
#单词级的one-hot编码

import numpy as np

samples=['The cat sat on the mat.','The dog ate my homework']  #每个样本是列表的一个元素

token_index={}
for sample in samples:
    for word in sample.split():
        if word not in token_index:
            token_index[word]=len(token_index)+1
            
max_length=10

results=np.zeros(shape=(len(samples),max_length,max(token_index.values())+1))

for i,sample in enumerate(samples):
    for j,word in list(enumerate(sample.split()))[:max_length]:
        index=token_index.get(word)
        results[i,j,index]=1

In [8]:
#字符级的one-hot编码

import string

samples=['The cat sat on the mat.','The dog ate my homework'] 
characters=string.printable
token_index=dict(zip(range(1,len(characters)+1),characters))

max_length=50
results=np.zeros((len(samples),max_length,max(token_index.keys())+1))
for i,sample in enumerate(samples):
    for j,character in enumerate(sample):
        index=token_index.get(character)
        results[i,j,index]=1

In [11]:
#用keras实现单词级的one-hot编码

from keras.preprocessing.text import Tokenizer

samples=['The cat sat on the mat.','The dog ate my homework'] 

tokenizer=Tokenizer(num_words=1000)
tokenizer.fit_on_texts(samples)

sequences=tokenizer.texts_to_sequences(samples)

one_hot_results=tokenizer.texts_to_matrix(samples,mode='binary')

word_index=tokenizer.word_index
print('Found %s unique tokens.'%len(word_index))

Found 9 unique tokens.


In [13]:
#使用散列技巧的单词级的one-hot编码
samples=['The cat sat on the mat.','The dog ate my homework'] 
dimensionality=1000   #将单词保存为长度为1000的向量
max_length=10

results=np.zeros((len(samples),max_length,dimensionality))

for i,sample in enumerate(samples):
    for j,word in list(enumerate(sample.split()))[:max_length]:
        idex=abs(hash(word)%dimensionality)        #将单词列散为0—1000范围内的一个随机整数索引
        results[i,j,idex]=1        

In [14]:
#将一个Embedding层实例化
from keras.layers import Embedding

embedding_layer=Embedding(1000,64)  #标记个数为1000，嵌入的维度为64

In [9]:
from keras.datasets import imdb
from keras.preprocessing import sequence

max_features=10000
maxlen=20

(x_train,y_train),(x_test,y_test)=imdb.load_data(num_words=max_features)

x_train=sequence.pad_sequences(x_train,maxlen=maxlen)
x_test=sequence.pad_sequences(x_test,maxlen=maxlen)

In [11]:
from keras.models import Sequential
from keras.layers import Flatten,Dense,Embedding

model=Sequential()
model.add(Embedding(10000,8,input_length=maxlen))

model.add(Flatten())

model.add(Dense(1,activation='sigmoid'))
model.compile(optimizer='rmsprop',loss='binary_crossentropy',metrics=['acc'])
model.summary()

history=model.fit(x_train,y_train,
                 epochs=10,
                 batch_size=30,
                 validation_split=0.2)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 20, 8)             80000     
_________________________________________________________________
flatten_2 (Flatten)          (None, 160)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 161       
Total params: 80,161
Trainable params: 80,161
Non-trainable params: 0
_________________________________________________________________
Instructions for updating:
Use tf.cast instead.
Train on 20000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
