In [1]:
import os

#### 1 读取文件内容

In [2]:
train_texts = open ("train_contents.txt", encoding = 'utf-8').read().split('\n')
train_labels= open ("train_labels.txt", encoding = 'utf-8').read ().split ('\n')
test_texts  = open ("test_contents.txt", encoding = 'utf-8').read().split ('\n')
test_labels = open ("test_labels.txt", encoding = 'utf-8').read().split ('\n')

all_texts  = train_texts + test_texts
all_labels = train_labels + test_labels

#### 2 对文本进行向量化处理

In [3]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
import numpy as np

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [4]:
MAX_SEQUENCE_LENGTH = 100
EMBEDDING_DIM = 200
VALIDATION_SPLIT = 0.16
TEST_SPLIT = 0.2

In [5]:
# 2.1 声明使用分词器Tokenizer，Tokenizer是一个用于向量化文本，或将文本转换为序列（即单词在字典中的下标构成的列表，从1算起）的类。
tokenizer = Tokenizer()
# 2.1.1 要用以训练的文本列表
tokenizer.fit_on_texts(all_texts)
# 2.1.2 序列的列表，列表中每个序列对应于一段输入文本，文本的数字表示
sequences = tokenizer.texts_to_sequences(all_texts)
# 2.1.3
#word_counts:字典，将单词（字符串）映射为它们在训练期间出现的次数。仅在调用fit_on_texts之后设置。
#word_docs: 字典，将单词（字符串）映射为它们在训练期间所出现的文档或文本的数量。仅在调用fit_on_texts之后设置。
#word_index: 字典，将单词（字符串）映射为它们的排名或者索引。仅在调用fit_on_texts之后设置。
#document_count: 整数。分词器被训练的文档（文本或者序列）数量。仅在调用fit_on_texts或fit_on_sequences之后设置。
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
data = pad_sequences(sequences, maxlen = MAX_SEQUENCE_LENGTH)

Found 65604 unique tokens.


In [6]:
# 2.2.1 使用to_categorical将labels进行二值化处理
labels = to_categorical(np.asarray(all_labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

Shape of data tensor: (21924, 100)
Shape of label tensor: (21924, 12)


#### 3 分配数据集

In [7]:
# 3.1 打乱次序
indices = np.arange (data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
# 3.2 分配数据集
p1 = int(len(data)*(1-VALIDATION_SPLIT-TEST_SPLIT))
p2 = int(len(data)*(1-TEST_SPLIT))
x_train = data[:p1]
y_train = labels[:p1]
x_val = data[p1:p2]
y_val = labels[p1:p2]
x_test = data[p2:]
y_test = labels[p2:]

print ('train docs: ' + str(len(x_train)))
print ('val docs: '+str(len(x_val)))
print ('test docs: '+str(len(x_test)))

train docs: 14031
val docs: 3508
test docs: 4385


#### 4 加载word2vec中的Embedding

In [8]:
import gensim
from keras.utils import plot_model   # 绘制模型的结构图

w2v_model = gensim.models.KeyedVectors.load_word2vec_format('vectors.bin', binary = True)
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
not_in_model = 0
in_model = 0
for word, i in word_index.items():
    if str(word) in w2v_model:
        in_model += 1
        embedding_matrix[i] = np.asarray(w2v_model[str(word)], dtype='float32')
    else:
        not_in_model += 1
print (str(not_in_model)+' words not in w2v model')



13822 words not in w2v model


#### 5 构建词嵌入模块

In [9]:
from keras.layers import Embedding

embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

#### 6 构建模型

In [10]:
from keras.models import Sequential
from keras.layers import Dense, Input, Dropout, Flatten
from keras.layers import LSTM

In [11]:
model = Sequential()
model.add (embedding_layer)
model.add (LSTM(200, dropout = 0.5, recurrent_dropout = 0.2))
model.add (Dropout(0.2))
model.add (Dense (labels.shape[1], activation = 'softmax'))
model.summary()
plot_model(model, to_file='model.png',show_shapes=True)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 200)          13121000  
_________________________________________________________________
lstm_1 (LSTM)                (None, 200)               320800    
_________________________________________________________________
dropout_1 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 12)                2412      
Total params: 13,444,212
Trainable params: 323,212
Non-trainable params: 13,121,000
_________________________________________________________________


In [12]:
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

In [None]:
model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=2, batch_size=128)
model.save('word_vector_lstm.h5')

Train on 14031 samples, validate on 3508 samples
Epoch 1/2
Epoch 2/2

In [None]:
print (model.evaluate(x_test, y_test))