In [1]:
import pandas as pd
import re
from gensim.models import Word2Vec
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense

'''
读取训练集并构造样本
'''
def split_sentence(sentence):
    stop = '[’!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]+'
    sentence = re.sub(stop, '', sentence)
    return sentence.split()

data = pd.read_csv('data/labeledTrainData.tsv',sep='\t')
# data = data[:100]
sentences = data.review.apply(split_sentence)

'''
训练word2vec
'''
# 嵌入维度
embedding_vector_size = 10
w2v_model = Word2Vec(
    sentences=sentences,
    size=embedding_vector_size,
    min_count=1,
    window=3,
    workers=4,
)
# 生成词典列表
vocab_list = list(w2v_model.wv.vocab.keys())
# 生成索引
word_index = {word: index for index, word in enumerate(vocab_list)}
# 序列化
def get_index(sentence):
    global word_index
    sequence = []
    for word in sentence:
        try:
            sequence.append(word_index[word])
            # 对于每一个单词存储index
        except KeyError:
            pass
    return sequence

X_data = list(map(get_index, sentences))

max_len = 150
X_pad = pad_sequences(X_data, maxlen=max_len)
# 获取标签
Y = data.sentiment.values
# 划分数据集
X_train, X_test, Y_train, Y_test = train_test_split(
    X_pad,
    Y,
    test_size=0.2,
    random_state=42)
# random_state随机数种子

embedding_matrix = w2v_model.wv.vectors
print(embedding_matrix.shape)

(158940, 10)


In [2]:
'''
构建分类模型
'''
model = Sequential()
model.add(Embedding(
    input_dim=embedding_matrix.shape[0],
    output_dim=embedding_matrix.shape[1],
    input_length=max_len,
    weights=[embedding_matrix],
    trainable=False
))

model.add(Flatten())
model.add(Dense(5))
model.add(Dense(1,activation='sigmoid'))


model.compile(loss='binary_crossentropy',
             optimizer='adam',
             metrics=['acc'])

history = model.fit(x=X_train,
                   y=Y_train,
                   validation_data=(X_test,Y_test),
                   batch_size=4,
                   epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
