In [1]:
import os
imdb_dir = './data/aclImdb'
train_dir = os.path.join(imdb_dir, 'train')

labels = []
texts = []

# 读取原始文本，每一行文本为一个样本
for label_type in ['neg', 'pos']:
    label = 0 if label_type == 'neg' else 1
    dir_name = os.path.join(train_dir, label_type)
    for fname in os.listdir(dir_name):
        if fname[-4:] == '.txt':
            with open(os.path.join(dir_name, fname), encoding='utf-8') as f:
                texts.append(f.read())
                labels.append(label)

In [2]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np

maxlen = 1000 # 每条评论的截断
training_samples = 200 # 训练样本个数
validation_samples = 10000 # 验证样本个数
max_words = 10000 # 只考虑词频最大的单词

# 将文本转为整数序列
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
data = pad_sequences(sequences, maxlen=maxlen)
labels = np.asarray(labels)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [3]:
word_index = tokenizer.word_index

In [4]:
len(data)

25000

In [5]:
from sklearn.model_selection import train_test_split
x_train_val, x_test, y_train_val, y_test = train_test_split(
    data,
    labels,
    test_size=0.2,
    shuffle=True,
    random_state=42)
x_train, x_val, y_train, y_val = train_test_split(
    x_train_val,
    y_train_val,
    test_size=0.2,
    shuffle=True,
    random_state=42)

In [6]:
# 解析GloVe词嵌入文件
embedding_dim = 100
glove_path = './data/glove.6B/glove.6B.%dd.txt' % embedding_dim
embeddings_index = {}
with open(glove_path, encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        embeddings_index[word] = np.asarray(values[1:], dtype='float32')
print('Found %d word vaectors.' % len(embeddings_index))

Found 400000 word vaectors.


In [7]:
# 生成词嵌入矩阵
embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in word_index.items():
    if i < max_words:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

In [8]:
# 定义模型
import keras
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense, LSTM, Dropout, Bidirectional

model = Sequential()
model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
model.add(Bidirectional(LSTM(128, recurrent_dropout=0.1)))
model.add(Dropout(0.25))
model.add(Dense(64))
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1000, 100)         1000000   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 256)               234496    
_________________________________________________________________
dropout_1 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 64)                16448     
_________________________________________________________________
dropout_2 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 65        
Total params: 1,251,009
Trainable params: 1,251,009
Non-trainable params: 0
_________________________________________________________________


In [9]:
# 预加载Glove
model.layers[0].set_weights([embedding_matrix])
model.layers[0].trainable = False

In [None]:
# 训练
callbadcks_list = [
    keras.callbacks.ModelCheckpoint(
        filepath='pre_trained_glove_model.h5',
        monitor='val_loss',
        save_best_only=True,
    )
]
model.compile(optimizer='rmsprop',loss='binary_crossentropy',metrics=['acc'])
history = model.fit(x_train, y_train,
                    epochs=20,
                    batch_size=128,
                    callbacks=callbadcks_list,
                    validation_data=(x_val, y_val))

Train on 16000 samples, validate on 4000 samples
Epoch 1/20
  128/16000 [..............................] - ETA: 3:18:08 - loss: 0.7168 - acc: 0.4922

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix
import seaborn as sb
import matplotlib.pyplot as plt
def plot_confusion_matrix(y_true, y_pred, ax, class_names, 
                          vmax=None, normed=True, title='Confusion matrix'):
    """
    画出混淆矩阵
    """
    matrix = confusion_matrix(y_true, y_pred)
    if normed:
        matrix = matrix.astype('float') / matrix.sum(axis=1)[:, np.newaxis]
    sb.heatmap(matrix, vmax=vmax, annot=True, square=True, ax=ax,
               cmap=plt.cm.Blues_r, cbar=False, linecolor='black',
               linewidth=1, xticklabels=class_names)
    ax.set_title(title, y=1.20, fontsize=16)
    ax.set_xlabel('Predicted labels', y=1.10, fontsize=12)
    ax.set_yticklabels(class_names, rotation=0)

In [None]:
# 预测
y_train_pred = model.predict_classes(x_train)
y_test_pred = model.predict_classes(x_test)

In [None]:
# 画出混淆矩阵
fig, (axis1, axis2) = plt.subplots(nrows=1, ncols=2)
plot_confusion_matrix(y_train, y_train_pred, ax=axis1,
                      title='Confusion matrix (train data)',
                      class_names=['Positive', 'Negative'])
plot_confusion_matrix(y_test, y_test_pred, ax=axis2,
                      title='Confusion matrix (test data)',
                      class_names=['Positive', 'Negative'])

In [None]:
# 画出曲线
fig, (axis1, axis2) = plt.subplots(nrows=1, ncols=2, figsize=(16,6))

axis1.plot(history.history['acc'], label='Train', linewidth=3)
axis1.plot(history.history['val_acc'], label='Validation', linewidth=3)
axis1.set_title('Model accuracy', fontsize=16)
axis1.set_ylabel('acc')
axis1.set_xlabel('epoch')
axis1.legend(loc='upper left')

axis2.plot(history.history['loss'], label='Train', linewidth=3)
axis2.plot(history.history['val_loss'], label='Validation', linewidth=3)
axis2.set_title('Model loss', fontsize=16)
axis2.set_ylabel('loss')
axis2.set_xlabel('epoch')
axis2.legend(loc='upper right')
plt.show()