# Keras建立MLP、RNN、LSTM模型进行IMDb情感分析

## 1 建立MLP模型进行IMDb情感分析

### 1.1 数据预处理

In [1]:

#导入模块
from keras.datasets import imdb
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
#读取IMDb数据
#创建rm_tag函数删除文字中的HTML标签
import re                                  #导入Regular Expression模块
def rm_tags(text):                        #创建rm_tags函数，输入参数text文字
    re_tag = re.compile(r'<[^>]+>')        #创建re_tag为正则表达式变量
    return re_tag.sub('',text)             #将text文字中符合正则表达式条件的字符替换成空字符串

In [3]:
#创建read_files函数读取IMDb文件目录
import os
def read_files(filetype):           #读取训练数据时传入“train”;读取测试数据时传入“test”
    path = "data/aclImdb/"
    file_list=[]
    
    positive_path=path+filetype+"/pos/"        #设置正面评价的文件目录
    for f in os.listdir( positive_path):      #for循环将positive_path目录下所有的文件加入file_list
        file_list+=[ positive_path+f]
    
    negative_path=path+filetype+"/neg/"        #设置负面评价的文件目录
    for f in os.listdir( negative_path):      #for循环将negative_path目录下所有的文件加入file_list
        file_list+=[ negative_path+f]
    
    print('read',filetype,'files:',len(file_list))   #显示当前读取的filetype("train"或“test”)目录下的文件个数
    
    all_labels = ([1]*12500+[0]*12500)   #前12500项是正面，产生12500项1的列表；后12500项是负面，产生12500项0的列表。
    
    all_texts = []
    for fi in file_list:                #读取所有文件
        with open(fi,encoding='utf8') as file_input:         #打开文件
#使用file_input.readlines()读取文件，并使用join连接所有文件的内容，然后使用rm_tags删除tag,最后加入all_texts list
            all_texts += [rm_tags(" ".join(file_input.readlines()))]
    return all_labels,all_texts

In [4]:
#读取训练数据
y_train,train_text=read_files("train")

read train files: 25000


In [5]:
#读取测试数据
y_test,test_text=read_files("test")

read test files: 25000


In [6]:
#建立token
token = Tokenizer(num_words=2000)
token.fit_on_texts(train_text)

In [7]:
#将“影评文字”转换成“数字列表”
x_train_seq = token.texts_to_sequences(train_text)
x_test_seq = token.texts_to_sequences(test_text)

In [8]:
#截长补短让所有“数字列表”的长度都为100
x_train = sequence.pad_sequences(x_train_seq,maxlen=100)
x_test = sequence.pad_sequences(x_test_seq,maxlen=100)

### 1.2 加入嵌入层

Keras提供了嵌入层可以将“数字列表”转换为“向量列表”。

In [9]:
from keras.models import Sequential
from keras.layers.core import Dense,Dropout,Activation,Flatten
from keras.layers.embeddings import Embedding

In [10]:
#建立模型
modelMPL = Sequential()
#将“嵌入层”加入模型
modelMPL.add(Embedding(output_dim=32,   #输出的维数为32，将“数字列表”转换为32维的向量
                      input_dim=2000,   #输入的维数是2000，建立的字典有2000个单词
                      input_length=100))  #“数值列表”每一项有100个数字
#加入Dropout以避免过拟合
modelMPL.add(Dropout(0.2))

### 1.3 建立MLP模型

In [11]:
#将“平坦层”加入模型
modelMPL.add(Flatten())

In [12]:
#将“隐藏层”加入模型
modelMPL.add(Dense(units=256,            #隐藏层共有256个单元
                   activation='relu'))     #定义激活函数ReLU
modelMPL.add(Dropout(0.35))

In [13]:
#将“输出层”加入模型
modelMPL.add(Dense(units=1,                 #输出层只有1个神经元，输出1代表正面评价，0代表负面评价
                  activation='sigmoid'))    #定义激活函数sigmoid

In [14]:
modelMPL.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 32)           64000     
_________________________________________________________________
dropout_1 (Dropout)          (None, 100, 32)           0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 3200)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 256)               819456    
_________________________________________________________________
dropout_2 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 257       
Total params: 883,713
Trainable params: 883,713
Non-trainable params: 0
_________________________________________________________________


### 1.4 训练模型

In [15]:
#定义训练方式
modelMPL.compile(loss='binary_crossentropy',
                optimizer='adam',
                metrics=['accuracy'])

In [16]:
train_history = modelMPL.fit(x_train,y_train,batch_size=100,
                            epochs=10,verbose=2,
                            validation_split=0.2)

Train on 20000 samples, validate on 5000 samples
Epoch 1/10
 - 8s - loss: 0.4905 - acc: 0.7488 - val_loss: 0.4572 - val_acc: 0.7886
Epoch 2/10
 - 8s - loss: 0.2709 - acc: 0.8884 - val_loss: 0.6275 - val_acc: 0.7236
Epoch 3/10
 - 8s - loss: 0.1641 - acc: 0.9378 - val_loss: 0.6444 - val_acc: 0.7498
Epoch 4/10
 - 8s - loss: 0.0859 - acc: 0.9714 - val_loss: 0.7551 - val_acc: 0.7702
Epoch 5/10
 - 8s - loss: 0.0486 - acc: 0.9836 - val_loss: 1.1073 - val_acc: 0.7300
Epoch 6/10
 - 7s - loss: 0.0362 - acc: 0.9869 - val_loss: 1.0342 - val_acc: 0.7706
Epoch 7/10
 - 8s - loss: 0.0305 - acc: 0.9893 - val_loss: 1.2062 - val_acc: 0.7426
Epoch 8/10
 - 8s - loss: 0.0262 - acc: 0.9902 - val_loss: 1.1010 - val_acc: 0.7816
Epoch 9/10
 - 9s - loss: 0.0248 - acc: 0.9908 - val_loss: 1.2973 - val_acc: 0.7578
Epoch 10/10
 - 8s - loss: 0.0233 - acc: 0.9914 - val_loss: 1.5052 - val_acc: 0.7320


### 1.5 评估模型准确率

In [17]:
scores = modelMPL.evaluate(x_test,y_test,verbose=1)
scores[1]



0.808

### 1.6 进行预测

In [18]:
#执行预测
predict=modelMPL.predict_classes(x_test)

In [19]:
#预测结果
predict[:10]

array([[1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1]])

In [20]:
#使用一维数组查看预测结果
predict_classes=predict.reshape(-1)
predict_classes[:10]

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

### 1.7 查看测试数据预测结果

之前的预测结果是0与1，创建display_test_Sentiment函数，可以显示负面评价或正面评价。

In [21]:
#创建display_test_Sentiment函数。
SentimentDict={1:'正面的',0:'负面的'}
def display_test_Sentiment(i):
    print(test_text[i])
    print('label真实值：',SentimentDict[y_test[i]],
         '预测结果：',SentimentDict[predict_classes[i]])

In [22]:
#显示第3项数据
display_test_Sentiment(3)

I saw this film in a sneak preview, and it is delightful. The cinematography is unusually creative, the acting is good, and the story is fabulous. If this movie does not do well, it won't be because it doesn't deserve to. Before this film, I didn't realize how charming Shia Lebouf could be. He does a marvelous, self-contained, job as the lead. There's something incredibly sweet about him, and it makes the movie even better. The other actors do a good job as well, and the film contains moments of really high suspense, more than one might expect from a movie about golf. Sports movies are a dime a dozen, but this one stands out. This is one I'd recommend to anyone.
label真实值： 正面的 预测结果： 正面的


In [23]:
#显示第12502项数据
display_test_Sentiment(12502)

First of all I hate those moronic rappers, who could'nt act if they had a gun pressed against their foreheads. All they do is curse and shoot each other and acting like cliché'e version of gangsters.The movie doesn't take more than five minutes to explain what is going on before we're already at the warehouse There is not a single sympathetic character in this movie, except for the homeless guy, who is also the only one with half a brain.Bill Paxton and William Sadler are both hill billies and Sadlers character is just as much a villain as the gangsters. I did'nt like him right from the start.The movie is filled with pointless violence and Walter Hills specialty: people falling through windows with glass flying everywhere. There is pretty much no plot and it is a big problem when you root for no-one. Everybody dies, except from Paxton and the homeless guy and everybody get what they deserve.The only two black people that can act is the homeless guy and the junkie but they're actors by 

### 1.8 查看《美女与野兽》的影评

之前的预测使用的是IMDb数据集的影评文字，接下来使用热门电影《美女与野兽》的影评文字进行预测

《美女与野兽》影评查看网址：http://www.imdb.com/title/tt2771200/reviews

In [24]:
input_text=''' I have re-watched this in theaters this weekend, so I come fresh with this movie in mind. Having said that, my perception of this movie has not changed. I will also add that this story was my favorite Disney story growing up. Having watched it twice now, my experience has remained the same. I still got lost in the story, the imagery, the music, and the singing. The plot was almost completely the same as that of the cartoon version, with a few additions. I very much loved these new additions as they added depth to the story and closed some plot holes. It also helped to better establish the relationship between Belle and the Beast.P.S. Loved the gay millisecond! I don't know what all the fuss was about. '''

In [25]:
#转换成“数字列表”
input_seq = token.texts_to_sequences([input_text])

In [26]:
print(input_seq[0])

[9, 24, 791, 292, 10, 7, 10, 34, 9, 212, 1472, 15, 10, 16, 7, 326, 256, 296, 11, 57, 4, 10, 16, 43, 20, 1189, 9, 76, 77, 758, 11, 10, 61, 12, 57, 510, 909, 61, 1789, 52, 256, 292, 8, 1447, 146, 57, 581, 43, 1, 168, 9, 127, 184, 412, 7, 1, 61, 1, 1, 224, 2, 1, 1115, 1, 110, 12, 216, 336, 1, 168, 13, 11, 4, 1, 1068, 306, 15, 3, 167, 9, 51, 72, 443, 130, 157, 13, 32, 1280, 1133, 5, 1, 61, 2, 45, 110, 1507, 8, 77, 1668, 5, 124, 1, 644, 196, 2, 1, 1683, 586, 443, 1, 987, 9, 88, 120, 47, 28, 1, 12, 40]


In [27]:
len(input_seq[0])

119

In [28]:
#截取“数字列表”使其长度为100
pad_input_seq = sequence.pad_sequences(input_seq,maxlen=100)

In [29]:
len(pad_input_seq[0])

100

In [30]:
#使用多层感知器进行预测
predict_result=modelMPL.predict_classes(pad_input_seq)

In [31]:
#查看预测结果
predict_result

array([[1]])

In [32]:
#读取预测结果中的元素
predict_result[0][0]

1

In [33]:
SentimentDict[predict_result[0][0]]

'正面的'

### 1.9 使用较大字典提高准确率

建立字典的单词数：原本为1000个单词的字典，增加为建立有3800个单词的字典

“数字列表”截长补短的长度：原本“数字列表”的长度都是100个数字，现在改为380个数字。

In [52]:
#读取所有文章建立字典，限制字典单词数为3800
tokenLage = Tokenizer(num_words=3800)

In [53]:
tokenLage.fit_on_texts(train_text)

In [54]:
#将文字转为数字序列
xLage_train_seq = tokenLage.texts_to_sequences(train_text)
xLage_test_seq = tokenLage.texts_to_sequences(test_text)

In [55]:
#截长补短，让所有影评所产生的数字序列长度一样
xLage_train=sequence.pad_sequences(xLage_train_seq,maxlen=380)
xLage_test=sequence.pad_sequences(xLage_test_seq,maxlen=380)

In [56]:
modelLage = Sequential()

In [57]:
modelLage.add(Embedding(output_dim=32,
                       input_dim=3800,
                       input_length=380))
modelLage.add(Dropout(0.2))

In [58]:
modelLage.add(Flatten())

In [59]:
modelLage.add(Dense(units=256,
                   activation='relu'))
modelLage.add(Dropout(0.2))

In [60]:
modelLage.add(Dense(units=1,
                   activation='sigmoid'))

In [61]:
modelLage.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 380, 32)           121600    
_________________________________________________________________
dropout_5 (Dropout)          (None, 380, 32)           0         
_________________________________________________________________
flatten_3 (Flatten)          (None, 12160)             0         
_________________________________________________________________
dense_5 (Dense)              (None, 256)               3113216   
_________________________________________________________________
dropout_6 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 257       
Total params: 3,235,073
Trainable params: 3,235,073
Non-trainable params: 0
_________________________________________________________________


In [62]:
#整理为函数
padLage_input_seq=sequence.pad_sequences(input_seq,maxlen=380)
def predictLage_review(input_text):
    input_seq=tokenLage.texts_to_sequences({input_text})
    padLage_input_seq = sequence.pad_sequences(input_seq,maxlen=380)
    predictLage_result = modelLage.predict_classes(padLage_input_seq)
    print(SentimentDict[predictLage_result[0][0]])

In [65]:
#定义训练方式
modelLage.compile(loss='binary_crossentropy',
                optimizer='adam',
                metrics=['accuracy'])

In [66]:
trainLage_history = modelLage.fit(xLage_train,y_train,batch_size=100,
                            epochs=10,verbose=2,
                            validation_split=0.2)

Train on 20000 samples, validate on 5000 samples
Epoch 1/10
 - 26s - loss: 0.4704 - acc: 0.7624 - val_loss: 0.4493 - val_acc: 0.8086
Epoch 2/10
 - 27s - loss: 0.1868 - acc: 0.9289 - val_loss: 0.6750 - val_acc: 0.7314
Epoch 3/10
 - 28s - loss: 0.0668 - acc: 0.9783 - val_loss: 0.7776 - val_acc: 0.7736
Epoch 4/10
 - 26s - loss: 0.0265 - acc: 0.9922 - val_loss: 1.0905 - val_acc: 0.7346
Epoch 5/10
 - 25s - loss: 0.0161 - acc: 0.9952 - val_loss: 1.2954 - val_acc: 0.7354
Epoch 6/10
 - 25s - loss: 0.0147 - acc: 0.9955 - val_loss: 1.1779 - val_acc: 0.7662
Epoch 7/10
 - 26s - loss: 0.0160 - acc: 0.9942 - val_loss: 1.0140 - val_acc: 0.7846
Epoch 8/10
 - 26s - loss: 0.0163 - acc: 0.9942 - val_loss: 1.2774 - val_acc: 0.7626
Epoch 9/10
 - 25s - loss: 0.0162 - acc: 0.9944 - val_loss: 1.1075 - val_acc: 0.8016
Epoch 10/10
 - 25s - loss: 0.0144 - acc: 0.9950 - val_loss: 1.1448 - val_acc: 0.8116


In [67]:
#评估模型准确率
scores = modelLage.evaluate(xLage_test,y_test,verbose=1)
scores[1]



0.85072

由以上结果可知，字典数增加为3800，并且“数字列表”的长度增加为380。训练时间变长，但是准确率有所提高

## 2 建立RNN模型进行IMDb情感分析

为什么使用RNN模型？

MNIST数据集（识别数字图形）、Cifar数据集（识别照片）图像并不会随着时间而改变，所以使用MPL或CNN都具有较好的效果。而人工智能所要解决的问题很多是顺序性的，例如自然语言处理（同一时间只能听到一个字，之前的语言会影响之后语言的含义）、视频图像处理（视频是一张张照片，依照时间顺序组成）、气象观测数据（信息随时间不断改变）和股票交易数据（股市开盘后，股价随着时间不断变动）。

MLP或CNN都只能依照当前的状态进行识别，如果要处理时间序列的问题，就必须使用RNN与LSTM模型。

RNN模型的原理是将神经元的输出再接回神经元的输入。这样的设计使神经网络具备“记忆”功能。

In [84]:
#建立模型
from keras.models import Sequential
from keras.layers.core import Dense,Dropout,Activation,Flatten
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import SimpleRNN

In [85]:
modelRNN = Sequential()

In [86]:
#将“嵌入层”加入模型
modelRNN.add(Embedding(output_dim=32,   #输出的维数为32，将“数字列表”转换为32维的向量
                      input_dim=2000,   #输入的维数是2000，建立的字典有2000个单词
                      input_length=100))  #“数值列表”每一项有100个数字
#加入Dropout以避免过拟合
modelRNN.add(Dropout(0.2))

In [87]:
#RNN层
modelRNN.add(SimpleRNN(units=16))

In [88]:
modelRNN.add(Dense(units=256,activation='relu'))
modelRNN.add(Dropout(0.35))

In [89]:
modelRNN.add(Dense(units=1,activation='sigmoid'))

In [90]:
modelRNN.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 100, 32)           64000     
_________________________________________________________________
dropout_10 (Dropout)         (None, 100, 32)           0         
_________________________________________________________________
simple_rnn_2 (SimpleRNN)     (None, 16)                784       
_________________________________________________________________
dense_9 (Dense)              (None, 256)               4352      
_________________________________________________________________
dropout_11 (Dropout)         (None, 256)               0         
_________________________________________________________________
dense_10 (Dense)             (None, 1)                 257       
Total params: 69,393
Trainable params: 69,393
Non-trainable params: 0
_________________________________________________________________


In [91]:
#定义训练方式
modelRNN.compile(loss='binary_crossentropy',
                optimizer='adam',
                metrics=['accuracy'])
trainRNN_history = modelRNN.fit(x_train,y_train,batch_size=100,
                            epochs=10,verbose=2,
                            validation_split=0.2)

Train on 20000 samples, validate on 5000 samples
Epoch 1/10
 - 7s - loss: 0.5349 - acc: 0.7331 - val_loss: 0.5267 - val_acc: 0.7564
Epoch 2/10
 - 6s - loss: 0.3646 - acc: 0.8437 - val_loss: 0.6378 - val_acc: 0.7280
Epoch 3/10
 - 6s - loss: 0.3166 - acc: 0.8696 - val_loss: 0.4076 - val_acc: 0.8226
Epoch 4/10
 - 6s - loss: 0.2794 - acc: 0.8854 - val_loss: 0.3549 - val_acc: 0.8604
Epoch 5/10
 - 7s - loss: 0.2469 - acc: 0.9006 - val_loss: 0.6791 - val_acc: 0.7198
Epoch 6/10
 - 7s - loss: 0.2125 - acc: 0.9163 - val_loss: 0.5516 - val_acc: 0.7864
Epoch 7/10
 - 6s - loss: 0.1866 - acc: 0.9274 - val_loss: 0.6771 - val_acc: 0.7564
Epoch 8/10
 - 6s - loss: 0.1569 - acc: 0.9387 - val_loss: 0.6065 - val_acc: 0.7984
Epoch 9/10
 - 6s - loss: 0.1349 - acc: 0.9480 - val_loss: 0.8218 - val_acc: 0.7720
Epoch 10/10
 - 6s - loss: 0.1110 - acc: 0.9590 - val_loss: 1.4831 - val_acc: 0.6412


In [92]:
scores = modelRNN.evaluate(x_test,y_test,verbose=1)
scores[1]



0.77768

## 3 建立LSTM模型进行IMDb情感分析

长短期记忆（Long Short Term Memory,LSTM）也是一种时间递归神经网络，专门设计用来解决RNN的长期依赖问题。

长期依赖问题就是在每一个时间的间隔不断增大时，RNN会丧失学习到连接到远处的信息的能力。

简单的说，RNN只有短期的记忆，没有长期的记忆。在LSTM神经网络中，每一个神经元相当于一个记忆细胞

In [93]:
#建立模型
from keras.models import Sequential
from keras.layers.core import Dense,Dropout,Activation,Flatten
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM

In [94]:
modelLSTM = Sequential()

In [95]:
#将“嵌入层”加入模型
modelLSTM.add(Embedding(output_dim=32,   #输出的维数为32，将“数字列表”转换为32维的向量
                      input_dim=2000,   #输入的维数是2000，建立的字典有2000个单词
                      input_length=100))  #“数值列表”每一项有100个数字
#加入Dropout以避免过拟合
modelLSTM.add(Dropout(0.2))

In [96]:
#LSTM层
modelLSTM.add(LSTM(32))

In [98]:
modelLSTM.add(Dense(units=256,activation='relu'))
modelLSTM.add(Dropout(0.35))

In [99]:
modelLSTM.add(Dense(units=1,activation='sigmoid'))

In [100]:
modelLSTM.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, 100, 32)           64000     
_________________________________________________________________
dropout_12 (Dropout)         (None, 100, 32)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 32)                8320      
_________________________________________________________________
dense_11 (Dense)             (None, 256)               8448      
_________________________________________________________________
dropout_13 (Dropout)         (None, 256)               0         
_________________________________________________________________
dense_12 (Dense)             (None, 256)               65792     
_________________________________________________________________
dropout_14 (Dropout)         (None, 256)               0         
__________

In [101]:
#定义训练方式
modelLSTM.compile(loss='binary_crossentropy',
                optimizer='adam',
                metrics=['accuracy'])
trainLSTM_history = modelLSTM.fit(x_train,y_train,batch_size=100,
                            epochs=10,verbose=2,
                            validation_split=0.2)

Train on 20000 samples, validate on 5000 samples
Epoch 1/10
 - 18s - loss: 0.4802 - acc: 0.7607 - val_loss: 0.4982 - val_acc: 0.7426
Epoch 2/10
 - 15s - loss: 0.3300 - acc: 0.8612 - val_loss: 0.5539 - val_acc: 0.7070
Epoch 3/10
 - 15s - loss: 0.3093 - acc: 0.8711 - val_loss: 0.3143 - val_acc: 0.8600
Epoch 4/10
 - 15s - loss: 0.2860 - acc: 0.8829 - val_loss: 0.5366 - val_acc: 0.7394
Epoch 5/10
 - 15s - loss: 0.2713 - acc: 0.8892 - val_loss: 0.5807 - val_acc: 0.7584
Epoch 6/10
 - 15s - loss: 0.2606 - acc: 0.8919 - val_loss: 0.5601 - val_acc: 0.7782
Epoch 7/10
 - 15s - loss: 0.2490 - acc: 0.9006 - val_loss: 0.4166 - val_acc: 0.8190
Epoch 8/10
 - 15s - loss: 0.2374 - acc: 0.9029 - val_loss: 0.5222 - val_acc: 0.7658
Epoch 9/10
 - 15s - loss: 0.2273 - acc: 0.9070 - val_loss: 0.5245 - val_acc: 0.7634
Epoch 10/10
 - 15s - loss: 0.2211 - acc: 0.9120 - val_loss: 0.5138 - val_acc: 0.7946


In [103]:
scores = modelLSTM.evaluate(x_test,y_test,verbose=1)
scores[1]



0.83392

由上述结果可知，LSTM模型的准确率较RNN模型有所提升。