# 跟着项目学机器学习--IMDB影评数据情感分析

In [10]:
from tensorflow import keras

## 数据集导入
导入IMDB数据集，并且简单分析数据，以及样本的分布情况。主要从以下几点分析数据：
1. vocab 字典的大小
2. 训练集和测试集的数据量，以及数据是否等长
3. 标记样本的分布情况

In [1]:
from tensorflow.keras.datasets import imdb

In [8]:
# 加载词索引文件
# 词索引是一个字典，将每个单词转换为一个正整数
# key：word
# value：index
# imdb.get_word_index()

word_index = imdb.get_word_index()
print("打印出字典的元素总数：", len(word_index))

打印出字典的元素总数： 88584


In [9]:
# 加载训练集和测试集
max_features = 20000
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)

In [10]:
# 查看数据维度
x_train.shape, y_train.shape, x_test.shape, y_test.shape

((25000,), (25000,), (25000,), (25000,))

In [11]:
# 查看每个数据是否等长
len(x_train[5]), len(x_train[7]), len(x_train[100])

(43, 562, 158)

In [12]:
# 查看样本数据
x_train[:2], y_train[:2]

(array([list([1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 19193, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 10311, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 12118, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32]),
        list([1, 194, 1153, 194, 

In [13]:
# 查看标签的分布情况
import pandas as pd

print(f"训练样本的标签分布：\n {pd.Series(y_train).value_counts()}")
print(f"测试样本的标签分布：\n {pd.Series(y_test).value_counts()}")

训练样本的标签分布：
 1    12500
0    12500
dtype: int64
测试样本的标签分布：
 1    12500
0    12500
dtype: int64


## 查看样本的数据形态
由于IMDB数据已经做了word to index， 转为正整数了。我们想查看数据，需要根据字典“翻译”成原始文本。
1. 我们需要将word_index 转换为 index_word
2. 根据index_word 将样本的转为文本数据

In [14]:
# 根据word_index 得到index_word
index_word = {v: k for k, v in word_index.items()}

In [15]:
# 定义函数，将制定的样本idx 转为文本（英文需要空格分割）
def transform2text(idx: int):
    data = x_train[idx]
    return " ".join([index_word[i] for i in data])

In [21]:
transform2text(100), y_train[100]

("the was rather is him completely br english send to one dvd for kind way are year type but tired talent of am stories slightest coop on her no was although some has as was garbage che's that to to when it as if is herself br bloodsuckers door simply to picture 25 for he silent to holy dramatically to bigger reason was then does sorry very not reason as it out is herself br film's for with and are of tension 4 of human br english send in could is again outrageous movies episode we could that elements for was nothing laugh has of holy laughing lot not me in perfect and of totally most only dreary 2 one an this an as it is fight harry storyline to action much one out will half this of and setting place movie is guide was fight wonderful have then zombies man sense are as am some br didn't",
 0)

## 构建模型
这是一个典型的NLP任务，根据序列预测分类。通常处理序列问题都采用RNN（循环神经网络）。当然读者也可以尝试别的算法，如Text-CNN也常常用于文本分类问题。或者更高级的预训练等模型。这些内容在后续的项目中都会涉及到。

In [22]:
from tensorflow.keras import layers, Model, Input

In [39]:
class Net:
    def __init__(self):
        self.embedd = layers.Embedding(max_features, 50)
        self.lstm1 = layers.LSTM(units=64, return_sequences=True, activation='tanh')
        self.lstm2 = layers.LSTM(units=64, activation='tanh')
        self.dense = layers.Dense(64, activation='tanh')
        self.sigmoid = layers.Dense(1, activation='sigmoid')
        
        self.model: Model = None
            
    def build_model(self):
        x_in = Input(shape=(None,))
        x = self.embedd(x_in)
        x = self.lstm1(x)
        x = self.lstm2(x)
        x = self.dense(x)
        out = self.sigmoid(x)
        return Model(x_in, out)
    
model = Net().build_model()
model.summary()

Model: "model_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_6 (InputLayer)         [(None, None)]            0         
_________________________________________________________________
embedding_5 (Embedding)      (None, None, 50)          1000000   
_________________________________________________________________
lstm_10 (LSTM)               (None, None, 64)          29440     
_________________________________________________________________
lstm_11 (LSTM)               (None, 64)                33024     
_________________________________________________________________
dense_10 (Dense)             (None, 64)                4160      
_________________________________________________________________
dense_11 (Dense)             (None, 1)                 65        
Total params: 1,066,689
Trainable params: 1,066,689
Non-trainable params: 0
_________________________________________________

In [25]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [26]:
# 预处理数据集
x_train = pad_sequences(x_train, maxlen=200)
x_test = pad_sequences(x_test, maxlen=200)

In [43]:
x_train.shape

(25000, 200)

In [36]:
from tensorflow.keras.optimizers import Adam

In [40]:
# 编译模型
model.compile(optimizer=Adam(learning_rate=0.0001), loss=keras.losses.BinaryCrossentropy(), metrics=['accuracy'])

In [41]:
model.fit(x_train, y_train, validation_data=(x_test, y_test), epochs=5, batch_size=32)

# Train on 25000 samples, validate on 25000 samples
# Epoch 1/20
# 25000/25000 [==============================] - 6s 251us/sample - loss: 0.4965 - accuracy: 0.7460 - val_loss: 0.4395 - val_accuracy: 0.7946
# Epoch 2/20
# 25000/25000 [==============================] - 6s 227us/sample - loss: 0.3138 - accuracy: 0.8690 - val_loss: 0.4258 - val_accuracy: 0.8057
# Epoch 3/20
# 25000/25000 [==============================] - 6s 232us/sample - loss: 0.1754 - accuracy: 0.9342 - val_loss: 0.6682 - val_accuracy: 0.7586
# Epoch 4/20
# 25000/25000 [==============================] - 6s 230us/sample - loss: 0.0663 - accuracy: 0.9775 - val_loss: 0.9378 - val_accuracy: 0.7436
# Epoch 5/20
# 25000/25000 [==============================] - 6s 231us/sample - loss: 0.0177 - accuracy: 0.9941 - val_loss: 1.5410 - val_accuracy: 0.7570
# Epoch 6/20
# 25000/25000 [==============================] - 6s 229us/sample - loss: 0.0064 - accuracy: 0.9981 - val_loss: 2.0868 - val_accuracy: 0.7553
# Epoch 7/20
# 25000/25000 [==============================] - 6s 234us/sample - loss: 0.0010 - accuracy: 0.9996 - val_loss: 2.8559 - val_accuracy: 0.7558
# Epoch 8/20
# 25000/25000 [==============================] - 6s 232us/sample - loss: 8.3974e-04 - accuracy: 0.9998 - val_loss: 3.3243 - val_accuracy: 0.7618

Train on 25000 samples, validate on 25000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f87db10d610>

In [None]:
## 模型已经过拟合了
## 在训练集上达到97%的准确率，而在测试集上只有85%