## Sentiment analysis layer

In [1]:
import os
import tensorflow as tf
import numpy as np
from tensorflow import keras
from tensorflow.keras import layers

In [2]:
# 对全局随机数生成种子的设置
tf.random.set_seed(22)
# 使用相同的参数，每次生成的随机数都相同
np.random.seed(22)
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
# startwith('2.') 这个函数用于判断tf.__version__的版本信息是否以'2.0'返回，返回True或者False
# assert 关键字用于判断该关键字后面的表达式返回值，True则不报错，返回False则报错‘AssertionError: ’
assert tf.__version__.startswith('2.')
assert np.__version__.startswith('1.16.2')

In [3]:
batchsz = 512
total_words = 10000    # 设定常用的单位数目为 10000
max_review_len = 80    # 设定每个句子中单词个数的最大值，即可以统一padding为这样的长度
# max_review_len = 100
embedding_len = 100    # 每个单词的编码维度，即用100维的向量表示一个单词

(x_train, y_train), (x_test, y_test) = keras.datasets.imdb.load_data(num_words=total_words)
x_train = keras.preprocessing.sequence.pad_sequences(x_train, maxlen=max_review_len)
x_test = keras.preprocessing.sequence.pad_sequences(x_test, maxlen=max_review_len)

# 对数据集进行切片处理
db_train = tf.data.Dataset.from_tensor_slices((x_train, y_train))
# batch()的参数 drop_remainer 设置为 True 是丢弃最末尾的 batch可能出现不为整数的batch
db_train = db_train.shuffle(1000).batch(batchsz, drop_remainder=True)
db_test = tf.data.Dataset.from_tensor_slices((x_test, y_test))
db_test = db_test.batch(batchsz, drop_remainder=True)
# 打印显示
print('x_train shape: ', x_train.shape)
print('y_train_max: ', tf.reduce_max(y_train))
print('y_train_min: ', tf.reduce_min(y_train))
print('x_test: ', x_test.shape)

x_train shape:  (25000, 80)
y_train_max:  tf.Tensor(1, shape=(), dtype=int64)
y_train_min:  tf.Tensor(0, shape=(), dtype=int64)
x_test:  (25000, 80)


In [4]:
class MyRNN(keras.Model):
    def __init__(self, units):
        super(MyRNN, self).__init__()
        self.embedding = layers.Embedding(total_words, embedding_len, input_length=max_review_len)
        
        self.rnn = keras.Sequential([
            layers.SimpleRNN(units, dropout=0.5, return_sequences=True, unroll=True),
            layers.SimpleRNN(units, dropout=0.5, unroll=True)
        ])
        
        self.outlayer = layers.Dense(1)
        
    def call(self, inputs, training=None):
        x = inputs
        x = self.embedding(x)
        x = self.rnn(x)
        x = self.outlayer(x)
        prob = tf.sigmoid(x)
        
        return prob
    
    

In [5]:
units = 64
# units = 150
epochs = 5
model = MyRNN(units)    # 调用 call 方法
model.compile(optimizer = keras.optimizers.Adam(0.001),
             loss = tf.losses.BinaryCrossentropy(),
             metrics = ['accuracy'])
# 训练
model.fit(db_train, epochs=epochs, validation_data=db_test)
# 测试评估
model.evaluate(db_test)

W0625 20:58:44.233396 139909149611840 deprecation.py:506] From /home/kukafee/environments/tf2_py3/lib/python3.6/site-packages/tensorflow/python/keras/backend.py:4081: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


[0.4924506489187479, 0.82747394]