In [None]:
from tensorflow.keras.datasets import imdb
(X_train, y_train), (X_test, y_test) = imdb.load_data(num_words=10000)


Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


In [None]:
X_train[1][:10]

[1, 194, 1153, 194, 8255, 78, 228, 5, 6, 1463]

In [None]:
from tensorflow import keras

word_index = keras.datasets.imdb.get_word_index()
id_to_word = {id_ + 3: word for word, id_ in word_index.items()}
for id_, token in enumerate(("<pad>","<sos>","<unk>")):
  id_to_word[id_] = token

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json


In [None]:
" ".join([id_to_word[id_] for id_ in X_train[1][:10]])

'<sos> big hair big boobs bad music and a giant'

In [None]:
import tensorflow_datasets as tfds

datasets,info = tfds.load("imdb_reviews",as_supervised=True,with_info=True)
train_size = info.splits["train"].num_examples

Downloading and preparing dataset 80.23 MiB (download: 80.23 MiB, generated: Unknown size, total: 80.23 MiB) to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0...


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Generating splits...:   0%|          | 0/3 [00:00<?, ? splits/s]

Generating train examples...:   0%|          | 0/25000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteK4HKE9/imdb_reviews-train.tfrecord…

Generating test examples...:   0%|          | 0/25000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteK4HKE9/imdb_reviews-test.tfrecord*…

Generating unsupervised examples...:   0%|          | 0/50000 [00:00<?, ? examples/s]

Shuffling /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0.incompleteK4HKE9/imdb_reviews-unsupervised.t…

Dataset imdb_reviews downloaded and prepared to /root/tensorflow_datasets/imdb_reviews/plain_text/1.0.0. Subsequent calls will reuse this data.


In [None]:
import tensorflow as tf

def preprocess(X_batch,y_batch):
  X_batch = tf.strings.substr(X_batch,0,300)
  X_batch = tf.strings.regex_replace(X_batch,b"<br\\s*/?>",b" ")
  X_batch = tf.strings.regex_replace(X_batch,b"[^a-zA-Z']",b" ")
  X_batch = tf.strings.split(X_batch)
  return X_batch.to_tensor(default_value=b"<pad>"),y_batch

In [None]:
for i ,_ in datasets["train"].batch(3).map(preprocess):
  print(i)
  for r in i:
    print('*'*30)
    print(r)
  break

接下来我们需要构建一个词汇表，这需要一次遍历整个训练集，并使用counter来对每个单词出现的次数进行计数

In [None]:
from collections import Counter
import numpy
vocabullary = Counter()
for X_batch,y_batch in datasets["train"].batch(32).map(preprocess):
  for review in X_batch:
    vocabullary.update(list(review.numpy()))

In [None]:
vocabullary

In [None]:
vocabullary.most_common()[:3]

[(b'<pad>', 214309), (b'the', 61137), (b'a', 38564)]

In [None]:
vocab_size = 10000
truncated_vocabulary = [
    word for word,count in vocabullary.most_common()[:vocab_size]
]

In [None]:
truncated_vocabulary

现在我们需要添加一个预处理步骤，以把每个单词替换为其ID（即其在词汇表中的索引）

In [None]:
words = tf.constant(truncated_vocabulary)
word_ids = tf.range(len(truncated_vocabulary),dtype=tf.int64)
vocab_init = tf.lookup.KeyValueTensorInitializer(words,word_ids)#创建了一个键值对映射表的初始化器vocab_init
num_oov_buckets = 1000
table = tf.lookup.StaticVocabularyTable(vocab_init,num_oov_buckets)

In [None]:
words[1055]

<tf.Tensor: shape=(), dtype=string, numpy=b'setting'>

In [None]:
word_ids

<tf.Tensor: shape=(10000,), dtype=int64, numpy=array([   0,    1,    2, ..., 9997, 9998, 9999])>

In [None]:
word_ids

<tf.Tensor: shape=(10000,), dtype=int64, numpy=array([   0,    1,    2, ..., 9997, 9998, 9999])>

In [None]:
table.lookup(tf.constant([b"This movie was faatastic".split()]))

<tf.Tensor: shape=(1, 4), dtype=int64, numpy=array([[   22,    12,    11, 10592]])>

现在我们准备创建最终的训练集。我们对评论进行批处理。然后使用preprocess()函数，将他们转换为单词的短序列。然后使用简单的encode_words函数对这些单词进行编码

In [None]:
def encode_words(X_batch,y_batch):
  return table.lookup(X_batch),y_batch

In [None]:
train_set = datasets["train"].batch(32).map(preprocess)
train_set

<_MapDataset element_spec=(TensorSpec(shape=(None, None), dtype=tf.string, name=None), TensorSpec(shape=(None,), dtype=tf.int64, name=None))>

In [None]:
train_set = datasets["train"].batch(32).map(preprocess)
train_set = train_set.map(encode_words).prefetch(1)

In [None]:
embed_size = 128
import keras
model = keras.models.Sequential([
    keras.layers.Embedding(vocab_size + num_oov_buckets,embed_size,
                           input_shape=[None]),
    keras.layers.GRU(128,return_sequences=True),
    keras.layers.GRU(128),
    keras.layers.Dense(1,activation="sigmoid")
])

model.compile(loss="binary_crossentropy",optimizer="adam",
              metrics=["accuracy"])
history = model.fit(train_set,epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


掩码屏蔽（Masking）是一种在神经网络中处理可变长度输入数据的技术。在自然语言处理任务中，输入的文本序列的长度可能不同，为了能够将其输入到神经网络中，需要对所有句子进行填充，使得它们的长度相等。但是这样做会导致模型计算了填充部分的权重，从而影响了模型的训练效果。因此，可以通过掩码屏蔽将填充部分的权重屏蔽掉，只计算有意义的部分。

掩码屏蔽的本质是对于每个填充值增加一层掩码，让神经网络在计算时忽略这些填充值，只考虑有意义的数据。在 TensorFlow 中，可以通过 Masking 层或 mask 参数实现掩码屏蔽，其原理是将填充值对应的元素的 mask 标记设置为 0，即不参与计算。

掩码屏蔽的好处是可以使神经网络更加有效地处理可变长度的数据，同时减少计算量，提高模型的训练效果。在自然语言处理任务中，掩码屏蔽通常被广泛应用于文本分类、情感分析等任务中。

In [None]:
K = keras.backend
inputs = keras.layers.Input(shape=[None])
mask = keras.layers.Lambda(lambda inputs: K.not_equal(inputs,0))(inputs)
z = keras.layers.Embedding(vocab_size + num_oov_buckets,embed_size)(inputs)
z = keras.layers.GRU(128,return_sequences=True)(z,mask=mask)
z = keras.layers.GRU(128)(z,mask=mask)
outputs = keras.layers.Dense(1,activation="sigmoid")(z)
model = keras.Model(inputs=[inputs],outputs=[outputs])

In [None]:
model.compile(loss="binary_crossentropy",optimizer="adam",
              metrics=["accuracy"])
history = model.fit(train_set,epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


## 重用与训练的嵌入

In [None]:
import tensorflow_hub as hub

model = keras.Sequential([
    hub.KerasLayer("https://tfhub.dev/google/tf2-preview/nnlm-en-dim50/1",
                   dtype=tf.string,input_shape=[],output_shape=[50]),
    keras.layers.Dense(128,activation="relu"),
    keras.layers.Dense(1,activation="sigmoid")
])
model.compile(loss="binary_crossentropy",optimizer="adam",
              metrics=["accuracy"])

In [None]:
datasets,info = tfds.load("imdb_reviews",as_supervised=True,with_info=True)
train_size = info.splits['train'].num_examples
batch_size = 32
train_set = datasets['train'].batch(batch_size).prefetch(1)
history = model.fit(train_set,epochs=20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [None]:
test_set = datasets['test'].batch(batch_size).prefetch(1)
test_loss, test_accuracy = model.evaluate(test_set)
print('Test Loss:', test_loss)
print('Test Accuracy:', test_accuracy)


Test Loss: 0.5097341537475586
Test Accuracy: 0.7498800158500671
