# Colab Practice
这个notebook大部分代码来自: https://www.tensorflow.org/tutorials/keras/basic_text_classification

In [1]:
! pip install tensorflow-gpu==2.0.0-alpha0

Collecting tensorflow-gpu==2.0.0-alpha0
[?25l  Downloading https://files.pythonhosted.org/packages/1a/66/32cffad095253219d53f6b6c2a436637bbe45ac4e7be0244557210dc3918/tensorflow_gpu-2.0.0a0-cp36-cp36m-manylinux1_x86_64.whl (332.1MB)
[K    100% |████████████████████████████████| 332.1MB 79kB/s  eta 0:00:01 1% |▋                               | 6.1MB 68.9MB/s eta 0:00:05    25% |████████                        | 83.2MB 54.1MB/s eta 0:00:05████                  | 145.7MB 43.0MB/s eta 0:00:05    44% |██████████████▎                 | 147.8MB 10.9MB/s eta 0:00:17    44% |██████████████▍                 | 148.9MB 11.2MB/s eta 0:00:17    52% |████████████████▊               | 173.3MB 45.7MB/s eta 0:00:04    97% |███████████████████████████████ | 322.3MB 48.9MB/s eta 0:00:01
Collecting google-pasta>=0.1.2 (from tensorflow-gpu==2.0.0-alpha0)
[?25l  Downloading https://files.pythonhosted.org/packages/d0/33/376510eb8d6246f3c30545f416b2263eee461e40940c2a4413c711bdf62d/google_pasta-0.1.7-py3-none

# import相关库


In [2]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.utils import get_file
import numpy as np

# 下载IMDB电影评论数据

注意,在Kaggle Kernel里面numpy版本为1.16.3, `allow_pickle`的默认值被修改了, 会导致错误, 因此需要rework

Ref: https://github.com/keras-team/keras/pull/12714

In [3]:
def load_data(path='imdb.npz', num_words=None, skip_top=0,
              maxlen=None, seed=113,
              start_char=1, oov_char=2, index_from=3, **kwargs):
    """Loads the IMDB dataset.
    # Arguments
        path: where to cache the data (relative to `~/.keras/dataset`).
        num_words: max number of words to include. Words are ranked
            by how often they occur (in the training set) and only
            the most frequent words are kept
        skip_top: skip the top N most frequently occurring words
            (which may not be informative).
        maxlen: sequences longer than this will be filtered out.
        seed: random seed for sample shuffling.
        start_char: The start of a sequence will be marked with this character.
            Set to 1 because 0 is usually the padding character.
        oov_char: words that were cut out because of the `num_words`
            or `skip_top` limit will be replaced with this character.
        index_from: index actual words with this index and higher.
    # Returns
        Tuple of Numpy arrays: `(x_train, y_train), (x_test, y_test)`.
    # Raises
        ValueError: in case `maxlen` is so low
            that no input sequence could be kept.
    Note that the 'out of vocabulary' character is only used for
    words that were present in the training set but are not included
    because they're not making the `num_words` cut here.
    Words that were not seen in the training set but are in the test set
    have simply been skipped.
    """
    # Legacy support
    if 'nb_words' in kwargs:
        warnings.warn('The `nb_words` argument in `load_data` '
                      'has been renamed `num_words`.')
        num_words = kwargs.pop('nb_words')
    if kwargs:
        raise TypeError('Unrecognized keyword arguments: ' + str(kwargs))

    path = get_file(path,
                    origin='https://s3.amazonaws.com/text-datasets/imdb.npz',
                    file_hash='599dadb1135973df5b59232a0e9a887c')
    with np.load(path, allow_pickle=True) as f:
        x_train, labels_train = f['x_train'], f['y_train']
        x_test, labels_test = f['x_test'], f['y_test']

    rng = np.random.RandomState(seed)
    indices = np.arange(len(x_train))
    rng.shuffle(indices)
    x_train = x_train[indices]
    labels_train = labels_train[indices]

    indices = np.arange(len(x_test))
    rng.shuffle(indices)
    x_test = x_test[indices]
    labels_test = labels_test[indices]

    xs = np.concatenate([x_train, x_test])
    labels = np.concatenate([labels_train, labels_test])

    if start_char is not None:
        xs = [[start_char] + [w + index_from for w in x] for x in xs]
    elif index_from:
        xs = [[w + index_from for w in x] for x in xs]

    if maxlen:
        xs, labels = _remove_long_seq(maxlen, xs, labels)
        if not xs:
            raise ValueError('After filtering for sequences shorter than maxlen=' +
                             str(maxlen) + ', no sequence was kept. '
                             'Increase maxlen.')
    if not num_words:
        num_words = max([max(x) for x in xs])

    # by convention, use 2 as OOV word
    # reserve 'index_from' (=3 by default) characters:
    # 0 (padding), 1 (start), 2 (OOV)
    if oov_char is not None:
        xs = [[w if (skip_top <= w < num_words) else oov_char for w in x]
              for x in xs]
    else:
        xs = [[w for w in x if skip_top <= w < num_words]
              for x in xs]

    idx = len(x_train)
    x_train, y_train = np.array(xs[:idx]), np.array(labels[:idx])
    x_test, y_test = np.array(xs[idx:]), np.array(labels[idx:])

    return (x_train, y_train), (x_test, y_test)

In [4]:
# get data

(train_data, train_labels), (test_data, test_labels) = load_data(num_words=10000)

print(train_data[0], train_labels[0])
print('Number of training instances: {0}, number of testing instances: {1}'.format(train_data.shape[0], test_data.shape[0]))

Downloading data from https://s3.amazonaws.com/text-datasets/imdb.npz
[1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 2, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 2, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 2, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 2, 8, 4, 107, 117, 5952, 15, 256, 4, 2, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 2, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19,

我们看到的是词索引, 想要看到原本的词需要用词表找回原来的词语

In [5]:
# get vocab
word_to_id = keras.datasets.imdb.get_word_index()
index_from=3
word_to_id = {k:(v+index_from) for k,v in word_to_id.items()}
word_to_id["<PAD>"] = 0
word_to_id["<START>"] = 1
word_to_id["<UNK>"] = 2
id_to_word = {value:key for key,value in word_to_id.items()}

print(' '.join([id_to_word[i] for i in train_data[0]]))

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb_word_index.json
<START> this film was just brilliant casting location scenery story direction everyone's really suited the part they played and you could just imagine being there robert <UNK> is an amazing actor and now the same being director <UNK> father came from the same scottish island as myself so i loved the fact there was a real connection with this film the witty remarks throughout the film were great it was just brilliant so much that i bought the film as soon as it was released for <UNK> and would recommend it to everyone to watch and the fly fishing was amazing really cried at the end it was so sad and you know what they say if you cry at a film it must have been good and this definitely was also <UNK> to the two little boy's that played the <UNK> of norman and paul they were just brilliant children are often left out of the <UNK> list i think because the stars that play them all grown up

接下来我们要对输入进行补全(padding)

补全会导致一部分无用计算, 但是更加方便处理(思考题: 怎样减少无用计算?)

In [6]:
train_data = keras.preprocessing.sequence.pad_sequences(train_data,
                                                        value=word_to_id["<PAD>"],
                                                        padding='post',
                                                        maxlen=256)

test_data = keras.preprocessing.sequence.pad_sequences(test_data,
                                                       value=word_to_id["<PAD>"],
                                                       padding='post',
                                                       maxlen=256)

In [7]:
train_data.shape

(25000, 256)

In [8]:
# Problem: Use tf.data to implement input pipeline

# placeholder for implementing dataset
def create_dataset_from_tensor_slices(X, y):
    return tf.data.Dataset.from_tensor_slices((np.array(X), np.array(y)))

def create_dataset_from_generator(X, y):
    def create_gen():
        for single_x, single_y in zip(X, y):
            yield (single_x, single_y)
    output_types = (tf.int32, tf.int32)
    output_shapes = ([256], [])
    return tf.data.Dataset.from_generator(create_gen, output_types=output_types, output_shapes=output_shapes)

def create_dataset_tfrecord(X, y, mode='train'):
    file_name = '{0}.tfrecord'.format(mode)
    
    # serialize features
    # WARNING: DO NOT WRITE MULTITPLE TIMES IN PRACTICE!!! IT'S SLOW!!!
    def _int64_list_feature(value):
        """Returns an int64_list from a bool / enum / int / uint."""
        return tf.train.Feature(int64_list=tf.train.Int64List(value=value))
    def _int64_feature(value):
        """Returns an int64_list from a bool / enum / int / uint."""
        return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
    def serialize_fn(single_x, single_y):
        feature_tuples = {'feature': _int64_list_feature(single_x), 'label': _int64_feature(single_y)}
        example_proto = tf.train.Example(
            features=tf.train.Features(feature=feature_tuples))
        return example_proto.SerializeToString()
    # write to file
    with tf.io.TFRecordWriter(file_name) as writer:
        for single_x, single_y in zip(X, y):
            example = serialize_fn(single_x, single_y)
            writer.write(example)
            
    # read file
    dataset = tf.data.TFRecordDataset(file_name)
    def parse_fn(example_proto):
        feature_description = {'feature': tf.io.FixedLenFeature([256], tf.int64), 'label': tf.io.FixedLenFeature([], tf.int64)}
        feature_tuple = tf.io.parse_single_example(
            example_proto, feature_description)
        return feature_tuple['feature'], feature_tuple['label']
    dataset = dataset.map(parse_fn)
    return dataset

# train_dataset = create_dataset_from_generator(train_data, train_labels)
# test_dataset = create_dataset_from_generator(test_data, test_labels)

train_dataset = create_dataset_tfrecord(train_data, train_labels)
test_dataset = create_dataset_tfrecord(test_data, test_labels, mode='test')

train_dataset = train_dataset.shuffle(10000).batch(256).prefetch(100).repeat()
test_dataset = test_dataset.batch(256).prefetch(100)
    

# 构建模型

下面就是激动人心的时候了: 写一个文本分类模型!

你需要改写一下下面的模型,让其准确率更高

你可以尝试使用Dropout, CudnnGRU等更加fancy的方法

In [9]:
# Problem: Implement a custom keras layer which has the identical effects of dense, but print the mean
#   of the variables if the mean value is greater than zero. Print for maximum 10 times.

# placeholder for implementing using Functional API or Model Subclassing
class WeirdDense(tf.keras.layers.Layer):

    def __init__(self, output_dim, activation):
        super(WeirdDense, self).__init__()
        self.output_dim = output_dim
        self.activation = activation
        self.print_times = tf.Variable(0, dtype=tf.int32, trainable=False)
        

    def build(self, input_shape):
        # Create a trainable weight variable for this layer.
        self.w = self.add_weight(shape=(input_shape[-1], self.output_dim),
                                 initializer='random_normal',
                                 trainable=True)
        self.b = self.add_weight(shape=(self.output_dim,),
                                 initializer='random_normal',
                                 trainable=True)
    @tf.function
    def call(self, x):
        mean_val = tf.reduce_mean(self.w)
        if tf.greater(mean_val, 0):
            if tf.less_equal(self.print_times, 10):
                tf.print(mean_val)
                self.print_times.assign_add(1)

        return_tensor = self.activation(tf.matmul(x, self.w) + self.b)
        return return_tensor
            

    def compute_output_shape(self, input_shape):
        return (input_shape[0], self.output_dim)

In [12]:
# input shape is the vocabulary count used for the movie reviews (10,000 words)
vocab_size = 10000

model = keras.Sequential()
model.add(keras.layers.Embedding(vocab_size, 16))
model.add(keras.layers.GlobalAveragePooling1D())
model.add(WeirdDense(16, activation=tf.nn.relu))
model.add(keras.layers.Dense(1, activation=tf.nn.sigmoid))

model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 16)          160000    
_________________________________________________________________
global_average_pooling1d_1 ( (None, 16)                0         
_________________________________________________________________
weird_dense_1 (WeirdDense)   (None, 16)                273       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 17        
Total params: 160,290
Trainable params: 160,289
Non-trainable params: 1
_________________________________________________________________


如果要使用Keras提供的训练、预测API, 你需要先compile模型, 然后调用该API

In [13]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['acc'])
history = model.fit(train_dataset,
                    epochs=1,
                    steps_per_epoch=100,
                    validation_data=test_dataset,
                    validation_steps=100,
                    verbose=1)

5.81007916e-05
0.000122097204
0.000183319906
0.000183319906
0.000183319906
0.000183319906
0.000183319906
0.000183319906


W0617 08:00:08.996987 139913903105408 training_generator.py:228] Your dataset ran out of data; interrupting training. Make sure that your dataset can generate at least `validation_steps * epochs` batches (in this case, 100 batches). You may need to use the repeat() function when building your dataset.


