In [2]:
from __future__ import absolute_import, division, print_function, unicode_literals

import tensorflow as tf

from tensorflow import keras

!pip install -q tensorflow-datasets
import tensorflow_datasets as tfds
tfds.disable_progress_bar()

import numpy as np


In [4]:
(train_data, test_data), info = tfds.load(
    # Use the version pre-encoded with an ~8k vocabulary
    'imdb_reviews/subwords8k',
    # Return the train/test datasets as a turple.
    split = (tfds.Split.TRAIN, tfds.Split.TEST),
    # Return (example, label) pairs from the dataset 
    # (instead of a dictionary).
    as_supervised=True,
    # ALSO RETURN THE 'INFO' structure
    with_info=True
)

[1mDownloading and preparing dataset imdb_reviews (80.23 MiB) to /home/george/tensorflow_datasets/imdb_reviews/subwords8k/0.1.0...[0m


KeyboardInterrupt: 

In [None]:
# Try the encoder
# the dataset (info) includes the text encoder (
# a tfds.features.text.SubwordTextEncoder)

encoder = info.features['text'].encoder
print('Vocabulary size:{}'.format(encoder.vacab_size))


In [None]:
# this text encoder will reversibly encode any string
sample_string = 'Hello TensorFlow'

encoded_string = encoder.encode(sample_string)
print('Encoded string is {}'.format(encoded_string))

original_string = encoder.decode(encoded_string)
print('The original string: "{}"'.format(original_string))

aser original_string == sample_string

In [None]:
for ts in encoded_string:
    print('{}----->{}'.format(ts, encoder.decode([ts])))

In [None]:
# Explore the data
# here's what  the first review looks like
for train_example, train_label in train_data.take(1):
    print('Encoded text:',train_example[:10].numpy())
    print('label:',train_label.numpy())

In [None]:
# The (info) structure contains the encoder/decoder. The
# encoder can be used to recover the original text
encoder.decode(train_example)

In [None]:
# PREPARE THE DATA FOR TRAINING
# You will want to create batches of training data for your model.
# The reviews are of different lengths, so use (padded_batch)
# to zero pad the sequences while batching

BUFFER_SIZE = 1000
train_batches = (
    train_data
    .shuffle(BUFFER_SIZE)
    .padded_batch(32, train_data.output_shapes)
)

test_batches = (
    test_data
    .padded_batch(32,train_data.output_shapes)
)

In [None]:
# Each batch will have a shape of (batch_size, sequence_length)
# because the padding is dynamic each batch will have a different length

for example_batch, label_batch in train_batches.take(2):
    print('Batch shape: ', example_batch.shape)
    print('label shape: ',label_batch.shape)

In [None]:
# Build the model
model = keras.Sequential([
    keras.layers.Embedding(encoder.vocab_size, 16),
    keras.layers.GlobalAveragePooling1D(),
    keras.layers.Dense(1, activation='sigmoid')
])

print(model.summary())

model.compile(optimizer='adam',
             loss='binary_crossentropy',
             metrics=['accuracy'])

In [None]:
# TRAIN THE MODEL
history = model.fit(train_batches,
                   epochs=10,
                   validation_data=test_batches,
                   validation_steps=30)

In [None]:
# Evaluate the model
loss, accuracy = model.evaluate(test_batches)

In [None]:
# Create a graph of accuracy and loss over time
history_dict = history.history
history_dict.keys()

In [None]:
import matplotlib.pyplot as plt
acc = history_dict['accuracy']
val_acc = history_dict['val_accuracy']
loss = history_dict['loss']
val_loss = history_dict['val_loss']

epochs = range(1, len(acc)+1)

# "bo" is for "blue dot"
plt.plot(epochs, loss, 'bo', label='Training loss')
# "b" is for "solid blue line"
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and Validation loss')
plt.xlabel('Epochs')
plt.ylabel('loss')
plt.legend()

plt.show()

In [None]:
plt.clf() # clear figure
plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and Validation accuracy')
plt.xlabel('Epochs')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')

plt.show()