In [3]:
# how to use (tf.data.TextLineDataset) to load examples from 
# text files
#=> Its designed to create a dataset from a text file,
# in which each example is a line of text from the original 
# file
# => Is potentialy useful for any text data that is primarily 
# line-based (eg, poetry or error logs)

#=> HYPOTHESIS here
#> We'll use three different English translations of the same
# work, Homer's llliad, and train a model to identify the 
# translator given a single line of text


In [4]:
# Setup
from __future__ import absolute_import, division, print_function, unicode_literals

import tensorflow as tf
import tensorflow_datasets as tfds
import os
tf.enable_eager_execution()

In [5]:
DIRECTORY_URL = 'https://storage.googleapis.com/download.tensorflow.org/data/illiad/'
FILE_NAMES = ['cowper.txt','derby.txt','butler.txt']

for name in FILE_NAMES:
    text_dir = tf.keras.utils.get_file(name, origin=DIRECTORY_URL+name)
    
parent_dir = os.path.dirname(text_dir)
parent_dir

'/home/george/.keras/datasets'

In [6]:
# Load text into datasets
# Iter through the files, loading each one into its own dataset
def labeler(example, index):
    return example, tf.cast(index, tf.int64)

labeled_data_sets = []
for i, file_name in enumerate(FILE_NAMES):
    lines_dataset = tf.data.TextLineDataset(os.path.join(parent_dir,file_name))
    print('lines_dataset',lines_dataset)
    labeled_dataset = lines_dataset.map(lambda ex: labeler(ex,i))
    print('labeled_dataset',labeled_dataset)
    labeled_data_sets.append(labeled_dataset)

print('#'*50)
print(labeled_data_sets)

lines_dataset <TextLineDatasetV1 shapes: (), types: tf.string>
labeled_dataset <DatasetV1Adapter shapes: ((), ()), types: (tf.string, tf.int64)>
lines_dataset <TextLineDatasetV1 shapes: (), types: tf.string>
labeled_dataset <DatasetV1Adapter shapes: ((), ()), types: (tf.string, tf.int64)>
lines_dataset <TextLineDatasetV1 shapes: (), types: tf.string>
labeled_dataset <DatasetV1Adapter shapes: ((), ()), types: (tf.string, tf.int64)>
##################################################
[<DatasetV1Adapter shapes: ((), ()), types: (tf.string, tf.int64)>, <DatasetV1Adapter shapes: ((), ()), types: (tf.string, tf.int64)>, <DatasetV1Adapter shapes: ((), ()), types: (tf.string, tf.int64)>]


In [7]:
# Combine these labeled datasets into a single dataset and 
# shuffle it
BUFFER_SIZE = 50000
BATCH_SIZE = 64
TAKE_SIZE = 5000

In [8]:
all_labeled_data = labeled_data_sets[0]
print('all_labeled_data0',all_labeled_data)
for labeled_dataset in labeled_data_sets[1:]:
    all_labeled_data = all_labeled_data.concatenate(labeled_dataset)
    print('all_labeled_data1',all_labeled_data)
    
all_labeled_data = all_labeled_data.shuffle(
    BUFFER_SIZE,
    reshuffle_each_iteration=False
)

all_labeled_data0 <DatasetV1Adapter shapes: ((), ()), types: (tf.string, tf.int64)>
all_labeled_data1 <DatasetV1Adapter shapes: ((), ()), types: (tf.string, tf.int64)>
all_labeled_data1 <DatasetV1Adapter shapes: ((), ()), types: (tf.string, tf.int64)>


In [9]:
# you can use (tf.data.Dataset.take) and print to see  what 
# the (example, label) pairs look like. The (numpy) 
# property shows each Tensor's value

for ex in all_labeled_data.take(5):
    print(ex)

(<tf.Tensor: id=74, shape=(), dtype=string, numpy=b'There issued flames of fiercely-burning fire:'>, <tf.Tensor: id=75, shape=(), dtype=int64, numpy=1>)
(<tf.Tensor: id=78, shape=(), dtype=string, numpy=b'Sarpedon. Juno and Minerva descend to resist Mars; the latter incites'>, <tf.Tensor: id=79, shape=(), dtype=int64, numpy=1>)
(<tf.Tensor: id=82, shape=(), dtype=string, numpy=b'of the Danaans. If all-seeing Jove will not send you this messenger,'>, <tf.Tensor: id=83, shape=(), dtype=int64, numpy=2>)
(<tf.Tensor: id=86, shape=(), dtype=string, numpy=b'In thy own field, at thy own fragrant shrine.'>, <tf.Tensor: id=87, shape=(), dtype=int64, numpy=0>)
(<tf.Tensor: id=90, shape=(), dtype=string, numpy=b'The son of Atreus, and Meriones,'>, <tf.Tensor: id=91, shape=(), dtype=int64, numpy=1>)


In [10]:
# Encode text lines as numbers
#. Build a vocabulary
tokenizer = tfds.features.text.Tokenizer()

vocabulary_set = set()
for text_tensor, _ in all_labeled_data:
    some_tokens = tokenizer.tokenize(text_tensor.numpy())
    vocabulary_set.update(some_tokens)
    
vocab_size = len(vocabulary_set)
vocab_size

17178

In [20]:
# vocabulary_set

In [18]:
# Encode examples
# create an encoder by passing the vocabulary_set to 
# tfds.features.test.TokenTextEncoder.
# The encoder's (encode) method takes in a string of text 
# and returns a list of intergers.

encoder = tfds.features.text.TokenTextEncoder(vocabulary_set)
encoder

<TokenTextEncoder vocab_size=17180>

In [16]:
# Try this on a single line to see what the output looks
# like
example_text = next(iter(all_labeled_data))[0].numpy()
print(example_text)

b'There issued flames of fiercely-burning fire:'


In [17]:
encoded_example = encoder.encode(example_text)
print(encoded_example)

[5953, 4677, 8205, 16481, 12197, 9924, 7045]


In [21]:
# now run the encoder on the dataset by wrapping it in 
# (tf.py_function) and passing that to the dataset's (map)
# method
def encode(text_tensor, label):
    encoded_text = encoder.encode(text_tensor.numpy())
    return encoded_text, label
def encode_map_fn(text, label):
    return tf.py_function(
        encode,
        inp=[text,label],
        Tout=(tf.int64, tf.int64)
    )

all_encoded_data = all_labeled_data.map(encode_map_fn)
all_encoded_data

<DatasetV1Adapter shapes: (<unknown>, <unknown>), types: (tf.int64, tf.int64)>

In [23]:
# Split the data into train and test batches
# Use (tf.data.Dataset.take) and (tf.data.Dataset.skip)
# to create a small test dataset and a larger training set
# To pad the sentenses/words to same length use
# (tf.data.Dataset.padded_batch)

train_data = all_encoded_data.skip(TAKE_SIZE).shuffle(BUFFER_SIZE)
train_data = train_data.padded_batch(
    BATCH_SIZE,
    padded_shapes=([-1],[])
)
test_data = all_encoded_data.take(TAKE_SIZE)
test_data = test_data.padded_batch(
    BATCH_SIZE,
    padded_shapes=([-1],[])
)

In [24]:
sample_text, sample_labels = next(iter(test_data))
sample_text[0], sample_labels[0]



(<tf.Tensor: id=149246, shape=(16,), dtype=int64, numpy=
 array([ 5953,  4677,  8205, 16481, 12197,  9924,  7045,     0,     0,
            0,     0,     0,     0,     0,     0,     0])>,
 <tf.Tensor: id=149250, shape=(), dtype=int64, numpy=1>)

In [25]:
# Since we have introduced a new token encoding 
# (the zero used for padding), the vocabulary 
# size has increased by one
vocab_size +=1

In [26]:
# BUILD THE MODEL
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(vocab_size, 64))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)))

# One or more dense layers
# Edit the list int the `for` line to experiment with layer
# sizes
for units in [64,64]:
    model.add(tf.keras.layers.Dense(units, activation='relu'))
    
# Output layer. The first argument is the number of labels
model.add(tf.keras.layers.Dense(3, activation='softmax'))

model.compile(
    optimizer='adam',
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

In [None]:
# TRAIN THE MODEL
model.fit(train_data, epochs=3, validation_data=test_data)

Epoch 1/3


In [None]:
# evaluate
eval_loss, eval_acc = model.evaluate(test_data)
print('\nEval loss: {:.3f}, Eval accuracy: {:.3f}'.format(
    eval_loss,
    eval_acc
))