## Keras Datasets cleanups

In [42]:
import tensorflow_datasets as tfds

# Replace 'anli' with available keras datasets: https://www.tensorflow.org/datasets/catalog/overview
(ds_train, ds_test), ds_info = tfds.load(
    'anli', 
    split=['train', 'test'],
    shuffle_files=True,
    with_info=True,
)



In [26]:
# Replace 'context' with the correct field according to the dataset Feature documentation. 
text = [example['context'].numpy().decode() for example in ds_train]
text = text[:round(len(text)/4)]

In [27]:
from tensorflow.keras.preprocessing.text import Tokenizer

# Tokenize the words from the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text)
total_words = len(tokenizer.word_index) + 1
print('Total words: ', total_words)

Total words:  14204


In [28]:
# Convert data to sequence of tokens
input_sequences = []
for paragraph in text:
    # Convert our headline into a sequence of tokens
    token_list = tokenizer.texts_to_sequences([paragraph])[0]

    # Create a series of sequences for each paragraph
    for i in range(1, len(token_list)):
        partial_sequence = token_list[:i+1]
        input_sequences.append(partial_sequence)

print(tokenizer.sequences_to_texts(input_sequences[:5]))
input_sequences[:5]

['joey heindle', 'joey heindle born', 'joey heindle born 14', 'joey heindle born 14 may', 'joey heindle born 14 may 1993']


[[11624, 11625],
 [11624, 11625, 22],
 [11624, 11625, 22, 551],
 [11624, 11625, 22, 551, 78],
 [11624, 11625, 22, 551, 78, 488]]

In [29]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
# The sequences are of various lengths.
# Make all the sequences the same length.

# Determine max sequence length
max_sequence_len = max([len(x) for x in input_sequences])

# Pad all sequences with zeros at the beginning to make them all max length
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
input_sequences[0]

array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0, 11624, 11625])

In [30]:
# Predictors are every word except the last
predictors = input_sequences[:,:-1]
# Labels are the last word
labels = input_sequences[:,-1]

In [31]:
from tensorflow.keras import utils

# The targets are categorical.
# We are predicting one word out of our possible total vocabulary.
# Instead of the network predicting scalar numbers, we will have it predict binary categories.
# for example:
# 13810 ----> array([0., 0., 0., ..., 1., 0., 0.], dtype=float32)
labels = utils.to_categorical(labels, num_classes=total_words)

In [32]:
# Thats all. Now just create a model and run:
# model.fit(predictors, labels, epochs=20, verbose=1)