Load data

In [1]:
file = open("usable_data.txt", "r", encoding="utf-8")
data = file.read()

Imports

In [56]:
import tensorflow as tf
import numpy as np
import os

Vocab

In [35]:
stop_words = [',', '.', '!', '?', ':', ';', '(', ')', '[', ']', '{', '}', '-', '–', '—', '“', '”']
# remove stop words/characters
data = ''.join([char for char in data if char not in stop_words])

# split on space or new line
words = data.split(' ') + data.split('\n')

# remove empty strings
words = [word for word in words if word != '' and word not in stop_words]

# remove duplicates
vocab = sorted(set(words))

In [36]:
print('vocab size: ', len(vocab))

vocab size:  77194


word to id convertor

In [37]:
ids_from_words = tf.keras.layers.StringLookup(
    vocabulary=list(vocab), mask_token=None)

id to word convertor

In [38]:
words_from_ids = tf.keras.layers.StringLookup(
    vocabulary=ids_from_words.get_vocabulary(), invert=True, mask_token=None)

Text from ids

In [39]:
def text_from_ids(ids):
  return tf.strings.reduce_join(words_from_ids(ids), axis=-1)

Creating sequences

In [40]:
all_ids = ids_from_words(words)
all_ids

<tf.Tensor: shape=(314453,), dtype=int64, numpy=array([ 2015, 30370,  2049, ..., 13260, 33042, 14993], dtype=int64)>

Creating ids dataset

In [41]:
ids_dataset = tf.data.Dataset.from_tensor_slices(all_ids)

show ten first words in the dataset

In [42]:
for ids in ids_dataset.take(10):
    print(words_from_ids(ids).numpy().decode('utf-8'))

0
To
100


199L
SPRG
Blue
Flame
Gang
Hey
petit
hey
Rentre
pas


define sequence length

In [43]:
seq_length = 100

Create sequences

In [96]:
sequences = ids_dataset.batch(seq_length+1, drop_remainder=True)

In [46]:
for seq in sequences.take(1):
  print(words_from_ids(seq))

tf.Tensor(
[b'0' b'To' b'100' b'\n\n199L\nSPRG' b'Blue' b'Flame' b'Gang\nHey'
 b'petit' b'hey\nRentre' b'pas' b'dans' b"l'Rap" b'le' b'bail' b'est'
 b'crade' b"c'est" b'un' b'panier' b"d'crabes\nPas" b"l'temps"
 b"d'regarder" b'les' b'autres' b'bailler\nMoi' b'quand' b"j'\xc3\xa9cris"
 b'le' b'cahier' b'crame\nTu' b'sais' b'que' b"j'suis" b'un' b'des'
 b'vrais\nSi' b"quelqu'un" b"s'occupe" b'bien' b'des' b'frais\nAvec'
 b'mon' b'\xc3\xa9quipe' b'on' b'atteint' b"l'Everest\nC'que" b'dit'
 b'Flingue' b'est' b"vrai\nJ'voulais" b'faire' b'un' b'gros' b'son' b'sur'
 b'mon' b'Blunt' b'Phillies' b'mais\nIci' b'les' b'chanteurs' b'\xc3\xa0'
 b'la' b'JeanPhilippe' b'Smet' b'finissent' b'ma\xc3\xaetre\nLe' b'fait'
 b"qu'ils" b'fassent' b'encore' b'du' b'biff' b"m'\xc3\xa9gare" b'la'
 b'France' b'est\nAu' b'ralenti' b'comme' b'Keanu' b'Reeves' b'quand'
 b'il' b'esquive' b'les' b"balles\nJ'suis" b"l'Neo" b"d'cette" b'matrice'
 b'crois' b'pas' b'que' b"j'blague" b'mamen\nLa' b'France' b'est'
 b'con

Define sequence spliting method

In [97]:
def split_input_target(sequence):
    input_text = sequence[:-1]
    target_text = sequence[1:]
    print(input_text.shape, input_text.dtype)
    print(target_text.shape, target_text.dtype)
    return input_text, target_text

create dataset

In [98]:
dataset = sequences.map(split_input_target)
# print(type(dataset[2]))

(100,) <dtype: 'int64'>
(100,) <dtype: 'int64'>


In [95]:
for input_example, target_example in dataset.take(2):
    print("Input :", text_from_ids(input_example).numpy())
    print("Target:", text_from_ids(target_example).numpy())

Input : b"0To100\n\n199L\nSPRGBlueFlameGang\nHeypetithey\nRentrepasdansl'Raplebailestcradec'estunpanierd'crabes\nPasl'tempsd'regarderlesautresbailler\nMoiquandj'\xc3\xa9crislecahiercrame\nTusaisquej'suisundesvrais\nSiquelqu'uns'occupebiendesfrais\nAvecmon\xc3\xa9quipeonatteintl'Everest\nC'queditFlingueestvrai\nJ'voulaisfaireungrossonsurmonBluntPhilliesmais\nIcileschanteurs\xc3\xa0laJeanPhilippeSmetfinissentma\xc3\xaetre\nLefaitqu'ilsfassentencoredubiffm'\xc3\xa9garelaFranceest\nAuralenticommeKeanuReevesquandilesquivelesballes\nJ'suisl'Neod'cettematricecroispasquej'blaguemamen\nLaFranceestconservatricesonRapn'\xc3\xa9chappe"
Target: b"To100\n\n199L\nSPRGBlueFlameGang\nHeypetithey\nRentrepasdansl'Raplebailestcradec'estunpanierd'crabes\nPasl'tempsd'regarderlesautresbailler\nMoiquandj'\xc3\xa9crislecahiercrame\nTusaisquej'suisundesvrais\nSiquelqu'uns'occupebiendesfrais\nAvecmon\xc3\xa9quipeonatteintl'Everest\nC'queditFlingueestvrai\nJ'voulaisfaireungrossonsurmonBluntPhilliesmais\nIcilescha

Create data batch 

In [99]:
# Batch size
BATCH_SIZE = 64

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

dataset = (
    dataset
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE),
)

dataset

(<PrefetchDataset element_spec=(TensorSpec(shape=(64, 100), dtype=tf.int64, name=None), TensorSpec(shape=(64, 100), dtype=tf.int64, name=None))>,)

build the model

In [51]:
vocab_size = len(vocab)

In [100]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, 50, input_length=seq_length),
    tf.keras.layers.LSTM(100, return_sequences=True),
    tf.keras.layers.LSTM(100),
    tf.keras.layers.Dense(100, activation='relu'),
    tf.keras.layers.Dense(vocab_size, activation='softmax'),
])

In [101]:
print(model.summary())
[print(i.shape, i.dtype) for i in model.inputs]
[print(o.shape, o.dtype) for o in model.outputs]
[print(l.name, l.input_shape, l.dtype) for l in model.layers]

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, 100, 50)           3859700   
                                                                 
 lstm_10 (LSTM)              (None, 100, 100)          60400     
                                                                 
 lstm_11 (LSTM)              (None, 100)               80400     
                                                                 
 dense_10 (Dense)            (None, 100)               10100     
                                                                 
 dense_11 (Dense)            (None, 77194)             7796594   
                                                                 
Total params: 11,807,194
Trainable params: 11,807,194
Non-trainable params: 0
_________________________________________________________________
None
(None, 100) <dtype: 'float32'>
(None,

[None, None, None, None, None]

Config callbacks

In [102]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True,
)

In [103]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [104]:
model.fit(dataset, batch_size=128, epochs=100, callbacks=[checkpoint_callback])

ValueError: Failed to find data adapter that can handle input: (<class 'tuple'> containing values of types {"<class 'tensorflow.python.data.ops.dataset_ops.PrefetchDataset'>"}), <class 'NoneType'>