In [0]:
import numpy as np
import tensorflow as tf
import os

In [0]:
# path to data file
path_to_file = "path/to/dataset.txt"

In [0]:
# Read the file in utf-8 format
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')

# length of text = number of characters in it
print ('Length of text: {} characters'.format(len(text)))

In [0]:
# Take a look at the first 250 characters in text
print(text[:250])

In [0]:
# The unique characters in the file
vocab = sorted(set(text))
print ('{} unique characters'.format(len(vocab)))

In [0]:
# Creating a mapping from unique characters to indices
char2idx = {c:i for i, c in enumerate(vocab)}
# Creating a reverse mapping as well
idx2char = np.array(vocab)

# Convert the entire text to their encodings
text_as_int = np.array([char2idx[c] for c in text])

In [0]:
# The maximum length sentence we want for a single input in characters
seq_length = 100
# +1 because we shift by 1 character each time
examples_per_epoch = len(text)//(seq_length+1)

# Create training examples / targets
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

In [0]:
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

In [0]:
def split_input_target(chunk):
    # splits the input chunk
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

In [0]:
# Batch size
BATCH_SIZE = 64

# Buffer size to shuffle the dataset
BUFFER_SIZE = 10000

# shuffle the dataset
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

In [0]:
# Length of the vocabulary in chars
vocab_size = len(vocab)

# The embedding dimension
embedding_dim = 256

# Number of RNN units here, LSTM units are used
rnn_units = 1024

In [0]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
  """
  Builds the Sequential Model, using Embeddings to create inputs to tensors of specified dimensions, and then find dependencies using LSTM units. At the end is the dense layer.

  input params:
  vocab_size: nubmer of unique characters
  embedding_dim: number of dimension in the embedding tensor
  rnn_units: self explanatory
  batch_size: number of samples seen before gradient updation

  output:
  model created
  """
  
  model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim,
                              batch_input_shape=[batch_size, None]),
    tf.keras.layers.LSTM(rnn_units,
                        return_sequences=True,
                        stateful=True,
                        recurrent_initializer='glorot_uniform'),
    tf.keras.layers.Dense(vocab_size)
  ])
  return model

In [0]:
model = build_model(
  vocab_size = len(vocab),
  embedding_dim=embedding_dim,
  rnn_units=rnn_units,
  batch_size=BATCH_SIZE)

In [0]:
model.summary()

In [0]:
model.compile(
    optimizer='adam', 
    loss=lambda labels, logits: tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)
)

In [0]:
# Total number of times the model sees the data
EPOCHS=42

In [0]:
history = model.fit(dataset, epochs=EPOCHS)

In [0]:
model.save('path/to/save/model')