In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals

import tensorflow as tf

tf.enable_eager_execution()

import numpy as np
import os
import time
import random

from pandas import read_csv

In [2]:
path_to_file = "./names.csv"

In [3]:
# Read, then join to form one whole string
text = read_csv(path_to_file)
text = text["name"]

end_of_name_character = "\n"

text = end_of_name_character.join(text)
# length of text is the number of characters in it
print ('Length of text: {} characters'.format(len(text)))

Length of text: 46987 characters


In [4]:
# Take a look at the first 250 characters in text
print(text[:250])

John
William
James
Charles
George
Frank
Joseph
Thomas
Henry
Robert
Edward
Harry
Walter
Arthur
Fred
Albert
Samuel
David
Louis
Joe
Charlie
Clarence
Richard
Andrew
Daniel
Ernest
Will
Jesse
Oscar
Lewis
Peter
Benjamin
Frederick
Willie
Alfred
Sam
Roy
Herbe


In [5]:
# The unique characters in the file
vocab = sorted(set(text))
print ('{} unique characters'.format(len(vocab)))

53 unique characters


In [6]:
# Creating a mapping from unique characters to indices
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

text_as_int = np.array([char2idx[c] for c in text])

In [7]:
# Visualize the indices
print('{')
for char,_ in zip(char2idx, range(20)):
    print('  {:4s}: {:3d},'.format(repr(char), char2idx[char]))
print('  ...\n}')

{
  '\n':   0,
  'A' :   1,
  'B' :   2,
  'C' :   3,
  'D' :   4,
  'E' :   5,
  'F' :   6,
  'G' :   7,
  'H' :   8,
  'I' :   9,
  'J' :  10,
  'K' :  11,
  'L' :  12,
  'M' :  13,
  'N' :  14,
  'O' :  15,
  'P' :  16,
  'Q' :  17,
  'R' :  18,
  'S' :  19,
  ...
}


In [8]:
# Show how the first 13 characters from the text are mapped to integers
print ('{} ---- characters mapped to int ---- > {}'.format(repr(text[:13]), text_as_int[:13]))

'John\nWilliam\n' ---- characters mapped to int ---- > [10 41 34 40  0 23 35 38 38 35 27 39  0]


In [9]:
# The maximum length sentence we want for a single input in characters
seq_length = 60
examples_per_epoch = len(text)//seq_length

# Create training examples / targets
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

for i in char_dataset.take(5):
  print(idx2char[i.numpy()])

J
o
h
n




In [10]:
sequences = char_dataset.batch(seq_length + 1, drop_remainder = True)

for item in sequences.take(5):
  print(repr(''.join(idx2char[item.numpy()])))
# TODO: Fix names being cut in half in the batches

'John\nWilliam\nJames\nCharles\nGeorge\nFrank\nJoseph\nThomas\nHenry\nR'
'obert\nEdward\nHarry\nWalter\nArthur\nFred\nAlbert\nSamuel\nDavid\nLou'
'is\nJoe\nCharlie\nClarence\nRichard\nAndrew\nDaniel\nErnest\nWill\nJes'
'se\nOscar\nLewis\nPeter\nBenjamin\nFrederick\nWillie\nAlfred\nSam\nRoy'
'\nHerbert\nJacob\nTom\nElmer\nCarl\nLee\nHoward\nMartin\nMichael\nBert\n'


In [11]:
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

In [12]:
# Visualize the behaviour we are expecting from our RNN
for input_example, target_example in  dataset.take(1):
  print ('Input data: ', repr(''.join(idx2char[input_example.numpy()])))
  print ('Target data:', repr(''.join(idx2char[target_example.numpy()])))

Input data:  'John\nWilliam\nJames\nCharles\nGeorge\nFrank\nJoseph\nThomas\nHenry\n'
Target data: 'ohn\nWilliam\nJames\nCharles\nGeorge\nFrank\nJoseph\nThomas\nHenry\nR'


In [13]:
# Visualize the behaviour we are expecting from our RNN on each step
for i, (input_idx, target_idx) in enumerate(zip(input_example[:5], target_example[:5])):
    print("Step {:4d}".format(i))
    print("  input: {} ({:s})".format(input_idx, repr(idx2char[input_idx])))
    print("  expected output: {} ({:s})".format(target_idx, repr(idx2char[target_idx])))

Step    0
  input: 10 ('J')
  expected output: 41 ('o')
Step    1
  input: 41 ('o')
  expected output: 34 ('h')
Step    2
  input: 34 ('h')
  expected output: 40 ('n')
Step    3
  input: 40 ('n')
  expected output: 0 ('\n')
Step    4
  input: 0 ('\n')
  expected output: 23 ('W')


In [14]:
# Batch size
BATCH_SIZE = 64
steps_per_epoch = examples_per_epoch // BATCH_SIZE

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

dataset

<DatasetV1Adapter shapes: ((64, 60), (64, 60)), types: (tf.int32, tf.int32)>

In [15]:
# Length of the vocabulary in chars
vocab_size = len(vocab)

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 1024

In [16]:
if tf.test.is_gpu_available():
  rnn = tf.keras.layers.CuDNNGRU
else:
  import functools
  rnn = functools.partial(
    tf.keras.layers.GRU, recurrent_activation='sigmoid')

In [17]:
# Build the model
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
  model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim,
                              batch_input_shape=[batch_size, None]),
    rnn(rnn_units,
        return_sequences=True,
        recurrent_initializer='glorot_uniform',
        stateful=True),
    tf.keras.layers.Dense(vocab_size)
  ])
  return model

In [18]:
model = build_model(
  vocab_size = len(vocab),
  embedding_dim = embedding_dim,
  rnn_units = rnn_units,
  batch_size = BATCH_SIZE)

In [19]:
for input_example_batch, target_example_batch in dataset.take(1):
  example_batch_predictions = model(input_example_batch)
  print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

(64, 60, 53) # (batch_size, sequence_length, vocab_size)


In [20]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (64, None, 256)           13568     
_________________________________________________________________
cu_dnngru (CuDNNGRU)         (64, None, 1024)          3938304   
_________________________________________________________________
dense (Dense)                (64, None, 53)            54325     
Total params: 4,006,197
Trainable params: 4,006,197
Non-trainable params: 0
_________________________________________________________________


In [21]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices, axis = -1).numpy()

In [22]:
sampled_indices

array([15,  3, 40, 37, 25, 28, 10,  3, 10, 51, 47, 22,  3, 29, 40, 17, 30,
       49, 40,  3, 39, 41, 30,  6, 40, 51, 44, 34, 29, 46,  1,  8, 12, 22,
       52, 34, 26, 39, 30, 49, 13, 38, 13, 17, 23, 52, 45, 21, 17, 38, 10,
       41, 35, 22, 38, 42, 26, 10, 43, 15], dtype=int64)

In [23]:
print("Input: \n", repr("".join(idx2char[input_example_batch[0]])))
print()
print("Next Char Predictions: \n", repr("".join(idx2char[sampled_indices ])))

Input: 
 'Devyn\nSavon\nDeondre\nDarrian\nDallin\nJordy\nMikal\nAdonis\nKelton'

Next Char Predictions: 
 'OCnkYbJCJyuVCcnQdwnCmodFnyrhctAHLVzhZmdwMlMQWzsUQlJoiVlpZJqO'


In [24]:
def loss(labels, logits):
  return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits = True)

example_batch_loss  = loss(target_example_batch, example_batch_predictions)
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("scalar_loss:      ", example_batch_loss.numpy().mean())

Prediction shape:  (64, 60, 53)  # (batch_size, sequence_length, vocab_size)
scalar_loss:       3.97137


In [25]:
model.compile(
    optimizer = tf.train.AdamOptimizer(),
    loss = loss)

In [26]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

In [27]:
EPOCHS = 50

In [28]:
history = model.fit(dataset.repeat(), epochs=EPOCHS, steps_per_epoch=steps_per_epoch, callbacks=[checkpoint_callback])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [29]:
tf.train.latest_checkpoint(checkpoint_dir)

'./training_checkpoints\\ckpt_50'

In [30]:
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)

model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

model.build(tf.TensorShape([1, None]))

In [31]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (1, None, 256)            13568     
_________________________________________________________________
cu_dnngru_1 (CuDNNGRU)       (1, None, 1024)           3938304   
_________________________________________________________________
dense_1 (Dense)              (1, None, 53)             54325     
Total params: 4,006,197
Trainable params: 4,006,197
Non-trainable params: 0
_________________________________________________________________


In [32]:
# Low temperatures results in more predictable text.
# Higher temperatures results in more surprising text.
# Experiment to find the best setting.
def generate_name(model, start_string = None, temperature = 1):
  # Evaluation step (generating text using the learned model)

  if (start_string == None):
    start_string = chr(random.randrange(65, 65 + 26))

  
  # Converting our start string to numbers (vectorizing)
  input_eval = [char2idx[s] for s in start_string]
  input_eval = tf.expand_dims(input_eval, 0)

  # Empty string to store our results
  text_generated = []
  

  # Here batch size == 1
  model.reset_states()
  while(True):
      predictions = model(input_eval)
      # remove the batch dimension
      predictions = tf.squeeze(predictions, 0)

      # using a multinomial distribution to predict the word returned by the model
      predictions = predictions / temperature
      predicted_id = tf.multinomial(predictions, num_samples=1)[-1,0].numpy()

      # We pass the predicted word as the next input to the model
      # along with the previous hidden state
      input_eval = tf.expand_dims([predicted_id], 0)

      character_generated = idx2char[predicted_id]
      if (character_generated == end_of_name_character):
        break
        
      text_generated.append(character_generated)

  return (start_string + ''.join(text_generated))

In [37]:
for i in range(0, 5):
    print(generate_name(model, start_string = "Ge"))

Ger
Gerta
Gedrie
Getti
Gerrice
