In [1]:
import tensorflow as tf
tf.enable_eager_execution()

import numpy as np
import os
import time

In [2]:
text = open('tweets.txt', 'rb').read().decode(encoding='utf-8')
print ('Length of text: {} characters'.format(len(text)))

Length of text: 850909 characters


In [3]:
vocab = sorted(set(text))
print ('{} unique characters'.format(len(vocab)))

106 unique characters


In [4]:
char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

text_as_int = np.array([char2idx[c] for c in text])

In [5]:
print('{')
for char,_ in zip(char2idx, range(20)):
    print('  {:4s}: {:3d},'.format(repr(char), char2idx[char]))
print('  ...\n}')

{
  u'\u064f': 101,
  u' ':   1,
  u'$':   5,
  u'(':   8,
  u',':  12,
  u'0':  16,
  u'4':  20,
  u'8':  24,
  u'@':  30,
  u'D':  34,
  u'\u03c9':  92,
  u'H':  38,
  u'L':  42,
  u'P':  46,
  u'T':  50,
  u'X':  54,
  u'd':  63,
  u'h':  67,
  u'l':  71,
  u'p':  75,
  ...
}


In [6]:
seq_length = 100
examples_per_epoch = len(text)//(seq_length+1)

char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

for i in char_dataset.take(5):
  print(idx2char[i.numpy()])

T
o
d
a
y


In [7]:
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

for item in sequences.take(5):
  print(repr(''.join(idx2char[item.numpy()])))

u'Today we express our deepest gratitude to all those who have served in our armed forces. #ThankAVet h'
u'ttps://t.co/wPk7QWpK8Z\nBusy day planned in New York. Will soon be making some very important decision'
u's on the people who will be running our government!\nLove the fact that the small groups of protesters'
u' last night have passion for our great country. We will all come together and be proud!\nJust had a ve'
u'ry open and successful presidential election. Now professional protesters, incited by the media, are '


In [8]:
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

In [9]:
for input_example, target_example in  dataset.take(1):
  print ('Input data: ', repr(''.join(idx2char[input_example.numpy()])))
  print ('Target data:', repr(''.join(idx2char[target_example.numpy()])))

('Input data: ', "u'Today we express our deepest gratitude to all those who have served in our armed forces. #ThankAVet '")
('Target data:', "u'oday we express our deepest gratitude to all those who have served in our armed forces. #ThankAVet h'")


In [10]:
for i, (input_idx, target_idx) in enumerate(zip(input_example[:5], target_example[:5])):
    print("Step {:4d}".format(i))
    print("  input: {} ({:s})".format(input_idx, repr(idx2char[input_idx])))
    print("  expected output: {} ({:s})".format(target_idx, repr(idx2char[target_idx])))

BATCH_SIZE = 64

BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

print dataset

vocab_size = len(vocab)
print("Vocab size = " + str(vocab_size))

embedding_dim = 256

rnn_units = 1024

Step    0
  input: 50 (u'T')
  expected output: 74 (u'o')
Step    1
  input: 74 (u'o')
  expected output: 63 (u'd')
Step    2
  input: 63 (u'd')
  expected output: 60 (u'a')
Step    3
  input: 60 (u'a')
  expected output: 84 (u'y')
Step    4
  input: 84 (u'y')
  expected output: 1 (u' ')
<DatasetV1Adapter shapes: ((64, 100), (64, 100)), types: (tf.int64, tf.int64)>
Vocab size = 106


In [11]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(vocab_size, embedding_dim,
                                  batch_input_shape=[batch_size, None]),
        tf.keras.layers.GRU(rnn_units,
                            return_sequences=True,
                            stateful=True,
                            recurrent_initializer='glorot_uniform'),
        tf.keras.layers.Dense(vocab_size)
    ])
    return model

model = build_model(
  vocab_size = len(vocab),
  embedding_dim=embedding_dim,
  rnn_units=rnn_units,
  batch_size=BATCH_SIZE)

In [12]:
for input_example_batch, target_example_batch in dataset.take(1):
  example_batch_predictions = model(input_example_batch)
  print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")


sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices,axis=-1).numpy()

print(sampled_indices)

print("Input: \n", repr("".join(idx2char[input_example_batch[0]])))
print()
print("Next Char Predictions: \n", repr("".join(idx2char[sampled_indices ])))

def loss(labels, logits):
  return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

example_batch_loss  = loss(target_example_batch, example_batch_predictions)
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("scalar_loss:      ", example_batch_loss.numpy().mean())

model.compile(optimizer='adam', loss=loss)

(TensorShape([Dimension(64), Dimension(100), Dimension(106)]), '# (batch_size, sequence_length, vocab_size)')
[ 90  29   6  17  17  83  83 105 102  50  11  88  37  43 105  73   9  50
  12  23  36  36  98  80  46  74  70  93 105  25 101  23  60  94  68  91
  63  38  10  39  28   6  68  20  58  70  71  43  77  69  52  11  66  92
 100  21  33  18  18   3  70  80  98  42  76  52  47  69  65  45  58  19
  41  84   3  42  30  86  42  70  94  25  54  11  70 104   1  35  51   1
  39  26  92  70  57  97  60  19  31   8]
('Input: \n', 'u\' wouldnt be watching at all!!! Honestly!"\\n"@antSTACKSgrieco: @realDonaldTrump you were great in it!!\'')
()
('Next Char Predictions: \n', 'u\'\\u0289?%11xx\\u06ea\\u066aT+}GM\\u06ean)T,7FF\\u0565uPok\\u044f\\u06ea9\\u064f7a\\u04d5i\\u0331dH*I=%i4]klMrjV+g\\u03c9\\u057b5C22"ku\\u0565LqVQjfO]3Ky"L@{Lk\\u04d59X+k\\u06e2 EU I:\\u03c9k[\\u0520a3A(\'')
('Prediction shape: ', TensorShape([Dimension(64), Dimension(100), Dimension(106)]), ' # (batch_size, sequence_len

In [13]:
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

EPOCHS = 10

history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

Epoch 1/10
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [14]:
checkpoint_dir = './training_checkpoints'
tf.train.latest_checkpoint(checkpoint_dir)


model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)

model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

model.build(tf.TensorShape([1, None]))

def generate_text(model, start_string):
  
  num_generate = 280

  input_eval = [char2idx[s] for s in start_string]
  input_eval = tf.expand_dims(input_eval, 0)

  text_generated = []

  temperature = 1.0

  model.reset_states()
  for i in range(num_generate):
      predictions = model(input_eval)
      
      predictions = tf.squeeze(predictions, 0)

      predictions = predictions / temperature
      predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

      input_eval = tf.expand_dims([predicted_id], 0)

      text_generated.append(idx2char[predicted_id])

  return (start_string + ''.join(text_generated))

In [17]:
print(generate_text(model, start_string=u"I "))

I WONT GOT #KetertyFlion
.@Fiverryan, If (Faxter/EZSJFWAN to will come to the U Saide Hay agree. So Crooked Jonans), no action). https://t.co/oWVxLQUNMo
John Kasich,  MA 7! 2 2513.22 https://t.co/X39HTfns48
Going to MAKE AMERICA GREAT AGAIN!
Frand lase night about by South Carolina


In [18]:
model.save('shakespeare.h5')