# Text Generation with RNNs

###  Load Data

In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('./data/training.1600000.processed.noemoticon.csv', 
                   encoding='latin-1',
                   header=None)
data.head()

Unnamed: 0,0,1,2,3,4,5
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [3]:
# concatenate the text
text = ' '.join(data[5])
text[:300]

"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D is upset that he can't update his Facebook by texting it... and might cry as a result  School today also. Blah! @Kenichan I dived many times for the ball. Managed to save 50%  The rest"

### Data Preprocessing

In [4]:
import numpy as np
# vectorization
vocab = sorted(set(text))
print(f'{len(vocab)} unique characters')

char2idx = {u:i for i, u in enumerate(vocab)}
idx2char = np.array(vocab)

text_as_int = np.array([char2idx[c] for c in text])

193 unique characters


In [5]:
import tensorflow as tf
# The maximum length sentence we want for a single input in characters
seq_length = 128
examples_per_epoch = len(text) // (seq_length+1)

# Create training examples / targets
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

for i in char_dataset.take(5):
    print(idx2char[i.numpy()])

@
s
w
i
t


In [6]:
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

for item in sequences.take(5):
    print(repr(''.join(idx2char[item.numpy()])))

"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D is upset that"
" he can't update his Facebook by texting it... and might cry as a result  School today also. Blah! @Kenichan I dived many times f"
'or the ball. Managed to save 50%  The rest go out of bounds my whole body feels itchy and like its on fire  @nationwideclass no, '
"it's not behaving at all. i'm mad. why am i here? because I can't see you all over there.  @Kwesidei not the whole crew  Need a h"
"ug  @LOLTrish hey  long time no see! Yes.. Rains a bit ,only a bit  LOL , I'm fine thanks , how's you ? @Tatiana_K nope they didn"


In [7]:
def split_input_target(chunk):
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

In [8]:
for input_example, target_example in  dataset.take(1):
    print ('Input data: ', repr(''.join(idx2char[input_example.numpy()])))
    print ('Target data:', repr(''.join(idx2char[target_example.numpy()])))

Input data:  "@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D is upset tha"
Target data: "switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D is upset that"


In [9]:
for i, (input_idx, target_idx) in enumerate(zip(input_example[:5], target_example[:5])):
    print("Step {:4d}".format(i))
    print("  input: {} ({:s})".format(input_idx, repr(idx2char[input_idx])))
    print("  expected output: {} ({:s})".format(target_idx, repr(idx2char[target_idx])))

Step    0
  input: 30 ('@')
  expected output: 81 ('s')
Step    1
  input: 81 ('s')
  expected output: 85 ('w')
Step    2
  input: 85 ('w')
  expected output: 71 ('i')
Step    3
  input: 71 ('i')
  expected output: 82 ('t')
Step    4
  input: 82 ('t')
  expected output: 65 ('c')


In [10]:
# Batch size
BATCH_SIZE = 64

# Buffer size to shuffle the dataset
# (TF data is designed to work with possibly infinite sequences,
# so it doesn't attempt to shuffle the entire sequence in memory. Instead,
# it maintains a buffer in which it shuffles elements).
BUFFER_SIZE = 10000

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

dataset

<BatchDataset shapes: ((64, 128), (64, 128)), types: (tf.int64, tf.int64)>

## Text generation

In [18]:
def get_model(batch_size, vocab, embedding_dim=256, rnn_units=512):
    model = tf.keras.Sequential([
        tf.keras.layers.Embedding(len(vocab), embedding_dim,
                                  batch_input_shape=[batch_size, None]),
        tf.keras.layers.LSTM(rnn_units,
                            return_sequences=True,
                            stateful=True,
                            recurrent_initializer='glorot_uniform'),
        tf.keras.layers.Dense(len(vocab))
    ])
    return model

model = get_model(BATCH_SIZE, vocab)
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)

In [19]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices,axis=-1).numpy()
sampled_indices

array([ 29,  10, 185,  44, 136, 127,  16, 155, 177, 182,  25, 114,  66,
        39,  92,  27,  11, 119,  72,  27, 120,  43, 155, 186,  70, 148,
        86,  94,  93, 168, 186,  11, 169, 177, 107, 147, 147, 152,  42,
        65,  26, 185, 143,  19, 170, 149, 116,  95, 111, 138, 105,  92,
        74, 153, 160,  97,  10, 170,  34, 141,  46,  66,  19,  24, 126,
       188,  60,  59,  70, 161,  77,  87, 159,  61, 107, 174,  68, 114,
        38, 185,  30,  26, 149,  30,  82,  13, 144, 122, 186,  50,  61,
        71,  67,  30,  64, 105,  17,  85, 192, 132, 164,  65,  19,  12,
       154,  37,  20, 138, 173,  99,  78,   2,  56, 179, 182, 136,  45,
        96,  41,  17, 100,  73, 130,  85,  43, 144,  91, 109])

In [20]:
print("Input: \n", repr("".join(idx2char[input_example_batch[0]])))
print()
print("Next Char Predictions: \n", repr("".join(idx2char[sampled_indices])))

Input: 
 'y bed just feels too amazing. Dont make me get up!  Twitter, you wound me  @RyanSchartz nay my friend.. i must apologize..  @gon'

Next Char Predictions: 
 '?*èN¯¦0Äàå9\x98dI~;+\x9ej;\x9fMÄéh»x\x80\x7fÑé+Òà\x91ºº¿Lc:è¶3Ó¼\x9a\x82\x95±\x8c~lÂÉ\x84*ÓD´Pd38¥ë^]hÊoyÈ_\x91Úf\x98Hè@:¼@t-·¡éT_ie@b\x8c1wï«Íc3,ÃG4±Ù\x86p!Zâå¯O\x83K1\x87k©wM·}\x93'


In [21]:
import os
EPOCHS = 20

# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)

model.compile(optimizer='adam', 
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True))
model.fit(dataset, 
          epochs=EPOCHS,
          callbacks=[checkpoint_callback],
          verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7ff58e4a7b10>

In [22]:
def generate_text(model, start_string):
    # Evaluation step (generating text using the learned model)

    # Number of characters to generate
    num_generate = 1000

    # Converting our start string to numbers (vectorizing)
    input_eval = [char2idx[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0)

    # Empty string to store our results
    text_generated = []

    # Low temperatures results in more predictable text.
    # Higher temperatures results in more surprising text.
    # Experiment to find the best setting.
    temperature = 1.0

    # Here batch size == 1
    model.reset_states()
    for i in range(num_generate):
        predictions = model(input_eval)
        # remove the batch dimension
        predictions = tf.squeeze(predictions, 0)

        # using a categorical distribution to predict the character returned by the model
        predictions = predictions / temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

        # We pass the predicted character as the next input to the model
        # along with the previous hidden state
        input_eval = tf.expand_dims([predicted_id], 0)

        text_generated.append(idx2char[predicted_id])

    return (start_string + ''.join(text_generated))

In [23]:
# rebuild model with batch size = 1 for generating
generating_model = get_model(1, vocab)

generating_model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))

generating_model.build(tf.TensorShape([1, None]))

In [24]:
print(generate_text(generating_model, start_string=u"Well, "))

Well,  Wear her own umitt little saunda sample with RIS a bits and I've been greeted here - looking forward to being at home want to look out!!  @nickumy27 grape thing with iraq wait and we all love seeing....but does my embarrassing life? I have all the chance of minute life's kidding  Looks like you can, I'm like the BT and Mac, Proper Master Lamen. They will eat. And loving it for me at the store!  sooo yea U like we posted a place I do put them on! Just yet time  @Puppinox. lol I'm not yet ending thright I'm awake today... Just off at 330 today and leg tuar would be stopping faired!  Coffee Doctor Teeee My Maree-Labor Lunch!  @unhine Lol, you just had spstupfit, too!  Ilsometic is really favorite shirt!!  going to TYS teaching me  AM, was heaven and use this apart with inside football/coment...more.  - Music, corner, swim proper punch up in me places to &quot;we are an urge don't change to  Just am new to realization, its been awle too??  @sashejezz hahaha 12!!!!!!!!! Yeah, photo s