# Code generation with an RNN
Modified from https://www.tensorflow.org/tutorials/text/text_generation

In [29]:
import os
import time
import json

import tensorflow as tf
import numpy as np

In [30]:
dataset = tf.keras.utils.get_file(
    "insults.txt",
    "https://gist.githubusercontent.com/MythicManiac/dc9e1216105ff317b7dd14014896b8a4/raw/623f22945918f7b1ab66dab80fcff91a56184cf3/messages.txt"
)

In [31]:
text = open(dataset, "r").read()
print(text[:250])

lucian push first lane fast pls
so we get lvl 2
i start on blue
and kill them
sebi da?
lux no ward ? :S
ok ill try
jo
you have 1
was mit skxype los?
lux supp itens
-.-
no supp
ap
supp ap
nah
just don't farm ok ?
is sometnhin
ap
supp ap pls
no farm
no


In [32]:
vocabulary = sorted(set(text))
print(f"{len(vocabulary)} unique characters in dataset")
print(vocabulary)

104 unique characters in dataset
['\n', ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '\\', '^', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '~', '£', '¤', '¨', '¬', '´', '¹', 'º', '¼', 'Â', 'Ã', 'ã', '‚', '„', '€']


In [33]:
character_to_index = {
    character: index
    for index, character
    in enumerate(vocabulary)
}
index_to_character = np.array(vocabulary)

vectorized_dataset = np.array([
    character_to_index[character]
    for character in text
])

In [34]:
print(json.dumps(character_to_index, indent=4)[:100] + "...")

{
    "\n": 0,
    " ": 1,
    "!": 2,
    "\"": 3,
    "#": 4,
    "$": 5,
    "%": 6,
    "&": 7,
...


In [35]:
print("Character to integer mapping example")
print(text[:13])
print(vectorized_dataset[:13])

Character to integer mapping example
lucian push f
[74 83 65 71 63 76  1 78 83 81 70  1 68]


In [36]:
maximum_sequence_length = 30
examples_per_epoch = len(text) // (maximum_sequence_length + 1)
print(f"Training with {examples_per_epoch} examples per epoch")

Training with 1222 examples per epoch


In [37]:
dataset_helper = tf.data.Dataset.from_tensor_slices(vectorized_dataset)
for i in dataset_helper.take(5):
    print(index_to_character[i.numpy()])

l
u
c
i
a


In [38]:
sequences = dataset_helper.batch(
    maximum_sequence_length + 1,
    drop_remainder=True
)
for item in sequences.take(5):
    print(repr("".join(index_to_character[item.numpy()])))

'lucian push first lane fast pls'
'\nso we get lvl 2\ni start on blu'
'e\nand kill them\nsebi da?\nlux no'
' ward ? :S\nok ill try\njo\nyou ha'
've 1\nwas mit skxype los?\nlux su'


In [39]:
def split_input_target(sequence):
    input_data = sequence[:-1]
    target_data = sequence[1:]
    return input_data, target_data

prepared_dataset = sequences.map(split_input_target)
prepared_dataset

<MapDataset shapes: ((30,), (30,)), types: (tf.int32, tf.int32)>

In [40]:
for input_example, target_example in prepared_dataset.take(1):
    print(f"Input data:", repr("".join(index_to_character[input_example.numpy()])))
    print(f"Target data:", repr("".join(index_to_character[target_example.numpy()])))

Input data: 'lucian push first lane fast pl'
Target data: 'ucian push first lane fast pls'


In [41]:
BATCH_SIZE = 64
BUFFER_SIZE = 10000

shuffled_dataset = (
    prepared_dataset
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
)

shuffled_dataset

<BatchDataset shapes: ((64, 30), (64, 30)), types: (tf.int32, tf.int32)>

In [42]:
vocabulary_size = len(vocabulary)
# Tutorial had the embedding dimension at 256, but after looking up some
# metrics and what it should be based on, I decided to drop it down to 64.
# See https://en.wikipedia.org/wiki/Word2vec#Dimensionality
# Also https://datascience.stackexchange.com/a/48194
embedding_dimension = 64
rnn_units = 1024

In [43]:
checkpoint_dir = "./training-checkpoints/lol-toxicity-generation-with-an-rnn"
def build_model(vocabulary_size, embedding_dimension, rnn_units, batch_size):
    model =  tf.keras.Sequential([
        tf.keras.layers.Embedding(
            vocabulary_size,
            embedding_dimension,
            batch_input_shape=[batch_size, None]
        ),
        tf.keras.layers.GRU(
            rnn_units,
            return_sequences=True,
            stateful=True,
            recurrent_initializer="glorot_uniform",
        ),
        tf.keras.layers.Dense(vocabulary_size),
    ])
    return model

model = build_model(
    vocabulary_size=vocabulary_size,
    embedding_dimension=embedding_dimension,
    rnn_units=rnn_units,
    batch_size=BATCH_SIZE,
)
latest = tf.train.latest_checkpoint(checkpoint_dir)
if latest:
    model.load_weights(latest)
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (64, None, 64)            6656      
_________________________________________________________________
gru_2 (GRU)                  (64, None, 1024)          3348480   
_________________________________________________________________
dense_2 (Dense)              (64, None, 104)           106600    
Total params: 3,461,736
Trainable params: 3,461,736
Non-trainable params: 0
_________________________________________________________________


In [44]:
for input_batch, target_batch in shuffled_dataset.take(1):
    predictions = model(input_batch)
    print(predictions.shape, "# (batch_size, sequence_length, vocabulary_size)")

(64, 30, 104) # (batch_size, sequence_length, vocabulary_size)


In [45]:
# Apparently random sampling should be used rather than argmax to avoid loops.
# So this piece of code uses a the output value as a probability, rather
# than just choosing the one that's highest.
sampled_indices = tf.random.categorical(predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices, axis=-1).numpy()
sampled_indices

array([ 75,  47,   8,  81,  81,  15, 102,  53,  65,   3,  31,  95,  53,
         8,  93,  64,  46,  59,  44, 102, 103,  44,  30,  11, 100,  24,
        88,  17,  34,  81], dtype=int64)

In [47]:
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(
        labels,
        logits,
        from_logits=True,
    )

batch_loss = loss(target_batch, predictions)
print("Predictions shape (batch_size, sequence_length, vocabulary_size)")
print(predictions.shape, "\n")
print("scalar_loss:", batch_loss.numpy().mean())

Predictions shape (batch_size, sequence_length, vocabulary_size)
(64, 30, 104) 

scalar_loss: 4.644359


In [48]:
model.compile(optimizer="adam", loss=loss)

In [49]:
checkpoint_prefix = os.path.abspath(
    os.path.join(checkpoint_dir, "ckpt_{epoch}")
)

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True,
)

In [50]:
EPOCHS = 90

In [51]:
history = model.fit(
    shuffled_dataset,
    epochs=EPOCHS,
    callbacks=[checkpoint_callback],
)

Train for 19 steps
Epoch 1/90
Epoch 2/90
Epoch 3/90
Epoch 4/90
Epoch 5/90
Epoch 6/90
Epoch 7/90
Epoch 8/90
Epoch 9/90
Epoch 10/90
Epoch 11/90
Epoch 12/90
Epoch 13/90
Epoch 14/90
Epoch 15/90
Epoch 16/90
Epoch 17/90
Epoch 18/90
Epoch 19/90
Epoch 20/90
Epoch 21/90
Epoch 22/90
Epoch 23/90
Epoch 24/90
Epoch 25/90
Epoch 26/90
Epoch 27/90
Epoch 28/90
Epoch 29/90
Epoch 30/90
Epoch 31/90
Epoch 32/90
Epoch 33/90
Epoch 34/90
Epoch 35/90
Epoch 36/90
Epoch 37/90
Epoch 38/90
Epoch 39/90
Epoch 40/90
Epoch 41/90
Epoch 42/90
Epoch 43/90
Epoch 44/90
Epoch 45/90
Epoch 46/90
Epoch 47/90
Epoch 48/90
Epoch 49/90
Epoch 50/90
Epoch 51/90
Epoch 52/90
Epoch 53/90
Epoch 54/90
Epoch 55/90
Epoch 56/90
Epoch 57/90
Epoch 58/90
Epoch 59/90
Epoch 60/90
Epoch 61/90
Epoch 62/90
Epoch 63/90
Epoch 64/90
Epoch 65/90
Epoch 66/90
Epoch 67/90
Epoch 68/90
Epoch 69/90
Epoch 70/90
Epoch 71/90
Epoch 72/90
Epoch 73/90
Epoch 74/90
Epoch 75/90
Epoch 76/90
Epoch 77/90
Epoch 78/90
Epoch 79/90
Epoch 80/90
Epoch 81/90
Epoch 82/90
Epoch 

In [52]:
model = build_model(
    vocabulary_size,
    embedding_dimension,
    rnn_units,
    batch_size=1,
)
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.build(tf.TensorShape([1, None]))
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (1, None, 64)             6656      
_________________________________________________________________
gru_3 (GRU)                  (1, None, 1024)           3348480   
_________________________________________________________________
dense_3 (Dense)              (1, None, 104)            106600    
Total params: 3,461,736
Trainable params: 3,461,736
Non-trainable params: 0
_________________________________________________________________


In [53]:
def generate_text(model, start_string):
    characters_to_generate = 1000
    
    input_eval = [
        character_to_index[character]
        for character in start_string
    ]
    # tf.expand_dims inserts a dimension at the specified index.
    # In this case it converts our shape from (n,) to (1, n,)
    input_eval = tf.expand_dims(input_eval, 0)
    
    generated_output = []
    
    temperature = 1.0
    
    model.reset_states()
    for i in range(characters_to_generate):
        predictions = model(input_eval)
        # tf.squeeze here does the opposite of tf.expand_dims
        predictions = tf.squeeze(predictions, 0)
        
        predictions /= temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)
        predicted_id = predicted_id[-1, 0].numpy()
        
        # Pass in the predicted character as input on the next round
        input_eval = tf.expand_dims([predicted_id], 0)
        generated_output.append(index_to_character[predicted_id])
    
    return f"{start_string}{''.join(generated_output)}"

In [62]:
print(generate_text(model, start_string="banned "))

banned that 2 vayne
7vayne take red
draven
dont you noob
we losing the tower and sthite
next time you don'trck
ide
u suck,too
wrtaa
told u
nah
u dont even red dreed
alp and --- win
yes i then then push
NS then uwer think i shot ult ?
u suck,too
what happend
lol?
well
okai need
hamadceam
como veas. . . omg ma her
Absolutly not
fking riven dmg
i'm comming top
muchroo.
play come and j4
pla -.-
why ult ?
baiooob
needs teemo veas. . .
omg go nn and shen unt ge
tryn xD
heacand ge and stfu
back
relax en y us the lee sin
qq piece of sheit e eluse
lol
xDDooooooo
im only nk
ranged
he halich back for wards
orianna
is here tart
gg
thx
me car the one rag nice team communication
you're the one or not takign inhib
reprot me alon
n
bat we are lori dives, they dive us
other lanes cull tryndafed
u suck,too
wrtaa
told u
nah
u dont kno
om -.-
non
np
sry
cant lux too low and riven feed riven
8 tryn
and die 8 time
REPORT EZAL ME WA work
bl dont run
OOO
gj
WH to work
bot soon
wp
ty
YIN got ot xtt's easy to p