# Code generation with an RNN
Modified from https://www.tensorflow.org/tutorials/text/text_generation

In [1]:
import os
import time
import json

import tensorflow as tf
import numpy as np

In [2]:
text = open("./data/legacy.py", "rb").read().decode("utf-8")
print(text[:250])

SKU_PREFIX = "ID#"
# -- encoding: UTF-8 --
import logging
import sys

from django.core.management import BaseCommand
from django.db.transaction import atomic

from apps.id_integration.product_importer import ShoopIDProductJsonImporter

LOG = logging.


In [3]:
vocabulary = sorted(set(text))
print(f"{len(vocabulary)} unique characters in dataset")
print(vocabulary)

122 unique characters in dataset
['\t', '\n', ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', '\\', ']', '^', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '|', '}', '~', '©', '³', '½', 'Ä', 'Å', 'Ö', 'á', 'ä', 'æ', 'ï', 'ó', 'ö', 'ø', 'ú', 'ý', '̈', 'ω', 'Ṕ', '–', '’', '“', '”', '€', '⅔', '\ue1c0']


In [4]:
character_to_index = {
    character: index
    for index, character
    in enumerate(vocabulary)
}
index_to_character = np.array(vocabulary)

vectorized_dataset = np.array([
    character_to_index[character]
    for character in text
])

In [5]:
print(json.dumps(character_to_index, indent=4)[:100] + "...")

{
    "\t": 0,
    "\n": 1,
    " ": 2,
    "!": 3,
    "\"": 4,
    "#": 5,
    "$": 6,
    "%": 7,...


In [6]:
print("Character to integer mapping example")
print(text[:13])
print(vectorized_dataset[:13])

Character to integer mapping example
SKU_PREFIX = 
[53 45 55 65 50 52 39 40 43 58  2 31  2]


In [7]:
maximum_sequence_length = 100
examples_per_epoch = len(text) // (maximum_sequence_length + 1)
print(f"Training with {examples_per_epoch} examples per epoch")

Training with 35117 examples per epoch


In [8]:
dataset_helper = tf.data.Dataset.from_tensor_slices(vectorized_dataset)
for i in dataset_helper.take(5):
    print(index_to_character[i.numpy()])

S
K
U
_
P


In [9]:
sequences = dataset_helper.batch(
    maximum_sequence_length + 1,
    drop_remainder=True
)
for item in sequences.take(5):
    print(repr("".join(index_to_character[item.numpy()])))

'SKU_PREFIX = "ID#"\n# -- encoding: UTF-8 --\nimport logging\nimport sys\n\nfrom django.core.management imp'
'ort BaseCommand\nfrom django.db.transaction import atomic\n\nfrom apps.id_integration.product_importer i'
'mport ShoopIDProductJsonImporter\n\nLOG = logging.getLogger()\nLOG.setLevel(logging.INFO)\n\nSTDOUT_HANDLE'
'R = logging.StreamHandler(sys.stdout)\nSTDOUT_HANDLER.setLevel(logging.INFO)\nLOG.addHandler(STDOUT_HAN'
'DLER)\n\n\nclass Command(BaseCommand):\n    """\n    Import ID product data from the JSON format they prov'


In [10]:
def split_input_target(sequence):
    input_data = sequence[:-1]
    target_data = sequence[1:]
    return input_data, target_data

prepared_dataset = sequences.map(split_input_target)
prepared_dataset

<MapDataset shapes: ((100,), (100,)), types: (tf.int32, tf.int32)>

In [11]:
for input_example, target_example in prepared_dataset.take(1):
    print(f"Input data:", repr("".join(index_to_character[input_example.numpy()])))
    print(f"Target data:", repr("".join(index_to_character[target_example.numpy()])))

Input data: 'SKU_PREFIX = "ID#"\n# -- encoding: UTF-8 --\nimport logging\nimport sys\n\nfrom django.core.management im'
Target data: 'KU_PREFIX = "ID#"\n# -- encoding: UTF-8 --\nimport logging\nimport sys\n\nfrom django.core.management imp'


In [12]:
BATCH_SIZE = 64
BUFFER_SIZE = 10000

shuffled_dataset = (
    prepared_dataset
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
)

shuffled_dataset

<BatchDataset shapes: ((64, 100), (64, 100)), types: (tf.int32, tf.int32)>

In [13]:
vocabulary_size = len(vocabulary)
# Tutorial had the embedding dimension at 256, but after looking up some
# metrics and what it should be based on, I decided to drop it down to 64.
# See https://en.wikipedia.org/wiki/Word2vec#Dimensionality
# Also https://datascience.stackexchange.com/a/48194
embedding_dimension = 64
rnn_units = 1024

In [14]:
checkpoint_dir = "./training-checkpoints/code-generation-with-an-rnn-2"
def build_model(vocabulary_size, embedding_dimension, rnn_units, batch_size):
    model =  tf.keras.Sequential([
        tf.keras.layers.Embedding(
            vocabulary_size,
            embedding_dimension,
            batch_input_shape=[batch_size, None]
        ),
        tf.keras.layers.GRU(
            rnn_units,
            return_sequences=True,
            stateful=True,
            recurrent_initializer="glorot_uniform",
        ),
        tf.keras.layers.GRU(
            rnn_units // 2,
            return_sequences=True,
            stateful=True,
            recurrent_initializer="glorot_uniform",
        ),
        tf.keras.layers.Dense(vocabulary_size),
    ])
    return model

model = build_model(
    vocabulary_size=vocabulary_size,
    embedding_dimension=embedding_dimension,
    rnn_units=rnn_units,
    batch_size=BATCH_SIZE,
)
latest = tf.train.latest_checkpoint(checkpoint_dir)
if latest:
    model.load_weights(latest)
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (64, None, 64)            7808      
_________________________________________________________________
gru (GRU)                    (64, None, 1024)          3348480   
_________________________________________________________________
gru_1 (GRU)                  (64, None, 512)           2362368   
_________________________________________________________________
dense (Dense)                (64, None, 122)           62586     
Total params: 5,781,242
Trainable params: 5,781,242
Non-trainable params: 0
_________________________________________________________________


In [15]:
for input_batch, target_batch in shuffled_dataset.take(1):
    predictions = model(input_batch)
    print(predictions.shape, "# (batch_size, sequence_length, vocabulary_size)")

(64, 100, 122) # (batch_size, sequence_length, vocabulary_size)


In [16]:
# Apparently random sampling should be used rather than argmax to avoid loops.
# So this piece of code uses a the output value as a probability, rather
# than just choosing the one that's highest.
sampled_indices = tf.random.categorical(predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices, axis=-1).numpy()
sampled_indices

array([87, 77, 85, 91, 69, 82, 85, 18,  2,  2,  2,  2, 85, 69,  2,  2,  2,
       71, 81, 84, 65, 73, 71, 14, 85, 31,  2, 93, 87, 70, 71, 80, 31, 82,
       73, 14, 56, 84, 81, 73, 87, 69, 86, 52, 80, 70, 71, 84, 85, 28, 45,
       47, 35, 71, 38, 54, 41, 35, 40, 80, 54, 39, 80, 54, 39, 54, 39, 48,
       40, 38, 47, 39, 54, 49,  2,  2,  2,  2,  2,  1,  2,  2,  2,  2, 84,
       71, 86, 81, 78, 85, 31, 31,  2,  2, 81, 87, 84, 84, 91, 80],
      dtype=int64)

In [17]:
print("Input:\n", repr("".join(index_to_character[input_batch[0]])), "\n")
print("Output:\n", repr("".join(index_to_character[sampled_indices])))

Input:
 'querier(\n        shop=shop, ordering=ProductOrdering.LEAST_EXPENSIVE_FIRST\n    )\n    results = queri' 

Output:
 'uksycps0    sc   eor_ge,s= {uden=pg,VroguctRnders:KMAeDTGAFnTEnTETENFDMETO     \n    retols==  ourryn'


In [18]:
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(
        labels,
        logits,
        from_logits=True,
    )

batch_loss = loss(target_batch, predictions)
print("Predictions shape (batch_size, sequence_length, vocabulary_size)")
print(predictions.shape, "\n")
print("scalar_loss:", batch_loss.numpy().mean())

Predictions shape (batch_size, sequence_length, vocabulary_size)
(64, 100, 122) 

scalar_loss: 1.9175811


In [19]:
model.compile(optimizer="adam", loss=loss)

In [20]:
from notebooks.tf.scheduler import SGDRScheduler

In [21]:
checkpoint_prefix = os.path.abspath(
    os.path.join(checkpoint_dir, "ckpt_{epoch}")
)

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True,
)

learningrate_callback = SGDRScheduler(
    min_lr=1e-5,
    max_lr=1e-2,
    steps_per_epoch=548,
    lr_decay=0.9,
    cycle_length=5,
    mult_factor=1.5
)

In [22]:
EPOCHS = 30

In [28]:
history = model.fit(
    shuffled_dataset,
    epochs=EPOCHS,
    callbacks=[checkpoint_callback, learningrate_callback],
)

Train for 548 steps
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [30]:
checkpoint = tf.train.latest_checkpoint(checkpoint_dir)
checkpoint = "./training-checkpoints/mastermark-code-generation-with-an-rnn\ckpt_29"
checkpoint = "./training-checkpoints/mastermark-code-generation-with-an-rnn\ckpt_139"
model = build_model(
    vocabulary_size,
    embedding_dimension,
    rnn_units,
    batch_size=1,
)
model.load_weights(checkpoint)
model.build(tf.TensorShape([1, None]))
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (1, None, 64)             7808      
_________________________________________________________________
gru_2 (GRU)                  (1, None, 1024)           3348480   
_________________________________________________________________
gru_3 (GRU)                  (1, None, 512)            2362368   
_________________________________________________________________
dense_1 (Dense)              (1, None, 122)            62586     
Total params: 5,781,242
Trainable params: 5,781,242
Non-trainable params: 0
_________________________________________________________________


In [31]:
def generate_text(model, start_string):
    characters_to_generate = 1900
    
    input_eval = [
        character_to_index[character]
        for character in start_string
    ]
    # tf.expand_dims inserts a dimension at the specified index.
    # In this case it converts our shape from (n,) to (1, n,)
    input_eval = tf.expand_dims(input_eval, 0)
    
    generated_output = []
    
    temperature = 1.0
    
    model.reset_states()
    for i in range(characters_to_generate):
        predictions = model(input_eval)
        # tf.squeeze here does the opposite of tf.expand_dims
        predictions = tf.squeeze(predictions, 0)
        
        predictions /= temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)
        predicted_id = predicted_id[-1, 0].numpy()
        
        # Pass in the predicted character as input on the next round
        input_eval = tf.expand_dims([predicted_id], 0)
        generated_output.append(index_to_character[predicted_id])
    
    return f"{start_string}{''.join(generated_output)}"

In [32]:
print(generate_text(model, start_string="import "))

import User
from shoop.identifiers to paramethod.

   :type no: L{Element}
        """

        if (
            "boxes",
            OrderSettingSelect,
                    "quantizer": self.ids_weight),
            "tax_class": {"styles__in=f.number
            str(text)
                Product.ode mutatus and the product array,
                colors=None,
            product=product,
        request,
                _pa"full_process_choices",
    ),
    tax_pee:
        # price internal installs class last in current tuple.
        # no vilidity/LogEntryKind.
        number
       
    :return: Item child address.integrap in PATTR or None)
            return self.get_init__(self, key) from load_byys
    gzeeding_price = formset_key(order)
            except CopTion:
                       mm_shop.org.uabaClasses = AdError("TESTED_REPLY_CACCE
        ),
    )
        current_user.password = request.POST[0]
    p = self.taxless_price
        if package_response:
            return Fa