# Code generation with an RNN
Modified from https://www.tensorflow.org/tutorials/text/text_generation

In [2]:
import os
import time
import json

import tensorflow as tf
import numpy as np

In [3]:
dataset = tf.keras.utils.get_file(
    "code.txt",
    "https://raw.githubusercontent.com/risk-of-thunder/Thunderstore/master/django/repository/models.py"
)

Downloading data from https://raw.githubusercontent.com/risk-of-thunder/Thunderstore/master/django/repository/models.py


In [4]:
text = open(dataset, "r").read()
print(text[:250])

import re
import uuid

from datetime import timedelta
from distutils.version import StrictVersion

from django.core.exceptions import ValidationError
from ipware import get_client_ip

from django.conf import settings
from django.core.files.storage im


In [5]:
vocabulary = sorted(set(text))
print(f"{len(vocabulary)} unique characters in dataset")
print(vocabulary)

84 unique characters in dataset
['\n', ' ', '!', '"', '#', '%', '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '4', '5', '6', '7', '8', '9', ':', '<', '=', '>', '@', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '}', '~']


In [6]:
character_to_index = {
    character: index
    for index, character
    in enumerate(vocabulary)
}
index_to_character = np.array(vocabulary)

vectorized_dataset = np.array([
    character_to_index[character]
    for character in text
])

In [7]:
print(json.dumps(character_to_index, indent=4)[:100] + "...")

{
    "\n": 0,
    " ": 1,
    "!": 2,
    "\"": 3,
    "#": 4,
    "%": 5,
    "(": 6,
    ")": 7,
...


In [8]:
print("Character to integer mapping example")
print(text[:13])
print(vectorized_dataset[:13])

Character to integer mapping example
import re
imp
[63 67 70 69 72 74  1 72 59  0 63 67 70]


In [9]:
maximum_sequence_length = 100
examples_per_epoch = len(text) // (maximum_sequence_length + 1)
print(f"Training with {examples_per_epoch} examples per epoch")

Training with 167 examples per epoch


In [10]:
dataset_helper = tf.data.Dataset.from_tensor_slices(vectorized_dataset)
for i in dataset_helper.take(5):
    print(index_to_character[i.numpy()])

i
m
p
o
r


In [11]:
sequences = dataset_helper.batch(
    maximum_sequence_length + 1,
    drop_remainder=True
)
for item in sequences.take(5):
    print(repr("".join(index_to_character[item.numpy()])))

'import re\nimport uuid\n\nfrom datetime import timedelta\nfrom distutils.version import StrictVersion\n\nfr'
'om django.core.exceptions import ValidationError\nfrom ipware import get_client_ip\n\nfrom django.conf i'
'mport settings\nfrom django.core.files.storage import get_storage_class\nfrom django.db import models, '
'transaction\nfrom django.db.models import Case, When, Sum, Q, signals\nfrom django.urls import reverse\n'
'from django.utils import timezone\nfrom django.utils.functional import cached_property\n\nfrom core.cach'


In [12]:
def split_input_target(sequence):
    input_data = sequence[:-1]
    target_data = sequence[1:]
    return input_data, target_data

prepared_dataset = sequences.map(split_input_target)
prepared_dataset

<MapDataset shapes: ((100,), (100,)), types: (tf.int32, tf.int32)>

In [13]:
for input_example, target_example in prepared_dataset.take(1):
    print(f"Input data:", repr("".join(index_to_character[input_example.numpy()])))
    print(f"Target data:", repr("".join(index_to_character[target_example.numpy()])))

Input data: 'import re\nimport uuid\n\nfrom datetime import timedelta\nfrom distutils.version import StrictVersion\n\nf'
Target data: 'mport re\nimport uuid\n\nfrom datetime import timedelta\nfrom distutils.version import StrictVersion\n\nfr'


In [14]:
BATCH_SIZE = 64
BUFFER_SIZE = 10000

shuffled_dataset = (
    prepared_dataset
    .shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
)

shuffled_dataset

<BatchDataset shapes: ((64, 100), (64, 100)), types: (tf.int32, tf.int32)>

In [15]:
vocabulary_size = len(vocabulary)
# Tutorial had the embedding dimension at 256, but after looking up some
# metrics and what it should be based on, I decided to drop it down to 64.
# See https://en.wikipedia.org/wiki/Word2vec#Dimensionality
# Also https://datascience.stackexchange.com/a/48194
embedding_dimension = 64
rnn_units = 1024

In [32]:
checkpoint_dir = "./training-checkpoints/code-generation-with-an-rnn"
def build_model(vocabulary_size, embedding_dimension, rnn_units, batch_size):
    model =  tf.keras.Sequential([
        tf.keras.layers.Embedding(
            vocabulary_size,
            embedding_dimension,
            batch_input_shape=[batch_size, None]
        ),
        tf.keras.layers.GRU(
            rnn_units,
            return_sequences=True,
            stateful=True,
            recurrent_initializer="glorot_uniform",
        ),
        tf.keras.layers.Dense(vocabulary_size),
    ])
    return model

model = build_model(
    vocabulary_size=vocabulary_size,
    embedding_dimension=embedding_dimension,
    rnn_units=rnn_units,
    batch_size=BATCH_SIZE,
)
latest = tf.train.latest_checkpoint(checkpoint_dir)
if latest:
    model.load_weights(latest)
model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (64, None, 64)            5376      
_________________________________________________________________
gru_4 (GRU)                  (64, None, 1024)          3348480   
_________________________________________________________________
dense_4 (Dense)              (64, None, 84)            86100     
Total params: 3,439,956
Trainable params: 3,439,956
Non-trainable params: 0
_________________________________________________________________


In [33]:
for input_batch, target_batch in shuffled_dataset.take(1):
    predictions = model(input_batch)
    print(predictions.shape, "# (batch_size, sequence_length, vocabulary_size)")

(64, 100, 84) # (batch_size, sequence_length, vocabulary_size)


In [34]:
# Apparently random sampling should be used rather than argmax to avoid loops.
# So this piece of code uses a the output value as a probability, rather
# than just choosing the one that's highest.
sampled_indices = tf.random.categorical(predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices, axis=-1).numpy()
sampled_indices

array([40, 50, 78, 23, 60, 16, 65, 74, 25, 24, 38, 35, 79, 25, 76, 27, 72,
       52, 18, 52, 61, 83, 70, 30, 44, 65, 40, 42, 65, 16,  5, 39, 12, 13,
       51, 16, 24, 24, 81, 26, 51, 28, 81, 24, 35, 81, 55, 23, 31, 59, 22,
       25, 77,  8, 28, 30, 26, 48, 59, 12, 83, 32, 36, 27, 65, 60, 78, 22,
        9, 34,  1, 13, 55, 33, 63, 79, 69, 54, 82, 51, 53, 32, 20, 15, 43,
       62, 18, 39, 83, 77, 65, 80, 80, 20, 44, 33, 28, 20, 35, 80],
      dtype=int64)

In [35]:
print("Input:\n", repr("".join(index_to_character[input_batch[0]])), "\n")
print("Output:\n", repr("".join(index_to_character[sampled_indices])))

Input:
 'validate_cache(CacheBustCondition.any_package_updated)\n\n    @staticmethod\n    def post_delete(sender' 

Output:
 'NXx:f2kt=<LHy=v@r[5[g~pCRkNPk2%M./Z2<<{>ZA{<H{a:De9=w*AC>Ve.~EI@kfx9+G /aFiyo_}Z]E71Qh5M~wkzz7RFA7Hz'


In [36]:
def loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(
        labels,
        logits,
        from_logits=True,
    )

batch_loss = loss(target_batch, predictions)
print("Predictions shape (batch_size, sequence_length, vocabulary_size)")
print(predictions.shape, "\n")
print("scalar_loss:", batch_loss.numpy().mean())

Predictions shape (batch_size, sequence_length, vocabulary_size)
(64, 100, 84) 

scalar_loss: 4.429997


In [37]:
model.compile(optimizer="adam", loss=loss)

In [38]:
checkpoint_prefix = os.path.abspath(
    os.path.join(checkpoint_dir, "ckpt_{epoch}")
)

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True,
)

In [39]:
EPOCHS = 90

In [41]:
history = model.fit(
    shuffled_dataset,
    epochs=EPOCHS,
    callbacks=[checkpoint_callback],
)

Train for 2 steps
Epoch 1/90
Epoch 2/90
Epoch 3/90
Epoch 4/90
Epoch 5/90
Epoch 6/90
Epoch 7/90
Epoch 8/90
Epoch 9/90
Epoch 10/90
Epoch 11/90
Epoch 12/90
Epoch 13/90
Epoch 14/90
Epoch 15/90
Epoch 16/90
Epoch 17/90
Epoch 18/90
Epoch 19/90
Epoch 20/90
Epoch 21/90
Epoch 22/90
Epoch 23/90
Epoch 24/90
Epoch 25/90
Epoch 26/90
Epoch 27/90
Epoch 28/90
Epoch 29/90
Epoch 30/90
Epoch 31/90
Epoch 32/90
Epoch 33/90
Epoch 34/90
Epoch 35/90
Epoch 36/90
Epoch 37/90
Epoch 38/90
Epoch 39/90
Epoch 40/90
Epoch 41/90
Epoch 42/90
Epoch 43/90
Epoch 44/90
Epoch 45/90
Epoch 46/90
Epoch 47/90
Epoch 48/90
Epoch 49/90
Epoch 50/90
Epoch 51/90
Epoch 52/90
Epoch 53/90
Epoch 54/90
Epoch 55/90
Epoch 56/90
Epoch 57/90
Epoch 58/90
Epoch 59/90
Epoch 60/90
Epoch 61/90
Epoch 62/90
Epoch 63/90
Epoch 64/90
Epoch 65/90
Epoch 66/90
Epoch 67/90
Epoch 68/90
Epoch 69/90
Epoch 70/90
Epoch 71/90
Epoch 72/90
Epoch 73/90
Epoch 74/90
Epoch 75/90
Epoch 76/90
Epoch 77/90
Epoch 78/90
Epoch 79/90
Epoch 80/90
Epoch 81/90
Epoch 82/90
Epoch 8

In [42]:
model = build_model(
    vocabulary_size,
    embedding_dimension,
    rnn_units,
    batch_size=1,
)
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.build(tf.TensorShape([1, None]))
model.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (1, None, 64)             5376      
_________________________________________________________________
gru_5 (GRU)                  (1, None, 1024)           3348480   
_________________________________________________________________
dense_5 (Dense)              (1, None, 84)             86100     
Total params: 3,439,956
Trainable params: 3,439,956
Non-trainable params: 0
_________________________________________________________________


In [43]:
def generate_text(model, start_string):
    characters_to_generate = 1000
    
    input_eval = [
        character_to_index[character]
        for character in start_string
    ]
    # tf.expand_dims inserts a dimension at the specified index.
    # In this case it converts our shape from (n,) to (1, n,)
    input_eval = tf.expand_dims(input_eval, 0)
    
    generated_output = []
    
    temperature = 1.0
    
    model.reset_states()
    for i in range(characters_to_generate):
        predictions = model(input_eval)
        # tf.squeeze here does the opposite of tf.expand_dims
        predictions = tf.squeeze(predictions, 0)
        
        predictions /= temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)
        predicted_id = predicted_id[-1, 0].numpy()
        
        # Pass in the predicted character as input on the next round
        input_eval = tf.expand_dims([predicted_id], 0)
        generated_output.append(index_to_character[predicted_id])
    
    return f"{start_string}{''.join(generated_output)}"

In [44]:
print(generate_text(model, start_string="import "))

import Charecest,
        identitie=Fadse,
        ingtance   modelsilasFarlatt,
            re_urencreremendencies",
        instanc,    re.chertetitt(on_wenge,= ", "  )
    in_ self.lase_download_counter_diwnEoritisy(self):
        return self.latest.depdid = ",
            "name": self.packages.anloadency

    @cached_property
    def d_pkaclase()
        i       rolderValid=True):
        kalkowel_at__ange_vels.CAChan_mipertace(self):
        return (
            "f     "relbsif", self.amon.acle_falef,
        related_name="versions",        PackageQhorntid ".lattagn iendet re  return relf.name.dendid
    def get_version_pn mepackagl",
        package = medere  members",
        PackageRef

    thombel=Tlue,            ", "hownr")sert,
    )
    cachertage=create_torest_ip retatid(uelder identity

    def owner(self):
       returd salf(sedf):
        return self.vanid

    @cached_property
    def active(self):
        return "restemm":

            "dependancicalas__icanat", kwy{