# My RNN Language Model implementation in Keras

In [1]:
import os

import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import GRU
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import load_model
print(tf.__version__)

2.0.0


In [2]:
with open('../datasets/username_dataset.txt', 'r') as file:
    data = file.readlines()
    
data = [example.strip().lower() for example in data]

with open('../datasets/username_dataset.txt', 'r') as file:
    data_str = file.read().lower()

vocab = list(set(data_str))
vocab.sort()

vocab.insert(0, '>') # End token: <end>
vocab.insert(0, '<') # Start token: <start>
vocab.insert(0, '*') # Padding

char_to_index = {ch: i for i, ch in enumerate(vocab)}
index_to_char = {i: ch for i, ch in enumerate(vocab)}

print(char_to_index)

longest_word = max(data, key=len)

print(f"Number of characters: {len(data_str)}")
print(f"Number of unique characters: {len(vocab)}")
print(f"Longest word: {longest_word}")
print(f"Length of longest word: {len(longest_word)}") # Needed information to know what length to pad the usernames to.

{'*': 0, '<': 1, '>': 2, '\n': 3, '0': 4, '1': 5, '2': 6, '3': 7, '4': 8, '5': 9, '6': 10, '7': 11, '8': 12, '9': 13, 'a': 14, 'b': 15, 'c': 16, 'd': 17, 'e': 18, 'f': 19, 'g': 20, 'h': 21, 'i': 22, 'j': 23, 'k': 24, 'l': 25, 'm': 26, 'n': 27, 'o': 28, 'p': 29, 'q': 30, 'r': 31, 's': 32, 't': 33, 'u': 34, 'v': 35, 'w': 36, 'x': 37, 'y': 38, 'z': 39}
Number of characters: 401244
Number of unique characters: 40
Longest word: rabbitsreviews
Length of longest word: 14


In [3]:
def print_mapping(mapping):
    print('{')
    for char, _ in zip(mapping, range(20)):
        print('  {:4s}: {:3d},'.format(repr(char), mapping[char]))
    print('  ...\n}')
    
print_mapping(char_to_index)

{
  '*' :   0,
  '<' :   1,
  '>' :   2,
  '\n':   3,
  '0' :   4,
  '1' :   5,
  '2' :   6,
  '3' :   7,
  '4' :   8,
  '5' :   9,
  '6' :  10,
  '7' :  11,
  '8' :  12,
  '9' :  13,
  'a' :  14,
  'b' :  15,
  'c' :  16,
  'd' :  17,
  'e' :  18,
  'f' :  19,
  ...
}


In [4]:
def prepare_dataset(data, username_length):
    """Adds <start> and <end> tokens and pads the usernames."""
    data_X = []
    data_Y = []
    for username in data:
        pad = "*" * (username_length - len(username))
        X = np.array([char_to_index[char] for char in f"<{username}{pad}"]) #.reshape((username_length + 1, 1))
        Y = np.array([char_to_index[char] for char in f"{username}>{pad}"]) #.reshape((username_length + 1, 1))
        data_X.append(X)
        data_Y.append(Y)
    

    return (np.array(data_X), np.array(data_Y))

data = prepare_dataset(data, len(longest_word))

print(data[0].shape)
print(data[1].shape)
dataset = tf.data.Dataset.from_tensor_slices(data)

(54037, 15)
(54037, 15)


In [5]:
for input_example, target_example in dataset.take(1):
    print('Input data: ', repr(''.join([index_to_char[i] for i in input_example.numpy()])))
    print('Target data:', repr(''.join([index_to_char[i] for i in target_example.numpy()])))
    
for input_example, target_example in dataset.take(1):
    print('Input data: ', input_example.numpy())
    print('Target data:', target_example.numpy())

Input data:  '<info**********'
Target data: 'info>**********'
Input data:  [ 1 22 27 19 28  0  0  0  0  0  0  0  0  0  0]
Target data: [22 27 19 28  2  0  0  0  0  0  0  0  0  0  0]


In [6]:
BATCH_SIZE = 128
BUFFER_SIZE = 10000
EPOCHS = 100
VOCAB_SIZE = len(vocab)
RNN_UNITS = 512
EMBEDDING_DIM = 256

dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
print(dataset)

<BatchDataset shapes: ((128, 15), (128, 15)), types: (tf.int32, tf.int32)>


In [7]:
def build_model(vocab_size, rnn_units, batch_size, embedding_dim, stateful=False):
    model = tf.keras.Sequential([
        tf.keras.layers.Masking(mask_value=0, batch_input_shape=[batch_size, None]),
        tf.keras.layers.Embedding(vocab_size, embedding_dim),
        tf.keras.layers.GRU(rnn_units,
                             return_sequences=True,
                             stateful=stateful,
                             recurrent_initializer='glorot_uniform'),
        tf.keras.layers.Dense(vocab_size, activation=tf.nn.softmax)])
    
    return model

In [8]:
model = build_model(
    vocab_size = VOCAB_SIZE,
    rnn_units=RNN_UNITS,
    embedding_dim=EMBEDDING_DIM,
    batch_size=BATCH_SIZE)

In [9]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
masking (Masking)            (128, None)               0         
_________________________________________________________________
embedding (Embedding)        (128, None, 256)          10240     
_________________________________________________________________
gru (GRU)                    (128, None, 512)          1182720   
_________________________________________________________________
dense (Dense)                (128, None, 40)           20520     
Total params: 1,213,480
Trainable params: 1,213,480
Non-trainable params: 0
_________________________________________________________________


In [10]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

In [11]:
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='loss',
                                               min_delta=0,
                                               patience=5,
                                               verbose=0,
                                               mode='auto',
                                               baseline=None,
                                               restore_best_weights=True)
# MAKES NO SENSE

In [12]:
model.fit(dataset, epochs=EPOCHS, callbacks=[early_stopping])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
 24/422 [>.............................] - ETA: 1:08 - loss: 0.7587

KeyboardInterrupt: 

In [13]:
# model.save_weights('../weights/gru_model_embedding_weights.h5')

In [14]:
model = build_model(VOCAB_SIZE, RNN_UNITS, 1, EMBEDDING_DIM, stateful=True)

In [15]:
model.load_weights('../weights/gru_model_embedding_weights.h5')

In [16]:
def generate_usernames(model):
    num_generate = 7
    generated_usernames = []

    for i in range(num_generate):
        model.reset_states()
        input_eval = np.array([char_to_index['<']]).reshape((1, 1))  # We start with the '<start>' token
        generated_username = []
        done = False
        
        while not done:
            predictions = model.predict(input_eval)
            predictions = tf.squeeze(predictions)

            predicted_id = np.random.choice(range(40), p=predictions.numpy())
            # predicted_id = np.argmax(predictions)

            input_eval = np.array([predicted_id]).reshape((1, 1))
            
            done = index_to_char[predicted_id] in ['>', '*']
            
            if not done:
                generated_username.append(index_to_char[predicted_id])
        generated_usernames.append("".join(generated_username))

    return generated_usernames

In [17]:
for _ in range(20):
    generated_usernames = generate_usernames(model)
    print(generated_usernames)

['casanova', 'lammer', 'breaks', 'recover', 'hinashid', 'unset', 'bluey']
['barry5', 'wilkid', 'burhad', 'tran', 'freew', 'jamie', 'fixit']
['irishra', 'country', 'steve1', 'dave', 'hammy', 'lange', 'lumber']
['starwars', 'garion', 'delray', 'tommyt', 'esquire', 'mand', 'binno']
['bryan', 'stephen', 'sukrate', 'anthonyl', 'brahis', 'conaus', 'chrisb']
['marko', 'sheila', 'orlion', 'thomas28', 'brian6', 'fresh', 'lacey']
['abrico', 'metallic', 'bed', 'chris22', 'tigerboy', 'spawnnn', 'london']
['bike', 'heller', 'leon', 'bath', 'westleherf', 'timoteo', 'gero']
['rasmus', 'odonnell', 'fucker', 'clhall', 'alfhern', 'bubba01', 'myhope']
['giovane', 'mercy', 'corona', 'slaves', 'marklee', 'porny', 'mikee']
['johnnys', 'disbite', 'slloyd', 'drew', 'condem', 'blueey', 'afrodite']
['bmiller', 'boout', 'gordo', 'ericc', 'greg55', 'oldgod', 'david']
['ddave', 'vera', 'smegman', 'perfecto', 'rebel1', 'hardup', 'henry']
['arron', 'toastie', 'darkboy', 'srilank', 'octavia', 'evildave', 'free']
['ca