# `The Neural Network`

In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
%%shell

rm -rf training_checkpoints
rm -rf Confessions-Generator
rm text_data.txt
git clone https://github.com/AurumnPegasus/Confessions-Generator.git
unzip Confessions-Generator/DataSet/text_data.txt.zip

In [0]:
# Netowrk Requirements
import numpy as np
import tensorflow as tf

# Uitlities
import os
import random
import json
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook

In [0]:
# NN Requirements
# path to data file
path_to_file = "text_data.txt"
# The maximum length sentence we want for a single input in characters
seq_length = 100
# Batch size
BATCH_SIZE = 256
# Buffer size to shuffle the dataset
BUFFER_SIZE = 10000
# The embedding dimension
embedding_dim = 256
# Number of RNN units here, LSTM units are used
rnn_units = 1024
# Total number of times the model sees the data
EPOCHS = 5

# Databse Requirements
# total confessions
database_size = 10

In [0]:
# Read the file in utf-8 format
text = open(path_to_file, 'rb').read().decode(encoding='utf-8')

# length of text = number of characters in it
print ('Length of text: {} characters'.format(len(text)))

In [0]:
# Take a look at the first 250 characters in text
print(text[500:1000])

In [0]:
# The unique characters in the file
vocab = sorted(set(text))
print ('{} unique characters'.format(len(vocab)))

In [0]:
# Creating a mapping from unique characters to indices
char2idx = {c:i for i, c in enumerate(vocab)}
# Creating a reverse mapping as well
idx2char = np.array(vocab)

# Convert the entire text to their encodings
text_as_int = np.array([char2idx[c] for c in text])

In [0]:
# +1 because we shift by 1 character each time
examples_per_epoch = len(text)//(seq_length+1)

# Create training examples / targets
char_dataset = tf.data.Dataset.from_tensor_slices(text_as_int)

In [0]:
sequences = char_dataset.batch(seq_length+1, drop_remainder=True)

In [0]:
def split_input_target(chunk):
    # splits the input chunk
    input_text = chunk[:-1]
    target_text = chunk[1:]
    return input_text, target_text

dataset = sequences.map(split_input_target)

In [0]:
# shuffle the dataset
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

In [0]:
# Length of the vocabulary in chars
vocab_size = len(vocab)

In [0]:
def build_model(vocab_size, embedding_dim, rnn_units, batch_size):
  """
  Builds the Sequential Model, using Embeddings to create inputs to tensors of specified dimensions, and then find dependencies using LSTM units. At the end is the dense layer.

  input params:
  vocab_size: nubmer of unique characters
  embedding_dim: number of dimension in the embedding tensor
  rnn_units: self explanatory
  batch_size: number of samples seen before gradient updation

  output:
  model created
  """
  
  model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim,
                              batch_input_shape=[batch_size, None]),
    tf.keras.layers.LSTM(rnn_units,
                        return_sequences=True,
                        stateful=True,
                        recurrent_initializer='glorot_uniform'),
    tf.keras.layers.Dense(vocab_size)
  ])
  return model

In [0]:
model = build_model(
  vocab_size = len(vocab),
  embedding_dim=embedding_dim,
  rnn_units=rnn_units,
  batch_size=BATCH_SIZE)

In [0]:
model.summary()

In [0]:
def custom_loss(labels, logits):
    return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)

In [0]:
model.compile(optimizer='adam', loss=custom_loss)

In [0]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback=tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    save_weights_only=True)


In [0]:
history = model.fit(dataset, epochs=EPOCHS, callbacks=checkpoint_callback)

In [0]:
plt.plot(history.history['loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.show()

# `Database Generation`

In [0]:
model = build_model(vocab_size, embedding_dim, rnn_units, batch_size=1)
model.load_weights(tf.train.latest_checkpoint(checkpoint_dir))
model.build(tf.TensorShape([1, None]))

In [0]:
def generate_text(model, out_size, start_string):
    # Number of characters to generate
    num_generate = out_size

    # Converting our start string to numbers (vectorizing)
    input_eval = [char2idx[s] for s in start_string]
    input_eval = tf.expand_dims(input_eval, 0)

    # Empty string to store our results
    text_generated = []

    # Low temperatures results in more predictable text.
    # Higher temperatures results in more surprising text.
    temperature = 1.0

    # Here batch size == 1
    model.reset_states()
    for i in range(num_generate):
        predictions = model(input_eval)
        # remove the batch dimension
        predictions = tf.squeeze(predictions, 0)

        # using a categorical distribution to predict the character returned by the model
        predictions = predictions / temperature
        predicted_id = tf.random.categorical(predictions, num_samples=1)[-1,0].numpy()

        # We pass the predicted character as the next input to the model
        # along with the previous hidden state
        input_eval = tf.expand_dims([predicted_id], 0)

        text_generated.append(idx2char[predicted_id])

    return (start_string + ''.join(text_generated))

In [0]:
print(generate_text(model, 100, "My "))

In [0]:
database = [[], [], []]

for _ in tqdm_notebook(range(database_size)):

    # random size
    user_jo_bola = random.randint(60, 420)
    temp_conf_string = generate_text(model, user_jo_bola, start_string="Dear")
    index = temp_conf_string.rfind('.')
    conf_string = temp_conf_string[:index] + "."

    if user_jo_bola < 180:
        # relative_conf = "smallpp"
        database[0].append(conf_string)
    elif user_jo_bola < 300:
        # relative_conf = "mediumpp"
        database[1].append(conf_string)
    else:
        # relative_conf = "largepp"
        database[2].append(conf_string)

In [0]:
pp_sizes = ["smallpp", "mediumpp", "largepp"]

for i, name in enumerate(pp_sizes):
    with open(f"/content/drive/My Drive/Database/{name}_database.json", 'w', encoding='utf-8') as f:
        json.dump({"content":database[i]}, f, ensure_ascii=False)