# Model creation, training, and text generation

In [1]:
#!pip install language_tool_python
#!pip install textstat

In [2]:
import pandas as pd
import os
import time
import tensorflow as tf
import random
import language_tool_python
import textstat

In [3]:
#get the data
df = pd.read_csv('Final Data.csv', engine = 'python')

In [4]:
#need to find max text length for padding
full_text = ''
max_text_length = 0
for i in range(0, len(df)):
    desc = df['Quest Description'].iloc[i]
    try:
        assert(type(desc) == str)
    except:
        continue
    title = df['Quest Title'].iloc[i]
    new_text = 'Title: '+title+'\nDescription: '+desc
    full_text = full_text+new_text
    if len(new_text) > max_text_length:
        print(len(new_text))
        max_text_length = len(new_text)

280
468
470
519
526
529
539
545
549
552
554
564
565
569
575
578
579


In [5]:
max_text_length

579

In [6]:
vocab = sorted(set(full_text))

In [7]:
print(vocab, len(vocab))

['\n', '\r', ' ', '!', '"', '#', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '<', '=', '>', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '}', '~'] 91


##### Defining the encoding / decoding structure

In [8]:
#layer for turning individual characters to IDs
chars_to_id_layer = tf.keras.layers.StringLookup(
    vocabulary=vocab, mask_token=None
)

In [9]:
#inverted version, IDs to characters
id_to_chars_layer = tf.keras.layers.StringLookup(
    vocabulary=chars_to_id_layer.get_vocabulary(), invert=True, mask_token=None
)

In [10]:
#function for encoding
def text_to_id(input_val):
    return chars_to_id_layer(tf.strings.unicode_split(input_val, input_encoding='UTF-8'))

In [11]:
#function for decoding
def id_to_text(input_val):
    return tf.strings.reduce_join(id_to_chars_layer(input_val), axis=-1)

In [12]:
#One big string to sample from
padded_text = ''
for i in range(len(df)):
    try:
        assert(type(df['Quest Description'].loc[i]) == str)
        pads = (max_text_length+5) - len('Title: '+df['Quest Title'].iloc[i]+'\nDescription: '+df['Quest Description'].loc[i])
    except:
        continue
    new_text = 'Title: '+df['Quest Title'].iloc[i]+'\nDescription: '+df['Quest Description'].loc[i]+' '*pads
    padded_text = padded_text+new_text

In [13]:
#Model makes predictions character by character, shift each sample by one to train the model
def split_input_target(seq):
    input_val = seq[:-1]
    output_val = seq[1:]
    return input_val, output_val

In [14]:
#one quest = one sample
seq_len = max_text_length-1

In [15]:
per_epoch = len(padded_text) // seq_len+1

In [16]:
ids = text_to_id(padded_text)
dataset = tf.data.Dataset.from_tensor_slices(ids)

In [17]:
sequences = dataset.batch(seq_len + 1, drop_remainder=True)

In [18]:
#example of a quest as it is fed into the model. Note the padding
for seq in sequences.take(1):
    print(id_to_text(seq))

tf.Tensor(b"Title: Sharptalon's Claw\nDescription: The mighty hippogryph Sharptalon has been slain, with the claw of the felled beast serving as a testament to your victory.\n\nSenani Thunderheart at the Silverwind Refuge will no doubt be interested in seeing this trophy as proof of your deeds.                                                                                                                                                                                                                                                                                                           ", shape=(), dtype=string)


In [19]:
dataset = sequences.map(split_input_target)

In [20]:
dataset

<_MapDataset element_spec=(TensorSpec(shape=(578,), dtype=tf.int64, name=None), TensorSpec(shape=(578,), dtype=tf.int64, name=None))>

In [21]:
BATCH_SIZE = 64
BUFFER_SIZE = 10000

dataset = (
    dataset.shuffle(BUFFER_SIZE)
    .batch(BATCH_SIZE, drop_remainder=True)
    .prefetch(tf.data.experimental.AUTOTUNE)
)

dataset

<_PrefetchDataset element_spec=(TensorSpec(shape=(64, 578), dtype=tf.int64, name=None), TensorSpec(shape=(64, 578), dtype=tf.int64, name=None))>

In [22]:
# Length of the vocabulary in chars
vocab_size = len(vocab)

# The embedding dimension
embedding_dim = 256

# Number of RNN units
rnn_units = 1024

In [23]:
#simple model structure, we only add three layers
class MyModel(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, rnn_units):
        super().__init__(self)
        #embedding layer
        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        #GRU layer
        self.gru = tf.keras.layers.GRU(
            rnn_units, return_sequences=True, return_state=True
        )
        #Dense layer
        self.dense = tf.keras.layers.Dense(vocab_size)

    def call(self, inputs, states=None, return_state=False, training=False):
        x = self.embedding(inputs, training=training)
        #Use precious state in training, initialise the model if there is no precious state
        if states is None:
            states = self.gru.get_initial_state(x)
        x, states = self.gru(x, initial_state=states, training=training)
        x = self.dense(x, training=training)

        if return_state:
            return x, states
        else:
            return x

In [24]:
model = MyModel(
    vocab_size=len(chars_to_id_layer.get_vocabulary()),
    embedding_dim=embedding_dim,
    rnn_units=rnn_units,
)

In [25]:
for input_example_batch, target_example_batch in dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(
        example_batch_predictions.shape,
        "# (batch_size, sequence_length, vocab_size)",
    )

(64, 578, 92) # (batch_size, sequence_length, vocab_size)


In [26]:
model.summary()

Model: "my_model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       multiple                  23552     
                                                                 
 gru (GRU)                   multiple                  3938304   
                                                                 
 dense (Dense)               multiple                  94300     
                                                                 
Total params: 4056156 (15.47 MB)
Trainable params: 4056156 (15.47 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [27]:
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)

In [28]:
model.compile(optimizer="adam", loss=loss)

In [29]:
# Directory where the checkpoints will be saved
checkpoint_dir = "./training_checkpoints"
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix, save_weights_only=True
)

In [None]:
EPOCHS = 30

##### Training the model. Once this is done, we can save the weights to load the model quickly.

In [None]:
history = model.fit(dataset, epochs=EPOCHS, callbacks=[checkpoint_callback])

In [None]:
model.save_weights('desctitle_weights.h5')

In [30]:
#model.load_weights('desctitle_weights.h5')

In [31]:
class OneStep(tf.keras.Model):
    def __init__(self, model, id_to_chars_layer, chars_to_id_layer, temperature=0.3):
        super().__init__()
        self.temperature = temperature
        self.model = model
        self.chars_from_ids = id_to_chars_layer
        self.ids_from_chars = chars_to_id_layer

        #Prevent [UNK] generation
        skip_ids = self.ids_from_chars(["[UNK]"])[:, None]
        sparse_mask = tf.SparseTensor(
            # Put a -inf at each bad index
            values=[-float("inf")] * len(skip_ids),
            indices=skip_ids,
            # Match the shape to the vocabulary
            dense_shape=[len(chars_to_id_layer.get_vocabulary())],
        )
        self.prediction_mask = tf.sparse.to_dense(sparse_mask)

    @tf.function
    def generate_one_step(self, inputs, states=None):
        # Convert strings to IDs
        input_chars = tf.strings.unicode_split(inputs, "UTF-8")
        input_ids = self.ids_from_chars(input_chars).to_tensor()

        # Run the model
        # predicted_logits.shape is [batch, char, next_char_logits]
        predicted_logits, states = self.model(
            inputs=input_ids, states=states, return_state=True
        )
        # Only use the last prediction
        predicted_logits = predicted_logits[:, -1, :]
        predicted_logits = predicted_logits / self.temperature
        # Apply the prediction mask: prevent "[UNK]" from being generated
        predicted_logits = predicted_logits + self.prediction_mask

        # Sample the output logits to generate token IDs
        predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
        predicted_ids = tf.squeeze(predicted_ids, axis=-1)

        # Convert from token ids to characters
        predicted_chars = self.chars_from_ids(predicted_ids)

        # Return the characters and model state
        return predicted_chars, states

In [32]:
one_step_model = OneStep(model, id_to_chars_layer, chars_to_id_layer)

In [33]:
text_eval = language_tool_python.LanguageToolPublicAPI('en-US')

In [34]:
#function for generating quests
def generate_quests(rep = 5, prompt = 'Title: '):
    #By default get no prompt and generate 5 responses.
    require_input = True
    while require_input:
        decision = input('Use default settings? Y/N\nDefault Settings:\nOutputs: 5\nPrompt: None')
        if decision.lower() in ['y', 'n']:
            require_input = False
    if decision.lower() == 'n': #option for custom settings
        require_input = True
        while require_input:
            decision = input('Please give the desired number of outputs (default=5)')
            try:
                decision = int(decision)
                assert(decision > 0)
            except:
                continue
            require_input = False
            rep = decision
        require_input = True
        while require_input:
            title_decision = input('Please give the desired Title: (default=None, required)')
            if not decision:
                continue    #a title must be provided
            description_decision = input('Please give the desired Description: (default = None, optional)')
            if not description_decision:
                description_decision = ''   #a description prompt is optional
            require_input = False
            prompt = 'Title: '+title_decision+'\nDescription: '+description_decision
    total_mistakes = 0      #keep track of grammar mistakes
    total_reading_score = 0 #keep track of reading ease score
    for i in range (rep):
        i =  0
        start = time.time() #get time taken to generate each response
        states = None
        next_char = tf.constant([prompt])
        result = [next_char]
        for n in range(max_text_length):
            next_char, states = one_step_model.generate_one_step(
                next_char, states=states
            )
            result.append(next_char)
        result = tf.strings.join(result)
        end = time.time()
        print(result[0].numpy().decode("utf-8"), "\n\n" + "_" * 80)
        total_reading_score = total_reading_score + textstat.flesch_reading_ease(result[0].numpy().decode("utf-8"))
        total_mistakes = total_mistakes + len(text_eval.check(result[0].numpy().decode("utf-8")))
        print("\nRun time:", end - start)
        i+=1
    print(f'Average grammar mistakes: {total_mistakes/rep}')
    print(f'Average reading score: {total_reading_score/rep}')


In [39]:
#function for sampling random original quest
def quest_examples():
    bad_input = True
    while bad_input:    #Get user input for number of responses
        ex_count = input('How many examples would you like?')
        try:
            ex_count = int(ex_count)
            assert(ex_count > 0)
        except:
            continue
        bad_input = False
    examples = []
    index_list = []
    for i in range(0, ex_count):
        bad_response = True
        while bad_response:
            index = random.randint(0, len(df))
            try:
                assert index not in index_list
            except:
                continue
            try:
                assert type(df['Quest Description'].iloc[index]) == str
            except:
                continue
            bad_response = False
            examples.append('Title: '+df['Quest Title'].iloc[index]+'\nDescription: '+df['Quest Description'].iloc[index]+'\n\n' + '_' * 80)
            index_list.append(index)
    total_mistakes = 0  #get the grammar mistakes and reading score to compare with generated quests
    total_reading_score = 0
    for x in examples:
        print(x)
        total_reading_score = total_reading_score + textstat.flesch_reading_ease(x)
        total_mistakes = total_mistakes + len(text_eval.check(x))
    print(f'Average grammar mistakes: {total_mistakes/ex_count}')
    print(f'Average reading score: {total_reading_score/ex_count}')



In [40]:
generate_quests()

Title: The First Ones Fell Kells
Description: The demons seem to be made of a new protoform construct. Present it to an appropriate protoform forge and it may instantiate one of these creatures for you.>                                                                                                                                                                                                                                                                                                                                                                                                

________________________________________________________________________________

Run time: 0.6239538192749023
Title: The Final Forces
Description: I am afraid there were some sort of stronger ships that we were able to help the Horde in the way. We have to stop them before they become the magic of their village.

The power of the arcane magic is strong enough to take us to the south. They will need to be 

In [41]:
quest_examples()

Title: The Path of Anguish
Description: Doom Lord Kazzak commands the Burning Legion in Outland. While we can't get to Kazzak - yet - we can and must get to his lieutenants that command Legion troops from the ground.

They have stationed dreadlords at the Path of Anguish, east of here. They are the ones responsible for these infernal storms that rain down upon our heads. To take them and their minions out would be a great set back to the Burning Legion.

________________________________________________________________________________
Average grammar mistakes: 3.0
Average reading score: 80.21
