Using section 8.1 in Deep Learning with Python as a guide, implement an LSTM text generator. Train the model on the Enron corpus or a text source of your choice. Save the model and generate 20 examples to the results directory of dsc650/assignments/assignment11/.

In [42]:
from tensorflow import keras
import numpy as np
path = keras.utils.get_file(
 'nietzsche.txt',
 origin='https://s3.amazonaws.com/text-datasets/nietzsche.txt')
text = open(path).read().lower()
print('Corpus length:', len(text))

Corpus length: 600893


In [43]:
# Length of extracted character sequences
maxlen = 60
# We sample a new sequence every `step` characters
step = 3
# This holds our extracted sequences
sentences = []
# This holds the targets (the follow-up characters)
next_chars = []
for i in range(0, len(text) - maxlen, step):
     sentences.append(text[i: i + maxlen])
     next_chars.append(text[i + maxlen])
print('Number of sequences:', len(sentences))
# List of unique characters in the corpus
chars = sorted(list(set(text)))
print('Unique characters:', len(chars))
# Dictionary mapping unique characters to their index in `chars`
char_indices = dict((char, chars.index(char)) for char in chars)
# Next, one-hot encode the characters into binary arrays.
print('Vectorization...')
x = np.zeros((len(sentences), maxlen, len(chars)), dtype=np.bool)
y = np.zeros((len(sentences), len(chars)), dtype=np.bool)
for i, sentence in enumerate(sentences):
     for t, char in enumerate(sentence):
          x[i, t, char_indices[char]] = 1
          y[i, char_indices[next_chars[i]]] = 1

Number of sequences: 200278
Unique characters: 57
Vectorization...


In [44]:
model = keras.models.Sequential()
model.add(keras.layers.LSTM(128, input_shape=(maxlen, len(chars))))
model.add(keras.layers.Dense(len(chars), activation='softmax'))

In [45]:
optimizer = keras.optimizers.RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer)

In [47]:
def sample(preds, temperature=1.0):
     preds = np.asarray(preds).astype('float64')
     preds = np.log(preds) / temperature
     exp_preds = np.exp(preds)
     preds = exp_preds / np.sum(exp_preds)
     probas = np.random.multinomial(1, preds, 1)
     return np.argmax(probas)

In [48]:
import random
import sys
for epoch in range(1, 40):
     print('epoch', epoch)
     # Fit the model for 1 epoch on the available training data
     model.fit(x, y,
               batch_size=128,
               epochs=1)
     # Select a text seed at random

epoch 1
epoch 2
epoch 3
epoch 4
epoch 5
epoch 6
epoch 7
epoch 8
epoch 9
epoch 10
epoch 11
epoch 12
epoch 13
epoch 14
epoch 15
epoch 16
epoch 17
epoch 18
epoch 19
epoch 20
epoch 21
epoch 22
epoch 23
epoch 24
epoch 25
epoch 26
epoch 27
epoch 28
epoch 29
epoch 30
epoch 31
epoch 32
epoch 33
epoch 34
epoch 35
epoch 36
epoch 37
epoch 38
epoch 39


In [49]:
model.save("/Users/muduo/Documents/GitHub/dsc650/dsc650/assignments/assignment11/LSTMtextgenmodel")

2021-11-19 23:33:08.319174: W tensorflow/python/util/util.cc:348] Sets are not currently considered sequences, but this may change in the future, so consider avoiding using them.


INFO:tensorflow:Assets written to: /Users/muduo/Documents/GitHub/dsc650/dsc650/assignments/assignment11/LSTMtextgenmodel/assets


INFO:tensorflow:Assets written to: /Users/muduo/Documents/GitHub/dsc650/dsc650/assignments/assignment11/LSTMtextgenmodel/assets


In [50]:
from tensorflow import keras
load = keras.models.load_model("/Users/muduo/Documents/GitHub/dsc650/dsc650/assignments/assignment11/LSTMtextgenmodel")

In [51]:
import random
import sys
for n in range(1, 21):
     # Select a text seed at random
     start_index = random.randint(0, len(text) - maxlen - 1)
     generated_text = text[start_index: start_index + maxlen]
     seed_text = generated_text
     with open("results/"+str(n)+".txt", "a") as a:
          a.write(f"--- Generating with seed: {seed_text}\n")
          for temperature in [0.5, 1.2]:
               final_text = seed_text
               # We generate 200 characters
               for i in range(200):
                    sampled = np.zeros((1, maxlen, len(chars)))
                    for t, char in enumerate(generated_text):
                         sampled[0, t, char_indices[char]] = 1.
                    preds = load.predict(sampled, verbose=0)[0]
                    next_index = sample(preds, temperature)
                    next_char = chars[next_index]
                    generated_text += next_char
                    final_text += next_char
                    generated_text = generated_text[1:]
               a.write(f"Temperature: {temperature} \n")
               a.write(final_text)
               a.write("\n"*2)
               

  preds = np.log(preds) / temperature
