In [1]:
import string
import re
from numpy import array
from pickle import dump
from keras.preprocessing.text import Tokenizer
from keras.utils.vis_utils import plot_model
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding
from random import randint
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [2]:
# load doc into memory
def load_doc(filename):
    file = open(filename, 'r', encoding='utf-8')
    text = file.read()
    file.close()
    return text

In [3]:
# turn a doc into clean tokens
def clean_doc(doc):
    tokens = doc.split()
    re_punc = re.compile('[%s]' % re.escape(string.punctuation))
    tokens = [re_punc.sub('', w) for w in tokens]
    tokens = [word for word in tokens if word.isalpha()]
    tokens = [word.lower() for word in tokens]
    return tokens

In [4]:
# save tokens to file, one dialog per line
def save_doc(lines, filename):
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()

In [5]:
# load document
filename = 'Divine Comedy rough.txt'
doc = load_doc(filename)
print(doc[:200])

Midway upon the journey of our life
I found myself within a forest dark,
For the straightforward pathway had been lost.
Ah me! how hard a thing it is to say
What was this forest savage, rough, and ste


In [6]:
# clean document
tokens = clean_doc(doc)
print(tokens[:200])
print('Total Tokens: %d' % len(tokens))
print('Unique Tokens: %d' % len(set(tokens)))

['midway', 'upon', 'the', 'journey', 'of', 'our', 'life', 'i', 'found', 'myself', 'within', 'a', 'forest', 'dark', 'for', 'the', 'straightforward', 'pathway', 'had', 'been', 'lost', 'ah', 'me', 'how', 'hard', 'a', 'thing', 'it', 'is', 'to', 'say', 'what', 'was', 'this', 'forest', 'savage', 'rough', 'and', 'stern', 'which', 'in', 'the', 'very', 'thought', 'renews', 'the', 'fear', 'so', 'bitter', 'is', 'it', 'death', 'is', 'little', 'more', 'but', 'of', 'the', 'good', 'to', 'treat', 'which', 'there', 'i', 'found', 'speak', 'will', 'i', 'of', 'the', 'other', 'things', 'i', 'saw', 'there', 'i', 'cannot', 'well', 'repeat', 'how', 'there', 'i', 'entered', 'so', 'full', 'was', 'i', 'of', 'slumber', 'at', 'the', 'moment', 'in', 'which', 'i', 'had', 'abandoned', 'the', 'true', 'way', 'but', 'after', 'i', 'had', 'reached', 'a', 'foot', 'at', 'that', 'point', 'where', 'the', 'valley', 'terminated', 'which', 'had', 'with', 'consternation', 'pierced', 'my', 'heart', 'upward', 'i', 'looked', 'and', 

In [7]:
# organize into sequences of tokens
length = 50 + 1
sequences = list()

In [8]:
for i in range(length, len(tokens)):
    seq = tokens[i-length:i]
    line = ' '.join(seq)
    sequences.append(line)

In [9]:
print('Total Sequences: %d' % len(sequences))

Total Sequences: 20573


In [10]:
# save sequences to file
out_filename = 'divine_comedy_sequences.txt'
save_doc(sequences, out_filename)

In [11]:
# load
in_filename = 'divine_comedy_sequences.txt'
doc = load_doc(in_filename)
lines = doc.split('\n')

In [12]:
# integer encode sequences of words
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
sequences = tokenizer.texts_to_sequences(lines)

In [13]:
# vocabulary size
vocab_size = len(tokenizer.word_index) + 1

In [14]:
# separate into input and output
sequences = array(sequences)
X, y = sequences[:,:-1], sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)
seq_length = X.shape[1]

In [24]:
# define the model
def define_model(vocab_size, seq_length):
    model = Sequential()
    model.add(Embedding(vocab_size, 50, input_length=seq_length))
    model.add(LSTM(100, return_sequences=True))
    model.add(LSTM(100))
    model.add(Dense(100, activation='relu'))
    model.add(Dense(vocab_size, activation='softmax'))
    
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    
    model.summary()
    #plot_model(model, to_file='model.png', show_shapes=True)
    return model

In [25]:
model = define_model(vocab_size, seq_length)































_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 50, 50)            177850    
_________________________________________________________________
lstm_1 (LSTM)                (None, 50, 100)           60400     
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               80400     
_________________________________________________________________
dense_1 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_2 (Dense)              (None, 3557)              359257    
Total params: 688,007
Trainable params: 688,007
Non-trainable params: 0
_________________________________________________________________


In [15]:
model = load_model('model.h5');












Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where




In [None]:
model.fit(X, y, batch_size=128, epochs=10)

Epoch 1/10
Epoch 2/10
 3200/20573 [===>..........................] - ETA: 15:02 - loss: 2.0149 - acc: 0.5244

In [58]:
model.save('model.h5')

In [62]:
seed_text = lines[randint(0,len(lines))]
print(seed_text + '\n')

thou trustest let not the amplitude deceive and unto him my guide criest thou too do not impede his journey fateordained it is so willed there where is power to do that which is willed and ask no further and now begin the dolesome notes to grow audible unto me now



In [63]:
# generate a sequence from a language model
def generate_seq(model, tokenizer, seq_length, seed_text, n_words):
    result = list()
    in_text = seed_text
    for _ in range(n_words):
        encoded = tokenizer.texts_to_sequences([in_text])[0]
        encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
        yhat = model.predict_classes(encoded, verbose=0)
        out_word = ''
        for word, index in tokenizer.word_index.items():
            if index == yhat:
                out_word = word
                break

        # append to input
        in_text += ' ' + out_word
        result.append(out_word)
    return ' '.join(result)

In [64]:
# generate new text
generated = generate_seq(model, tokenizer, seq_length, seed_text, 50)
print(generated)

to the trunk you than wast i read to dismal mournings these think from foot and i may entered abandoned the sepulchres felt himself and was beneath my steps and has be to stone and cold even as the flowerets by nocturnal chill upon each approached for if i walked
