In [15]:

def load_doc(filename):
    file = open(filename, 'r',encoding='utf-8')
    text = file.read()
    file.close()
    return text

In [16]:
in_filename = 'adam.txt'
doc = load_doc(in_filename)
print(doc[:200])

﻿Chapter I
The Workshop


With a single drop of ink for a mirror, the Egyptian sorcerer
undertakes to reveal to any chance comer far-reaching visions of the
past. This is what I undertake to do for yo


In [17]:
import string

def clean_doc(doc):
    doc = doc.replace('--', ' ')
    tokens = doc.split()
    table = str.maketrans('', '', string.punctuation)
    tokens = [w.translate(table) for w in tokens]
    tokens = [word for word in tokens if word.isalpha()]
    tokens = [word.lower() for word in tokens]
    return tokens

In [18]:
tokens = clean_doc(doc)
print(tokens[:200])
print('Total Tokens: %d' % len(tokens))
print('Unique Tokens: %d' % len(set(tokens)))

['i', 'the', 'workshop', 'with', 'a', 'single', 'drop', 'of', 'ink', 'for', 'a', 'mirror', 'the', 'egyptian', 'sorcerer', 'undertakes', 'to', 'reveal', 'to', 'any', 'chance', 'comer', 'farreaching', 'visions', 'of', 'the', 'past', 'this', 'is', 'what', 'i', 'undertake', 'to', 'do', 'for', 'you', 'reader', 'with', 'this', 'drop', 'of', 'ink', 'at', 'the', 'end', 'of', 'my', 'pen', 'i', 'will', 'show', 'you', 'the', 'roomy', 'workshop', 'of', 'mr', 'jonathan', 'burge', 'carpenter', 'and', 'builder', 'in', 'the', 'village', 'of', 'hayslope', 'as', 'it', 'appeared', 'on', 'the', 'eighteenth', 'of', 'june', 'in', 'the', 'year', 'of', 'our', 'lord', 'the', 'afternoon', 'sun', 'was', 'warm', 'on', 'the', 'five', 'workmen', 'there', 'busy', 'upon', 'doors', 'and', 'windowframes', 'and', 'wainscoting', 'a', 'scent', 'of', 'pinewood', 'from', 'a', 'tentlike', 'pile', 'of', 'planks', 'outside', 'the', 'open', 'door', 'mingled', 'itself', 'with', 'the', 'scent', 'of', 'the', 'elderbushes', 'which'

In [19]:
length = 50 + 1
sequences = list()
for i in range(length, len(tokens)):
    seq = tokens[i-length:i]
    line = ' '.join(seq)
    sequences.append(line)
print('Total Sequences: %d' % len(sequences))

Total Sequences: 198685


In [22]:
def save_doc(lines, filename):
    data = '\n'.join(lines)
    file = open(filename, 'w', encoding='utf-8')
    file.write(data)
    file.close()

In [23]:
out_filename = 'adam_sequences.txt'
save_doc(sequences, out_filename)

# Training the model

In [24]:
in_filename = 'adam_sequences.txt'
doc = load_doc(in_filename)
lines = doc.split('\n')

In [25]:
from keras.preprocessing.text import Tokenizer

# Encdiing the textual data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
sequences = tokenizer.texts_to_sequences(lines)

In [26]:
vocab_size = len(tokenizer.word_index) + 1

In [27]:
import numpy as np
from keras.utils import to_categorical

sequences = np.array(sequences)
X, y = sequences[:,:-1], sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)
seq_length = X.shape[1]

In [28]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers


model = keras.Sequential()
model.add(layers.Embedding(vocab_size, 50, input_length=seq_length))
model.add(layers.LSTM(100, return_sequences=True))
model.add(layers.LSTM(100))
model.add(layers.Dense(100, activation='relu'))
model.add(layers.Dense(vocab_size, activation='softmax'))
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 50, 50)            571700    
                                                                 
 lstm (LSTM)                 (None, 50, 100)           60400     
                                                                 
 lstm_1 (LSTM)               (None, 100)               80400     
                                                                 
 dense (Dense)               (None, 100)               10100     
                                                                 
 dense_1 (Dense)             (None, 11434)             1154834   
                                                                 
Total params: 1,877,434
Trainable params: 1,877,434
Non-trainable params: 0
_________________________________________________________________
None


In [34]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, y, batch_size=128, epochs=50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x22c8f8f1a30>

In [35]:
from numpy import array
from pickle import dump
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding

model.save('model_adam2.h5')
dump(tokenizer, open('tokenizer_adam2.pkl', 'wb'))