In [52]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from numpy import array
from pickle import dump
from __future__ import print_function
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding
from keras.callbacks import LambdaCallback, EarlyStopping, ModelCheckpoint
from keras.layers import Dense, Dropout, Activation
from keras.preprocessing.text import Tokenizer
from keras.optimizers import RMSprop
from keras.utils import to_categorical
import tensorflow as tf
import string

In [53]:
# load text to memory
def load_doc(filename): 
# open the file as read only
    file = open(filename, 'r')
# read all text
    text = file.read()
# close the file
    file.close()
    return text

In [54]:

##heuristics for splitting up words
#replace dashes with a white space
#otherwise words based on white space.
#reduce all words to lowercase and remove punctuation from words 
#remove all words that are not alphabetic 
# turn a doc into clean tokens



def clean_doc(doc):
# replace '--' with a space ' '
    doc = doc.replace('--', ' ')
# split into tokens by white space
    tokens = doc.split()
# remove punctuation from each token
    table = str.maketrans('', '', string.punctuation)
    tokens = [w.translate(table) for w in tokens]
# remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
# make lower case
    tokens = [word.lower() for word in tokens]
    return tokens


In [55]:
# use GPU to speed up training time through tensforflow
config = tf.ConfigProto()
#only allocate as much GPU memory based on runtime allocations, initially little but allows memory to be extended
config.gpu_options.allow_growth = True


In [None]:
# save tokens to file, one dialog per line
def save_doc(lines, filename):
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()


In [None]:
# load document
in_filename = '/Users/neilwatt/Documents/BIs/PrWeb/2018Posts/October/canterburytales.txt'
doc = load_doc(in_filename)
print(doc[:200])

In [32]:
# clean document
tokens = clean_doc(doc)
print(tokens[:200])
print('Total Tokens: %d' % len(tokens))
print('Unique Tokens: %d' % len(set(tokens)))


['the', 'project', 'gutenberg', 'etext', 'of', 'the', 'canterbury', 'tales', 'and', 'other', 'poems', 'by', 'geoffrey', 'chaucer', 'the', 'canterbury', 'tales', 'and', 'other', 'poems', 'of', 'geoffrey', 'chaucer', 'for', 'popular', 'perusal', 'by', 'd', 'purves', 'contents', 'preface', 'life', 'of', 'chaucer', 'the', 'canterbury', 'tales', 'the', 'general', 'prologue', 'the', 'knights', 'tale', 'the', 'millers', 'tale', 'the', 'reeves', 'tale', 'the', 'cooks', 'gutenberg', 'etext', 'of', 'the', 'canterbury', 'tales', 'and', 'other', 'poems', 'by', 'geoffrey', 'chaucer', 'the', 'canterbury', 'tales', 'and', 'other', 'poems', 'of', 'geoffrey', 'chaucer', 'for', 'popular', 'perusal', 'by', 'd', 'purves', 'contents', 'preface', 'life', 'of', 'chaucer', 'the', 'canterbury', 'tales', 'the', 'general', 'prologue', 'the', 'knights', 'tale', 'the', 'millers', 'tale', 'the', 'reeves', 'tale', 'the', 'cooks', 'tale', 'the', 'of', 'the', 'canterbury', 'tales', 'and', 'other', 'poems', 'by', 'geof

In [66]:
(set(tokens))

{'carried',
 'bodekins',
 'carthage',
 'jollily',
 'galley',
 'circumstances',
 'avicenna',
 'weenest',
 'guessing',
 'authors',
 'unthank',
 'physiologus',
 'nation',
 'pal',
 'snubnosed',
 'renounce',
 'disposd',
 'readily',
 'blunder',
 'ordered',
 'arras',
 'debased',
 'whip',
 'abandoned',
 'tercel',
 'abie',
 'dishonesty',
 'wore',
 'appaird',
 'spiced',
 'quench',
 'amulius',
 'rey',
 'countenance',
 'katharos',
 'gauren',
 'princples',
 'second',
 'duchess',
 'skilful',
 'loseth',
 'tended',
 'shaft',
 'making',
 'yshewd',
 'lade',
 'invisible',
 'magistrate',
 'proudest',
 'deliverly',
 'england',
 'caitiffs',
 'richness',
 'pore',
 'tonight',
 'troubadours',
 'signe',
 'zech',
 'waileth',
 'fate',
 'visage',
 'se',
 'stot',
 'showed',
 'gnawing',
 'ourselves',
 'forlete',
 'tooles',
 'ronian',
 'holyday',
 'paraventure',
 'present',
 'navarre',
 'leaveless',
 'bookes',
 'orpheus',
 'seedfowl',
 'pound',
 'sinnes',
 'stark',
 'mandements',
 'merite',
 'waist',
 'gnof',
 'defen

In [33]:
def map_book(tokens):
    hash_map = {}

    if tokens is not None:
        for element in tokens:
            # Remove Punctuation
            word = element.replace(",","")
            word = word.replace(".","")

            # Word Exist?
            if word in hash_map:
                hash_map[word] = hash_map[word] + 1
            else:
                hash_map[word] = 1

        return hash_map
    else:
        return None

In [35]:
hash_of_words = map_book(tokens)
hash_of_words 

{'aback': 50,
 'eagles': 125,
 'cleopatra': 128,
 'sports': 76,
 'anxieties': 51,
 'attentive': 101,
 'doat': 77,
 'descended': 280,
 'ferth': 50,
 'cheekes': 51,
 'misty': 52,
 'amatory': 51,
 'fairly': 151,
 'accidentally': 51,
 'lind': 50,
 'hall': 1177,
 'release': 178,
 'weepeth': 152,
 'wailed': 127,
 'mile': 357,
 'affection': 230,
 'sick': 688,
 'sleepe': 206,
 'kin': 384,
 'novelty': 76,
 'tohewn': 77,
 'stripe': 52,
 'mead': 560,
 'one': 12743,
 'dancing': 282,
 'yblow': 103,
 'holde': 408,
 'swink': 303,
 'squames': 50,
 'lightness': 102,
 'maketh': 1146,
 'ballads': 102,
 'drien': 76,
 'holdeth': 203,
 'liggen': 50,
 'corpse': 232,
 'ytaught': 101,
 'melt': 104,
 'vita': 50,
 'record': 231,
 'contest': 152,
 'fancied': 102,
 'robin': 102,
 'marcia': 52,
 'faire': 1148,
 'desire': 2018,
 'ally': 255,
 'astart': 177,
 'chambermaid': 50,
 'trumpe': 50,
 'interposed': 51,
 'clothes': 865,
 'horror': 103,
 'commune': 305,
 'oliver': 179,
 'lesson': 355,
 'lavender': 78,
 'skin':

In [36]:
#dwords=pd.DataFrame(hash_of_words.items(), columns=['Word', 'Count'])
#dwords

dwords = pd.DataFrame(list(hash_of_words.items()), columns=['Word', 'Count'])
dwords

Unnamed: 0,Word,Count
0,aback,50
1,eagles,125
2,cleopatra,128
3,sports,76
4,anxieties,51
5,attentive,101
6,doat,77
7,descended,280
8,ferth,50
9,cheekes,51


In [37]:
#create delete words array
#find that 8220 words appeared only one
#in other words, 46% of the training set consists of single use words!
deletedf=dwords[(dwords['Count'] < 2)]

delete_list=deletedf['Word'].values
delete_list

array([], dtype=object)

In [38]:
# load document
in_filename = '/Users/neilwatt/Documents/BIs/PrWeb/2018Posts/October/canterburytales.txt'
doc = load_doc(in_filename)
print(doc[:200])

The Project Gutenberg Etext of The Canterbury Tales and Other Poems
by Geoffrey Chaucer






                      THE CANTERBURY TALES
                         And other Poems
                      


In [39]:
# clean document but this time I want remove any words on delete list
tokens = clean_doc(doc)
print(tokens[:200])
print('Total Tokens: %d' % len(tokens))
print('Unique Tokens: %d' % len(set(tokens)))

['the', 'project', 'gutenberg', 'etext', 'of', 'the', 'canterbury', 'tales', 'and', 'other', 'poems', 'by', 'geoffrey', 'chaucer', 'the', 'canterbury', 'tales', 'and', 'other', 'poems', 'of', 'geoffrey', 'chaucer', 'edited', 'for', 'popular', 'perusal', 'by', 'd', 'laing', 'purves', 'contents', 'preface', 'life', 'of', 'chaucer', 'the', 'canterbury', 'tales', 'the', 'general', 'prologue', 'the', 'knights', 'tale', 'the', 'millers', 'tale', 'the', 'reeves', 'tale', 'the', 'cooks', 'tale', 'the', 'man', 'of', 'laws', 'tale', 'the', 'wife', 'of', 'baths', 'tale', 'the', 'friars', 'tale', 'the', 'sompnours', 'tale', 'the', 'clerks', 'tale', 'the', 'merchants', 'tale', 'the', 'squires', 'tale', 'the', 'franklins', 'tale', 'the', 'doctors', 'tale', 'the', 'pardoners', 'tale', 'the', 'shipmans', 'tale', 'the', 'prioresss', 'tale', 'chaucers', 'tale', 'of', 'sir', 'thopas', 'chaucers', 'tale', 'of', 'meliboeus', 'the', 'monks', 'tale', 'the', 'nuns', 'priests', 'tale', 'the', 'second', 'nuns',

In [40]:
#remove words on delete list
new_words = [word for word in tokens if word not in delete_list]

new_words

['the',
 'project',
 'gutenberg',
 'etext',
 'of',
 'the',
 'canterbury',
 'tales',
 'and',
 'other',
 'poems',
 'by',
 'geoffrey',
 'chaucer',
 'the',
 'canterbury',
 'tales',
 'and',
 'other',
 'poems',
 'of',
 'geoffrey',
 'chaucer',
 'edited',
 'for',
 'popular',
 'perusal',
 'by',
 'd',
 'laing',
 'purves',
 'contents',
 'preface',
 'life',
 'of',
 'chaucer',
 'the',
 'canterbury',
 'tales',
 'the',
 'general',
 'prologue',
 'the',
 'knights',
 'tale',
 'the',
 'millers',
 'tale',
 'the',
 'reeves',
 'tale',
 'the',
 'cooks',
 'tale',
 'the',
 'man',
 'of',
 'laws',
 'tale',
 'the',
 'wife',
 'of',
 'baths',
 'tale',
 'the',
 'friars',
 'tale',
 'the',
 'sompnours',
 'tale',
 'the',
 'clerks',
 'tale',
 'the',
 'merchants',
 'tale',
 'the',
 'squires',
 'tale',
 'the',
 'franklins',
 'tale',
 'the',
 'doctors',
 'tale',
 'the',
 'pardoners',
 'tale',
 'the',
 'shipmans',
 'tale',
 'the',
 'prioresss',
 'tale',
 'chaucers',
 'tale',
 'of',
 'sir',
 'thopas',
 'chaucers',
 'tale',
 

In [41]:
#we go from a corpus of tokens to a corpus of  Total Tokens: 264748
# and Unique Tokens: 17712
#to a corpus of 256528 tokens but only 9492 unique tokens
print(new_words[:200])
print('Total Tokens: %d' % len(new_words))
print('Unique Tokens: %d' % len(set(new_words)))

['the', 'project', 'gutenberg', 'etext', 'of', 'the', 'canterbury', 'tales', 'and', 'other', 'poems', 'by', 'geoffrey', 'chaucer', 'the', 'canterbury', 'tales', 'and', 'other', 'poems', 'of', 'geoffrey', 'chaucer', 'edited', 'for', 'popular', 'perusal', 'by', 'd', 'laing', 'purves', 'contents', 'preface', 'life', 'of', 'chaucer', 'the', 'canterbury', 'tales', 'the', 'general', 'prologue', 'the', 'knights', 'tale', 'the', 'millers', 'tale', 'the', 'reeves', 'tale', 'the', 'cooks', 'tale', 'the', 'man', 'of', 'laws', 'tale', 'the', 'wife', 'of', 'baths', 'tale', 'the', 'friars', 'tale', 'the', 'sompnours', 'tale', 'the', 'clerks', 'tale', 'the', 'merchants', 'tale', 'the', 'squires', 'tale', 'the', 'franklins', 'tale', 'the', 'doctors', 'tale', 'the', 'pardoners', 'tale', 'the', 'shipmans', 'tale', 'the', 'prioresss', 'tale', 'chaucers', 'tale', 'of', 'sir', 'thopas', 'chaucers', 'tale', 'of', 'meliboeus', 'the', 'monks', 'tale', 'the', 'nuns', 'priests', 'tale', 'the', 'second', 'nuns',

In [72]:
#delete all words on delete list from corpus and 

#remove word Chapter
#fin = open("/Users/neilwatt/Documents/BIs/PrWeb/2018Posts/October/canterburytales.txt", encoding='utf-8')
#fout = open("/Users/neilwatt/Documents/BIs/PrWeb/2018Posts/October/Code/canterburytales_Clean.txt", "w+", encoding='utf-8')

#for line in fin:
#    for word in delete_list:
 #       line = line.replace(word, "")
 #   fout.write(line)
#fin.close()
#fout.close()


In [43]:
# organize into sequences of tokens
#difference between sequences (previous) and sentences is that I fixed issue
#with repeating words
length = 20 + 1
#move forward 3 words
step = 2
sequences = list()
for i in range(length, len(new_words), step):
    # select sequence of new_words (previously tokens)
    seq = new_words[i-length:i]
    # convert into a line
    line = ' '.join(seq)
    # store
    sequences.append(line)
print('Total Sequences: %d' % len(sequences))

Total Sequences: 132364


In [44]:
# save sequences to file
out_filename = 'canterburytales_sequences.txt'

save_doc(sequences, out_filename)

In [56]:
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text
 
# load
in_filename = 'canterburytales_sequences.txt'
doc = load_doc(in_filename)
lines = doc.split('\n')

In [57]:
#representation of the model is distributed
#- i.e words with similar meaning have similar representation
#representation learned simulataneously with model
#probability of next word using context of last 100 words
#embedding layer for representation, LSTM to predict words on context

# integer encode sequences of words
#mapping each word to unique integer and encoding input sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
sequences = tokenizer.texts_to_sequences(lines)

# vocabulary size
#i add a one since vocabulary begins from index one 
#but indexing arrays is zero offset
vocab_size = len(tokenizer.word_index) + 1

In [58]:
tokenizer.word_index

{'candlestick': 16242,
 'aback': 8955,
 'cleopatra': 4328,
 'sports': 5441,
 'anxieties': 9286,
 'attentive': 5091,
 'fremmed': 11069,
 'library': 9984,
 'cheekes': 7599,
 'humanity': 14808,
 'asleepe': 12796,
 'fairly': 3591,
 'personage': 14150,
 'hall': 596,
 'mental': 11825,
 'trothe': 1687,
 'wailed': 3795,
 'garble': 12180,
 'twisted': 13223,
 'comforteth': 7633,
 'thymbra': 16623,
 'redeth': 17266,
 'kin': 1635,
 'novelty': 6265,
 'total': 7948,
 'mead': 1139,
 'blive': 2417,
 'origen': 13540,
 'yreft': 13922,
 'affirmeth': 15749,
 'corporeal': 12935,
 'holde': 1541,
 'retire': 13164,
 'swink': 1982,
 'squames': 7187,
 'lightness': 4716,
 'rampeth': 11554,
 'liggen': 7596,
 'bespread': 7242,
 'contest': 3580,
 'robin': 4707,
 'character': 1702,
 'ally': 2327,
 'chestnut': 14024,
 'thalmighty': 12521,
 'chambermaid': 9022,
 'clothes': 772,
 'vulgate': 10770,
 'horror': 4704,
 'commune': 1942,
 'yonnge': 11023,
 'yew': 4440,
 'sirs': 1272,
 'flying': 9414,
 'enforced': 12171,
 'br

In [59]:

#need to seperate word encodings into  y (output) and x (input)
# separate output from input with array slicing
sequences = array(sequences)
X, y = sequences[:,:-1], sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)
seq_length = X.shape[1]

In [60]:

# defining model
# two LSTM hidden layers with 100 memory cells
#dense fully connected layer with 100 neurons connects to the LSTM hidden layers 
#to interpret the features extracted from the sequence
#output layer predicts the next word as a single vector the size of the vocabulary 
#with a probability for each word in the vocabulary
#softmax activation function is used to ensure 
#the outputs have the characteristics of normalized probabilities
model = Sequential()
#reduced from 50 to 20
model.add(Embedding(vocab_size, 20, input_length=seq_length))
model.add(LSTM(100))
#model.add(LSTM(100, return_sequences=True))
model.add(Dropout(0.3))
#model.add(LSTM(100))
#model.add(Dropout(0.2))
model.add(Dense(100, activation='relu'))
model.add(Dense(100))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 20, 20)            354260    
_________________________________________________________________
lstm_4 (LSTM)                (None, 100)               48400     
_________________________________________________________________
dropout_4 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_10 (Dense)             (None, 100)               10100     
_________________________________________________________________
dense_11 (Dense)             (None, 100)               10100     
_________________________________________________________________
dense_12 (Dense)             (None, 17713)             1789013   
Total params: 2,211,873
Trainable params: 2,211,873
Non-trainable params: 0
_________________________________________________________________


In [61]:

# compile model
optimizer = RMSprop(lr=0.007)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])


In [None]:
# define the checkpoint so I can load model in future
filepath = "weights.hdf5"
checkpoint = ModelCheckpoint(filepath, 
                             monitor='loss', 
                             verbose=1, 
                             save_best_only=True, 
                 
                             mode='min')
earlystop = EarlyStopping(monitor='val_loss', patience=2,
                              verbose=2)

# fit model using the gpu
#batch size is a particularly important hyperparamter which I intend to play around within future posts
with tf.device('/gpu:0'):

    history=model.fit(X, y,validation_split=0.1,batch_size=60,epochs=60,verbose=1,callbacks=[ checkpoint,earlystop])


Train on 119127 samples, validate on 13237 samples
Epoch 1/60

In [None]:
print(history.history.keys())

In [None]:
history.history['val_loss']

In [None]:
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
# save the tokenizer
dump(tokenizer, open('tokenizer.pkl', 'wb'))

In [None]:
from random import randint
from pickle import load
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences
 
# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text
 


In [None]:
n_words

In [None]:
# generate a sequence from a language model
def generate_seq(model, tokenizer, seq_length, seed_text, n_words):
    result = list()
    in_text = seed_text
    # generate a fixed number of words
    for _ in range(n_words):
        # encode the text as integer
        encoded = tokenizer.texts_to_sequences([in_text])[0]
        # truncate sequences to a fixed length
        encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
        # predict probabilities for each word
        yhat = model.predict_classes(encoded, verbose=0)
        # map predicted word index to word
        out_word = ''
        for word, index in tokenizer.word_index.items():
            if index == yhat:
                out_word = word
                break
        # append to input
        in_text += ' ' + out_word
        result.append(out_word)
    return ' '.join(result)
 
# load cleaned text sequences
in_filename = '/Users/neilwatt/Documents/BIs/PrWeb/2018Posts/October/Code/canterburytales_sequences.txt'
doc = load_doc(in_filename)
lines = doc.split('\n')
seq_length = len(lines[0].split()) - 1
 
# load the model
model = load_model('weights.hdf5')
 
# load the tokenizer
tokenizer = load(open('tokenizer.pkl', 'rb'))
 
# select a seed text
seed_text = lines[randint(0,len(lines))]
print(seed_text + '\n')
 
# generate new text
generated = generate_seq(model, tokenizer, seq_length, seed_text, 3)
print(generated)

In [None]:
seed_text