In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from numpy import array
from pickle import dump
from __future__ import print_function
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Embedding
from keras.callbacks import LambdaCallback, EarlyStopping, ModelCheckpoint
from keras.layers import Dense, Dropout, Activation
from keras.preprocessing.text import Tokenizer
from keras.optimizers import RMSprop
from keras.utils import to_categorical
import tensorflow as tf
import string

Using TensorFlow backend.
  return f(*args, **kwds)


In [2]:
# load text to memory
def load_doc(filename): 
# open the file as read only
    file = open(filename, 'r')
# read all text
    text = file.read()
# close the file
    file.close()
    return text

In [3]:

##heuristics for splitting up words
#replace dashes with a white space
#otherwise words based on white space.
#reduce all words to lowercase and remove punctuation from words 
#remove all words that are not alphabetic 
# turn a doc into clean tokens



def clean_doc(doc):
# replace '--' with a space ' '
    doc = doc.replace('--', ' ')
# split into tokens by white space
    tokens = doc.split()
# remove punctuation from each token
    table = str.maketrans('', '', string.punctuation)
    tokens = [w.translate(table) for w in tokens]
# remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
# make lower case
    tokens = [word.lower() for word in tokens]
    return tokens


In [4]:
# use GPU to speed up training time through tensforflow
config = tf.ConfigProto()
#only allocate as much GPU memory based on runtime allocations, initially little but allows memory to be extended
config.gpu_options.allow_growth = True


In [5]:
# save tokens to file, one dialog per line
def save_doc(lines, filename):
    data = '\n'.join(lines)
    file = open(filename, 'w')
    file.write(data)
    file.close()


In [6]:
# load document
in_filename = '/Users/neilwatt/Documents/BIs/PrWeb/2018Posts/October/Word_LSTM/GoneWithWind.txt'
doc = load_doc(in_filename)
print(doc[:200])



Title: Gone With The Wind
Author: Margaret Mitchell (1900-1949)
eBook No.:  0200161.txt
Character set encoding:     ASCII--7 bit
Date first posted: February 2002
Date most recently updated: December


In [7]:
# clean document
tokens = clean_doc(doc)
print(tokens[:200])
print('Total Tokens: %d' % len(tokens))
print('Unique Tokens: %d' % len(set(tokens)))


['title', 'gone', 'with', 'the', 'wind', 'author', 'margaret', 'mitchell', 'ebook', 'no', 'character', 'set', 'encoding', 'ascii', 'bit', 'date', 'first', 'posted', 'february', 'date', 'most', 'recently', 'updated', 'december', 'this', 'ebook', 'was', 'produced', 'by', 'don', 'lainson', 'dlainsonsympaticoca', 'project', 'gutenberg', 'of', 'australia', 'ebooks', 'are', 'created', 'from', 'printed', 'editions', 'which', 'are', 'in', 'the', 'public', 'domain', 'in', 'australia', 'unless', 'a', 'copyright', 'notice', 'is', 'included', 'we', 'do', 'not', 'keep', 'any', 'ebooks', 'in', 'compliance', 'with', 'a', 'particular', 'paper', 'edition', 'title', 'gone', 'with', 'the', 'wind', 'author', 'margaret', 'mitchell', 'part', 'one', 'chapter', 'i', 'scarlett', 'ohara', 'was', 'not', 'beautiful', 'but', 'men', 'seldom', 'realized', 'it', 'when', 'caught', 'by', 'her', 'charm', 'as', 'the', 'tarleton', 'twins', 'were', 'in', 'her', 'face', 'were', 'too', 'sharply', 'blended', 'the', 'delicate'

In [8]:
def map_book(tokens):
    hash_map = {}

    if tokens is not None:
        for element in tokens:
            # Remove Punctuation
            word = element.replace(",","")
            word = word.replace(".","")

            # Word Exist?
            if word in hash_map:
                hash_map[word] = hash_map[word] + 1
            else:
                hash_map[word] = 1

        return hash_map
    else:
        return None

In [9]:
hash_of_words = map_book(tokens)
hash_of_words 

{'distinguish': 5,
 'clashed': 1,
 'forgetful': 4,
 'ignorant': 23,
 'halfkilled': 1,
 'aloud': 26,
 'neighbors': 79,
 'lowbacked': 3,
 'noonday': 1,
 'despised': 4,
 'stable': 13,
 'oven': 2,
 'consorter': 1,
 'creeters': 1,
 'organizing': 1,
 'woe': 1,
 'klan': 40,
 'interposed': 6,
 'addressing': 1,
 'crossroads': 2,
 'macassar': 1,
 'sultry': 1,
 'lavish': 2,
 'warrior': 1,
 'battered': 8,
 'veered': 1,
 'references': 1,
 'narrate': 1,
 'squeezing': 5,
 'bursting': 19,
 'gigglings': 1,
 'goodnatured': 3,
 'indignantly': 30,
 'lilac': 1,
 'anyhow': 2,
 'contained': 5,
 'jackknife': 1,
 'wanted': 270,
 'shipwrecked': 1,
 'hundred': 68,
 'convinced': 5,
 'calflike': 1,
 'keys': 1,
 'justice': 5,
 'struggles': 4,
 'acted': 33,
 'resurrection': 2,
 'fashionable': 9,
 'poorer': 1,
 'obstacle': 2,
 'gaping': 3,
 'poteen': 1,
 'venture': 3,
 'vehemence': 3,
 'cry': 120,
 'known': 113,
 'contemplate': 1,
 'pall': 3,
 'leasing': 3,
 'oak': 17,
 'seamed': 1,
 'waggins': 1,
 'sixtyfive': 3,
 '

In [10]:
#create dataframe of all words according to count

dwords = pd.DataFrame(list(hash_of_words.items()), columns=['Word', 'Count'])
dwords

Unnamed: 0,Word,Count
0,distinguish,5
1,clashed,1
2,forgetful,4
3,ignorant,23
4,halfkilled,1
5,aloud,26
6,neighbors,79
7,lowbacked,3
8,noonday,1
9,despised,4


In [11]:
#create delete words array
#words must appear at least 3 times
#there are 17353  unique words
#9283 words appear less than 3 times
deletedf=dwords[(dwords['Count'] < 3)]

delete_list=deletedf['Word'].values


In [12]:
len(delete_list)

9283

In [13]:
# load document
in_filename = '/Users/neilwatt/Documents/BIs/PrWeb/2018Posts/October/canterburytales.txt'
doc = load_doc(in_filename)
print(doc[:200])

The Project Gutenberg Etext of The Canterbury Tales and Other Poems
by Geoffrey Chaucer






                      THE CANTERBURY TALES
                         And other Poems
                      


In [15]:
# clean document but this time I want remove any words on delete list
tokens = clean_doc(doc)
print(tokens[:200])
print('Total Tokens: %d' % len(tokens))
print('Unique Tokens: %d' % len(set(tokens)))

['the', 'project', 'gutenberg', 'etext', 'of', 'the', 'canterbury', 'tales', 'and', 'other', 'poems', 'by', 'geoffrey', 'chaucer', 'the', 'canterbury', 'tales', 'and', 'other', 'poems', 'of', 'geoffrey', 'chaucer', 'edited', 'for', 'popular', 'perusal', 'by', 'd', 'laing', 'purves', 'contents', 'preface', 'life', 'of', 'chaucer', 'the', 'canterbury', 'tales', 'the', 'general', 'prologue', 'the', 'knights', 'tale', 'the', 'millers', 'tale', 'the', 'reeves', 'tale', 'the', 'cooks', 'tale', 'the', 'man', 'of', 'laws', 'tale', 'the', 'wife', 'of', 'baths', 'tale', 'the', 'friars', 'tale', 'the', 'sompnours', 'tale', 'the', 'clerks', 'tale', 'the', 'merchants', 'tale', 'the', 'squires', 'tale', 'the', 'franklins', 'tale', 'the', 'doctors', 'tale', 'the', 'pardoners', 'tale', 'the', 'shipmans', 'tale', 'the', 'prioresss', 'tale', 'chaucers', 'tale', 'of', 'sir', 'thopas', 'chaucers', 'tale', 'of', 'meliboeus', 'the', 'monks', 'tale', 'the', 'nuns', 'priests', 'tale', 'the', 'second', 'nuns',

In [None]:
#remove words on delete list
new_words = [word for word in tokens if word not in delete_list]

new_words

In [None]:

print(new_words[:200])
print('Total Tokens: %d' % len(new_words))
print('Unique Tokens: %d' % len(set(new_words)))

In [34]:
# organize into sequences of tokens
#difference between sequences (previous) and sentences is that I fixed issue
#with repeating words
length = 50 + 1
#move forward 3 words
step = 2
sequences = list()
for i in range(length, len(new_words), step):
    # select sequence of new_words (previously tokens)
    seq = new_words[i-length:i]
    # convert into a line
    line = ' '.join(seq)
    # store
    sequences.append(line)
print('Total Sequences: %d' % len(sequences))

Total Sequences: 125393


In [35]:
# save sequences to file
out_filename = 'canterburytales_sequences.txt'

save_doc(sequences, out_filename)

In [36]:
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text
 
# load
in_filename = 'canterburytales_sequences.txt'
doc = load_doc(in_filename)
lines = doc.split('\n')

In [38]:
#representation of the model is distributed
#- i.e words with similar meaning have similar representation
#representation learned simulataneously with model
#probability of next word using context of last 100 words
#embedding layer for representation, LSTM to predict words on context

# integer encode sequences of words
#mapping each word to unique integer and encoding input sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)
sequences = tokenizer.texts_to_sequences(lines)

# vocabulary size
#i add a one since vocabulary begins from index one 
#but indexing arrays is zero offset
vocab_size = len(tokenizer.word_index) + 1

In [39]:
tokenizer.word_index

{'openly': 1653,
 'chin': 5427,
 'trojans': 3174,
 'palm': 5858,
 'close': 1155,
 'temperance': 6282,
 'promises': 1847,
 'savour': 1392,
 'put': 259,
 'wighte': 5644,
 'o': 131,
 'worste': 5093,
 'list': 218,
 'given': 356,
 'guard': 4685,
 'carpenteres': 3461,
 'et': 1179,
 'creon': 4063,
 'simon': 3037,
 'sides': 2082,
 'comfort': 706,
 'selfsame': 5509,
 'guitar': 5268,
 'boethius': 2460,
 'hoard': 3287,
 'just': 1195,
 'riche': 1677,
 'priest': 390,
 'mighte': 378,
 'trespace': 4862,
 'weive': 4565,
 'breaking': 4992,
 'plenty': 2622,
 'principle': 5912,
 'psalm': 1983,
 'direct': 2177,
 'deum': 6207,
 'violent': 5206,
 'cavalry': 5415,
 'sits': 1201,
 'beyond': 3318,
 'jewels': 3262,
 'dung': 2404,
 'abideth': 6327,
 'spiritual': 2710,
 'deny': 4065,
 'churlish': 3896,
 'saiden': 4464,
 'specially': 1092,
 'allusion': 4725,
 'courtiers': 2489,
 'wantonness': 3267,
 'riseth': 4071,
 'craft': 635,
 'griseld': 1723,
 'wight': 147,
 'sheweth': 4198,
 'seemed': 672,
 'lovely': 2714,
 

In [40]:

#need to seperate word encodings into  y (output) and x (input)
# separate output from input with array slicing
sequences = array(sequences)
X, y = sequences[:,:-1], sequences[:,-1]
y = to_categorical(y, num_classes=vocab_size)
seq_length = X.shape[1]

In [48]:

# defining model
# two LSTM hidden layers with 100 memory cells
#dense fully connected layer with 100 neurons connects to the LSTM hidden layers 
#to interpret the features extracted from the sequence
#output layer predicts the next word as a single vector the size of the vocabulary 
#with a probability for each word in the vocabulary
#softmax activation function is used to ensure 
#the outputs have the characteristics of normalized probabilities
model = Sequential()

model.add(Embedding(vocab_size, 10, input_length=seq_length))
model.add(LSTM(200))
#model.add(LSTM(100, return_sequences=True))
model.add(Dropout(0.2))
#model.add(LSTM(100))
#model.add(Dropout(0.2))
model.add(Dense(200, activation='relu'))
model.add(Dense(200))
model.add(Dense(vocab_size, activation='softmax'))
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 50, 10)            66470     
_________________________________________________________________
lstm_3 (LSTM)                (None, 200)               168800    
_________________________________________________________________
dropout_3 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense_7 (Dense)              (None, 200)               40200     
_________________________________________________________________
dense_8 (Dense)              (None, 200)               40200     
_________________________________________________________________
dense_9 (Dense)              (None, 6647)              1336047   
Total params: 1,651,717
Trainable params: 1,651,717
Non-trainable params: 0
_________________________________________________________________


In [49]:

# compile model
optimizer = RMSprop(lr=0.007)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])


In [None]:
# define the checkpoint so I can load model in future
filepath = "weights.hdf5"
checkpoint = ModelCheckpoint(filepath, 
                             monitor='loss', 
                             verbose=1, 
                             save_best_only=True, 
                 
                             mode='min')
earlystop = EarlyStopping(monitor='val_loss', patience=2,
                              verbose=2)

# fit model using the gpu
#batch size is a particularly important hyperparamter which I intend to play around within future posts
with tf.device('/gpu:0'):

    history=model.fit(X, y,validation_split=0.1,batch_size=60,epochs=60,verbose=1,callbacks=[ checkpoint,earlystop])


Train on 112853 samples, validate on 12540 samples
Epoch 1/60

Epoch 00001: loss improved from inf to 15.39986, saving model to weights.hdf5
Epoch 2/60

In [None]:
print(history.history.keys())

In [None]:
history.history['val_loss']

In [None]:
# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
# save the tokenizer
dump(tokenizer, open('tokenizer.pkl', 'wb'))

In [None]:
from random import randint
from pickle import load
from keras.models import load_model
from keras.preprocessing.sequence import pad_sequences
 
# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text
 


In [None]:
n_words

In [None]:
# generate a sequence from a language model
def generate_seq(model, tokenizer, seq_length, seed_text, n_words):
    result = list()
    in_text = seed_text
    # generate a fixed number of words
    for _ in range(n_words):
        # encode the text as integer
        encoded = tokenizer.texts_to_sequences([in_text])[0]
        # truncate sequences to a fixed length
        encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
        # predict probabilities for each word
        yhat = model.predict_classes(encoded, verbose=0)
        # map predicted word index to word
        out_word = ''
        for word, index in tokenizer.word_index.items():
            if index == yhat:
                out_word = word
                break
        # append to input
        in_text += ' ' + out_word
        result.append(out_word)
    return ' '.join(result)
 
# load cleaned text sequences
in_filename = '/Users/neilwatt/Documents/BIs/PrWeb/2018Posts/October/Code/canterburytales_sequences.txt'
doc = load_doc(in_filename)
lines = doc.split('\n')
seq_length = len(lines[0].split()) - 1
 
# load the model
model = load_model('weights.hdf5')
 
# load the tokenizer
tokenizer = load(open('tokenizer.pkl', 'rb'))
 
# select a seed text
seed_text = lines[randint(0,len(lines))]
print(seed_text + '\n')
 
# generate new text
generated = generate_seq(model, tokenizer, seq_length, seed_text, 3)
print(generated)

In [None]:
seed_text