In [1]:
def load_file(filename):            #Load file, open and read, return the text.
    file = open(filename, mode = 'r', encoding='utf-8') 
    text = file.read()
    file.close()
    return text
def clean_text(text):
    text = text.replace("--"," ")   # Replace -- with a space
    text = text.replace("_"," ")    # Replace _ with a space
    text = text.replace("\""," ")   # Replace " with a space
    text = text.replace("'"," ")    # Replace ' with a space
    tokens = text.split()           # Split the text to a list of words(tokens)
    tokens = [word.lower() for word in tokens]     # Lower case
    return tokens
def save_file(lines, filename):     # Save Sequences to file
    data = "\n".join(lines)
    file = open(filename, "w")      # Opens write only mode
    file.write(data)
    file.close()
def generate_seq(model, tokenizer, seq_length, seed_text, n_words):
    result = list()
    for _ in range(n_words):
        encoded = tokenizer.texts_to_sequences([seed_text])[0]         
        encoded = pad_sequences([encoded], maxlen=seq_length)
        print("Encoded:" + str(encoded))
        y_out = model.predict_classes(encoded, verbose=0)
        print("y_out:" + str(y_out))
        out_word = ''
        for word, index in tokenizer.word_index.items():
            if index == y_out:
                out_word = word
                break
        seed_text += " " + out_word
        result.append(out_word)
    return " ".join(result)

In [2]:
from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras import Sequential
from keras.layers import Embedding
from keras.layers import LSTM
from keras.layers import Dense
from keras.models import load_model
from keras import Model
from keras.preprocessing.sequence import pad_sequences
from pickle import dump
from pickle import load
from random import randint
import numpy as np
import string


Using TensorFlow backend.


In [3]:
read_filename = "TheCountOfMonteCristo.txt"        #Training Text
text = load_file(read_filename)                    #Load text

print(text[10]+"\n")                               #Text loaded character by character
print(text[:200])

O

THE COUNT OF MONTE CRISTO

by Alexandre Dumas, Pere




Chapter 1. Marseilles--The Arrival.

On the 24th of February, 1815, the look-out at Notre-Dame de la Garde
signalled the three-master, the Phara


In [4]:
tokens = clean_text(text)                           #Cleaned text and word
total_tokens=len(tokens)                            #Repeated words counted separately
unique_tokens=len(set(tokens))                      #Repeated words counted only one time
print(tokens[0])
print(tokens[:20])
print("Total Tokens:%d" % total_tokens)
print("Unique Tokens:%d"% unique_tokens)

the
['the', 'count', 'of', 'monte', 'cristo', 'by', 'alexandre', 'dumas,', 'pere', 'chapter', '1.', 'marseilles', 'the', 'arrival.', 'on', 'the', '24th', 'of', 'february,', '1815,']
Total Tokens:464076
Unique Tokens:30673


In [5]:
#Organize into sequences of tokens
length = 4                            # N input words + 1 output word
sequences = list()
for i in range(length, total_tokens): 
    sequence = tokens[i-length:i]     # Iterate words 0-N+1, 1-N+2, 2-N+3
    line = " ".join(sequence)         # Convert into a line
    sequences.append(line)
total_seq = len(sequences)
print("Total Sequences:%d" % total_seq)
print(sequences[0]+"\n")
print(sequences[0:10])

Total Sequences:464072
the count of monte

['the count of monte', 'count of monte cristo', 'of monte cristo by', 'monte cristo by alexandre', 'cristo by alexandre dumas,', 'by alexandre dumas, pere', 'alexandre dumas, pere chapter', 'dumas, pere chapter 1.', 'pere chapter 1. marseilles', 'chapter 1. marseilles the']


In [6]:
write_filename = "monteCristo_sequences.txt"
save_file(sequences, write_filename)

In [7]:
file = load_file(write_filename)
lines = file.split("\n")    # Take line by line using split(\n)
(lines[0:2])

['the count of monte', 'count of monte cristo']

In [8]:
#Encoding
tokenizer = Tokenizer()
tokenizer.fit_on_texts(lines)                       #Finds the all off the unique words in the data and assigns each a unique integer.list of texts to train on.
sequences = tokenizer.texts_to_sequences(lines)     #Converts each sequence from a list of words to a list of integers.list of texts to turn to sequences.
sequences = pad_sequences(sequences, maxlen=length) #Pads sequences to the same length of the N+1.
print(sequences[0:4])           

[[    1    47     3    56]
 [   47     3    56    57]
 [    3    56    57    31]
 [   56    57    31 10241]]


In [9]:
vocab_size = len(tokenizer.word_index) + 1            # Indexing of array is zero-offset
print("Total Tokens             :%d" % total_tokens)
print("Unique Tokens            :%d" % unique_tokens) # string, string! string?
print("Actual tokens(vocab_size):%d" % vocab_size)    # string, string! string? --> string
print(vocab_size)
tokenizer.word_index.items()

Total Tokens             :464076
Unique Tokens            :30673
Actual tokens(vocab_size):15918
15918




In [10]:
sequences = np.array(sequences)            #Convert the sequences list to an array to be able to split as X,Y
X = sequences[:,:-1]                       #Take the 0-N words as input
Y = sequences[:,-1]                        #Take the N+1 word as output
Y=to_categorical(Y,num_classes=vocab_size) #One hot encoding
print(Y[9])
print(X.shape)
print(Y.shape)

[0. 1. 0. ... 0. 0. 0.]
(464072, 3)
(464072, 15918)


In [None]:
model = Sequential()
model.add(Embedding(vocab_size, 3, input_length=length-1))       #input_dim,output_dim,input_length
model.add(LSTM(512, return_sequences=True))
model.add(LSTM(512))
model.add(Dense(vocab_size , activation = "softmax"))

In [11]:
model = load_model("model_final.h5")
tokenizer = load(open("tokenizer.pkl", "rb"))

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.


In [12]:
#seed_text = lines[randint(0,len(lines))]
#print(seed_text)
seed_text="owner of the "

In [14]:
generated = generate_seq(model, tokenizer, length-1, seed_text, 2)
print(generated)

Encoded:[[980   3   1]]
y_out:[664]
Encoded:[[  3   1 664]]
y_out:[11]
horse that
