In [19]:
# import dependencies
import numpy
import sys
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\komaldeep
[nltk_data]     kaur\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [20]:
# load data
# loading data and opening our input data in the form of a txt file
# project Gutenburg/berg is where the data can be found (just Google it !)
file = open("chapter.txt","r",encoding="utf8").read()

In [21]:
#tokenization
#standarization
#meaningful elements

def tokenize_words(input):
    # lowercase everything to standardize it
    input = input.lower()

    # instantiate the tokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(input)

    # if the created token isn't in the stop words, make it part of "filtered"
    filtered = filter(lambda token: token not in stopwords.words('english'), tokens)
    return " ".join(filtered)

# preprocess the input data, make tokens
processed_inputs = tokenize_words(file)

In [22]:
# char to numbers
# convert characters in our inputs to numbers
# we'll sort the list of the set of all characters that appears in out i/p text and then use the enumerate fn
# to get numbers that represents the characters
# we'll create a dictionary that stores the keys and values, or the characters and the numbers that represent them
chars = sorted(list(set(processed_inputs)))
char_to_num = dict((c,i) for i, c in enumerate(chars))


In [23]:
#check if words to chars or char to num (?!) has worked ?
# just so we get an idea of whether our process of converting words to characters has worked,
input_len = len(processed_inputs)
vocab_len = len(chars)
print ("Total number of characters:", input_len)
print ("Total vocab:", vocab_len)

Total number of characters: 17569
Total vocab: 30


In [24]:
# seq length
# we,re defining how long we want an individual sequence here
# an individual sequence is a complete mapping of input characters as integers
seq_length = 100
x_data = []
y_data = []

In [25]:
# loop through inputs, start at the beginning and go until we hit
# the final character we can create a sequence out of
for i in range(0, input_len - seq_length, 1):
    # Define input and output sequences
    # Input is the current character plus desired sequence length
    in_seq = processed_inputs[i:i + seq_length]

    # Out sequence is the initial character plus total sequence length
    out_seq = processed_inputs[i + seq_length]

    # We now convert list of characters to integers based on
    # previously and add the values to our lists
    x_data.append([char_to_num[char] for char in in_seq])
    y_data.append(char_to_num[out_seq])

In [26]:
# check to see how many total input sequence we have
n_patterns = len(x_data)
print ("Total Patterns:", n_patterns)

Total Patterns: 17469


In [27]:
# convert input sequence to np array that our network can use
X = numpy.reshape(x_data, (n_patterns, seq_length, 1))
X = X/float(vocab_len)

In [28]:
# one-hot encoding our label data
y = np_utils.to_categorical(y_data)

In [29]:
# Creating the model
# creating a sequential model
# dropout is used to prevent overfitting
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))

In [30]:
# compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [31]:
# Saving weights
filepath = "model_weights_saved.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
desired_callbacks = [checkpoint]

In [32]:
# fit model 
model.fit(X, y, epochs=4, batch_size=256, callbacks=desired_callbacks)

Epoch 1/4

Epoch 00001: loss improved from inf to 2.96745, saving model to model_weights_saved.hdf5
Epoch 2/4

Epoch 00002: loss improved from 2.96745 to 2.91741, saving model to model_weights_saved.hdf5
Epoch 3/4

Epoch 00003: loss improved from 2.91741 to 2.91180, saving model to model_weights_saved.hdf5
Epoch 4/4

Epoch 00004: loss improved from 2.91180 to 2.90830, saving model to model_weights_saved.hdf5


<keras.callbacks.callbacks.History at 0x26ac014e978>

In [33]:
# recompile model with saved weights
filename = "model_weights_saved.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [34]:
#output of the model back to characters
num_to_char = dict((i, c) for i, c in enumerate(chars))

In [35]:
# random seed to help generate
start = numpy.random.randint(0, len(x_data) - 1)
pattern = x_data[start]
print("Random Seed:")
print("\"", ''.join([num_to_char[value] for value in pattern]), "\"")

Random Seed:
"  discoveries modern philosophers always came studies discontented unsatisfied sir isaac newton said  "


In [36]:
# generate the text
for i in range(1000):
    x = numpy.reshape(pattern, (1, len(pattern), 1))
    x = x / float(vocab_len)
    prediction = model.predict(x, verbose=0)
    index = numpy.argmax(prediction)
    result = num_to_char[index]
    seq_in = [num_to_char[value] for value in pattern]

    sys.stdout.write(result)

    pattern.append(index)
    pattern = pattern[1:len(pattern)]

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        