In [2]:
# importing dependencies
import numpy
import sys
import nltk
nltk.download('stopwords')
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/keerthana/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Using TensorFlow backend.


In [3]:
# loading the data
file = open("frankenstein - 2.txt").read()

In [4]:
# tokenization
# Tokenization: It is the process of breaking a stream of text up into words, phrases, symbols or other 
#               meaningful elements.
# standardisation
def tokenize_words(input):
    # lowercase everything to standardize it
    input = input.lower()
    

    # instantiate the tokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    # tokenizing the text into tokens
    tokens = tokenizer.tokenize(input)

    # if the created token isn't in the stop words, make it part of "filtered"
    # filtering the stopwords using lambda function
    filtered = filter(lambda token: token not in stopwords.words('english'), tokens)
    return " ".join(filtered)

# preprocess the input data, make tokens
processed_inputs = tokenize_words(file)

In [5]:
# chars to numbers
# convert characters in our input to numbers
# we will sort the list of the set of all characters that appear in out i/p text and then use the enumerate func
# to get numbers that represent characters
# we will then create a dictionary that stores the keys and values, or the characters and the numbers that represent them
chars = sorted(list(set(processed_inputs)))
char_to_num = dict((c, i) for i, c in enumerate(chars))

In [6]:
# check if words to chars or chars to num (?!) has worked?
# just so we get an idea of whether our process of converting words to characters has worked
# we print the length ofour variables
input_len = len(processed_inputs)
vocab_len = len(chars)
print("Total number of characters:", input_len)
print("Total vocab:", vocab_len)

Total number of characters: 236349
Total vocab: 42


In [7]:
# seq length
# we are defining how long we want an individual sequence here
# an individual sequence is a complete mapping of input characters as integers
seq_length = 100
x_data = []
y_data = []

In [8]:
# loop through inputs, start at the beginning and go until we hit
# the final character we can create a sequence out of

# here we're going through the entire list of i/ps and converting the chars to numbers with a for loop
# this will create a bunch of sequences where each sequence starts with the next character in the i/p data
# begining with the first character
for i in range(0, input_len - seq_length, 1):
    # Define input and output sequences
    # Input is the current character plus desired sequence length
    in_seq = processed_inputs[i:i + seq_length]

    # Out sequence is the initial character plus total sequence length
    out_seq = processed_inputs[i + seq_length]

    # We now convert list of characters to integers based on
    # previously and add the values to our lists
    x_data.append([char_to_num[char] for char in in_seq])
    y_data.append(char_to_num[out_seq])

# check to see how many input sequences we have
n_patterns = len(x_data)
print ("Total Patterns:", n_patterns)

Total Patterns: 236249


In [9]:
# convert input sequence to np array taht our network can use
X = numpy.reshape(x_data, (n_patterns, seq_length, 1))
X = X/float(vocab_len)

In [10]:
# one-hot encoding our label data
y = np_utils.to_categorical(y_data)

In [11]:
# creating the model
# creating a sequential model
# dropout is used to prevent overfitting
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))

In [12]:
# compiling the model
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [13]:
# saving the weights 
filepath = "model_weights_saved.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
desired_callbacks = [checkpoint]

In [18]:
# fit the model and let it train
model.fit(X, y, epochs=4, batch_size=256, callbacks=desired_callbacks)

Epoch 1/4

Epoch 00001: loss improved from inf to 2.78870, saving model to model_weights_saved.hdf5
Epoch 2/4

Epoch 00002: loss improved from 2.78870 to 2.46623, saving model to model_weights_saved.hdf5
Epoch 3/4

Epoch 00003: loss improved from 2.46623 to 2.29015, saving model to model_weights_saved.hdf5
Epoch 4/4

Epoch 00004: loss improved from 2.29015 to 2.15861, saving model to model_weights_saved.hdf5


<keras.callbacks.callbacks.History at 0x7fd96d812c50>

In [14]:
# recompile the model with saved weights
filename = "model_weights_saved.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [15]:
# output of the model back to characters
num_to_char = dict((i, c) for i, c in enumerate(chars))

In [16]:
# random seed to help generate
start = numpy.random.randint(0, len(x_data) - 1)
pattern = x_data[start]
print("Random Seed:")
print("\"", ''.join([num_to_char[value] for value in pattern]), "\"")

Random Seed:
"  recesses nature show works hiding places ascend heavens discovered blood circulates nature air brea "


In [17]:
# generate the text
for i in range(1000):
    x = numpy.reshape(pattern, (1, len(pattern), 1))
    x = x / float(vocab_len)
    prediction = model.predict(x, verbose=0)
    index = numpy.argmax(prediction)
    result = num_to_char[index]
    seq_in = [num_to_char[value] for value in pattern]

    sys.stdout.write(result)

    pattern.append(index)
    pattern = pattern[1:len(pattern)]

ture seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared seared s