In [3]:
# importing some required dependencies
import numpy
import sys
import nltk
nltk.download('stopwords')
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
# load some data
# loading data and opening it in the form of txt file
# file has been taken from Project Guthenberg/berg
file = open('frankenstein-2.txt').read()

In [5]:
# tokenization
# standardization
# defining tokenization and process of breaking stream of text up into words phrases, symbols
# meaningful elements
def tokenize_words(input):
    # lowercase everything to standardize it
    input = input.lower()
    # instantiating the tokenizer
    tokenizer = RegexpTokenizer(r'\w+')
    # tokenizing text into tokens
    tokens = tokenizer.tokenize(input)
    # filtering stopword using lambda
    filtered = filter(lambda token: token not in stopwords.words('english'),tokens)
    return "".join(filtered)
# preprocess the input data and make it as tokens
processed_inputs = tokenize_words(file)

In [6]:
# characters to numbers
# conert input character into numbers

chars = sorted(list(set(processed_inputs)))
char_to_num = dict((c,i) for i,c in enumerate(chars))

In [7]:
# check if words to chars or char to num (?!) has worked?
input_len = len(processed_inputs)
vocab_len = len(chars)
print ("total no. of characters:", input_len)
print ("total vocab:", vocab_len)

total no. of characters: 241873
total vocab: 42


In [8]:
# seq length
seq_length = 100
x_data = []
y_data = []

In [9]:
# loop through sequence
for i in range(0, input_len - seq_length,1):
    in_seq = processed_inputs[i:i + seq_length]
    out_seq = processed_inputs[i + seq_length]
    x_data.append([char_to_num[char] for char in in_seq])
    y_data.append(char_to_num[out_seq])
    
n_patterns = len(x_data)
print ("total patterns:", n_patterns)

total patterns: 241773


In [10]:
# convert input sequence to np array and so on
X = numpy.reshape(x_data, (n_patterns, seq_length, 1))
X = X/float(vocab_len)

In [11]:
# one-hot encoding
y = np_utils.to_categorical(y_data)

In [12]:
# creating the model
model = Sequential()
model.add(LSTM(256, input_shape= (X.shape[1], X.shape[2]), return_sequences = True))
model.add(Dropout(0.2))
model.add(LSTM(256, return_sequences = True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation = 'softmax'))

In [13]:
# compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [14]:
# saving weights
filepath = 'model_weight_saved.hdf5'
checkpoint = ModelCheckpoint(filepath, monitor = 'loss', verbose = 1, save_best_only = True, mode= 'min')
desired_callbacks = [checkpoint]

In [15]:
# fit model and let them trains
model.fit(X,y, epochs= 4, batch_size = 256, callbacks = desired_callbacks)

Epoch 1/4

Epoch 00001: loss improved from inf to 2.93454, saving model to model_weight_saved.hdf5
Epoch 2/4

Epoch 00002: loss improved from 2.93454 to 2.91349, saving model to model_weight_saved.hdf5
Epoch 3/4

Epoch 00003: loss improved from 2.91349 to 2.88190, saving model to model_weight_saved.hdf5
Epoch 4/4

Epoch 00004: loss improved from 2.88190 to 2.84683, saving model to model_weight_saved.hdf5


<keras.callbacks.History at 0x7f8ec1df3cf8>

In [16]:
# recompile model with saved weights
filename = "model_weight_saved.hdf5"
model.load_weights(filename)
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam')

In [17]:
# output of the model back into characters
num_to_char = dict((i,c) for i,c in enumerate(chars))

In [18]:
# random seed to help generate
start = numpy.random.randint(0, len(x_data) - 1)
pattern = x_data[start] 
print ("Random Seed:")
print ("\"", ''.join([num_to_char[value] for value in pattern]), "\"")

Random Seed:
" waydyingmomentsshallcursemakerwordsstrangeeffectuponcompassionatedsometimesfeltwishconsolelookedupon "


In [19]:
# generate the text
for i in range(1000):
    x = numpy.reshape(pattern, (1,len(pattern), 1))
    x = x/float(vocab_len)
    prediction= model.predict(x, verbose = 0)
    index = numpy.argmax(prediction)
    result = num_to_char[index]
    seq_in = [num_to_char[value] for value in pattern]
    sys.stdout.write(result)
    pattern.append(index)
    pattern = pattern[1:len(pattern)]

eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee