In [1]:
#import dependencies
import sys
import numpy as np
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.utils import np_utils
from keras.callbacks import ModelCheckpoint

In [2]:
#import book file it's in utf-8 format 
file = open("wonderland.txt", encoding="utf-8").read()

In [3]:
#tokenizing the words
def tokenize_words(input):
    input = input.lower() ##making every char to lowercase 
    tokenizer=RegexpTokenizer(r'\w+ ') ##using regexptokenizer to select word char only(A-Z,1-9,_) 
    tokens=tokenizer.tokenize(input) ##tokenizing the input
    filtered=filter(lambda token: token not in stopwords.words("english"),tokens) 
    ##filtering out the words who doesn't have meaning like is,the etc.
    return "".join(filtered) ## returning that data

processed_inputs = tokenize_words(file)

In [4]:
##creating th sorted list of each characters
chars = sorted(list(set(processed_inputs)))
##giving each chars a numeric value
char_to_num = dict((c,i) for i,c in enumerate(chars))

In [5]:
input_len=len(processed_inputs) ##total no. of characters
vocal_len=len(chars) ##total no. of unique characters
print("Total no of chars:",input_len)
print("Total vocab:",vocal_len)

Total no of chars: 97761
Total vocab: 29


In [6]:
seq_len = 100
x_data = []
y_data = []
# creating sequences
for i in range(0, input_len - seq_len,1):
    seq_in=processed_inputs[i:(i+seq_len)] ## 1 to 100 
    seq_out=processed_inputs[(i+seq_len)] ## 101
    ## converting the char to num using dict and adding it x_data and y_data
    x_data.append([char_to_num[x] for x in seq_in])  
    y_data.append(char_to_num[seq_out])
## getting the total no. of patterns
patterns=len(x_data) 
print("Total Patterns:",patterns)

Total Patterns: 97661


In [7]:
# reshaping the x_data to (Samples, time stamps ,features)
X=np.reshape(x_data, (patterns, seq_len, 1))
X=X/float(vocal_len) ##normalizing

In [8]:
y=np_utils.to_categorical(y_data) ##one hot encoding of y_data

In [9]:
##crating LSTM Model
model=Sequential()
model.add(LSTM(256, input_shape=(X.shape[1],X.shape[2]), return_sequences=True))##input and LSTM layer
model.add(Dropout(0.2))##dropout layer
model.add(LSTM(128))## again an LSTM layer
model.add(Dropout(0.2))## dropout layer
model.add(Dense(y.shape[1], activation="softmax"))## final layer

In [10]:
model.compile(loss="categorical_crossentropy", optimizer="adam") ##compiling the model

In [11]:
## Creating Checkpoint for model
backup_file = "model_weights.hdf5"
checkpoint = ModelCheckpoint(backup_file, monitor="loss", verbose=1, save_best_only=True, mode="min")
desired_callbacks=[checkpoint]

In [12]:
model.fit(X, y, epochs=15, batch_size=256, callbacks=desired_callbacks)

Epoch 1/15
Epoch 00001: loss improved from inf to 2.81618, saving model to model_weights.hdf5
Epoch 2/15
Epoch 00002: loss improved from 2.81618 to 2.64925, saving model to model_weights.hdf5
Epoch 3/15
Epoch 00003: loss improved from 2.64925 to 2.40247, saving model to model_weights.hdf5
Epoch 4/15
Epoch 00004: loss improved from 2.40247 to 2.26417, saving model to model_weights.hdf5
Epoch 5/15
Epoch 00005: loss improved from 2.26417 to 2.16762, saving model to model_weights.hdf5
Epoch 6/15
Epoch 00006: loss improved from 2.16762 to 2.09322, saving model to model_weights.hdf5
Epoch 7/15
Epoch 00007: loss improved from 2.09322 to 2.03108, saving model to model_weights.hdf5
Epoch 8/15
Epoch 00008: loss improved from 2.03108 to 1.97909, saving model to model_weights.hdf5
Epoch 9/15
Epoch 00009: loss improved from 1.97909 to 1.93672, saving model to model_weights.hdf5
Epoch 10/15
Epoch 00010: loss improved from 1.93672 to 1.90320, saving model to model_weights.hdf5
Epoch 11/15
Epoch 00011

<tensorflow.python.keras.callbacks.History at 0x2489118a7f0>

In [13]:
## retraining the model
model.load_weights(backup_file)
model.compile(loss="categorical_crossentropy", optimizer="adam")

In [14]:
## opposite of char_to_num
num_to_char = dict((i,c) for i,c in enumerate(chars))

In [15]:
## getting seed value
start=np.random.randint(0,len(x_data)-1)
pattern=x_data[start]
print("Random Seed:")
print("\"",''.join([num_to_char[x] for x in pattern]),"\"")

Random Seed:
" ong and how would feel with all their simple and find a pleasure in their simple remembering her own "


In [16]:
#generating characters
for i in range(500):
    x=np.reshape(pattern,(1,len(pattern),1))
    x=x/float(vocal_len)
    pred = model.predict(x, verbose=0)
    index = np.argmax(pred)
    result = num_to_char[index]
    seq_in = [num_to_char[x] for x in pattern]
    sys.stdout.write(result)
    pattern.append(index)
    pattern=pattern[1:len(pattern)]
print("\nDone")
    

 and she said the she she was the she was the she was the she was the she was the she was the she was the she was the she was the she was the she was the she was the she was the she was the she was the she was the she was the she was the she was the she was the she was the she was the she was the she was the she was the she was the she was the she was the she was the she was the she was the she was the she was the she was the she was the she was the she was the she was the she was the she was th
Done
