In [40]:
# import dependencies 
import numpy as np 
import sys

In [41]:
# load data 
file = open("Frankeinstein.txt").read()

In [42]:
# tokenize the words from the test file 
# standardizatoin 
import nltk
nltk.download('stopwords')
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

def tokenize_words(input):
    # making the input word a lower case 
    input = input.lower()
    
    # making tokens using tokenizer 
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(input)
    
    # filtering the stop words from the tokens 
    filtered = filter(lambda token : token not in stopwords.words('english'), tokens)
    return "".join(filtered)

processed_inputs = tokenize_words(file)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [43]:
# sorting the char and converting char to num 
chars = sorted(list(set(processed_inputs)))
char_to_num = dict((c,i) for i, c in enumerate(chars))

In [39]:
# determining the length of the variabes
input_len = len(processed_inputs)
vocab_len = len(chars)
print('total number of characters : ', input_len)
print('total number of vocabs : ', vocab_len)

total number of characters :  233236
total number of vocabs :  41


In [44]:
# generating out dataset

seq_length = 100
X_data = []
y_data = []

In [45]:
# creating input sequence and output sequence 

for i in range(0, input_len - seq_length):
    in_seq = processed_inputs[i:i + seq_length]
    out_seq = processed_inputs[i + seq_length]
    # converting the sequences in numbers and adding it to our dataset
    X_data.append([char_to_num[char] for char in in_seq])
    y_data.append([char_to_num[out_seq]])

# note : try printin an inividual in_seq and out_seq for better understanding

In [46]:
# number of patters we have 
n_patterns = len(X_data)
print('total patterns : ', n_patterns)

total patterns :  233136


In [47]:
# converting the input sequence to numpy array 
X = np.reshape(X_data, (n_patterns, seq_length, 1))
X = X/float(vocab_len)

In [49]:
# one_hot encoding 
from keras.utils import np_utils 
y = np_utils.to_categorical(y_data)

In [50]:
# creating a sequencial model 
from keras import Sequential
from keras.layers import Dense, Dropout, LSTM
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))

W0529 15:13:08.512183 139643002533696 deprecation_wrapper.py:119] From /mnt/disks/user/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0529 15:13:08.555622 139643002533696 deprecation_wrapper.py:119] From /mnt/disks/user/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0529 15:13:09.183063 139643002533696 deprecation_wrapper.py:119] From /mnt/disks/user/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:133: The name tf.placeholder_with_default is deprecated. Please use tf.compat.v1.placeholder_with_default instead.

W0529 15:13:09.190980 139643002533696 deprecation.py:506] From /mnt/disks/user/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:3445: calling dropout (from tensorflow.python.ops.nn_ops) with keep_pro

In [51]:
# compiling the model 
model.compile(loss='categorical_crossentropy', optimizer='adam')

W0529 15:13:44.148562 139643002533696 deprecation_wrapper.py:119] From /mnt/disks/user/anaconda3/lib/python3.7/site-packages/keras/optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.

W0529 15:13:44.172680 139643002533696 deprecation_wrapper.py:119] From /mnt/disks/user/anaconda3/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:3295: The name tf.log is deprecated. Please use tf.math.log instead.



In [52]:
# saving weights
from keras.callbacks import ModelCheckpoint
filepath = "model_weights_saved.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
desired_callbacks = [checkpoint]


In [53]:
# fit the model and let it train 
model.fit(X, y, epochs=4, batch_size=256, callbacks=desired_callbacks)

W0529 15:14:38.230058 139643002533696 deprecation.py:323] From /mnt/disks/user/anaconda3/lib/python3.7/site-packages/tensorflow/python/ops/math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Epoch 1/4

Epoch 00001: loss improved from inf to 2.93312, saving model to model_weights_saved.hdf5
Epoch 2/4

Epoch 00002: loss improved from 2.93312 to 2.91078, saving model to model_weights_saved.hdf5
Epoch 3/4

Epoch 00003: loss improved from 2.91078 to 2.87822, saving model to model_weights_saved.hdf5
Epoch 4/4

Epoch 00004: loss improved from 2.87822 to 2.84592, saving model to model_weights_saved.hdf5


<keras.callbacks.History at 0x7f00aa6fa860>

In [54]:
# recompiling the model with the saved weights 
filename = "model_weights_saved.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [55]:
# output of the model back into the characters 
num_to_char = dict((i, c) for i, c in enumerate(chars))

In [56]:
# generating sequence of characters using the random seed

start = np.random.randint(0, len(X_data) - 1)
pattern = X_data[start]
print("Random Seed:")
print("\"", ''.join([num_to_char[value] for value in pattern]), "\"")

Random Seed:
" heldresultstudydesirewisestmensincecreationworldwithingrasplikemagicsceneopeneduponinformationobtain "


In [57]:
# generating the text 

for i in range(1000):
    x = np.reshape(pattern, (1, len(pattern), 1))
    x = x / float(vocab_len)
    prediction = model.predict(x, verbose=0)
    index = np.argmax(prediction)
    result = num_to_char[index]
    seq_in = [num_to_char[value] for value in pattern]

    sys.stdout.write(result)

    pattern.append(index)
    pattern = pattern[1:len(pattern)]

eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee