In [111]:
#Importing Dependancies
import numpy as np
import sys
#import nltk  (Not necessary, only if nltk stopwords error arises)
#nltk.download('stopwords')
from nltk.tokenize import RegexpTokenizer as rt
from nltk.corpus import stopwords as sw
from keras.models import Sequential as sq
from keras.layers import Dense, Dropout, LSTM
from keras.utils import np_utils as npu
from keras.callbacks import ModelCheckpoint 

In [112]:
#loading data
#file = open('Text Doc.txt').read()
file = open('sample.txt').read()

In [113]:
#tokenization
#standardization
def tokenize_words(input):
    input = input.lower()
    #initiating the token
    tokenizer = rt(r'\w+')
    #tokenizing text to tokens
    tokens = tokenizer.tokenize(input)
    #filtering stopwords using lambda
    filtered = filter(lambda token: token not in sw.words('english'), tokens)
    return "".join(filtered)
#preprocess the input data , make tokens
processed_inputs = tokenize_words(file)

In [114]:
#chars to numbers
#after conversion we will create the dictionary that stores the keys and values
chars = sorted(list(set(processed_inputs)))
chars_to_num = dict((c,i) for i,c in enumerate(chars))

In [115]:
#check if words to chars or chars to num(?!) has worked?
#print the length of the variables
input_len = len(processed_inputs)
vocab_len = len(chars)
print("Total number of characters : ", input_len)
print("Total vocab:", vocab_len)

Total number of characters :  2639
Total vocab: 26


In [116]:
#seq length
#we define how long we need an individual sequence here(an individual sequence is a complete mapping of input characters as integers)
seq_length = 100
x_data = []
y_data = []

In [117]:
#loop through the sequence
#here we go through the entire list and convert chars to numbers
for i in range(0, input_len - seq_length, 1):
    #define i/o sequences
    #inp is the current char + desired sequence length
    in_seq = processed_inputs[i:i + seq_length]
    #op is the initial character + total sequence length
    out_seq = processed_inputs[i + seq_length]
    #converting the list of chars to int based on prev values and appending values to lists
    x_data.append([chars_to_num[chars] for chars in in_seq])
    y_data.append(chars_to_num[out_seq])

#check to see how many total inp seq we have
n_patterns = len(x_data)
print("Total Patterns: ", n_patterns)

Total Patterns:  2539


In [118]:
#convert input sequence to np array that our network can use
x = np.reshape(x_data, (n_patterns, seq_length, 1))
x = x/float(vocab_len)

In [119]:
#one-hot encoding
y = npu.to_categorical(y_data)

In [120]:
#creating the sequential model 
#droput is used to prevent overfitting
model = sq()
model.add(LSTM(256, input_shape=(x.shape[1], x.shape[2]), return_sequences = True))
model.add(Dropout(0.2))
model.add(LSTM(256, return_sequences = True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation = 'softmax'))

In [121]:
#compile the model
model.compile(loss='categorical_crossentropy', optimizer = 'adam')

In [125]:
#saving weights
filepath = "model_weights_saved.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor = 'loss', verbose = 1, save_best_only=True, mode = 'min')
desired_callbacks = [checkpoint]

In [126]:
#fit the model and letting it train
model.fit(x,y, epochs=4, batch_size=256, callbacks=desired_callbacks)

Epoch 1/4

Epoch 00001: loss improved from inf to 3.04950, saving model to model_weights_saved.hdf5
Epoch 2/4

Epoch 00002: loss improved from 3.04950 to 2.93317, saving model to model_weights_saved.hdf5
Epoch 3/4

Epoch 00003: loss improved from 2.93317 to 2.92973, saving model to model_weights_saved.hdf5
Epoch 4/4

Epoch 00004: loss improved from 2.92973 to 2.91554, saving model to model_weights_saved.hdf5


<keras.callbacks.callbacks.History at 0x1ce05fe2d88>

In [127]:
#recompile model with the saved weights
filename = 'model_weights_saved.hdf5'
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer = 'adam')

In [129]:
#output of model back into characters
num_to_char = dict((i,c) for i,c in enumerate(chars))

In [134]:
#random need to help generate
start = np.random.randint(0, len(x_data) - 1)
pattern = x_data[start]
print("Random Seed: ")
print("\"", ''.join([num_to_char[value] for value in pattern]), "\"")

Random Seed: 
" esfurnishedunwillingadditionsblessingresolvedpeculiarfatgracefulhamsussexreallyladieselinorsirsexopi "


In [137]:
#generate text
for i in range(1000):
    x=np.reshape(pattern, (1,len(pattern), 1))
    x=x/float(vocab_len)
    pred = model.predict(x, verbose = 0)
    index = np.argmax(pred)
    result = num_to_char[index]
    seq_in = [num_to_char[value] for value in pattern]
    sys.stdout.write(result)
    pattern.append(index)
    pattern = pattern[1:len(pattern)]
    

eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee