In [2]:
import numpy
import sys
import nltk
nltk.download('stopwords')

from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.callbacks import ModelCheckpoint


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [5]:
file = open("frankenstein.txt").read()


In [6]:
def tokenize_words(input):
    input = input.lower()
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(input)
    filtered = filter(lambda token: token not in stopwords.words('english'), tokens)
    return " ".join(filtered)

processed_inputs = tokenize_words(file)


In [7]:
chars = sorted(list(set(processed_inputs)))
char_to_num = dict((c, i) for i, c in enumerate(chars))

In [8]:
input_len = len(processed_inputs)
vocab_len = len(chars)
print("Total No. of Characters: ",input_len)
print("Total Vocab: ",vocab_len)

Total No. of Characters:  269566
Total Vocab:  38


In [9]:
seq_length = 100
x_data = []
y_data = []

In [10]:
# loop through the sequence
for i in range(0, input_len - seq_length, 1):
    in_seq = processed_inputs[i:i + seq_length]
    out_seq = processed_inputs[i + seq_length]
    x_data.append([char_to_num[char] for char in in_seq])
    y_data.append(char_to_num[out_seq])

n_patterns = len(x_data)
print("Total Patterns:", n_patterns)



Total Patterns: 269466


In [11]:
# convert input sequence to np array and so on
X = numpy.reshape(x_data, (n_patterns, seq_length, 1))
X = X / float(vocab_len)

In [12]:
from tensorflow.keras.utils import to_categorical  # Correct Import

# One-hot encoding
y = to_categorical(y_data)


In [13]:
# creating the model
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))

  super().__init__(**kwargs)


In [14]:
model.compile(loss="categorical_crossentropy", optimizer="adam")

In [15]:
filepath = "model_weights_saved.keras"  # Change .hdf5 to .keras
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
desired_callbacks = [checkpoint]



In [16]:
# Train the model
model.fit(X, y, epochs=4, batch_size=256, callbacks=desired_callbacks)


Epoch 1/4
[1m1052/1053[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 63ms/step - loss: 2.9523
Epoch 1: loss improved from inf to 2.89374, saving model to model_weights_saved.keras
[1m1053/1053[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 64ms/step - loss: 2.9521
Epoch 2/4
[1m1052/1053[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 65ms/step - loss: 2.6567
Epoch 2: loss improved from 2.89374 to 2.61269, saving model to model_weights_saved.keras
[1m1053/1053[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m69s[0m 65ms/step - loss: 2.6566
Epoch 3/4
[1m1052/1053[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 67ms/step - loss: 2.4900
Epoch 3: loss improved from 2.61269 to 2.46102, saving model to model_weights_saved.keras
[1m1053/1053[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m84s[0m 67ms/step - loss: 2.4900
Epoch 4/4
[1m1052/1053[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 67ms/step - loss: 2.3705
Epoch 4: loss improved from 2.46102 to 2

<keras.src.callbacks.history.History at 0x781bebb51d90>

In [17]:
# Recompile model with the saved weights
filename = "model_weights_saved.keras"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')



In [18]:
# Output of the model back into characters
num_to_char = dict((i, c) for i, c in enumerate(chars))

In [19]:
start = numpy.random.randint(0, len(x_data) - 1)
pattern = x_data[start]
print("Random Seed:")
print("\"", "".join([num_to_char[value] for value in pattern]), "\"")

Random Seed:
" ll unhappy still avoid society time lost conjecture cause yesterday idea struck well founded conjure "


In [20]:
for i in range(1000):
    x = numpy.reshape(pattern, (1, len(pattern), 1))
    x = x / float(vocab_len)
    prediction = model.predict(x, verbose=0)
    index = numpy.argmax(prediction)
    result = num_to_char[index]

    sys.stdout.write(result)
    pattern.append(index)
    pattern = pattern[1:len(pattern)]

d sererable serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer serer sere