In [1]:
# import dependencies
import numpy
import sys
import nltk
nltk.download("stopwords")
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from keras.models import Sequential
from keras.layers import Dense, Dropout, LSTM
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [2]:
# load data
file = open ("/content/meta_text.txt").read()

In [3]:
# tokenization
# standardization
def tokenize_words(input_text):
    input_text = input_text.lower()
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(input_text)
    filtered_tokens = [token for token in tokens if token not in stopwords.words('english')]
    return " ".join(filtered_tokens)
processed_inputs = tokenize_words(file)

In [4]:
# chars to numbers
chars = sorted(list(set(processed_inputs)))
char_to_num = dict((c, i) for i, c in enumerate(chars))

In [5]:
# check if words to chars or chars to num (?!) has worked?
input_len = len(processed_inputs)
vocab_len = len(chars)
print("Total number of characters:", input_len)
print("Total vocab:", vocab_len)

Total number of characters: 67092
Total vocab: 28


In [6]:
# seg length
seq_length = 100
x_data = []
y_data = []

In [7]:
#loop through the seguence
for i in range(0, input_len - seq_length, 1):
    in_seg = processed_inputs[i:i + seq_length]
    out_seg = processed_inputs[i + seq_length]
    x_data.append([char_to_num[char] for char in in_seg])
    y_data.append(char_to_num[out_seg])
n_patterns = len(x_data)
print("Total Patterns:", n_patterns)

Total Patterns: 66992


In [8]:
# convert input sequence to np array that our network can use
x = numpy.reshape(x_data, (n_patterns,seq_length, 1))
x = x / float(vocab_len)

In [9]:
# one-hot encoding our label data
y = to_categorical(y_data)

In [10]:
#creating the model
# creating a sequential model
# dropout is used to prevent overfitting
model = Sequential()
model.add(LSTM(256, input_shape=(x.shape[1], x.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256, return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(128))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))

In [11]:
#compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [12]:
#saving weights
filepath = "model_weights_saved.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
desired_callbacks = [checkpoint]

In [14]:
#fit model and let it train
model.fit(x , y, epochs=4, batch_size=256, callbacks=desired_callbacks)

Epoch 1/4
Epoch 1: loss improved from inf to 2.93756, saving model to model_weights_saved.hdf5
Epoch 2/4


  saving_api.save_model(


Epoch 2: loss improved from 2.93756 to 2.89753, saving model to model_weights_saved.hdf5
Epoch 3/4
Epoch 3: loss improved from 2.89753 to 2.70110, saving model to model_weights_saved.hdf5
Epoch 4/4
Epoch 4: loss improved from 2.70110 to 2.50557, saving model to model_weights_saved.hdf5


<keras.src.callbacks.History at 0x7891381ed6f0>

In [15]:
#recompile model with the saved weights
filename = "model_weights_saved.hdf5"
model.load_weights(filename)
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [16]:
#output of the model back into characters
num_to_char = dict((i, c) for i, c in enumerate(chars))

In [17]:
# random seed to help generate
import numpy as np
start = numpy.random.randint(0, len(x_data) - 1)
pattern = x_data[start]
print("Random Seed:")
print("".join([num_to_char[value] for value in pattern]))

Random Seed:
ed opened eyes wide whistled waste time yank open bedroom doors shout loudly darkness bedrooms come 


In [19]:
# generate the text
for i in range(100):
    x = np.reshape(pattern, (1, len(pattern), 1))
    x = x / float(vocab_len)
    prediction = model.predict(x, verbose=0)
    index = np.argmax(prediction)
    result = num_to_char[index]
    sys.stdout.write(result)
    pattern.append(index)
    pattern = pattern[1:len(pattern)]

e sooe sooe sooe sooe sooe sooe sooe sooe sooe sooe sooe sooe sooe sooe sooe sooe sooe sooe sooe soo