In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint
import pickle
import numpy as np

In [2]:
# Settings
window = 40
epochs = 50
learning_rate = 0.001
activation = 'relu'
output_activation = 'softmax'
loss = 'categorical_crossentropy'
batch_size = 32

# Model Files
token_filename = 'html_tokens.pl'
model_name = 'html_model.h5'

# Input
text_file = '/home/fignewton/Documents/Projects/Python/Investigacion_en_CC/test/ecci.html.corpus'

# Output
new_data = '/home/fignewton/Documents/Projects/Python/Investigacion_en_CC/test/ecci.new.html.corpus'
original_id = 'inputformcontrolformtexteditsearchblockform2'

In [3]:
# Read Data
data = open(text_file, 'r', encoding='utf-8').read()
data = data.lower().replace('\n', ' ')
data = data.split()
data = ' '.join(data)

# Tokenize
tokenizer = Tokenizer(oov_token='OOV')
tokenizer.fit_on_texts([data])
pickle.dump(tokenizer, open(token_filename, 'wb'))
sequence_data = tokenizer.texts_to_sequences([data])[0]

# Get the Size of the Vocabulary
vocab_size = len(tokenizer.word_index) + 1

# Generate the sequences of words
sequences = []
for i in range(window, len(sequence_data)):
    words = sequence_data[i-window:i+1]
    sequences.append(words)
sequences = np.array(sequences)
sequences[:10]

# Split in training data
X = []
y = []
for i in sequences:
    X.append(i[0:window])
    y.append(i[window])
X = np.array(X)
y = np.array(y)
y = to_categorical(y, num_classes=vocab_size)

# Create Model
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=window))
model.add(LSTM(1000, return_sequences=True))
model.add(LSTM(1000))
model.add(Dense(1000, activation=activation))
model.add(Dense(vocab_size, activation=output_activation))

# Train
checkpoint = ModelCheckpoint(model_name, monitor='loss', verbose=1, save_best_only=True)
model.compile(loss=loss, optimizer=Adam(learning_rate=learning_rate))
model.fit(X, y, epochs=epochs, batch_size=batch_size, callbacks=[checkpoint])

Epoch 1/50
Epoch 1: loss improved from inf to 4.53306, saving model to html_model.h5
Epoch 2/50
Epoch 2: loss improved from 4.53306 to 3.92929, saving model to html_model.h5
Epoch 3/50
Epoch 3: loss improved from 3.92929 to 3.84832, saving model to html_model.h5
Epoch 4/50
Epoch 4: loss improved from 3.84832 to 3.81001, saving model to html_model.h5
Epoch 5/50
Epoch 5: loss did not improve from 3.81001
Epoch 6/50
Epoch 6: loss improved from 3.81001 to 3.79785, saving model to html_model.h5
Epoch 7/50
Epoch 7: loss improved from 3.79785 to 3.78746, saving model to html_model.h5
Epoch 8/50
Epoch 8: loss improved from 3.78746 to 3.77850, saving model to html_model.h5
Epoch 9/50
Epoch 9: loss did not improve from 3.77850
Epoch 10/50
Epoch 10: loss did not improve from 3.77850
Epoch 11/50
Epoch 11: loss did not improve from 3.77850
Epoch 12/50
Epoch 12: loss did not improve from 3.77850
Epoch 13/50
Epoch 13: loss did not improve from 3.77850
Epoch 14/50
Epoch 14: loss did not improve from 3

<keras.callbacks.History at 0x7f5280754910>