In [1]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.text import text_to_word_sequence
import tensorflow as tf
import logging
tf.get_logger().setLevel(logging.ERROR)

In [2]:
EPOCHS = 32
BATCH_SIZE = 256
INPUT_FILE_NAME = 'C:/Users/pipel/Documents/Javeriana Topicos/Sesion 3/Version 2/Frankenstein.txt'
WINDOW_LENGTH = 40
WINDOW_STEP = 3
PREDICT_LENGTH = 3
MAX_WORDS = 10000
EMBEDING_WIDTH = 100

In [3]:
# Abrir el archivo
file = open(INPUT_FILE_NAME, 'r', encoding='utf-8')
text = file.read()
file.close()

# Separar el texto en palabras
text = text_to_word_sequence(text)

#crear data de entrenamiento
fragments =[]
targets=[]

for i in range(0, len(text) - WINDOW_LENGTH, WINDOW_STEP):
    fragments.append(text[i: i+WINDOW_LENGTH])
    targets.append(text[i+WINDOW_LENGTH])

In [4]:
# Convertir a indices
tokenizer = Tokenizer(num_words=MAX_WORDS, oov_token='UNK')
tokenizer.fit_on_texts(text)
fragments_indexed = tokenizer.texts_to_sequences(fragments)
targets_indexed = tokenizer.texts_to_sequences(targets)

# Transformar a formatos adecuados
X = np.array(fragments_indexed, dtype=np.int)
y = np.zeros((len(targets_indexed),MAX_WORDS))
for i, target_index in enumerate(targets_indexed):
    y[i, target_index]=1

In [5]:
# construir el modelo
training_model = Sequential()
training_model.add (Embedding(
                                output_dim=EMBEDING_WIDTH, input_dim=MAX_WORDS,
                                mask_zero=True, input_length=None))
training_model.add(LSTM(128, return_sequences= True,
                       dropout=0.2,recurrent_dropout=0.2))
training_model.add(LSTM(128,
                       dropout=0.2,recurrent_dropout=0.2))
training_model.add(Dense(128, activation='relu'))
training_model.add(Dense(MAX_WORDS, activation='softmax'))

training_model.compile(loss='categorical_crossentropy', optimizer='adam')
training_model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 100)         1000000   
_________________________________________________________________
lstm (LSTM)                  (None, None, 128)         117248    
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dense (Dense)                (None, 128)               16512     
_________________________________________________________________
dense_1 (Dense)              (None, 10000)             1290000   
Total params: 2,555,344
Trainable params: 2,555,344
Non-trainable params: 0
_________________________________________________________________


In [36]:
#Entrenamiento
history = training_model.fit(X,y, validation_split=0.05,
                            batch_size=BATCH_SIZE,
                            epochs=EPOCHS, verbose=2,
                            shuffle=True)

Train on 24179 samples, validate on 1273 samples
Epoch 1/32
24179/24179 - 45s - loss: 4.2620 - val_loss: 12.3880
Epoch 2/32
24179/24179 - 43s - loss: 4.2108 - val_loss: 12.6769
Epoch 3/32
24179/24179 - 43s - loss: 4.2084 - val_loss: 12.1934
Epoch 4/32
24179/24179 - 43s - loss: 4.2148 - val_loss: 12.1311
Epoch 5/32
24179/24179 - 43s - loss: 4.3169 - val_loss: 12.8531
Epoch 6/32
24179/24179 - 43s - loss: 4.1419 - val_loss: 13.4457
Epoch 7/32
24179/24179 - 44s - loss: 4.0140 - val_loss: 13.5498
Epoch 8/32
24179/24179 - 46s - loss: 3.9556 - val_loss: 13.8036
Epoch 9/32
24179/24179 - 48s - loss: 3.9006 - val_loss: 13.9680
Epoch 10/32
24179/24179 - 47s - loss: 3.8613 - val_loss: 14.1043
Epoch 11/32
24179/24179 - 52s - loss: 3.8141 - val_loss: 14.2515
Epoch 12/32
24179/24179 - 49s - loss: 3.7673 - val_loss: 14.3641
Epoch 13/32
24179/24179 - 50s - loss: 3.7219 - val_loss: 14.4267
Epoch 14/32
24179/24179 - 50s - loss: 3.6776 - val_loss: 14.6533
Epoch 15/32
24179/24179 - 50s - loss: 3.6337 - val

In [37]:
#construir modelo para predecir

inference_model = Sequential()
inference_model.add (Embedding(
                                output_dim=EMBEDING_WIDTH, input_dim=MAX_WORDS,
                                mask_zero=True, input_length=None))
inference_model.add(LSTM(128, return_sequences= True,
                       dropout=0.2,recurrent_dropout=0.2))
inference_model.add(LSTM(128,
                       dropout=0.2,recurrent_dropout=0.2))
inference_model.add(Dense(128, activation='relu'))
inference_model.add(Dense(MAX_WORDS, activation='softmax'))

weights = training_model.get_weights()
inference_model.set_weights(weights)

In [38]:
#inicializar una frase

first_words = ['i','saw']
first_words_indexed = tokenizer.texts_to_sequences(first_words)
inference_model.reset_states
predicted_string = ''

# greedy prediction
#inyectar palabras iniciales al modelo
for i, word_index in enumerate(first_words_indexed):
    x=np.zeros((1,1), dtype=np.int)
    x[0][0]=word_index[0]
    predicted_string += first_words[i]
    predicted_string += ' '
    y_predict = inference_model.predict(x, verbose=0)[0]
    
# Predecir las siguientes PEDICT_LENGTH palabras
for i in range (PREDICT_LENGTH):
    new_word_index = np.argmax(y_predict)
    word = tokenizer.sequences_to_texts(
        [[new_word_index]])
    x[0][0]=new_word_index
    predicted_string += word[0]
    predicted_string += ' '
    y_predict = inference_model.predict(x, verbose=0)[0]
    
print(predicted_string)

i saw to to to 
