https://www.gutenberg.org/cache/epub/41445/pg41445.txt

In [1]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import LSTM
import tensorflow as tf
import logging
tf.get_logger().setLevel(logging.ERROR)

In [2]:
EPOCHS = 32
BATCH_SIZE = 256
INPUT_FILE_NAME = 'C:/Users/pipel/Documents/Javeriana Topicos/Sesion 3/Version 2/Frankenstein.txt'
WINDOW_LENGTH = 40
WINDOW_STEP = 3
BEAM_SIZE = 8
NUM_LETTERS = 11
MAX_LENGTH = 50

In [3]:
# Abrir el archivo
file = open(INPUT_FILE_NAME, 'r', encoding='utf-8')
text = file.read()
file.close()

# convertir el texto a minusculas y quitar saltos de linea y espacios adicionales
text = text.lower()
text = text.replace('\n', ' ')
text = text.replace('\ufeff', ' ')
text = text.replace('  ', '')

# codificar caracteres como indices
unique_chars = list(set(text))
char_to_index = dict((ch, index) for index,
                    ch in enumerate(unique_chars))
index_to_char = dict((index, ch) for index,
                    ch in enumerate(unique_chars))
encoding_with = len(char_to_index)

In [4]:
char_to_index

{' ': 0,
 'ë': 1,
 '*': 2,
 '’': 3,
 'b': 4,
 'z': 5,
 '-': 6,
 '5': 7,
 'g': 8,
 'ô': 9,
 '#': 10,
 'q': 11,
 ';': 12,
 '(': 13,
 ')': 14,
 ':': 15,
 '1': 16,
 'w': 17,
 't': 18,
 '7': 19,
 '.': 20,
 'â': 21,
 'i': 22,
 '‘': 23,
 'j': 24,
 '_': 25,
 '“': 26,
 'p': 27,
 '4': 28,
 '&': 29,
 'f': 30,
 '2': 31,
 's': 32,
 ',': 33,
 '3': 34,
 'k': 35,
 '8': 36,
 '—': 37,
 'ê': 38,
 '%': 39,
 '$': 40,
 '6': 41,
 'm': 42,
 '[': 43,
 'e': 44,
 'o': 45,
 'r': 46,
 ']': 47,
 'a': 48,
 '”': 49,
 '?': 50,
 '9': 51,
 'y': 52,
 'h': 53,
 'u': 54,
 'v': 55,
 '/': 56,
 '0': 57,
 'c': 58,
 'd': 59,
 'n': 60,
 'æ': 61,
 '!': 62,
 'x': 63,
 'l': 64}

In [5]:
#crear Window
fragments = []
targets = []

for i in range (0, len(text)- WINDOW_LENGTH, WINDOW_STEP):
    fragments.append (text [i : i+WINDOW_LENGTH])
    targets.append(text[i+WINDOW_LENGTH])
    
print('fragment: '+fragments[0] + '      Target: ' + targets[0])
print('fragment: '+fragments[1] + '      Target: ' + targets[1])

fragment:  the project gutenberg ebook of frankens      Target: t
fragment: e project gutenberg ebook of frankenstei      Target: n


In [6]:
# one-hot encoding

x = np.zeros((len(fragments), WINDOW_LENGTH, encoding_with))
y = np.zeros((len(fragments), encoding_with))

for i, fragment in enumerate(fragments):
    for j, char in enumerate(fragment):
        x[i,j, char_to_index[char]] = 1
    target_char = targets[i]
    y[i, char_to_index[target_char]] = 1
    
print(x[0])
print(y[0])

[[1. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]


In [7]:
#Entrenar Modelo

model = Sequential()
model.add(LSTM(128, return_sequences=True,
                    dropout=0.2,
                    recurrent_dropout=0.2,
                    input_shape=(None,encoding_with)))

model.add(LSTM(128, dropout=0.2,
                    recurrent_dropout=0.2))

model.add(Dense(encoding_with, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam')

model.summary()


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm (LSTM)                  (None, None, 128)         99328     
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dense (Dense)                (None, 65)                8385      
Total params: 239,297
Trainable params: 239,297
Non-trainable params: 0
_________________________________________________________________


In [8]:
#ejecutar modelo

history = model.fit(x, y, validation_split=0.05,
                    batch_size=BATCH_SIZE,
                    epochs=EPOCHS,
                    shuffle=True)

Train on 134581 samples, validate on 7084 samples
Epoch 1/32
Epoch 2/32
Epoch 3/32
Epoch 4/32
Epoch 5/32
Epoch 6/32
Epoch 7/32
Epoch 8/32
Epoch 9/32
Epoch 10/32
Epoch 11/32
Epoch 12/32
Epoch 13/32
Epoch 14/32
Epoch 15/32
Epoch 16/32
Epoch 17/32
Epoch 18/32
Epoch 19/32
Epoch 20/32
Epoch 21/32
Epoch 22/32
Epoch 23/32
Epoch 24/32
Epoch 25/32
Epoch 26/32
Epoch 27/32
Epoch 28/32
Epoch 29/32
Epoch 30/32
Epoch 31/32
Epoch 32/32


In [9]:
#crear un beam inicial

letters = 'the body '
one_hots = []
for i, char in enumerate(letters):
    x = np.zeros(encoding_with)
    x[char_to_index[char]] = 1
    one_hots.append(x)

beams = [(np.log(1.0), letters, one_hots)]

In [10]:
#predecir letras futuras

for i in range(NUM_LETTERS):
    minibatch_list =[]
    
    #crear minibatch desde one-hot y predecir
    for triple in beams:
        minibatch_list.append(triple[2])
        
    minibatch = np.array(minibatch_list)
    y_predict = model.predict(minibatch, verbose=0)
    new_beams = []
    
    for j, softmax_vec in enumerate (y_predict):
        triple = beams[j]
        #crea nuevos beams desde los existentes
        for k in range(BEAM_SIZE):
            char_index = np.argmax(softmax_vec)
            new_prob = triple[0] + np.log(softmax_vec[char_index])
            new_letters = triple[1] + index_to_char[char_index]
            x = np.zeros(encoding_with)
            x[char_index]=1
            new_one_hots = triple[2].copy()
            new_one_hots.append(x)
            new_beams.append((new_prob, new_letters, new_one_hots))
            softmax_vec[char_index]=0
        
    #limpiar el arbol para quedarse con los beams mas probables
    new_beams.sort(key=lambda tup: tup[0], reverse=True)
    beams = new_beams[0:BEAM_SIZE]
    
for item in beams:
    print(item[1])

the body of the count of the 
the body of the death of the 
the body of the should of the
the body of the countion of t
the body of the should of my 
the body of the countion of m
the body of the countion of h
the body of the countion of a
