In [1]:
import numpy 
import sys
from keras.models import Sequential 
from keras.layers import Dense, Dropout, LSTM
from keras.callbacks import ModelCheckpoint 
from keras.utils import np_utils

In [3]:
# Loading file
filename = "domCasmurro.txt"
raw_text = open(filename).read()
raw_text = raw_text.lower()

In [4]:
print(raw_text)

                                  dom casmurro

                                                                             texto de referência:
                                                     obras completas de machado de assis, vol. i,
                                                              nova aguilar, rio de janeiro, 1994.


                                publicado originalmente pela editora garnier, rio de janeiro, 1899.




                             capítulo primeiro
                                 do título

uma noite destas, vindo da cidade para o engenho novo, encontrei no trem da
central um rapaz aqui do bairro, que eu conheço de vista e de chapéu.
cumprimentou-me, sentou-se ao pé de mim, falou da lua e dos ministros, e
acabou recitando-me versos. a viagem era curta, e os versos pode ser que não
fossem inteiramente maus. sucedeu, porém, que, como eu estava cansado, fechei
os olhos três ou quatro vezes; tanto bastou para que ele interrompesse a leitura e
mete

In [5]:
# Creating Mapper from chars to numbers
chars = sorted(list(set(raw_text)))
print(chars)

['\n', '\x0c', ' ', '!', '"', '$', "'", '(', ')', '+', ',', '-', '.', '0', '1', '2', '4', '5', '6', '7', '8', '9', ':', ';', '=', '?', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'z', '°', 'à', 'á', 'â', 'ã', 'ç', 'é', 'ê', 'í', 'ó', 'ô', 'õ', 'ú', 'ü', '—', '”']


In [6]:
char_to_int = dict((c,i) for i, c in enumerate(chars))

In [7]:
print(char_to_int)

{'\n': 0, '\x0c': 1, ' ': 2, '!': 3, '"': 4, '$': 5, "'": 6, '(': 7, ')': 8, '+': 9, ',': 10, '-': 11, '.': 12, '0': 13, '1': 14, '2': 15, '4': 16, '5': 17, '6': 18, '7': 19, '8': 20, '9': 21, ':': 22, ';': 23, '=': 24, '?': 25, 'a': 26, 'b': 27, 'c': 28, 'd': 29, 'e': 30, 'f': 31, 'g': 32, 'h': 33, 'i': 34, 'j': 35, 'k': 36, 'l': 37, 'm': 38, 'n': 39, 'o': 40, 'p': 41, 'q': 42, 'r': 43, 's': 44, 't': 45, 'u': 46, 'v': 47, 'w': 48, 'x': 49, 'z': 50, '°': 51, 'à': 52, 'á': 53, 'â': 54, 'ã': 55, 'ç': 56, 'é': 57, 'ê': 58, 'í': 59, 'ó': 60, 'ô': 61, 'õ': 62, 'ú': 63, 'ü': 64, '—': 65, '”': 66}


In [10]:
n_chars = len(raw_text)
n_vocab = len(chars)
print("Number of chars in Text:", n_chars)
print("Number of different chars in Text:", n_vocab)

Number of chars in Text: 383347
Number of different chars in Text: 67


In [11]:
# Spliting raw_text in group of 100 chars
seq_length = 100
dataX = []
dataY = []

for i in range(0, n_chars - seq_length, 1):
    seq_in = raw_text[i: i + seq_length]
    seq_out = raw_text[i + seq_length]
    dataX.append([char_to_int[char] for char in seq_in])
    dataY.append(char_to_int[seq_out])
n_patterns = len(dataX)
print("Total Patterns", n_patterns)

Total Patterns 383247


In [13]:
print(dataX[0], dataY[0])
print(dataX[5000], dataY[5000])

[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 29, 40, 38, 2, 28, 26, 44, 38, 46, 43, 43, 40, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2] 2
[47, 34, 29, 26, 2, 41, 34, 40, 43, 23, 2, 57, 2, 40, 46, 45, 43, 26, 2, 28, 40, 34, 44, 26, 12, 2, 26, 2, 28, 30, 43, 45, 40, 44, 0, 43, 30, 44, 41, 30, 34, 45, 40, 44, 10, 2, 26, 42, 46, 30, 37, 26, 2, 47, 34, 29, 26, 2, 26, 39, 45, 34, 32, 26, 2, 26, 41, 26, 43, 30, 28, 30, 11, 38, 30, 2, 29, 30, 44, 41, 34, 29, 26, 2, 29, 30, 2, 38, 46, 34, 45, 40, 44, 2, 30, 39, 28, 26, 39, 45] 40


In [14]:
# Data reshape to use in LSTM with Keras [samples, time steps, features]
X = numpy.reshape(dataX, (n_patterns, seq_length, 1))

# Normalization
X = X / float(n_vocab)

In [15]:
# One hot Encoding do valor de saida
y = np_utils.to_categorical(dataY)

In [16]:
print(X[0], y[0])
print(X[5000], y[5000])

[[0.02985075]
 [0.02985075]
 [0.02985075]
 [0.02985075]
 [0.02985075]
 [0.02985075]
 [0.02985075]
 [0.02985075]
 [0.02985075]
 [0.02985075]
 [0.02985075]
 [0.02985075]
 [0.02985075]
 [0.02985075]
 [0.02985075]
 [0.02985075]
 [0.02985075]
 [0.02985075]
 [0.02985075]
 [0.02985075]
 [0.02985075]
 [0.02985075]
 [0.02985075]
 [0.02985075]
 [0.02985075]
 [0.02985075]
 [0.02985075]
 [0.02985075]
 [0.02985075]
 [0.02985075]
 [0.02985075]
 [0.02985075]
 [0.02985075]
 [0.02985075]
 [0.43283582]
 [0.59701493]
 [0.56716418]
 [0.02985075]
 [0.41791045]
 [0.3880597 ]
 [0.65671642]
 [0.56716418]
 [0.68656716]
 [0.64179104]
 [0.64179104]
 [0.59701493]
 [0.        ]
 [0.        ]
 [0.02985075]
 [0.02985075]
 [0.02985075]
 [0.02985075]
 [0.02985075]
 [0.02985075]
 [0.02985075]
 [0.02985075]
 [0.02985075]
 [0.02985075]
 [0.02985075]
 [0.02985075]
 [0.02985075]
 [0.02985075]
 [0.02985075]
 [0.02985075]
 [0.02985075]
 [0.02985075]
 [0.02985075]
 [0.02985075]
 [0.02985075]
 [0.02985075]
 [0.02985075]
 [0.02

In [17]:
print(X.shape, y.shape)

(383247, 100, 1) (383247, 67)


In [18]:
# Creating LSTM Model
model = Sequential()
model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2]), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(256))
model.add(Dropout(0.2))
model.add(Dense(y.shape[1], activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam')

In [19]:
# Defining Checkpoint
filepath = "weights-imporvement-{epoch:02d}-{loss:.4f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose = 1, save_best_only = True, mode='min')
callbacks_list = [checkpoint]

In [21]:
model.fit(X, y, epochs=1, batch_size = 64, callbacks=callbacks_list)


Epoch 00001: loss improved from inf to 2.70541, saving model to weights-imporvement-01-2.7054.hdf5


<tensorflow.python.keras.callbacks.History at 0x7fbdd052e580>

In [22]:
filename = "weights-imporvement-01-2.7054.hdf5"
model.load_weights(filename)
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam')

In [23]:
print(filename)

weights-imporvement-01-2.7054.hdf5


In [24]:
# Transforming Int_to_char
int_to_char = dict((i,c) for i, c in enumerate(chars))

In [25]:
# Random Seed
start = numpy.random.randint(0, len(dataX) - 100)
print(start)

322344


In [28]:
# Start generate text from random seed
pattern = dataX[start]
print("\"", ''.join([int_to_char[value] for value in pattern]), "\"")

# Generating Chars
for i in range(1000):
    x = numpy.reshape(pattern, (1, len(pattern), 1))
    x = x / float(n_vocab)
    prediction = model.predict(x, verbose=0)
    index = numpy.argmax(prediction)
    result = int_to_char[index]
    seq_in = [int_to_char[value] for value in pattern]
    sys.stdout.write(result)
    pattern.append(index)
    pattern = pattern[1:len(pattern)]
print ("\nConcluído.")

"  que se pode acrescentar que nem tudo o
que dura, dura muito tempo. esta segunda parte não acha crena  "
do comta do comta do comta do comta do comta do comta do comta do comta do comta do comta do comta do comta do comta do comta do comta do comta do comta do comta do comta do comta do comta do comta do comta do comta do comta do comta do comta do comta do comta do comta do comta do comta do comta do comta do comta do comta do comta do comta do comta do comta do comta do comta do comta do comta do comta do comta do comta do comta do comta do comta do comta do comta do comta do comta do comta do comta do comta do comta do comta do comta do comta do comta do comta do comta do comta do comta do comta do comta do comta do comta do comta do comta do comta do comta do comta do comta do comta do comta do comta do comta do comta do comta do comta do comta do comta do comta do comta do comta do comta do comta do comta do comta do comta do comta do comta do comta do comta do comta do comta do