In [51]:
import numpy as np
import sys
import os
import codecs

import keras
from keras.models import Sequential, Model, load_model
from keras.layers import Dense, LSTM, Dropout, Input, Embedding
from keras.optimizers import RMSprop, Adam

from keras.callbacks import ModelCheckpoint, LambdaCallback
from keras.utils import np_utils

import tensorflow as tf

In [16]:
# load ascii text and covert to lowercase
filenames = [f for f in os.listdir('data') if f.startswith("Harry")]
raw_text = "\n".join(codecs.open(os.path.join('data', f), 'rb', 'cp1252', errors='ignore').read() for f in filenames)
#raw_text = "\n".join(open('data//' + f).read() for f in filenames)
raw_text = raw_text.lower()

In [18]:
print(len(raw_text))
raw_text[10000:14000]

6321551


'with... you know... her crowd." \r\n\r\nmrs. dursley sipped her tea through pursed lips. mr. dursley wondered whether he dared tell her he\'d heard the name "potter." he decided he didn\'t dare. instead he said, as casually as he could, "their son -- he\'d be about dudley\'s age now, wouldn\'t he?" \r\n\r\n"i suppose so," said mrs. dursley stiffly. \r\n\r\n"what\'s his name again? howard, isn\'t it?" \r\n\r\n"harry. nasty, common name, if you ask me." \r\n\r\n"oh, yes," said mr. dursley, his heart sinking horribly. "yes, i quite agree." \r\n\r\nhe didn\'t say another word on the subject as they went upstairs to bed. \r\n\r\nwhile mrs. dursley was in the bathroom, mr. dursley crept to the bedroom window and peered down into the front garden. the cat was still there. \r\n\r\nit was staring down privet drive as though it were waiting for something. \r\n\r\nwas he imagining things? could all this have anything to do with the potters? if it did... if it got out that they were related to a 

In [19]:
# create mapping of unique chars to integers
chars = sorted(list(set(raw_text)))
char_to_int = dict((c, i) for i, c in enumerate(chars))
int_to_char = dict((i, c) for i, c in enumerate(chars))

# summarize the loaded data
n_chars = len(raw_text)
n_vocab = len(chars)
print("Total Characters: ", n_chars)
print("Total Unique chars: ", n_vocab)

Total Characters:  6321551
Total Unique chars:  78


In [20]:
print(chars)

['\n', '\r', '\x1f', ' ', '!', '"', '$', '%', '&', "'", '(', ')', '*', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '=', '>', '?', '[', '\\', ']', '^', '_', '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{', '}', '~', '¦', '«', '\xad', '»', 'é', 'ü', '–', '‘', '’', '“', '•']


In [21]:
# create input and output pairs
seq_length = 64
dataX = []
dataY = []
for i in range(0, n_chars - seq_length, 1):
    seq_in = raw_text[i:i + seq_length]
    seq_out = raw_text[i + seq_length]
    dataX.append([char_to_int[char] for char in seq_in])
    dataY.append(char_to_int[seq_out])
n_patterns = len(dataX)
print("Total sequences: ", n_patterns)

Total sequences:  6321487


In [29]:
# reshape X to be [samples, time steps, features]
X = np.reshape(dataX, (n_patterns, seq_length))
print(X.shape)
X[:2, :]
# normalize
#X = X / float(n_vocab)

(6321487, 64)


array([[45, 38, 55, 55, 62,  3, 53, 52, 57, 57, 42, 55,  3, 38, 51, 41,  3,
        57, 45, 42,  3, 56, 52, 55, 40, 42, 55, 42, 55,  9, 56,  3, 56, 57,
        52, 51, 42,  3,  1,  0,  1,  0, 40, 45, 38, 53, 57, 42, 55,  3, 52,
        51, 42,  3,  1,  0,  1,  0, 57, 45, 42,  3, 39, 52],
       [38, 55, 55, 62,  3, 53, 52, 57, 57, 42, 55,  3, 38, 51, 41,  3, 57,
        45, 42,  3, 56, 52, 55, 40, 42, 55, 42, 55,  9, 56,  3, 56, 57, 52,
        51, 42,  3,  1,  0,  1,  0, 40, 45, 38, 53, 57, 42, 55,  3, 52, 51,
        42,  3,  1,  0,  1,  0, 57, 45, 42,  3, 39, 52, 62]])

In [26]:
# one hot encode the output variable
#y = np_utils.to_categorical(dataY)
y = np.array(dataY)
print(y.shape)
y[:5]
#WE USE SPARSE CATEGORICAL

(6321487,)


array([62,  3, 60, 45, 52])

In [34]:
# define the input shape
hidden_size=64

inputs = Input(shape=(X.shape[1],))
embeded = Embedding(len(chars), hidden_size, input_length=seq_length)(inputs)
# x = LSTM(hidden_size, return_sequences = True)(embeded) 
# x = Dropout(0.2)(x)
x = LSTM(hidden_size)(embeded)
x = Dropout(0.3)(x)
output = Dense(len(chars), activation ='softmax')(x)

model = Model(inputs, output)
model.compile(loss='sparse_categorical_crossentropy',
             optimizer='adam')

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         (None, 64)                0         
_________________________________________________________________
embedding_3 (Embedding)      (None, 64, 64)            4992      
_________________________________________________________________
lstm_5 (LSTM)                (None, 64)                33024     
_________________________________________________________________
dropout_5 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 78)                5070      
Total params: 43,086
Trainable params: 43,086
Non-trainable params: 0
_________________________________________________________________


In [59]:
def generate_stuff():
    # pick a random seed
    start = np.random.randint(0, len(dataX)-1)
    pattern = dataX[start]
    seed = dataX[start]
    print(pattern)
    print("Seed pattern:")
    print("\"", ''.join([int_to_char[value] for value in pattern]), "\"")
    
    generated_text = []
    # generate characters
    for i in range(100):
        x = np.reshape(pattern, (1, len(pattern)))
        prediction = model.predict(x, verbose=0)
        index = np.argmax(prediction)
        result = int_to_char[index]
        seq_in = [int_to_char[value] for value in pattern]
        pattern.append(index)
        generated_text.append(index)
        pattern = pattern[1:]
    #print("\nDone.")
    #print(pattern)
    #print("\"", ''.join([int_to_char[value] for value in seed]), "\"")
    print("\"", ''.join([int_to_char[value] for value in generated_text]), "\"")

In [None]:
#Train!

# define the checkpoint
#filepath="checkpoints/weights-improvement-{epoch:02d}-{loss:.4f}-gentext-CharRNN-simple.hdf5"
epoch_print = LambdaCallback(
    on_epoch_end=lambda batch,logs: generate_stuff())
checkpoint = ModelCheckpoint('Harrypotter_Gen.h5', monitor='loss', verbose=1, save_best_only=True, mode='min')
callbacks_list = [checkpoint, epoch_print]

model.fit(X, y, epochs=20, batch_size=64, callbacks=callbacks_list)

Epoch 1/20
 233536/6321487 [>.............................] - ETA: 4923s - loss: 1.7244

In [37]:
model.save('Harrypotter_Gen.h5')

In [55]:
generate_stuff()

[40, 52, 50, 42, 3, 58, 53, 3, 46, 51, 3, 52, 58, 55, 3, 52, 60, 49, 15, 5, 1, 0, 39, 58, 57, 3, 45, 38, 44, 55, 46, 41, 3, 50, 42, 55, 42, 49, 62, 3, 62, 38, 60, 51, 42, 41, 3, 60, 46, 41, 42, 49, 62, 3, 38, 51, 41, 3, 40, 38, 56, 57, 3, 38]
Seed pattern:
" come up in our owl."
but hagrid merely yawned widely and cast a "

Done.
[3, 56, 57, 38, 55, 57, 42, 41, 3, 57, 52, 3, 57, 45, 42, 3, 56, 57, 38, 55, 57, 42, 41, 3, 57, 52, 3, 57, 45, 42, 3, 56, 57, 38, 55, 57, 42, 41, 3, 57, 52, 3, 57, 45, 42, 3, 56, 57, 38, 55, 57, 42, 41, 3, 57, 52, 3, 57, 45, 42, 3, 56, 57, 38]
" come up in our owl."
but hagrid merely yawned widely and cast an "
" nd the started to the started to the started to the started to the started to the started to the sta "



Done.
[41, 3, 57, 52, 3, 57, 45, 42, 3, 56, 57, 38, 55, 57, 42, 41, 3, 57, 52, 3, 57, 45, 42, 3, 56, 57, 38, 55, 57, 42, 41, 3, 57, 52, 3, 57, 45, 42, 3, 56, 57, 38, 55, 57, 42, 41, 3, 57, 52, 3, 57, 45, 42, 3, 56, 57, 38, 55, 57, 42, 41, 3, 57, 52]
"  until proven guilty, severus," he said firmly.
snape looked fur "
" started to the started to the started to the started to the started to the started to the started to "
