In [1]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils
from sklearn.metrics import accuracy_score



In [2]:
def get_model(X, Y, units=512):
    model = Sequential()
    model.add(LSTM(units, input_shape=(X.shape[1], X.shape[2])))
    model.add(Dropout(0.3))
    model.add(Dense(Y.shape[1], activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    model.summary()
    return model

In [3]:

def read_and_process_text(filename):
    raw_text = open(filename).read().lower()
    translation_table = dict.fromkeys(map(ord, '\n\r\"\'()&$*-0123456789:;[]_`'), ' ')
    raw_text = raw_text.translate(translation_table)
    translation_table = dict.fromkeys(map(ord, '?!'), '.')
    raw_text = raw_text.translate(translation_table)
    raw_text = " ".join(raw_text.split())
    letters = sorted(list(set(raw_text)))
    char_to_int = dict((c, i) for i, c in enumerate(letters))
    one_hot = np_utils.to_categorical([char_to_int[i] for i in letters]).tolist()
    list_sentences = filter(None, raw_text.split("."))
    return one_hot, char_to_int, letters, list(map(lambda x: x + ".", list_sentences))


def create_dataset(one_hot, char_to_int, letters, list_sentences, seq_length):
    dataX = list()
    dataY = list()
    for sentence in list_sentences:
        for i in range(0, len(sentence) - seq_length):
            seq_in = sentence[i:i + seq_length]
            seq_out = sentence[i + seq_length]
            dataX.append([one_hot[char_to_int[char]] for char in seq_in])
            dataY.append(one_hot[char_to_int[seq_out]])
    print("Total Vocab: ", len(letters))
    print("Total Sequents: ", len(dataX))
    return np.array(dataX), np.array(dataY)


def run_model(model, X, Y, path, epochs=10):
    filepath = path + "-weights.hdf5"
    checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=1, save_best_only=True, mode='min')
    callbacks_list = [checkpoint]
    model.fit(X, Y, epochs=epochs, batch_size=64, callbacks=callbacks_list)


def load_model(filename, model):
    model.load_weights(filename)
    model.compile(loss='categorical_crossentropy', optimizer='adam')

In [4]:
def LSTM_gen(string, seq_length, letters, one_hot, char_to_int, model, max_len=1000):
    if seq_length >= len(string):
        print("Cannot predict(too short init string)")
        return string
    last_string = string[-seq_length:]
    current_window = [one_hot[char_to_int[ch]] for ch in last_string]
    while True:
        x = np.array([current_window])
        prediction = model.predict(x, verbose=0)
        index = np.argmax(prediction)
        result = letters[index]
        string += result
        if(result == '.') or (len(string) > max_len):
            return string
        current_window.append(one_hot[index])
        current_window = current_window[1:len(current_window)]

In [5]:
seq_length = 20

In [6]:
one_hot, char_to_int, letters, list_sentences = read_and_process_text("data/alice_in_wonderland.txt")

[X, Y] = create_dataset(one_hot, char_to_int, letters, list_sentences, seq_length)
del list_sentences
print(letters)


Total Vocab:  29
Total Sequents:  107966
[' ', ',', '.', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [7]:
model = get_model(X, Y)
load_model("runs/" + str(seq_length) + "-weights.hdf5", model)
# run_model(model, X, Y, "runs/" + str(seq_length), 1000)


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm (LSTM)                 (None, 512)               1110016   
                                                                 
 dropout (Dropout)           (None, 512)               0         
                                                                 
 dense (Dense)               (None, 29)                14877     
                                                                 
Total params: 1,124,893
Trainable params: 1,124,893
Non-trainable params: 0
_________________________________________________________________


In [8]:
def print_gen(string):
    gen_string = LSTM_gen(string, seq_length, letters, one_hot, char_to_int, model)
    print(gen_string)


In [9]:
print_gen("in another moment down went alice after it")# never once considering how in the world she was to get out again.

in another moment down went alice after it, never once considering how in the world she was to get out again.


In [10]:
print_gen("she was close behind it when she turned the corner,")#but the rabbit was no longer to be seen:  she found herself in a long, low hall, which was lit up by a row of lamps hanging from the roof.

she was close behind it when she turned the corner, but the rabbit was no longer to be seen she found herself in a long, low hall, which was lit up by a row of lamps hanging from the roof.


In [11]:
print_gen("she has mastered machine learning")

she has mastered machine learning to the king, and the hatter hurriedly left the court, without even waiting to put his shoes on.


In [12]:
print_gen("i will eat this plunger for world peace")

i will eat this plunger for world peace with the white rabbit cried out, silence in the court.


In [13]:
def accuracy(X, Y, model):
    pred = model.predict(X)
    pred = list(map(lambda x: np.argmax(x), pred))
    real = list(map(lambda x: np.argmax(x), Y))
    return accuracy_score(real, pred)

In [14]:
print(accuracy(X, Y, model))

0.9926458329474094
