In [11]:
from __future__ import print_function
import collections
import os
import tensorflow as tf
from keras.models import Sequential, load_model
from keras.layers import Dense, Activation, Embedding, Flatten, Dropout, TimeDistributed, Reshape, Lambda
from keras.layers import LSTM
from keras.optimizers import RMSprop, Adam, SGD
from keras import backend as K
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint
import numpy as np
import argparse
import pdb


In [51]:
data_path = "./inputs"
file_name = "shakespeare.txt"

In [53]:
def read_words(filename):
    with tf.gfile.GFile(filename, "r") as f:
        return f.read().decode("utf-8").replace("\n", "<eos>").split()
    
def build_vocab(filename):
    data = read_words(filename)
    counter = collections.Counter(data)
    count_pairs = sorted(counter.items(), key = lambda x : (-x[1], x[0]))
    words, _ = zip(*count_pairs)
    word_to_id = dict(zip(words, range(len(words))))
    return word_to_id

def file_to_word_ids(filename, word_to_id):
    data = read_words(filename)
    return [word_to_id[x] for x in data if x in word_to_id]

def load_data():
    train_path = os.path.join(data_path, file_name)
    valid_path = os.path.join(data_path, file_name)
    test_path = os.path.join(data_path, file_name)

    # build the complete vocabulary, then convert text data to list of integers
    word_to_id = build_vocab(train_path)
    train_data = file_to_word_ids(train_path, word_to_id)
    valid_data = file_to_word_ids(valid_path, word_to_id)
    test_data = file_to_word_ids(test_path, word_to_id)
    vocabulary = len(word_to_id)
    reversed_dictionary = dict(zip(word_to_id.values(), word_to_id.keys()))
    print(train_data[:5])
    print(word_to_id)
    print(vocabulary)
    print(" ".join([reversed_dictionary[x] for x in train_data[:10]]))
    return train_data, valid_data, test_data, vocabulary, reversed_dictionary

train_data, valid_data, test_data, vocabulary, reversed_dictionary = load_data()

[1432, 1574, 210, 1, 118]
6775
<eos>The First part of King Henry the Sixth<eos>Shakespeare homepage |


In [58]:
class KerasBatchGenerator(object):
    def __init__(self, data, num_steps, batch_size, vocabulary, skip_step = 5):
        self.data = data
        self.num_steps = num_steps
        self.batch_size = batch_size
        self.vocabulary = vocabulary
        self.current_idx = 0
        self.skip_step = skip_step
        
    def generate(self):
        x = np.zeros((self.batch_size, self.num_steps))
        y = np.zeros((self.batch_size, self.num_steps, self.vocabulary))
        while True:
            for i in range(self.batch_size):
                if self.current_idx + self.num_steps >= len(self.data):
                    self.current_idx = 0
                x[i, :] = self.data[self.current_idx:self.current_idx + self.num_steps]
                temp_y = self.data[self.current_idx + 1 : self.current_idx + self.num_steps + 1]
                y[i, :, :] = to_categorical(temp_y, num_classes = self.vocabulary)
                self.current_idx += self.skip_step
                
        yield x, y

In [None]:
num_steps = 30
batch_size = 20
train_data_generator = KerasBatchGenerator(train_data, num_steps, batch_size, vocabulary, skip_step = num_steps)
valid_data_generator = KerasBatchGenerator(valid_data, num_steps, batch_size, vocabulary,
                                           skip_step=num_steps)

for x, y in train_data_generator.generate():
    print(x, y)

In [56]:
hidden_size = 500
use_dropout = True
model = Sequential()
model.add(Embedding(vocabulary, hidden_size, input_length = num_steps))
model.add(LSTM(hidden_size, return_sequences = True))
model.add(LSTM(hidden_size, return_sequences = True))
if use_dropout:
    model.add(Dropout(0.5))
model.add(TimeDistributed(Dense(vocabulary)))
model.add(Activation("softmax"))


optimizer = Adam()
model.compile(loss = "categorical_crossentropy", optimizer = "adam", metrics=['categorical_accuracy'])


print(model.summary())

6775
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 30, 500)           3387500   
_________________________________________________________________
lstm_1 (LSTM)                (None, 30, 500)           2002000   
_________________________________________________________________
lstm_2 (LSTM)                (None, 30, 500)           2002000   
_________________________________________________________________
dropout_1 (Dropout)          (None, 30, 500)           0         
_________________________________________________________________
time_distributed_1 (TimeDist (None, 30, 6775)          3394275   
_________________________________________________________________
activation_1 (Activation)    (None, 30, 6775)          0         
Total params: 10,785,775
Trainable params: 10,785,775
Non-trainable params: 0
___________________________________________________________

In [None]:
checkpointer = ModelCheckpoint(filepath=data_path + '/model-{epoch:02d}.hdf5', verbose=1)
num_epochs = 50

model.fit_generator(train_data_generator.generate(), len(train_data)//(batch_size*num_steps), num_epochs,
                        validation_data=valid_data_generator.generate(),
                        validation_steps=len(valid_data)//(batch_size*num_steps), callbacks=[checkpointer])
    # model.fit_generator(train_data_generator.generate(), 2000, num_epochs,
    #                     validation_data=valid_data_generator.generate(),
    #                     validation_steps=10)
    model.save(data_path + "final_model.hdf5")
    
