In [1]:
from __future__ import division, print_function
import numpy as np

from keras.models import Sequential
from keras.layers import Embedding, Dropout, Activation
from keras.layers import LSTM, TimeDistributed, Dense
from keras.optimizers import Adam
from keras.preprocessing import sequence

Using Theano backend.
Using gpu device 0: GeForce GTX 850M (CNMeM is disabled, cuDNN 5105)


In [2]:
sentences = []
fd = open('data/sentences.txt', 'r')
for line in fd:
    sentences.append(line[:-1])
fd.close()

In [3]:
print('Number of sentences:', len(sentences))

Number of sentences: 29878


In [4]:
corpus = ' '.join(sentences)
print('Corpus length:', len(corpus))

Corpus length: 1277948


In [5]:
chars = sorted(list(set(corpus)))
vocab_size = len(chars)
print('Total chars:', len(chars))
print(chars)

Total chars: 72
[' ', '!', '"', "'", ',', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [6]:
char_indices = dict((c, i) for i,c in enumerate(chars))
indices_char = dict((i, c) for i,c in enumerate(chars))

In [7]:
corpus_idx = [char_indices[c] for c in corpus]
sentences_idx = []
for sentence in sentences:
    sentences_idx.append([char_indices[c] for c in sentence])

# Preprocessing

In [8]:
maxlen = 40

In [9]:
x = []
y = []
for i in range(0, len(corpus_idx) - maxlen+1):
    x.append(corpus_idx[i:i+maxlen])
    y.append(corpus_idx[i+1:i+maxlen+1])
print('x:', len(x), 'y:', len(y))

x: 1277909 y: 1277909


In [10]:
x = np.concatenate([[np.array(i)] for i in x[:-2]])
y = np.concatenate([[np.array(i)] for i in y[:-2]])
print('x:', x.shape, 'y:', y.shape)

x: (1277907, 40) y: (1277907, 40)


# Model

In [11]:
model = Sequential()

model.add(Embedding(vocab_size, 24, input_length=maxlen))
model.add(LSTM(
            128,
            return_sequences=True,
            dropout_U=0.2,
            dropout_W=0.2,
            consume_less='gpu'))
model.add(Dropout(0.2))
model.add(LSTM(
            128,
            return_sequences=True,
            dropout_U=0.2,
            dropout_W=0.2,
            consume_less='gpu'))
model.add(TimeDistributed(Dense(vocab_size)))
model.add(Activation('softmax'))

In [12]:
model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer=Adam()
)

In [13]:
def print_example(start, n_char, maxlen):
    for i in range(n_char):
        start_idx = [char_indices[c] for c in start]
        z = sequence.pad_sequences([start_idx], maxlen=maxlen, truncating='pre')
        pred = model.predict_classes(z, verbose=0)[0][-1]
        start = start + indices_char[pred]
    print(start)

In [14]:
def train():
    model.fit(
        x,
        np.expand_dims(y, -1),
        batch_size=64,
        nb_epoch=1,
        shuffle=False
    )

In [16]:
train()
print_example('I am ', 100, maxlen)

Epoch 1/1
I am the war the Stark the war the Stark the North the Stark the war the Stark the North the Stark the wa


In [17]:
train()
print_example('I am ', 100, maxlen)

Epoch 1/1
I am the war the war the war the war the war the war the war the war the war the war the war the war the 


In [18]:
train()
print_example('I am ', 100, maxlen)

Epoch 1/1
I am the way to stay the way to stay the way to stay the way to stay the way to stay the way to stay the 


In [19]:
train()
print_example('I am ', 100, maxlen)


Epoch 1/1
I am the way to stay to stay the way to stay to stay the way to stay to stay the way to stay to stay the 


In [None]:
train()
print_example('I am ', 100, maxlen)


Epoch 1/1
  55936/1277907 [>.............................] - ETA: 1051s - loss: 1.5255