In [14]:
from __future__ import division, print_function
import numpy as np

from keras.models import Sequential
from keras.layers import Embedding, Dropout, Activation
from keras.layers import LSTM, TimeDistributed, Dense
from keras.optimizers import Adam
from keras.preprocessing import sequence

In [2]:
sentences = []
fd = open('data/sentences.txt', 'r')
for line in fd:
    sentences.append(line[:-1])
fd.close()

In [3]:
print('Number of sentences:', len(sentences))

Number of sentences: 29878


In [4]:
corpus = ' '.join(sentences)
print('Corpus length:', len(corpus))

Corpus length: 1277948


In [5]:
chars = sorted(list(set(corpus)))
vocab_size = len(chars)
print('Total chars:', len(chars))
print(chars)

Total chars: 72
[' ', '!', '"', "'", ',', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']


In [6]:
char_indices = dict((c, i) for i,c in enumerate(chars))
indices_char = dict((i, c) for i,c in enumerate(chars))

In [7]:
corpus_idx = [char_indices[c] for c in corpus]
sentences_idx = []
for sentence in sentences:
    sentences_idx.append([char_indices[c] for c in sentence])

# Preprocessing

In [8]:
maxlen = 100
dataset = sequence.pad_sequences(sentences_idx, maxlen=maxlen+1, truncating='post')

In [9]:
x = []
y = []
for sentence in dataset:
    x.append(sentence[:-1])
    y.append(sentence[1:])

In [10]:
x = np.concatenate([[np.array(i)] for i in x])
y = np.concatenate([[np.array(i)] for i in y])
print('x:', x.shape, 'y:', y.shape)

x: (29878, 100) y: (29878, 100)


# Model

In [20]:
model = Sequential()

model.add(Embedding(vocab_size, 24, input_length=maxlen))
model.add(LSTM(
            128,
            return_sequences=True,
            dropout_U=0.2,
            dropout_W=0.2,
            consume_less='gpu'))
model.add(Dropout(0.2))
model.add(LSTM(
            128,
            return_sequences=True,
            dropout_U=0.2,
            dropout_W=0.2,
            consume_less='gpu'))
model.add(TimeDistributed(Dense(vocab_size)))
model.add(Activation('softmax'))

In [21]:
model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer=Adam()
)

In [33]:
def print_example(start, n_char, maxlen):
    for i in range(n_char):
        start_idx = [char_indices[c] for c in start]
        z = sequence.pad_sequences([start_idx], maxlen=maxlen, truncating='pre')
        pred = model.predict_classes(z, verbose=0)[0][-1]
        start = start + indices_char[pred]
    print(start)

In [35]:
def train():
    model.fit(
        x,
        np.expand_dims(y, -1),
        batch_size=64,
        nb_epoch=1
    )

In [36]:
for i in range(10):
    train()
    print_example('I am ', 100, maxlen)

Epoch 1/1
I am toud tous tous tous tous tous tous tous tous tous toun toun toun toun toun toun toun toun toun toun 
Epoch 1/1
I am to to to to tore to tore to tore to to to to the to the the the the the the the the the the the the 
Epoch 1/1
I am the the the the the the the the the the the the the the the the the the the the the the the the the 
Epoch 1/1
I am the wang.r..n.....................s the wang the wang the wang the wang the wang the wang the wang t
Epoch 1/1
I am the san..................................t.r..t.r..t.r..t.r..t the be the sant the sant the sant the
Epoch 1/1
I am the san.......t.r..t.r..t.r..t.r..t.r..t.r..t.r..t.r..t.r..t the be the be the be the be the be the 
Epoch 1/1
I am the wan..........t.r..t.r..t.r..t.r..t.r..t.r..t.r..t.r..t.r..t.r..t.r..t.r.r.r the be the be the wa
Epoch 1/1
I am the Nor the Nor the Nor the Nor the Nor the Nor the Nor the Nor the Nor the Nor the Nor the Not the 
Epoch 1/1
I am the No................................t.r............n.r.