In [1]:
from __future__ import division, print_function
import numpy as np

from keras.models import Sequential
from keras.layers import Embedding, Dropout, Activation
from keras.layers import LSTM, TimeDistributed, Dense
from keras.optimizers import Adam
from keras.preprocessing import sequence

Using Theano backend.
Using gpu device 0: GeForce GTX 850M (CNMeM is disabled, cuDNN 5105)


In [2]:
sentences = []
fd = open('data/sentences.txt', 'r')
for line in fd:
    sentences.append(line[:-1])
fd.close()

In [3]:
print('Number of sentences:', len(sentences))

Number of sentences: 29878


In [4]:
corpus = ' '.join(sentences)
print('Corpus length:', len(corpus))

Corpus length: 1277948


In [5]:
chars = sorted(list(set(corpus)))
vocab_size = len(chars)
print('Total chars:', len(chars))
print(' '.join(chars))

Total chars: 72
  ! " ' , . 0 1 2 3 4 5 6 7 8 9 : ; ? A B C D E F G H I J K L M N O P Q R S T U V W X Y Z _ a b c d e f g h i j k l m n o p q r s t u v w x y z


In [6]:
char_indices = dict((c, i) for i,c in enumerate(chars))
indices_char = dict((i, c) for i,c in enumerate(chars))

In [7]:
sentences_idx = []
for sentence in sentences:
    sentences_idx.append([char_indices[c] for c in sentence])

# Preprocessing

In [8]:
maxlen = 100
dataset = sequence.pad_sequences(sentences_idx, maxlen=maxlen+1, truncating='pre')

In [9]:
x = []
y = []
for sentence in dataset:
    x.append(sentence[:-1])
    y.append(sentence[1:])

In [10]:
x = np.concatenate([[np.array(i)] for i in x])
y = np.concatenate([[np.array(i)] for i in y])
print('x:', x.shape, 'y:', y.shape)

x: (29878, 100) y: (29878, 100)


# Model

In [11]:
model = Sequential()

model.add(Embedding(vocab_size, 24, input_length=maxlen))
model.add(LSTM(
            256,
            return_sequences=True,
            dropout_U=0.2,
            dropout_W=0.2,
            consume_less='gpu'))
model.add(Dropout(0.2))
model.add(LSTM(
            256,
            return_sequences=True,
            dropout_U=0.2,
            dropout_W=0.2,
            consume_less='gpu'))
model.add(TimeDistributed(Dense(vocab_size)))
model.add(Activation('softmax'))

In [12]:
model.compile(
    loss='sparse_categorical_crossentropy',
    optimizer=Adam()
)

In [13]:
def print_example(start, n_char, maxlen):
    for i in range(n_char):
        start_idx = [char_indices[c] for c in start]
        z = sequence.pad_sequences([start_idx], maxlen=maxlen, truncating='pre')
        preds = model.predict(z, verbose=0)[0][-1]
        next_char = np.random.choice(chars, p=preds)
        start = start + next_char
    print(start)

In [14]:
def train():
    model.fit(
        x,
        np.expand_dims(y, -1),
        batch_size=64,
        nb_epoch=1
    )

In [15]:
for i in range(5):
    print('STEP', i+1)
    train()
    print_example('Winter is coming.', 400, maxlen)
    print()

STEP 1
Epoch 1/1
Winter is coming.e..k In teo oa chi fha wlrt. iio yhiae Ler dlca Ya tpaao di darwfc., yl cow?dmr.rers.. lLa Ronann onklf ;orto imem myr aratdi,t9ni Ied ionr thenr Dadletth?dy dome Nnt Ins wotde fnstctgn karts toua,gdn hannedrss.d.s. hed.me.t wtspe. Murta ilt?nL ther woms So Maoed,.d iode. heys .erfr fomt kvdair the too ei. irt tourr lat cav the Nu i thind aasls.,h saasrssvys roe to kasnhe tsr fritd. kot senthal.s 

STEP 2
Epoch 1/1
Winter is coming...t.u...m.....s.sa....s.unn.rot.eas. iped.t...e......te.n..ett...t... olrl pors irl fatc fwr Hevt Boddef she Rey.t orefg laprey fisove he olded ml, moud and me annne whet's ona. lutcuret mesf wnel Lurt So whanyand loil. the de as los Srueingr to mer gwig whe ghe wtan alloddes. ond hrm Tirl'sitherd shigl I'm taurd the l'vess cteigg to mamese he fangeresh fo' toom besgir hy toleir? ti fel chey walle

STEP 3
Epoch 1/1
Winter is coming.p... hya.k..t.enteong.or...s.s..n.... you.re.ne......s.ore rarding..is an weld.s Tore onowe th

In [16]:
model.save_weights('data/model_by_line_5.hf5')

In [17]:
for i in range(95):
    print('STEP', i+1)
    train()
    print_example('Winter is coming.', 400, maxlen)
    print()

STEP 1
Epoch 1/1
Winter is coming..d.ly...ce.ted..t....t..d.es all.t.t.s winds...t..t.....s.m.ld.m.s..n..t.s.. her froth.l? That it a brother she. napthings mrouth the gasters arout anr trunseds.n every new and Minks my luster alow? I well lades.s fre recpye wutch and do you seed morern my darat you sow should daan asbome to be say.s brother the sreapen,d Deands and bemoars worring.se you and Gessad the dactle he starle syeet colp

STEP 2
Epoch 1/1
Winter is coming.s..t..ng.ce.s.ne..s.is.s.tle.s.n.s.s.se.we.rles.talyarfen.ut..t.n.als..ked.s need Keeppeny of the goes.ngs Blage then incelie.n of proteste to mut un wutch with Vake Fawaun the hanbcess the gows.n chister a couf me my sof Laesnit to got an the peace them before ut walgers with come we't, a spreid burt us ver about you? I'll not they dotary of the clease muy in no one of the lieds, the Ereesan Told

STEP 3
Epoch 1/1
Winter is coming..t.s.s.nk..ts.n.ug.es.od..y suave.s.re.s. destagy.ne shore.n.t.stane.n.s lastess.stand a the k

In [18]:
model.save_weights('data/model_by_line_100.hf5')