In [61]:
from __future__ import print_function
from keras.models import Sequential
from keras.layers import Dense, Activation, LSTM, TimeDistributed,Dropout
from keras.optimizers import RMSprop
from keras.utils.data_utils import get_file
from keras.callbacks import ModelCheckpoint

import numpy as np
import random
import sys
import io

In [62]:
path= get_file('nietzsche.txt',origin='https://s3.amazonaws.com/text-datasets/nietzsche.txt')
text= io.open(path, encoding='utf-8').read().lower()
print ('corpus length: ', len(text))

corpus length:  600893


In [63]:
chars= sorted(list(set(text)))
print ('total chars: ', len(chars))

total chars:  57


In [64]:
print (chars)

['\n', ' ', '!', '"', "'", '(', ')', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '=', '?', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'ä', 'æ', 'é', 'ë']


In [65]:
char_indices= dict((c,i) for i,c in enumerate(chars))
indices_char= dict((i,c) for i,c in enumerate(chars))

In [66]:
max_len=40
step=3
sentences=[]
next_chars=[]
for i in range(0,len(text)-max_len, step):
    sentences.append(text[i:i+max_len])
    next_chars.append(text[i+max_len])
print ('nb sequences: ', len(sentences))

nb sequences:  200285


In [67]:
print ('Vectorization....')
x=np.zeros((len(sentences), max_len, len(chars)), dtype=np.bool)
y=np.zeros((len(sentences),len(chars)),dtype=np.bool)
print (x.shape,y.shape)

Vectorization....
(200285, 40, 57) (200285, 57)


In [68]:
for i,sentence in enumerate(sentences):
    for t,char in  enumerate(sentence):
        x[i,t,char_indices[char]]=1
    y[i,char_indices[next_chars[i]]] = 1


In [81]:
model=Sequential()
model.add(LSTM(512, input_shape=(max_len,len(chars)), return_sequences=True))
model.add(Dropout(0.2))
model.add(LSTM(512))
model.add(Dropout(0.2))
model.add(Dense(len(chars)))
model.add(Activation('softmax'))
# optimizer= RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_16 (LSTM)               (None, 40, 512)           1167360   
_________________________________________________________________
dropout_7 (Dropout)          (None, 40, 512)           0         
_________________________________________________________________
lstm_17 (LSTM)               (None, 512)               2099200   
_________________________________________________________________
dropout_8 (Dropout)          (None, 512)               0         
_________________________________________________________________
dense_8 (Dense)              (None, 57)                29241     
_________________________________________________________________
activation_7 (Activation)    (None, 57)                0         
Total params: 3,295,801
Trainable params: 3,295,801
Non-trainable params: 0
_________________________________________________________________


In [82]:
def sample(preds,temperature=1):
    preds=np.asarray(preds).astype('float64')
    preds=np.log(preds)/temperature
#     print (preds)
    exp_preds=np.exp(preds)
#     print (exp_preds)
    probas=np.random.multinomial(1,preds,1)
#     print (probas)
    return np.argmax(probas)

In [84]:
checkpoint = ModelCheckpoint('weights.{epoch:02d}-{loss:.5f}.hdf5', monitor='val_acc', verbose=1, save_best_only=True, mode='auto')
model.fit(x,y, batch_size=128, epochs=10, callbacks=[checkpoint])
start_index= random.randint(0,len(text)-max_len-1)

generated=''
sentence=text[start_index:start_index+max_len]
generated+=sentence
print ('Generated with seed: "'+sentence + '"')
sys.stdout.write(generated)

for i in range(400):
    x_pred= np.zeros((1,max_len,len(chars)))
    for t, char in enumerate(sentence):
        x_pred[0,t,char_indices[char]]=1.

    preds=model.predict(x_pred, verbose=0)[0]
    next_index= np.argmax(preds)
    next_char=indices_char[next_index]

    generated+=next_char
    sentence=sentence[1:]+next_char
    sys.stdout.write("~")
    sys.stdout.write(next_char)
    sys.stdout.flush()
print()

Epoch 1/10
Epoch 2/10
   128/200285 [..............................] - ETA: 3:39 - loss: 1.8102



Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Generated with seed: "id to do wrong"; while they accept "good"
id to do wrong"; while they accept "good~"~ ~a~s~ ~i~n~ ~t~h~e~ ~f~a~c~t~ ~t~h~a~t~ ~t~h~e~ ~s~e~n~s~e~ ~o~f~ ~t~h~e~ ~s~p~i~r~i~t~ ~o~f~ ~t~h~e~ ~s~a~m~e~ ~m~o~r~a~l~i~t~y~ ~o~f~ ~t~h~e~ ~s~u~b~j~e~c~t~i~o~n~ ~o~f~ ~t~h~e~ ~s~u~b~j~e~c~t~,~ ~t~h~e~ ~c~o~n~s~c~i~e~n~c~e~ ~o~f~ ~t~h~e~ ~s~u~b~j~e~c~t~,~ ~t~h~e~ ~c~o~n~s~c~i~e~n~c~e~ ~o~f~ ~t~h~e~ ~s~u~b~j~e~c~t~,~ ~t~h~e~ ~c~o~n~s~c~i~e~n~c~e~ ~o~f~ ~t~h~e~ ~s~u~b~j~e~c~t~,~ ~t~h~e~ ~c~o~n~s~c~i~e~n~c~e~ ~o~f~ ~t~h~e~ ~s~u~b~j~e~c~t~,~ ~t~h~e~ ~c~o~n~s~c~i~e~n~c~e~ ~o~f~ ~t~h~e~ ~s~u~b~j~e~c~t~,~ ~t~h~e~ ~c~o~n~s~c~i~e~n~c~e~ ~o~f~ ~t~h~e~ ~s~u~b~j~e~c~t~,~ ~t~h~e~ ~c~o~n~s~c~i~e~n~c~e~ ~o~f~ ~t~h~e~ ~s~u~b~j~e~c~t~,~ ~t~h~e~ ~c~o~n~s~c~i~e~n~c~e~ ~o~f~ ~t~h~e~ ~s~u~b~j~e~c~t~,~ ~t~h~e~ ~c~o~n~s~c~i~e~n~c~e~ ~o~f~ ~t~h~e~ ~s~u~b~j~e~c~t~,~ ~t~h~e~ ~c~o~n~s~c~i~e~n~c~e~ ~o~f~ ~t~h


In [85]:
model.save('text-gen1.h5')