In [58]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import keras
from keras.utils.data_utils import get_file
from keras.models import Model, Sequential
from keras.layers import Input, Embedding, Dense, Flatten
from keras.layers import SimpleRNN
from keras.layers.merge import Add
from keras.optimizers import RMSprop, Adam

In [2]:
path = get_file('nietzsche.txt', origin="https://s3.amazonaws.com/text-datasets/nietzsche.txt")
text = open(path).read()
print('corpus length:', len(text))

corpus length: 600893


In [3]:
chars = sorted(list(set(text)))
# Sometimes it's useful to have a zero value in the dataset, e.g. for padding
chars.insert(0, '\0')
vocab_size = len(chars)

print('vocab size:', vocab_size)
''.join(chars)

vocab size: 85


'\x00\n !"\'(),-.0123456789:;=?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]_abcdefghijklmnopqrstuvwxyzÆäæéë'

In [4]:
char_to_idx = dict((c, i) for i, c in enumerate(chars))
idx_to_char = dict((i, c) for i, c in enumerate(chars))

In [5]:
idx = [char_to_idx[c] for c in text]

## 根据3个char 得到第4个char

In [6]:
clen = 3

c1_data = [idx[i] for i in range(0, len(idx)-clen-1, clen)] # 这里的间距调成1应该也是没问题的
c2_data = [idx[i+1] for i in range(0, len(idx)-clen-1, clen)]
c3_data = [idx[i+2] for i in range(0, len(idx)-clen-1, clen)]
c4_data = [idx[i+3] for i in range(0, len(idx)-clen-1, clen)]

In [7]:
x1 = np.array(c1_data)
x2 = np.array(c2_data)
x3 = np.array(c3_data)
y = np.array(c4_data)

In [8]:
x1.shape, y.shape

((200297,), (200297,))

embedding layers

In [9]:
def embedding_input(name, n_in, n_out):
    inp = Input(shape=(1,), dtype='int64', name=name)
    emb = Embedding(n_in, n_out, input_length=1)(inp)
    return inp, Flatten()(emb)

In [10]:
n_fac = 42 # 隐藏因子数
c1_inp, c1_emb = embedding_input('c1', vocab_size, n_fac)
c2_inp, c2_emb = embedding_input('c2', vocab_size, n_fac)
c3_inp, c3_emb = embedding_input('c3', vocab_size, n_fac)

hidden layers

In [11]:
n_hidden = 256

dense_in = Dense(n_hidden, activation='relu')
dense_hidden = Dense(n_hidden, activation='tanh')

In [12]:
c1_den = dense_in(c1_emb)

c2_den = dense_in(c2_emb)
hidden_2 = dense_hidden(c1_den)
hidden_2_add = Add()([c2_den, hidden_2])

c3_den = dense_in(c3_emb)
hidden_3 = dense_hidden(hidden_2_add)
hidden_3_add = Add()([c3_den, hidden_3])


In [13]:
c4_out = Dense(vocab_size, activation='softmax')(hidden_3_add)

In [14]:
model = Model([c1_inp, c2_inp, c3_inp], c4_out)

In [15]:
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
c3 (InputLayer)                  (None, 1)             0                                            
____________________________________________________________________________________________________
c2 (InputLayer)                  (None, 1)             0                                            
____________________________________________________________________________________________________
c1 (InputLayer)                  (None, 1)             0                                            
____________________________________________________________________________________________________
embedding_3 (Embedding)          (None, 1, 42)         3570        c3[0][0]                         
___________________________________________________________________________________________

In [16]:
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam())

In [17]:
model.fit([x1, x2, x3], y, batch_size=64, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x11671fe80>

#### Test model

In [18]:
def get_next(inp):
    idxs = [char_to_idx[c] for c in inp]
    arrs = [np.array([i]) for i in idxs]
    p = model.predict(arrs)
    i = np.argmax(p)
    return chars[i]

In [19]:
get_next('phi')

'l'

In [20]:
get_next(' th')

'e'

In [21]:
get_next(' an')

'd'

## first RNN

In [37]:
clen = 8

c_in_data = [[idx[i+n] for i in range(0, len(idx)-clen-1, clen)]
             for n in range(clen)]
c_out_data = [idx[i+8] for i in range(0, len(idx)-clen-1, clen)]

In [38]:
xs = [np.array(dat) for dat in c_in_data]
y = np.array(c_out_data)

In [39]:
def embeding_input(name, n_in, n_out):
    inp = Input((1,), dtype='int64', name=name+'_in')
    emb = Embedding(n_in, n_out, input_length=1, name=name+'_emb')(inp)
    return inp, Flatten()(emb)

In [40]:
n_fac = 42

c_inps, c_embs = zip(*[embeding_input('c'+str(n), vocab_size, n_fac) for n in range(clen)])

dense_hidden的初始化方式是identity（单位矩阵），这样能保证在初始化时，这个dense层不会影响通过的数据（A*I=A）

原因参考Hinton的论文A Simple Way to Initialize Recurrent Networks of Rectified Linear Units：https://arxiv.org/abs/1504.00941

In [41]:
n_hidden = 256

dense_in = Dense(n_hidden, activation='relu')
dense_hidden = Dense(n_hidden, activation='relu', kernel_initializer='identity')
dense_out = Dense(vocab_size, activation='softmax')

In [42]:
c_dens = [dense_in(c_emb) for c_emb in c_embs]

In [43]:
c_add = c_dens[0]

for n in range(1, clen):
    c_hidden = dense_hidden(c_add)
    c_add = Add()([c_hidden, c_dens[n]])

In [44]:
c_out = dense_out(c_hidden)

In [45]:
model = Model(list(c_inps), c_out)

In [46]:
model.summary()

____________________________________________________________________________________________________
Layer (type)                     Output Shape          Param #     Connected to                     
c0_in (InputLayer)               (None, 1)             0                                            
____________________________________________________________________________________________________
c0_emb (Embedding)               (None, 1, 42)         3570        c0_in[0][0]                      
____________________________________________________________________________________________________
c1_in (InputLayer)               (None, 1)             0                                            
____________________________________________________________________________________________________
flatten_12 (Flatten)             (None, 42)            0           c0_emb[0][0]                     
___________________________________________________________________________________________

In [47]:
model.compile(Adam(), 'sparse_categorical_crossentropy')

In [52]:
model.fit(xs, y, epochs=5)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x116b5fe10>

#### Test model

In [53]:
get_next('for thos')

'e'

In [54]:
get_next('part of ')

't'

In [55]:
get_next('queens a')

'f'

## first RNN with Keras

In [57]:
n_fac = 42
n_hidden = 256
clen = 8

In [60]:
model = Sequential([
    Embedding(vocab_size, n_fac, input_length=clen),
    SimpleRNN(n_hidden, activation='relu', recurrent_initializer='identity'),
    Dense(vocab_size, activation='softmax'),
])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 8, 42)             3570      
_________________________________________________________________
simple_rnn_2 (SimpleRNN)     (None, 256)               76544     
_________________________________________________________________
dense_11 (Dense)             (None, 85)                21845     
Total params: 101,959
Trainable params: 101,959
Non-trainable params: 0
_________________________________________________________________


In [61]:
model.compile(Adam(), 'sparse_categorical_crossentropy')

In [66]:
model.fit(np.concatenate(xs, axis=1), y, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x11c6e04a8>

In [85]:
def get_next(s):
    idxs = [char_to_idx[c] for c in s]
    pred = model.predict(np.array([idxs]))
    pred_idx = pred.argmax(axis=1)[0]
    return idx_to_char[pred_idx]

In [86]:
get_next('this is ')

't'

In [87]:
get_next('part of ')

't'

In [88]:
get_next('queens a')

'n'