In [1]:
import keras as ks
from keras.models import Sequential, Model
from keras.layers import Input, SimpleRNN, Activation, LSTM, Reshape, Lambda, Dense
from keras.preprocessing import sequence
import numpy as np

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
filepath = 'data/chi_names.txt'
data = open(filepath, encoding='utf-8').read().lower()
uniq = set(data)
ndim = len(uniq)
print('unique characters: len =', ndim)

unique characters: len = 3781


In [3]:
ch2idx = {word: idx for idx, word in enumerate(uniq)}
idx2ch = {idx: word for idx, word in enumerate(uniq)}

In [4]:
def make_onehot(array):
    return np.eye(ndim)[array]

In [5]:
def word2arr(name):
    arr = np.zeros((len(name), ndim))
    for i, ch in enumerate(name):
        arr[i] = make_onehot(ch2idx[ch]) 
    return arr


def arr2word(arr, showProb=False):
    name = ''
    prob = 1
    for vec in arr:
        ch = np.random.choice(ndim, p=vec)
        if showProb:
            print(vec[ch])
        prob *= vec[ch]
        name += idx2ch[ch]
    return name, prob

In [6]:
hidden = 128
rnn_cell = SimpleRNN(hidden, return_sequences=True)

x = Input(shape=(None, ndim), name='x')
out = rnn_cell(x)
out = Dense(ndim, activation='softmax')(out)

model = Model(x, out)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
x (InputLayer)               (None, None, 3781)        0         
_________________________________________________________________
simple_rnn_1 (SimpleRNN)     (None, None, 128)         500480    
_________________________________________________________________
dense_1 (Dense)              (None, None, 3781)        487749    
Total params: 988,229
Trainable params: 988,229
Non-trainable params: 0
_________________________________________________________________


In [7]:
model.compile(optimizer='rmsprop',
              loss='categorical_crossentropy')

## Training model

In [8]:
from itertools import groupby
from random import shuffle
import progressbar
lines = open(filepath, encoding='utf-8').readlines()
shuffle(lines)

counter = 0
bar = progressbar.ProgressBar(max_value=len(lines))
for length, chunks in groupby(lines, key=len):
    chk = list(chunks)
    arr = np.zeros((len(chk), length, ndim), dtype=bool)
    counter += len(chk)
    for i, name in enumerate(chk):
        for j, ch in enumerate(name):
            arr[i, j, ch2idx[ch]] = 1
    x = arr[:, :-1]
    y = arr[:, 1:]
    model.fit(x, y, batch_size=128, verbose=0)
    bar.update(counter)

100% (86893 of 86893) |###################| Elapsed Time: 0:13:46 ETA:  0:00:00

*Here is a ineffecient fit way with batch size = 1 using fit_generator*
```python
lines = open(filepath, encoding='utf-8').readlines()
def lines_generator():
    while True:
        looper = map(word2arr, [x.lower() for x in lines])
        for arr in looper:
            x = arr[:-1].reshape(1, arr.shape[0]-1, arr.shape[1])
            y = arr[1:].reshape(1, arr.shape[0]-1, arr.shape[1])
            yield (x, y)
```

```
model.fit_generator(generator=lines_generator(), steps_per_epoch=1000, epochs=1)
```

In [9]:
model.save('model/chi_name.h5')
del model

## Load model and sampling

In [10]:
from keras.models import load_model
model = load_model('model/chi_name.h5')

In [11]:
def sample(first_name):
    name = first_name
    last_prob = 1
    for i in range(1, 10):
        next = word2arr(name)
        next = model.predict(next.reshape(1, i, ndim))
        ch, prob = arr2word(next.reshape(i, ndim)[-1].reshape(1, ndim))
        if ch[0] is '\n':
            break
        name += ch[0]
        last_prob = prob
    return name, last_prob

In [13]:
first_name = ['张', '王', '赵', '钱', '孙', '李', '黄', '周', '杨', '何']
for name in first_name:
    for _ in range(10):
        print(sample(name))

('张兆文', 0.014937767758965492)
('张璋', 0.00011286656081210822)
('张威毅', 0.0006412349175661802)
('张义石', 0.00030982488533481956)
('张肖力', 0.0014862999087199569)
('张晓娃', 9.306262654718012e-05)
('张长显', 0.0008578095003031194)
('张正鸿', 0.0013419606257230043)
('张霞', 0.002493421547114849)
('张良', 0.0033084985334426165)
('王惠建', 0.006828772369772196)
('王中雪', 0.003974865190684795)
('王', 1)
('王玉蓉', 0.002176048466935754)
('王育利', 0.0036887871101498604)
('王子兴', 0.0032367785461246967)
('王正', 0.004068646114319563)
('王启弛', 1.24761254483019e-05)
('王迪群', 0.0015167791862040758)
('王金菲', 0.0005232281400822103)
('赵吉文', 0.019701942801475525)
('赵庭相', 0.000520802685059607)
('赵霖', 0.0003490494564175606)
('赵兴江', 0.0038434681482613087)
('赵丽', 0.004908709786832333)
('赵商跃', 0.0016150326700881124)
('赵海栋', 0.0013796888524666429)
('赵一君', 0.0029013724997639656)
('赵锦松', 0.003136803163215518)
('赵树耀', 0.0015497293788939714)
('钱静', 0.0043167914263904095)
('钱静', 0.0043167914263904095)
('钱建战', 0.00017651547386776656)
('钱振燕', 0.00393