In [1]:
import keras as ks
from keras.models import Sequential, Model
from keras.layers import Input, SimpleRNN, Activation, LSTM, Reshape, Lambda, Dense
from keras.preprocessing import sequence
import numpy as np

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
filepath = 'data/chi_names.txt'
data = open(filepath, encoding='utf-8').read().lower()
uniq = set(data)
ndim = len(uniq)
print('unique characters: len =', ndim)

unique characters: len = 3757


## helper function

In [3]:
def word2arr(name):
    arr = np.zeros((len(name), ndim))
    for i, ch in enumerate(name):
        arr[i, ch2idx[ch]] = 1
    return arr


def arr2word(arr, showProb=False):
    name = ''
    prob = 1.0
    for vec in arr:
        ch = np.random.choice(ndim, p=vec)
        if showProb:
            print(vec[ch])
        prob = vec[ch]
        name += idx2ch[ch]
    return name, prob

import pickle
def save_dict(filepath, dict_obj):
    with open(filepath, "wb") as f:
        pickle.dump(dict_obj, f)

        
def load_dict(filepath):
    with open(filepath, "rb") as f:
        return pickle.load(f)

## Creating Model

In [4]:
ch2idx = {word: idx for idx, word in enumerate(uniq)}
idx2ch = {idx: word for idx, word in enumerate(uniq)}

In [5]:
hidden = 128
rnn_cell = SimpleRNN(hidden, return_sequences=True)

x = Input(shape=(None, ndim), name='x')
out = rnn_cell(x)
out = Dense(ndim, activation='softmax')(out)

model = Model(x, out)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
x (InputLayer)               (None, None, 3757)        0         
_________________________________________________________________
simple_rnn_1 (SimpleRNN)     (None, None, 128)         497408    
_________________________________________________________________
dense_1 (Dense)              (None, None, 3757)        484653    
Total params: 982,061
Trainable params: 982,061
Non-trainable params: 0
_________________________________________________________________


In [6]:
model.compile(optimizer='rmsprop',
              loss='categorical_crossentropy')

## Training model

### Option 1: traning model using variable length

In [8]:
from itertools import groupby
from random import shuffle
import progressbar
lines = open(filepath, encoding='utf-8').readlines()
shuffle(lines)

for epochs in range(20):
    counter = 0
    bar = progressbar.ProgressBar(max_value=len(lines))
    for length, chunks in groupby(lines, key=len):
        chk = list(chunks)
        arr = np.zeros((len(chk), length, ndim), dtype=bool)
        counter += len(chk)
        for i, name in enumerate(chk):
            for j, ch in enumerate(name):
                arr[i, j, ch2idx[ch]] = 1
        x = arr[:, :-1]
        y = arr[:, 1:]
        model.fit(x, y, batch_size=128, verbose=0)
        bar.update(counter)

100% (86716 of 86716) |###################| Elapsed Time: 0:01:08 Time: 0:01:08
100% (86716 of 86716) |###################| Elapsed Time: 0:01:08 Time: 0:01:08
100% (86716 of 86716) |###################| Elapsed Time: 0:01:09 Time: 0:01:09
100% (86716 of 86716) |###################| Elapsed Time: 0:01:09 Time: 0:01:09
100% (86716 of 86716) |###################| Elapsed Time: 0:01:08 Time: 0:01:08
100% (86716 of 86716) |###################| Elapsed Time: 0:01:09 Time: 0:01:09
100% (86716 of 86716) |###################| Elapsed Time: 0:01:09 Time: 0:01:09
100% (86716 of 86716) |###################| Elapsed Time: 0:01:09 Time: 0:01:09
100% (86716 of 86716) |###################| Elapsed Time: 0:01:09 Time: 0:01:09
100% (86716 of 86716) |###################| Elapsed Time: 0:01:08 Time: 0:01:08
100% (86716 of 86716) |###################| Elapsed Time: 0:01:08 Time: 0:01:08
100% (86716 of 86716) |###################| Elapsed Time: 0:01:08 Time: 0:01:08
100% (86716 of 86716) |#################

### Option 2: padding name to maxlength using '\n'

In [None]:
def padding_name(name, max_length):
    name += '\n' * (max_length - len(name))
    return name

In [None]:
from random import shuffle
lines = open(filepath, encoding='utf-8').readlines()
shuffle(lines)
max_length = len(max(lines, key=len))
lines = [padding_name(name, max_length) for name in lines]

def batch_generator(lines, batch_size=128):
    chunks = [lines[i:i + batch_size] for i in range(0, len(lines), batch_size)]
    while True:
        for chunk in chunks:
            arr = np.zeros((batch_size, max_length, ndim))
            for i, name in enumerate(chunk):
                for j, ch in enumerate(name):
                    arr[i, j, ch2idx[ch]] = 1
            x = arr[:, :-1]
            y = arr[:, 1:]
            yield (x, y)

model.fit_generator(generator=batch_generator(lines), steps_per_epoch=len(lines)/128, epochs=1)

## Save model
Since Python use different random seed for hash function in every session, in order to reload trained model, you cannot save only model itself, but also ch2idx, idx2ch object as well because these 2 objects use set function to generate dict object.

See also https://stackoverflow.com/a/27522708/7620214

In [9]:
# save model
model.save('model/chi_name.h5')

# save dict
save_dict('model/ch2idx.pkl', ch2idx)
save_dict('model/idx2ch.pkl', idx2ch)

## Load model and sampling

In [10]:
from keras.models import load_model
# load model
model = load_model('model/chi_name.h5')

# load dict
ch2idx = load_dict('model/ch2idx.pkl')
idx2ch = load_dict('model/idx2ch.pkl')

In [11]:
def sample(first_name):
    name = first_name
    last_prob = 1
    for i in range(1, 10):
        next = word2arr(name)
        next = model.predict(next.reshape(1, i, ndim))
        ch, prob = arr2word(next.reshape(i, ndim)[-1].reshape(1, ndim))
        if ch[0] is '\n':
            break
        name += ch[0]
        last_prob *= prob
    return name, last_prob

In [12]:
first_name = ['陈', '张', '王', '赵', '钱', '孙', '李', '黄', '周', '杨', '何', '欧', '上']
for name in first_name:
    for _ in range(10):
        print(sample(name))

('陈开静', 3.2350599947714447e-06)
('陈进俊', 0.00010984689275285626)
('陈欣志', 2.845677842833465e-05)
('陈芳林', 2.1386588258407238e-05)
('陈晓海', 0.00025052245772494083)
('陈福忠', 1.5073073087405915e-05)
('陈青', 0.0016482708742842078)
('陈民', 0.0007179697277024388)
('陈中锦', 2.071043745415803e-05)
('陈天', 0.014639856293797493)
('张国翠', 5.243099708932865e-05)
('张博', 0.0016613940242677927)
('张慧', 0.009406695142388344)
('张敏萍', 0.00010950164997166473)
('张万', 0.0007539856596849859)
('张明安', 0.0001086827391844885)
('张', 1)
('张世会', 8.934337866905616e-06)
('张晓建', 0.00021921020577242434)
('张春旺', 7.235907900231037e-06)
('王武明', 1.2880774324599718e-06)
('王建刚', 1.5832891328023345e-05)
('王小', 0.012258147820830345)
('王秀', 0.012613345868885517)
('王芳', 0.006224652286618948)
('王龙', 0.005746682174503803)
('王', 1)
('王兴', 0.012211103923618793)
('王华', 0.01619190350174904)
('王莹君', 3.6088496159969783e-07)
('赵柏文', 2.577176772160304e-06)
('赵凤贵', 4.4025703446773595e-06)
('赵', 1)
('赵家双', 0.00017261608799236036)
('赵广永', 9.77951371311