In [1]:
import keras as ks
from keras.models import Sequential, Model
from keras.layers import Input, SimpleRNN, Activation, LSTM, Reshape, Lambda, Dense
from keras.preprocessing import sequence
import numpy as np

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
filepath = 'data/chi_names.txt'
data = open(filepath, encoding='utf-8').read().lower()
uniq = set(data)
ndim = len(uniq)
print('unique characters: len =', ndim)

unique characters: len = 3757


## helper function

In [3]:
def word2arr(name):
    arr = np.zeros((len(name), ndim))
    for i, ch in enumerate(name):
        arr[i, ch2idx[ch]] = 1
    return arr


def arr2word(arr, showProb=False):
    name = ''
    prob = 1.0
    for vec in arr:
        ch = np.random.choice(ndim, p=vec)
        if showProb:
            print(vec[ch])
        prob *= vec[ch]
        name += idx2ch[ch]
    return name, prob

import pickle
def save_dict(filepath, dict_obj):
    with open(filepath, "wb") as f:
        pickle.dump(dict_obj, f)

        
def load_dict(filepath):
    with open(filepath, "rb") as f:
        return pickle.load(f)

## Creating Model

In [4]:
ch2idx = {word: idx for idx, word in enumerate(uniq)}
idx2ch = {idx: word for idx, word in enumerate(uniq)}

In [5]:
hidden = 128
rnn_cell = SimpleRNN(hidden, return_sequences=True)

x = Input(shape=(None, ndim), name='x')
out = rnn_cell(x)
out = Dense(ndim, activation='softmax')(out)

model = Model(x, out)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
x (InputLayer)               (None, None, 3757)        0         
_________________________________________________________________
simple_rnn_1 (SimpleRNN)     (None, None, 128)         497408    
_________________________________________________________________
dense_1 (Dense)              (None, None, 3757)        484653    
Total params: 982,061
Trainable params: 982,061
Non-trainable params: 0
_________________________________________________________________


In [6]:
model.compile(optimizer='rmsprop',
              loss='categorical_crossentropy')

## Training model

### Option 1: traning model using variable length

In [7]:
from itertools import groupby
from random import shuffle
import progressbar
lines = open(filepath, encoding='utf-8').readlines()
shuffle(lines)

counter = 0
bar = progressbar.ProgressBar(max_value=len(lines))
for length, chunks in groupby(lines, key=len):
    chk = list(chunks)
    arr = np.zeros((len(chk), length, ndim), dtype=bool)
    counter += len(chk)
    for i, name in enumerate(chk):
        for j, ch in enumerate(name):
            arr[i, j, ch2idx[ch]] = 1
    x = arr[:, :-1]
    y = arr[:, 1:]
    model.fit(x, y, batch_size=128, verbose=0)
    bar.update(counter)

100% (86716 of 86716) |###################| Elapsed Time: 0:13:37 ETA:  0:00:00

### Option 2: padding name to maxlength using '\n'

In [None]:
def padding_name(name, max_length):
    name += '\n' * (max_length - len(name))
    return name

In [None]:
from random import shuffle
lines = open(filepath, encoding='utf-8').readlines()
shuffle(lines)
max_length = len(max(lines, key=len))
lines = [padding_name(name, max_length) for name in lines]

def batch_generator(lines, batch_size=128):
    chunks = [lines[i:i + batch_size] for i in range(0, len(lines), batch_size)]
    while True:
        for chunk in chunks:
            arr = np.zeros((batch_size, max_length, ndim))
            for i, name in enumerate(chunk):
                for j, ch in enumerate(name):
                    arr[i, j, ch2idx[ch]] = 1
            x = arr[:, :-1]
            y = arr[:, 1:]
            yield (x, y)

model.fit_generator(generator=batch_generator(lines), steps_per_epoch=len(lines)/128, epochs=1)

## Save model
Since Python use different random seed for hash function in every session, in order to reload trained model, you cannot save only model itself, but also ch2idx, idx2ch object as well because these 2 objects use set function to generate dict object.

See also https://stackoverflow.com/a/27522708/7620214

In [8]:
# save model
model.save('model/chi_name.h5')

# save dict
save_dict('model/ch2idx.pkl', ch2idx)
save_dict('model/idx2ch.pkl', idx2ch)

## Load model and sampling

In [None]:
from keras.models import load_model
# load model
model = load_model('model/chi_name.h5')

# load dict
ch2idx = load_dict('model/ch2idx.pkl')
idx2ch = load_dict('model/idx2ch.pkl')

In [9]:
def sample(first_name):
    name = first_name
    last_prob = 1
    for i in range(1, 10):
        next = word2arr(name)
        next = model.predict(next.reshape(1, i, ndim))
        ch, prob = arr2word(next.reshape(i, ndim)[-1].reshape(1, ndim))
        if ch[0] is '\n':
            break
        name += ch[0]
        last_prob = prob
    return name, last_prob

In [10]:
first_name = ['陈', '张', '王', '赵', '钱', '孙', '李', '黄', '周', '杨', '何', '欧', '上']
for name in first_name:
    for _ in range(10):
        print(sample(name))

('陈天雄', 0.003409971483051777)
('陈澍', 3.940694296034053e-05)
('陈', 1)
('陈海厚', 0.0002645377826411277)
('陈概张', 0.00016857880109455436)
('陈清', 0.0034715617075562477)
('陈灵尤', 0.0003563206410035491)
('陈桃', 0.0003612065629567951)
('陈博', 0.001150886993855238)
('陈秀海', 0.008844899013638496)
('张东', 0.005207078997045755)
('张怀伯', 0.0002961941354442388)
('张凤学', 0.0027856621891260147)
('张赞双', 0.0009214482852257788)
('张华', 0.010215764865279198)
('张璇', 0.00032445212127640843)
('张静锋', 0.004459536634385586)
('张亚', 0.003247500630095601)
('张双', 0.0015154107240960002)
('张立树', 0.002470780862495303)
('王美', 0.004075216129422188)
('王强', 0.0023727361112833023)
('王维', 0.003937355242669582)
('王', 1)
('王家海', 0.00729594798758626)
('王仙雯', 0.001730739139020443)
('王小青', 0.002841019770130515)
('王铁成', 0.0037237636279314756)
('王雪革', 0.00010903873044298962)
('王晗', 0.0004707835614681244)
('赵阳月', 0.0014148115878924727)
('赵苏', 0.0009791180491447449)
('赵承', 0.0010355053236708045)
('赵儿', 0.00010591329191811383)
('赵惠顺', 0.000988