https://pypi.org/project/keras-transformer/

https://github.com/GlassyWing/transformer-word-segmenter

集束搜索可以认为是维特比算法的贪心形式，在维特比所有中由于利用动态规划导致当字典较大时效率低，而集束搜索使用beam size参数来限制在每一步保留下来的可能性词的数量。集束搜索是在测试阶段为了获得更好准确性而采取的一种策略，在训练阶段无需使用。

In [8]:
# toy数据
# Build a small toy token dictionary
tokens = 'all work and no play makes jack a dull boy'.split(' ')
token_dict = {
    '<PAD>': 0,
    '<START>': 1,
    '<END>': 2,
}
# 字典
for token in tokens:
    if token not in token_dict:
        token_dict[token] = len(token_dict)

print(token_dict)
# Generate toy data
encoder_inputs_no_padding = []
encoder_inputs, decoder_inputs, decoder_outputs = [], [], []
for i in range(1, len(tokens) - 1):
    encode_tokens, decode_tokens = tokens[:i], tokens[i:]
    encode_tokens = ['<START>'] + encode_tokens + ['<END>'] + ['<PAD>'] * (len(tokens) - len(encode_tokens))
    output_tokens = decode_tokens + ['<END>', '<PAD>'] + ['<PAD>'] * (len(tokens) - len(decode_tokens))
    decode_tokens = ['<START>'] + decode_tokens + ['<END>'] + ['<PAD>'] * (len(tokens) - len(decode_tokens))
    
    encode_tokens = list(map(lambda x: token_dict[x], encode_tokens))
    decode_tokens = list(map(lambda x: token_dict[x], decode_tokens))
    output_tokens = list(map(lambda x: [token_dict[x]], output_tokens))
    encoder_inputs_no_padding.append(encode_tokens[:i + 2])
    
    encoder_inputs.append(encode_tokens)
    decoder_inputs.append(decode_tokens)
    decoder_outputs.append(output_tokens)
for e in encoder_inputs:
    print(e)
print("__")
for e in decoder_inputs:
    print(e)
    
for e in decoder_outputs:
    print(e)

{'<PAD>': 0, '<START>': 1, '<END>': 2, 'all': 3, 'work': 4, 'and': 5, 'no': 6, 'play': 7, 'makes': 8, 'jack': 9, 'a': 10, 'dull': 11, 'boy': 12}
[1, 3, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 3, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 3, 4, 5, 2, 0, 0, 0, 0, 0, 0, 0]
[1, 3, 4, 5, 6, 2, 0, 0, 0, 0, 0, 0]
[1, 3, 4, 5, 6, 7, 2, 0, 0, 0, 0, 0]
[1, 3, 4, 5, 6, 7, 8, 2, 0, 0, 0, 0]
[1, 3, 4, 5, 6, 7, 8, 9, 2, 0, 0, 0]
[1, 3, 4, 5, 6, 7, 8, 9, 10, 2, 0, 0]
__
[1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 2, 0]
[1, 5, 6, 7, 8, 9, 10, 11, 12, 2, 0, 0]
[1, 6, 7, 8, 9, 10, 11, 12, 2, 0, 0, 0]
[1, 7, 8, 9, 10, 11, 12, 2, 0, 0, 0, 0]
[1, 8, 9, 10, 11, 12, 2, 0, 0, 0, 0, 0]
[1, 9, 10, 11, 12, 2, 0, 0, 0, 0, 0, 0]
[1, 10, 11, 12, 2, 0, 0, 0, 0, 0, 0, 0]
[1, 11, 12, 2, 0, 0, 0, 0, 0, 0, 0, 0]
[[4], [5], [6], [7], [8], [9], [10], [11], [12], [2], [0], [0]]
[[5], [6], [7], [8], [9], [10], [11], [12], [2], [0], [0], [0]]
[[6], [7], [8], [9], [10], [11], [12], [2], [0], [0], [0], [0]]
[[7], [8], [9], [10], [11], [12], [2], [0], 

In [9]:
# 机器翻译toy-data
source_tokens = [
    'i need more power'.split(' '),
    'eat jujube and pill'.split(' '),
]
target_tokens = [
    list('我要更多的抛瓦'),
    list('吃枣💊'),
]

# 生成不同语言的词典
def build_token_dict(token_list):
    token_dict = {
        '<PAD>': 0,
        '<START>': 1,
        '<END>': 2,
    }
    for tokens in token_list:
        for token in tokens:
            if token not in token_dict:
                token_dict[token] = len(token_dict)
    return token_dict

source_token_dict = build_token_dict(source_tokens)
target_token_dict = build_token_dict(target_tokens)
target_token_dict_inv = {v: k for k, v in target_token_dict.items()}

# 添加特殊符号
encode_tokens = [['<START>'] + tokens + ['<END>'] for tokens in source_tokens]
decode_tokens = [['<START>'] + tokens + ['<END>'] for tokens in target_tokens]
output_tokens = [tokens + ['<END>', '<PAD>'] for tokens in target_tokens]

# 补齐长度
source_max_len = max(map(len, encode_tokens))
target_max_len = max(map(len, decode_tokens))

encode_tokens = [tokens + ['<PAD>'] * (source_max_len - len(tokens)) for tokens in encode_tokens]
decode_tokens = [tokens + ['<PAD>'] * (target_max_len - len(tokens)) for tokens in decode_tokens]
output_tokens = [tokens + ['<PAD>'] * (target_max_len - len(tokens)) for tokens in output_tokens]

encode_input = [list(map(lambda x: source_token_dict[x], tokens)) for tokens in encode_tokens]
decode_input = [list(map(lambda x: target_token_dict[x], tokens)) for tokens in decode_tokens]
decode_output = [list(map(lambda x: [target_token_dict[x]], tokens)) for tokens in output_tokens]

for e in encoder_inputs:
    print(e)
print("__")
for e in decoder_inputs:
    print(e)
    
for e in decoder_outputs:
    print(e)

[1, 3, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 3, 4, 2, 0, 0, 0, 0, 0, 0, 0, 0]
[1, 3, 4, 5, 2, 0, 0, 0, 0, 0, 0, 0]
[1, 3, 4, 5, 6, 2, 0, 0, 0, 0, 0, 0]
[1, 3, 4, 5, 6, 7, 2, 0, 0, 0, 0, 0]
[1, 3, 4, 5, 6, 7, 8, 2, 0, 0, 0, 0]
[1, 3, 4, 5, 6, 7, 8, 9, 2, 0, 0, 0]
[1, 3, 4, 5, 6, 7, 8, 9, 10, 2, 0, 0]
__
[1, 4, 5, 6, 7, 8, 9, 10, 11, 12, 2, 0]
[1, 5, 6, 7, 8, 9, 10, 11, 12, 2, 0, 0]
[1, 6, 7, 8, 9, 10, 11, 12, 2, 0, 0, 0]
[1, 7, 8, 9, 10, 11, 12, 2, 0, 0, 0, 0]
[1, 8, 9, 10, 11, 12, 2, 0, 0, 0, 0, 0]
[1, 9, 10, 11, 12, 2, 0, 0, 0, 0, 0, 0]
[1, 10, 11, 12, 2, 0, 0, 0, 0, 0, 0, 0]
[1, 11, 12, 2, 0, 0, 0, 0, 0, 0, 0, 0]
[[4], [5], [6], [7], [8], [9], [10], [11], [12], [2], [0], [0]]
[[5], [6], [7], [8], [9], [10], [11], [12], [2], [0], [0], [0]]
[[6], [7], [8], [9], [10], [11], [12], [2], [0], [0], [0], [0]]
[[7], [8], [9], [10], [11], [12], [2], [0], [0], [0], [0], [0]]
[[8], [9], [10], [11], [12], [2], [0], [0], [0], [0], [0], [0]]
[[9], [10], [11], [12], [2], [0], [0], [0], [0], [0], [0], [0

In [10]:
import numpy as np
from keras_transformer import get_model, decode

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [11]:
# 构建和训练模型
model = get_model(
    token_num=max(len(source_token_dict), len(target_token_dict)),
    embed_dim=32,
    encoder_num=2,
    decoder_num=2,
    head_num=4,
    hidden_dim=128,
    dropout_rate=0.05,
    use_same_embed=False,  # 不同语言需要使用不同的词嵌入
)
model.compile('adam', 'sparse_categorical_crossentropy')
model.summary()

model.fit(
    x=[np.array(encode_input * 1024), np.array(decode_input * 1024)],
    y=np.array(decode_output * 1024),
    epochs=10,
    batch_size=32,
)






Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Encoder-Input (InputLayer)      (None, None)         0                                            
__________________________________________________________________________________________________
Encoder-Token-Embedding (Embedd [(None, None, 32), ( 416         Encoder-Input[0][0]              
__________________________________________________________________________________________________
Encoder-Embedding (TrigPosEmbed (None, None, 32)     0           Encoder-Token-Embedding[0][0]    
__________________________________________________________________________________________________
Encoder-1-MultiHeadSelfAttentio (None, None, 32)     4224        Encoder-Embedding[0][0

<keras.callbacks.History at 0x7f0374ddc710>

In [12]:
# 预测过程
decoded = decode(
    model,
    encode_input,
    start_token=target_token_dict['<START>'],
    end_token=target_token_dict['<END>'],
    pad_token=target_token_dict['<PAD>'],
)
print(''.join(map(lambda x: target_token_dict_inv[x], decoded[0][1:-1])))
print(''.join(map(lambda x: target_token_dict_inv[x], decoded[1][1:-1])))

我要更多的抛瓦
吃枣💊
