In [1]:
from keras_bert import load_trained_model_from_checkpoint

config_path = './model/bert_config.json'
checkpoint_path = './model/model.ckpt-1400000'

bert = load_trained_model_from_checkpoint(config_path, checkpoint_path)
bert.summary()

Using TensorFlow backend.


__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Input-Token (InputLayer)        (None, 512)          0                                            
__________________________________________________________________________________________________
Input-Segment (InputLayer)      (None, 512)          0                                            
__________________________________________________________________________________________________
Embedding-Token (TokenEmbedding [(None, 512, 768), ( 24576000    Input-Token[0][0]                
__________________________________________________________________________________________________
Embedding-Segment (Embedding)   (None, 512, 768)     1536        Input-Segment[0][0]              
__________________________________________________________________________________________________
Embedding-

In [2]:
import sentencepiece as spm
import numpy as np

spp = spm.SentencePieceProcessor()
spp.Load('./model/wiki-ja.model')

def texts2matrix(texts):
    maxlen = 512 # from BERT config
    common_seg_input = np.zeros((len(texts), maxlen), dtype = np.float32)
    matrix = np.zeros((len(texts), maxlen), dtype = np.float32)
    for i, text in enumerate(texts):
        tok = [w for w in spp.encode_as_pieces(text.replace(" ", ""))]
        if tok == [] or len(tok) > maxlen:
            print("skip processing", tok)
        else:
            tokens = []
            tokens.append('[CLS]')
            tokens.extend(tok)
            tokens.append('[SEP]')
            for t, token in enumerate(tokens):
                try:
                    matrix[i, t] = spp.piece_to_id(token)
                except:
                    print(token+"is unknown")
                    matrix[i, t] = spp.piece_to_id('<unk>')
    return bert.predict([matrix, common_seg_input])[:,0] # embedding of [CLS]

In [3]:
spp.encode_as_pieces("隣の客はよく柿食う客だ")

['▁', '隣の', '客', 'は', 'よく', '柿', '食', 'う', '客', 'だ']

In [4]:
texts2matrix(["隣の客はよく柿食う客だ"])

array([[-3.41196954e-01,  1.03131640e+00,  1.53629273e-01,
        -4.40728009e-01,  1.48483515e-02,  8.22932497e-02,
        -2.90441006e-01,  7.41560638e-01,  9.66181457e-01,
        -5.53366125e-01, -3.13522905e-01,  4.32044834e-01,
        -3.25102150e-01, -2.92775273e-01, -3.10505331e-02,
        -4.70475703e-01, -5.41527793e-02, -5.28014079e-02,
         5.45616820e-02, -4.55384225e-01, -1.48637548e-01,
        -2.70385772e-01,  3.95462364e-01,  7.42649853e-01,
         3.23779643e-01, -1.18695222e-01,  2.62434155e-01,
        -3.65382016e-01,  3.53820950e-01,  9.89902139e-01,
         5.05595505e-01,  5.27373552e-01,  3.30920160e-01,
        -6.04039207e-02, -9.53062773e-02, -2.08897784e-01,
        -3.60898823e-01, -4.41889614e-01, -1.66381931e+00,
         8.48136604e-01,  9.16482806e-02,  1.83489114e-01,
         2.44988471e-01, -1.15357660e-01,  4.47331876e-01,
        -1.04940629e+00,  1.93154171e-01,  1.48590133e-02,
         1.32922947e-01,  1.21201493e-01, -2.23365307e-0