In [1]:
import json
import numpy as np
from bert4keras.backend import keras, K, batch_gather
from bert4keras.layers import Loss
from bert4keras.layers import LayerNormalization
from bert4keras.tokenizers import Tokenizer
from bert4keras.models import build_transformer_model
from bert4keras.optimizers import Adam, extend_with_exponential_moving_average
from bert4keras.snippets import sequence_padding 
from bert4keras.snippets import open, to_array
from keras.layers import Input, Dense, Lambda, Reshape
from keras.models import Model

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
Using TensorFlow backend.


In [2]:
bert_vocab_path = 'vocab.txt'
tokenizer = Tokenizer(bert_vocab_path, do_lower_case=True)

In [3]:
text = '爱德华·尼科·埃尔南迪斯（1986-），是一位身高只有70公分哥伦比亚男子，体重10公斤，只比随身行李高一些，2010年获吉尼斯世界纪录正式认证，成为全球当今最矮的成年男人'
print(tokenizer.encode(text))

([101, 4263, 2548, 1290, 185, 2225, 4906, 185, 1812, 2209, 1298, 6832, 3172, 8020, 8629, 118, 8021, 8024, 3221, 671, 855, 6716, 7770, 1372, 3300, 8203, 1062, 1146, 1520, 840, 3683, 762, 4511, 2094, 8024, 860, 7028, 8108, 1062, 3165, 8024, 1372, 3683, 7390, 6716, 6121, 3330, 7770, 671, 763, 8024, 8166, 2399, 5815, 1395, 2225, 3172, 686, 4518, 5279, 2497, 3633, 2466, 6371, 6395, 8024, 2768, 711, 1059, 4413, 2496, 791, 3297, 4765, 4638, 2768, 2399, 4511, 782, 102], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])


In [5]:
print(tokenizer.tokenize(text))

['[CLS]', '爱', '德', '华', '·', '尼', '科', '·', '埃', '尔', '南', '迪', '斯', '（', '1986', '-', '）', '，', '是', '一', '位', '身', '高', '只', '有', '70', '公', '分', '哥', '伦', '比', '亚', '男', '子', '，', '体', '重', '10', '公', '斤', '，', '只', '比', '随', '身', '行', '李', '高', '一', '些', '，', '2010', '年', '获', '吉', '尼', '斯', '世', '界', '纪', '录', '正', '式', '认', '证', '，', '成', '为', '全', '球', '当', '今', '最', '矮', '的', '成', '年', '男', '人', '[SEP]']


In [6]:
tokenizer.rematch(text, tokenizer.tokenize(text))

[[],
 [0],
 [1],
 [2],
 [3],
 [4],
 [5],
 [6],
 [7],
 [8],
 [9],
 [10],
 [11],
 [12],
 [13, 14, 15, 16],
 [17],
 [18],
 [19],
 [20],
 [21],
 [22],
 [23],
 [24],
 [25],
 [26],
 [27, 28],
 [29],
 [30],
 [31],
 [32],
 [33],
 [34],
 [35],
 [36],
 [37],
 [38],
 [39],
 [40, 41],
 [42],
 [43],
 [44],
 [45],
 [46],
 [47],
 [48],
 [49],
 [50],
 [51],
 [52],
 [53],
 [54],
 [55, 56, 57, 58],
 [59],
 [60],
 [61],
 [62],
 [63],
 [64],
 [65],
 [66],
 [67],
 [68],
 [69],
 [70],
 [71],
 [72],
 [73],
 [74],
 [75],
 [76],
 [77],
 [78],
 [79],
 [80],
 [81],
 [82],
 [83],
 [84],
 [85],
 []]

In [7]:
class DataGenerator(object):
    """数据生成器模版
    """
    def __init__(self, data, batch_size=32, buffer_size=None):
        self.data = data
        self.batch_size = batch_size
        if hasattr(self.data, '__len__'):
            self.steps = len(self.data) // self.batch_size
            if len(self.data) % self.batch_size != 0:
                self.steps += 1
        else:
            self.steps = None
        self.buffer_size = buffer_size or batch_size * 1000

    def __len__(self):
        return self.steps

    def sample(self, random=False):
        """采样函数，每个样本同时返回一个is_end标记
        """
        if random:
            if self.steps is None:

                def generator():
                    caches, isfull = [], False
                    for d in self.data:
                        caches.append(d)
                        if isfull:
                            i = np.random.randint(len(caches))
                            yield caches.pop(i)
                        elif len(caches) == self.buffer_size:
                            isfull = True
                    while caches:
                        i = np.random.randint(len(caches))
                        yield caches.pop(i)

            else:

                def generator():
                    for i in np.random.permutation(len(self.data)):
                        yield self.data.loc[i]

            data = generator()
        else:
            data = iter(self.data.iterrows())
        d_current = next(data)
        for d_next in data:
            yield False, d_current
            d_current = d_next

        yield True, d_current

    def __iter__(self, random=False):
        raise NotImplementedError

    def forfit(self, random=True):
        while True:
            for d in self.__iter__(random):
                yield d


In [8]:
def search(pattern, sequence):
    """从sequence中寻找子串pattern
    如果找到，返回第一个下标；否则返回-1。
    """
    n = len(pattern)
    for i in range(len(sequence)):
        if sequence[i:i + n] == pattern:
            return i
    return -1

In [30]:
maxlen = 128

class data_generator(DataGenerator):
    """数据生成器
    """
    def __iter__(self, random=False):
        batch_token_ids, batch_segment_ids = [], []
        batch_subject_labels, batch_subject_ids, batch_object_labels = [], [], []
        for is_end, d in self.sample(random):
            token_ids, segment_ids = tokenizer.encode(d[1][1], maxlen=maxlen)
            # 整理三元组 {s: [(o, p)]}
            spoes = {}
            for spo in d[1][2]:
                print(spo)
                s = tokenizer.encode(spo['subject'])[1:-1]
                print(s)
                p = predicate2id[spo['predicate']]
                o = tokenizer.encode(spo['object'])[1:-1]
                s_idx = search(s, token_ids)
                o_idx = search(o, token_ids)
                if s_idx != -1 and o_idx != -1:
                    s = (s_idx, s_idx + len(s) - 1)
                    o = (o_idx, o_idx + len(o) - 1, p)
                    if s not in spoes:
                        spoes[s] = []
                    spoes[s].append(o)
            if spoes:
                # subject标签
                subject_labels = np.zeros((len(token_ids), 2))
                for s in spoes:
                    subject_labels[s[0], 0] = 1
                    subject_labels[s[1], 1] = 1
                # 随机选一个subject（这里没有实现错误！这就是想要的效果！！）
                start, end = np.array(list(spoes.keys())).T
                start = np.random.choice(start)
                end = np.random.choice(end[end >= start])
                subject_ids = (start, end)
                # 对应的object标签
                object_labels = np.zeros((len(token_ids), len(predicate2id), 2))
                for o in spoes.get(subject_ids, []):
                    object_labels[o[0], o[2], 0] = 1
                    object_labels[o[1], o[2], 1] = 1
                # 构建batch
                batch_token_ids.append(token_ids)
                batch_segment_ids.append(segment_ids)
                batch_subject_labels.append(subject_labels)
                batch_subject_ids.append(subject_ids)
                batch_object_labels.append(object_labels)
                if len(batch_token_ids) == self.batch_size or is_end:
                    batch_token_ids = sequence_padding(batch_token_ids)
                    batch_segment_ids = sequence_padding(batch_segment_ids)
                    batch_subject_labels = sequence_padding(
                        batch_subject_labels
                    )
                    batch_subject_ids = np.array(batch_subject_ids)
                    batch_object_labels = sequence_padding(batch_object_labels)
                    yield [
                        batch_token_ids, batch_segment_ids,
                        batch_subject_labels, batch_subject_ids,
                        batch_object_labels
                    ], None
                    batch_token_ids, batch_segment_ids = [], []
                    batch_subject_labels, batch_subject_ids, batch_object_labels = [], [], []

In [10]:
import pandas as pd
train_path = r'D:\Documents\project\demo\transformers_tutorial\data\百度关系抽取数据集\train_data.json'
train_data = pd.read_json(train_path, lines=True)[:4]

schemads_path = 'D:/Documents/project/demo/transformers_tutorial/data/百度关系抽取数据集/all_50_schemas'
predicate_data = pd.read_json(schemads_path, lines=True)
id2p = predicate_data['predicate'].drop_duplicates().reset_index(drop=True).to_dict()
predicate2id = dict(zip(id2p.values(), id2p.keys()))

In [31]:
train_generator = data_generator(train_data, batch_size=32)

In [32]:
list(train_generator)

()
()
()
()
()
()


[]

In [37]:
import tensorflow as tf

x = K.constant([5, 4, 6])
y = K.constant([5, 2, 5])
z = K.cast(K.greater(x, y),'float32')

with tf.Session() as sess:
    print(sess.run(z))


[0. 1. 1.]


In [44]:
x = K.constant([5, 4, 6])
y = K.constant([5, 2, 5])
z = K.sum([x,y],axis=0)
with tf.Session() as sess:
    print(sess.run(z))

[10.  6. 11.]


In [76]:
tokenizer.rematch(text, tokens)

NameError: name 'tokenizer' is not defined