In [1]:
import random
from random import choice
import numpy as np
import pandas as pd
from transformers import BertTokenizer,  BertConfig, TFBertModel
import tensorflow as tf
from tensorflow.keras import backend as K

# 数据集整理
## 读取数据

In [4]:
def json_to_df(path, nrows=False):
    if nrows:
        df = pd.read_json(path, nrows=nrows, lines=True)
    else:
        df = pd.read_json(path, lines=True)
    df = df[['text', 'spo_list']]
    return df

In [5]:
%%time
train_path ='../data/百度关系抽取数据集/train_data.json'
# train_path = '../data/百度关系抽取数据集/experiment.json']

train_data = json_to_df(train_path, nrows=5000)
print(f'Train data size: {train_data.shape}') #

dev_path = '../data/百度关系抽取数据集/dev_data.json'
# dev_path =  '../data/百度关系抽取数据集/experiment.json'
dev_data = json_to_df(dev_path, nrows=5000)
print(f'Validation data size: {dev_data.shape}') #

Train data size: (5000, 2)
Validation data size: (5000, 2)
Wall time: 365 ms


## 清洗数据

In [6]:
def clean_spo(spo_list):
    for spo in spo_list:
        spo['predicate'] = spo['predicate'].lower()
        spo['subject'] = spo['subject'].lower()
        spo['object'] = spo['object'].lower()
    return spo_list

In [7]:
def data_clean(df):
    df['text'] = df['text'].str.lower()
    df['spo_list'] = df['spo_list'].apply(clean_spo)
    return df

In [9]:
train_data = data_clean(train_data)
dev_data = data_clean(train_data)

train_text = train_data['text'].to_list()
train_spo = train_data['spo_list'].to_list()

dev_text = dev_data['text'].to_list()
dev_spo = dev_data['spo_list'].to_list()
dev_spo[:2]

[[{'predicate': '主演',
   'object_type': '人物',
   'subject_type': '影视作品',
   'object': '周星驰',
   'subject': '喜剧之王'}],
 [{'predicate': '目',
   'object_type': '目',
   'subject_type': '生物',
   'object': '半翅目',
   'subject': '茶树茶网蝽'}]]

In [10]:
%%time
def read_schemads(path):
    predicate_data = pd.read_json(schemads_path, lines=True)
    id2p = predicate_data['predicate'].drop_duplicates().reset_index(drop=True).to_dict()
    p2id = dict(zip(id2p.values(), id2p.keys()))
    print(f'length of p2id :{len(p2id)}')#
    print(f'random p2id sample:{random.sample(p2id.items(), 5)}')#
    return id2p, p2id

Wall time: 0 ns


In [15]:
schemads_path = '../data/百度关系抽取数据集/all_50_schemas'
id2p, p2id = read_schemads(schemads_path)

length of p2id :49
random p2id sample:[('所在城市', 41), ('简称', 6), ('嘉宾', 45), ('编剧', 20), ('所属专辑', 9)]


## 数据集处理

In [11]:
def find_head_idx(pattern, sequence):
    """从sequence中寻找子串pattern
    如果找到，返回第一个下标；否则返回-1。
    """
    n = len(pattern)
    for i in range(len(sequence)):
        if sequence[i:i + n] == pattern:
            return i
    return -1

In [118]:
def data_process(df, p2id, tokenizer, max_length):
    total_token_ids, total_token_type_ids, total_attention_mask = [], [], []
    total_subject_labels, total_subject_ids, total_object_labels = [], [], []
    for idx in df.index:
        row = df.loc[idx]
        inputs = tokenizer(row['text'], max_length=max_length, padding='max_length', truncation=True)
        token_ids, token_type_ids, attention_mask = inputs['input_ids'], inputs['token_type_ids'], inputs['attention_mask']
        # 实体关系token id 位置字典
        s2op_map = {}
        for sop_n, spo in enumerate(row['spo_list']):
            sub_ids = tokenizer.encode(spo['subject'])[1:-1]
            p_id = p2id[spo['predicate']]
            obj_ids = tokenizer.encode(spo['object'])[1:-1]
            # 查找subject 和 object对应的token id 起始索引
            sub_head_idx = find_head_idx(sub_ids, inputs['input_ids'])
            obj_head_idx = find_head_idx(obj_ids, inputs['input_ids'])
            if sub_head_idx != -1 and obj_head_idx != -1:
                # 获取subject 起始位置的索引和结束位置索引的元组
                sub = (sub_head_idx, sub_head_idx + len(sub_ids) - 1)
                # 获取object 起始位置的索引和结束位置索引元组以及与关系标签id的元组对
                obj = (obj_head_idx, obj_head_idx + len(obj_ids) - 1, p_id)
                # print('---- sub -----', sub)#    
                if sub not in s2op_map:
                    s2op_map[sub] = []
                s2op_map[sub].append(obj)
                # print('-----------', s2op_map)#                
                {(22,23):[(28, 29, 7), (25, 26, 8)]}  
            else:
                print('--idx--', idx, '--text--', row['text'])
                print('--sop_n--', sop_n, '--spo--', spo, '\n')


        if s2op_map:
        # subject标签
            subject_labels = np.zeros((max_length, 2))
            for s in s2op_map:
                #sub_head
                subject_labels[s[0], 0] = 1
                #sub_tail
                subject_labels[s[1], 1] = 1
            # 随机选一个subject
            sub_head, sub_tail = choice(list(s2op_map.keys()))
            subject_ids = (sub_head, sub_tail)
            # sub_head, sub_tail = np.array(list(s2op_map.keys())).T
            # sub_head = np.random.choice(sub_head)
            # sub_tail = np.random.choice(sub_tail[sub_tail >= sub_head])
            # 对应的object标签
            object_labels = np.zeros((len(token_ids), len(p2id), 2))
            for op in s2op_map.get((sub_head, sub_tail), []):
                # print(op)
                # obj_head
                object_labels[op[0], op[2], 0] = 1
                # obj_tail
                object_labels[op[1], op[2], 1] = 1

            # 所有数据汇总
            total_token_ids.append(token_ids)
            total_token_type_ids.append(token_type_ids)
            total_attention_mask.append(attention_mask)
            total_subject_labels.append(subject_labels)
            total_subject_ids.append(subject_ids)
            total_object_labels.append(object_labels)                
    return total_token_ids, total_token_type_ids, total_attention_mask, \
           total_subject_labels, total_subject_ids, total_object_labels

In [161]:
input_demo = data_process(train_data[3:4], p2id, tokenizer, max_length)
print(np.array(input_demo[5]).shape)
print(train_spo[3])
print(np.array(input_demo[5])[0, 25, 14, 0], np.array(input_demo[5])[0, 27, 14, 1])
print(np.array(input_demo[5])[0, 14, 17, 0], np.array(input_demo[5])[0, 14, 17, 1])
# tokenizer.decode(train_input_ids[3][op_s:op_e+1])
print(train_spo[3])

(25, 27, 14)
(14, 14, 17)
(28, 31, 21)
(1, 256, 49, 2)
[{'predicate': '身高', 'object_type': 'Number', 'subject_type': '人物', 'object': '70公分', 'subject': '爱德华·尼科·埃尔南迪斯'}, {'predicate': '出生日期', 'object_type': 'Date', 'subject_type': '人物', 'object': '1986', 'subject': '爱德华·尼科·埃尔南迪斯'}, {'predicate': '国籍', 'object_type': '国家', 'subject_type': '人物', 'object': '哥伦比亚', 'subject': '爱德华·尼科·埃尔南迪斯'}]
1.0 1.0
1.0 1.0
[{'predicate': '身高', 'object_type': 'Number', 'subject_type': '人物', 'object': '70公分', 'subject': '爱德华·尼科·埃尔南迪斯'}, {'predicate': '出生日期', 'object_type': 'Date', 'subject_type': '人物', 'object': '1986', 'subject': '爱德华·尼科·埃尔南迪斯'}, {'predicate': '国籍', 'object_type': '国家', 'subject_type': '人物', 'object': '哥伦比亚', 'subject': '爱德华·尼科·埃尔南迪斯'}]


In [114]:
%%time
model_path = '../model_dirs/bert-base-chinese'  
tokenizer = BertTokenizer.from_pretrained(model_path)
max_length = 256
train_input_ids, train_token_type_ids, train_attention_mask, train_subject_labels, train_subject_ids, train_object_labels  = data_process(train_data, p2id, tokenizer, max_length)
print(train_subject_ids[:2])
print(np.array(train_subject_labels).shape)

--idx-- 1178 --text-- ▌1999年：「喜剧之王」前两年的贺岁档其实都有星爷，只不过作品票房一直跟不上
--sop_n-- 0 --spo-- {'predicate': '上映时间', 'object_type': 'Date', 'subject_type': '影视作品', 'object': '1999年', 'subject': '喜剧之王'} 

[(22, 25), (1, 5)]
(9715, 256, 2)
Wall time: 6.87 s


In [162]:
%%time
val_inputs = tokenizer(dev_text, max_length=max_length, padding='max_length', truncation=True, return_tensors='tf') 
val_input_ids, val_attention_mask = val_inputs['input_ids'], val_inputs['attention_mask']

Wall time: 2.57 s


In [169]:
ix = 1178
text = train_data.loc[ix,'text']
print(len(text), text)
sub = train_data.loc[ix,'spo_list'][0]['subject']
obj = train_data.loc[ix,'spo_list'][0]['object']
print(sub, obj)
max_length = 256
s_ids = tokenizer.encode(sub, max_length=max_length, truncation=True)[1:-1]
o_ids = tokenizer.encode(obj, max_length=max_length, truncation=True)[1:-1]
text_ids = tokenizer.encode(text, max_length=max_length, padding='max_length', truncation=True)
print(text_ids)
print(s_ids)
print(o_ids)
find_head_idx(s_ids, text_ids), find_head_idx(o_ids, text_ids)

39 ▌1999年：「喜剧之王」前两年的贺岁档其实都有星爷，只不过作品票房一直跟不上
喜剧之王 1999年
[101, 456, 8818, 8653, 2399, 8038, 519, 1599, 1196, 722, 4374, 520, 1184, 697, 2399, 4638, 6590, 2259, 3440, 1071, 2141, 6963, 3300, 3215, 4267, 8024, 1372, 679, 6814, 868, 1501, 4873, 2791, 671, 4684, 6656, 679, 677, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[1599, 1196, 722, 4374]
[8338, 2399]


(7, -1)

In [166]:
## debug
model_path = '../model_dirs/bert-base-chinese'  
tokenizer = BertTokenizer.from_pretrained(model_path)
max_length = 128
print(f"text ---\n {train_data.loc[47:100,'text']}")
inputs = tokenizer(train_data.loc[47:100,'text'].to_list(), max_length=max_length, padding='max_length', return_tensors='tf', truncation=True)
print('inputs keys --\n', inputs.keys())
print(f"input_ids --\n {inputs['input_ids']}")
tokens = tokenizer.decode(inputs['input_ids'][0])
print('tokens --\n', tokens)
print(f"spo_list ------\n {train_data.loc[47,'spo_list']}")
text_len = len(tokens)
text_len
bert_model = TFBertModel.from_pretrained(model_path, output_hidden_states=True)
outputs = bert_model(inputs)
last_hidden_state, pooler_output, hidden_states = outputs[:3]

text ---
 47                       《头文字d4》是2005年接力出版社出版的图书，作者是重野秀一
48          苏州硕诺尔自动化设备有限公司于2014年11月14日在苏州市吴中区市场监督管理局登记成立
49     爱德华多·戈塔尔迪1985年7月22日出生于巴西，身高193厘米，惯用脚右脚，主要效力于莱里...
50     个人信息姓名：伊万-佩恩 erwan peron (法国)  国籍：法国  性别：男  生日...
51     【陈赫告白母亲，张子萱晒女儿萌照，低调示爱的二人获得大家的认可】文/热门电影君（原创文章，谢...
52                    严九岳(1574～1621年)，字以赞，号海日，明代福建永安市贡川人
53                         张辉，女，心血管内科，汉族，1963年6月8日生，天津市人
54                  《是我不小心》是陈明演唱的一首歌曲，收录在专辑《相信你总会被我感动》当中
55     覃林盛，湖北力帝机床股份有限公司董事长，在机械制造行业已辛勤耕耘了三十余年，在董事长兼总经理...
56                               姚官保，男，1985年10月出生于河南省汝南县
57                        2015年10月13日，北京龙泉寺举行装藏法会，贤佳法师主持
58                                 刘才光  男，1930年4月生，福建福州人
59     尼尔森总部位于美国纽约，并在伊利诺伊州的商堡（schaumburg）、比利时的瓦韦尔（wav...
60     孙晓健，男，1964年9月出生，研究生学历，1986年2月加入中国共产党，现任泰兴市粮食系统...
61                         《成交百分百》是2003年海潮出版社出版的图书，作者是景斓
62               云南展博电气设备有限公司于2012年11月13日在昆明市工商行政管理局登记成立
63                0后的第三波主持人：纳豆、夏宇童-(代班2周) 节目改名电玩快打 ver 2
64                   

Some layers from the model checkpoint at ../model_dirs/bert-base-chinese were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at ../model_dirs/bert-base-chinese.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [18]:
hidden_states[-1].shape

TensorShape([54, 128, 768])

In [170]:
def extract_subject(output, subject_ids):
    start = tf.gather(output, subject_ids[:,:1], batch_dims=1)
    end = tf.gather(output, subject_ids[:,1:], batch_dims=1)
    subject = tf.keras.layers.Concatenate(axis=2)([start, end])
    return subject[:, 0]

extract_subject(hidden_states[-1], tf.constant(train_subject_ids[:hidden_states[-1].shape[0]]))

<tf.Tensor: shape=(54, 1536), dtype=float32, numpy=
array([[ 0.25873297,  0.49238703,  0.63826686, ...,  0.11304718,
        -0.9661733 , -0.4333823 ],
       [ 0.42852157, -0.17608842,  0.8069094 , ...,  1.4964044 ,
        -0.49943933, -0.20207542],
       [ 0.5664912 , -0.12230435,  0.19252312, ...,  0.5424045 ,
         0.27006415,  0.08944741],
       ...,
       [ 0.46302688, -0.4424666 ,  0.5141795 , ...,  0.4153874 ,
         0.5806328 ,  0.04473295],
       [-0.11299253, -0.3138919 , -1.4666147 , ...,  0.36222684,
         0.43323463,  0.29208207],
       [ 0.6611088 ,  0.7782924 , -0.7245413 , ...,  0.38568816,
         0.23528749,  0.6906782 ]], dtype=float32)>

In [172]:
class LayerNormalization(tf.keras.layers.Layer):
    """(Conditional) Layer Normalization
    hidden_*系列参数仅为有条件输入时(conditional=True)使用
    """
    def __init__(
        self,
        center=True,
        scale=True,
        epsilon=None,
        conditional=False,
        hidden_units=None,
        hidden_activation='linear',
        hidden_initializer='glorot_uniform',
        **kwargs):
        super(LayerNormalization, self).__init__(**kwargs)
        self.center = center
        self.scale = scale
        self.conditional = conditional
        self.hidden_units = hidden_units
        self.hidden_activation = tf.keras.activations.get(hidden_activation)
        self.hidden_initializer = tf.keras.initializers.get(hidden_initializer)
        self.epsilon = epsilon or 1e-12
        
    def compute_mask(self, inputs, mask=None):
        if self.conditional:
            masks = mask if mask is not None else []
            masks = [m[None] for m in masks if m is not None]
            if len(masks) == 0:
                return None
            else:
                return K.all(K.concatenate(masks, axis=0), axis=0)
        else:
            return mask
        
    def build(self, input_shape):
        super(LayerNormalization, self).build(input_shape)
        if self.conditional:
            shape = (input_shape[0][-1],)
        else:
            shape = (input_shape[-1],)
        if self.center:
            self.beta = self.add_weight(
                shape=shape, initializer='zeros', name='beta')
        if self.scale:
            self.gamma = self.add_weight(
                shape=shape, initializer='ones', name='gamma')
        if self.conditional:
            if self.hidden_units is not None:
                self.hidden_dense = tf.keras.layers.Dense(
                    units=self.hidden_units,
                    activation=self.hidden_activation,
                    use_bias=False,
                    kernel_initializer=self.hidden_initializer)
            if self.center:
                self.beta_dense = tf.keras.layers.Dense(
                    units=shape[0], use_bias=False, kernel_initializer='zeros')
            if self.scale:
                self.gamma_dense = tf.keras.layers.Dense(
                    units=shape[0], use_bias=False, kernel_initializer='zeros')

    def call(self, inputs):
        """如果是条件Layer Norm，则默认以list为输入，第二个是condition
        """
        if self.conditional:
            inputs, cond = inputs
            if self.hidden_units is not None:
                cond = self.hidden_dense(cond)
            for _ in range(K.ndim(inputs) - K.ndim(cond)):
                cond = K.expand_dims(cond, 1)
            if self.center:
                beta = self.beta_dense(cond) + self.beta
            if self.scale:
                gamma = self.gamma_dense(cond) + self.gamma
        else:
            if self.center:
                beta = self.beta
            if self.scale:
                gamma = self.gamma
        outputs = inputs
        if self.center:
            mean = K.mean(outputs, axis=-1, keepdims=True)
            outputs = outputs - mean
        if self.scale:
            variance = K.mean(K.square(outputs), axis=-1, keepdims=True)
            std = K.sqrt(variance + self.epsilon)
            outputs = outputs / std
            outputs = outputs * gamma
        if self.center:
            outputs = outputs + beta
        return outputs        

In [177]:

def E2EModel(pretrained_path, max_length, p2id):
    input_ids = tf.keras.layers.Input((max_length,), dtype=tf.int32, name='input_ids')
    token_type_ids = tf.keras.layers.Input((max_length,), dtype=tf.int32, name='total_segment_ids')
    attention_mask = tf.keras.layers.Input((max_length,), dtype=tf.int32, name='attention_mask')
    subject_ids = tf.keras.layers.Input((2,), dtype=tf.int32, name='subject_ids')

    bert_model = TFBertModel.from_pretrained(pretrained_path, output_hidden_states=True)
    outputs = bert_model(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
    last_hidden_state, pooler_output, hidden_states = outputs[:3]
    layer_1 = hidden_states[-1]
    
    subject_preds = tf.keras.layers.Dense(units=2, activation='sigmoid',)(layer_1)
    subject_preds = tf.keras.layers.Lambda(lambda x: x**2)(subject_preds)
    subject_model = tf.keras.models.Model(inputs=[input_ids, token_type_ids, attention_mask], outputs=subject_preds)

    subject = extract_subject([layer_1, subject_ids])
    Normalization_1 = LayerNormalization(conditional=True)([layer_1, subject])
    output = tf.keras.layers.Dense(units=len(p2d) * 2, activation='sigmoid' )(output)
    output = tf.keras.layers.Lambda(lambda x: x**4)(output)
    object_preds = tf.reshape((-1, len(p2id), 2))(output)
    object_model = tf.keras.models.Model(input=[input_ids, token_type_ids, attention_mask, subject_ids], outputs=object_preds)

    train_model = tf.keras.models.Model(input=[input_ids, token_type_ids, attention_mask, subject_ids], outputs= [subject_preds, object_preds])


In [None]:
pid = os.getpid()
!kill -9 $pid

'kill' �����ڲ����ⲿ���Ҳ���ǿ����еĳ���
���������ļ���


In [None]:

current=np.array([
        [0,7,1,2,2],
        [1,7,3,4,3],
        [2,7,5,6,6],
        [3,7,7,8,7],
        [4,7,7,8,7],
        [5,7,7,8,7]
])

current =tf.constant(current)
print(current)
points_e = tf.expand_dims(current, axis=1)
points_e 

tf.Tensor(
[[0 7 1 2 2]
 [1 7 3 4 3]
 [2 7 5 6 6]
 [3 7 7 8 7]
 [4 7 7 8 7]
 [5 7 7 8 7]], shape=(6, 5), dtype=int32)


<tf.Tensor: shape=(6, 1, 5), dtype=int32, numpy=
array([[[0, 7, 1, 2, 2]],

       [[1, 7, 3, 4, 3]],

       [[2, 7, 5, 6, 6]],

       [[3, 7, 7, 8, 7]],

       [[4, 7, 7, 8, 7]],

       [[5, 7, 7, 8, 7]]])>