In [149]:
import random
from random import choice
import numpy as np
import pandas as pd
from transformers import BertTokenizer,  BertConfig, TFBertModel
import tensorflow as tf
from tensorflow.keras import backend as K

# 数据集整理
## 读取数据

In [150]:
def json_to_df(path, nrows=False):
    if nrows:
        df = pd.read_json(path, nrows=1000, lines=True)
    else:
        df = pd.read_json(path, lines=True)
    df = df[['text', 'spo_list']]
    return df

In [151]:
%%time
train_path ='../data/百度关系抽取数据集/train_data.json'
# train_path = '../data/百度关系抽取数据集/experiment.json']

train_data = json_to_df(train_path, nrows=1000)
print(f'Train data size: {train_data.shape}') #

dev_path = '../data/百度关系抽取数据集/dev_data.json'
# dev_path =  '../data/百度关系抽取数据集/experiment.json'
dev_data = json_to_df(dev_path, nrows=1000)
print(f'Validation data size: {dev_data.shape}') #

Train data size: (1000, 2)
Validation data size: (1000, 2)
Wall time: 115 ms


## 清洗数据

In [152]:
def clean_spo(spo_list):
    for spo in spo_list:
        spo['predicate'] = spo['predicate'].lower()
        spo['subject'] = spo['subject'].lower()
        spo['object'] = spo['object'].lower()
    return spo_list

In [153]:
def data_clean(df):
    df['text'] = df['text'].str.lower()
    df['spo_list'] = df['spo_list'].apply(clean_spo)
    return df

In [154]:
train_data = data_clean(train_data)
dev_data = data_clean(train_data)
train_text = train_data['text'].to_list()
train_spo = train_data['spo_list'].to_list()
dev_text = dev_data['text'].to_list()
dev_spo = dev_data['spo_list'].to_list()

In [155]:
%%time
def read_schemads(path):
    predicate_data = pd.read_json(schemads_path, lines=True)
    id2p = predicate_data['predicate'].drop_duplicates().reset_index(drop=True).to_dict()
    p2id = dict(zip(id2p.values(), id2p.keys()))
    print(f'length of p2id :{len(p2id)}')#
    print(f'random p2id sample:{random.sample(p2id.items(), 5)}')#
    return id2p, p2id

Wall time: 0 ns


In [156]:
schemads_path = '../data/百度关系抽取数据集/all_50_schemas'
id2p, p2id = read_schemads(schemads_path)

length of p2id :49
random p2id sample:[('上映时间', 7), ('成立日期', 35), ('主角', 33), ('父亲', 1), ('简称', 6)]


## 数据集处理

In [167]:
model_path = '../model_dirs/bert-base-chinese'  
tokenizer = BertTokenizer.from_pretrained(model_path)
max_length = 256
print(f"text ---\n {train_data.loc[47:48,'text']}")
inputs = tokenizer(train_data.loc[47:48,'text'].to_list(), max_length=max_length, padding='max_length', truncation=True)
print('inputs keys --\n', inputs.keys())
print(f"input_ids --\n {inputs['input_ids']}")
tokens = tokenizer.decode(inputs['input_ids'][0])
print('tokens --\n', tokens)
print(f"spo_list ------\n {train_data.loc[47,'spo_list']}")
text_len = len(tokens)
text_len

text ---
 47                 《头文字d4》是2005年接力出版社出版的图书，作者是重野秀一
48    苏州硕诺尔自动化设备有限公司于2014年11月14日在苏州市吴中区市场监督管理局登记成立
Name: text, dtype: object
inputs keys --
 dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])
input_ids --
 [[101, 517, 1928, 3152, 2099, 146, 8159, 518, 3221, 8232, 2399, 2970, 1213, 1139, 4276, 4852, 1139, 4276, 4638, 1745, 741, 8024, 868, 5442, 3221, 7028, 7029, 4899, 671, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

1425

In [11]:
def find_head_idx(pattern, sequence):
    """从sequence中寻找子串pattern
    如果找到，返回第一个下标；否则返回-1。
    """
    n = len(pattern)
    for i in range(len(sequence)):
        if sequence[i:i + n] == pattern:
            return i
    return -1

In [141]:
def data_process(df, p2id, tokenizer, max_length):
    total_token_ids, total_segment_ids = [], []
    total_subject_labels, total_subject_ids, total_object_labels = [], [], []
    for idx in df.index:
        row = df.loc[idx]
        inputs = tokenizer(row['text'], max_length=max_length, padding='max_length', truncation=True)
        token_ids, segment_ids, attention_mask = inputs['input_ids'], inputs['token_type_ids'], inputs['attention_mask']
        # 实体关系token id 位置字典
        s2ro_map = {}
        for sop_n, spo in enumerate(row['spo_list']):
            sub_ids = tokenizer.encode(spo['subject'], max_length=max_length)[1:-1]
            p_id = p2id[spo['predicate']]
            obj_ids = tokenizer.encode(spo['object'], max_length=max_length)[1:-1]
            # 查找subject 和 object对应的token id 起始索引
            sub_head_idx = find_head_idx(sub_ids, inputs['input_ids'])
            obj_head_idx = find_head_idx(obj_ids, inputs['input_ids'])
            if sub_head_idx != -1 and obj_head_idx != -1:
                # 获取subject 起始位置的索引和结束位置索引的元组
                sub = (sub_head_idx, sub_head_idx + len(sub_ids) - 1)
                # 获取object 起始位置的索引和结束位置索引元组以及与关系标签id的元组对
                obj = (obj_head_idx, obj_head_idx + len(obj_ids) - 1, p_id)
                # print('---- sub -----', sub)#    
                if sub not in s2ro_map:
                    s2ro_map[sub] = []
                    # [(28, 29,)，7), (25, 26,)，8)......]   
                s2ro_map[sub].append(obj)
            else:
                print('--idx--', idx, '--text--', row['text'])
                print('--sop_n--', sop_n, '--spo--', spo, '\n')
            # print('-----------', s2ro_map)#                
            # {(22,23):[((28, 29,), 7), ((25, 26,), 8)]}  

        if s2ro_map:
        # subject标签
            subject_labels = np.zeros((len(token_ids), 2))
            for s in s2ro_map:
                #sub_head
                subject_labels[s[0], 0] = 1
                #sub_tail
                subject_labels[s[1], 1] = 1
            # 随机选一个subject
            sub_head, sub_tail = choice(list(s2ro_map.keys()))
            subject_ids = (sub_head, sub_tail)
            # sub_head, sub_tail = np.array(list(s2ro_map.keys())).T
            # sub_head = np.random.choice(sub_head)
            # sub_tail = np.random.choice(sub_tail[sub_tail >= sub_head])
            # 对应的object标签
            object_labels = np.zeros((len(token_ids), len(p2id), 2))
            for ro in s2ro_map.get((sub_head, sub_tail), []):
                # obj_head
                object_labels[ro[0], ro[2], 0] = 1
                # obj_tail
                object_labels[ro[1], ro[2], 1] = 1
            # 所有数据汇总
            total_token_ids.append(token_ids)
            total_segment_ids.append(segment_ids)
            total_subject_labels.append(subject_labels)
            total_subject_ids.append(subject_ids)
            total_object_labels.append(object_labels)                
    return total_token_ids, total_segment_ids, total_subject_labels, total_subject_ids, total_object_labels   

In [171]:
model_path = '../model_dirs/bert-base-chinese'  
tokenizer = BertTokenizer.from_pretrained(model_path)
max_length = 256
total_token_ids, total_segment_ids, total_subject_labels, total_subject_ids, total_object_labels  = data_process(train_data, p2id, tokenizer, max_length)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [172]:
len(total_token_ids)

1000

In [159]:
def extract_subject(inputs):
    """根据subject_ids从output中取出subject的向量表征
    """
    output, subject_ids = inputs
    start = batch_gather(output, subject_ids[:, :1])
    end = batch_gather(output, subject_ids[:, 1:])
    subject = K.concatenate([start, end], 2)
    return subject[:, 0]

In [160]:
def extract_subject(inputs):
    """根据subject_ids从output中取出subject的向量表征
    """
    output, subject_ids = inputs
    start = tf.gather(output, subject_ids[:,0],axis=1,dims=0)
    end = tf.gather(output, subject_ids[:,1],axis=1,dims=0)
    subject = tf.keras.layers.Concatenate(axis=2)([start, end])
    return subject[:,0]

In [161]:
text = train_data.loc[971,'text']
print(len(text), text)
sub = train_data.loc[971,'spo_list'][0]['subject']
obj = train_data.loc[971,'spo_list'][0]['object']
print(sub, obj)
max_length = 256
s_ids = tokenizer.encode(sub, max_length=max_length, truncation=True)[1:-1]
o_ids = tokenizer.encode(obj, max_length=max_length, truncation=True)[1:-1]
text_ids = tokenizer.encode(text, max_length=max_length, padding='max_length', truncation=True)
print(text_ids)
print(s_ids)
find_head_idx(s_ids, text_ids), find_head_idx(o_ids, text_ids)


175 总之单曲循环停不下来，各种脑洞大开7.《前世滇南行》演唱：奇然作词：小鱼萝莉（我很喜欢鱼妈的词）作曲：加菲老猫编曲：加菲老猫混缩：mr.鱼这首歌让我特别想走一趟滇南，歌词里缓缓诉说了一个透露着悲伤的故事8.《泸沽寻梦》作词:pavane a·g orison演唱/作曲:银临编曲/混音:灰原穷和声:银临、灰原穷银临专辑《腐草为萤》中收录的一首民乐歌曲
泸沽寻梦 银临
[101, 2600, 722, 1296, 3289, 2542, 4384, 977, 679, 678, 3341, 8024, 1392, 4905, 5554, 3822, 1920, 2458, 128, 119, 517, 1184, 686, 3995, 1298, 6121, 518, 4028, 1548, 8038, 1936, 4197, 868, 6404, 8038, 2207, 7824, 5850, 5799, 8020, 2769, 2523, 1599, 3614, 7824, 1968, 4638, 6404, 8021, 868, 3289, 8038, 1217, 5838, 5439, 4344, 5356, 3289, 8038, 1217, 5838, 5439, 4344, 3921, 5367, 8038, 8912, 119, 7824, 6821, 7674, 3625, 6375, 2769, 4294, 1166, 2682, 6624, 671, 6636, 3995, 1298, 8024, 3625, 6404, 7027, 5353, 5353, 6401, 6432, 749, 671, 702, 6851, 7463, 4708, 2650, 839, 4638, 3125, 752, 129, 119, 517, 3810, 3782, 2192, 3457, 518, 868, 6404, 131, 9519, 13045, 8154, 143, 185, 149, 8549, 10316, 4028, 1548, 120, 868, 3289, 131, 7213, 707, 5356, 3289, 120, 3921, 7509, 131, 4129, 1333, 4956, 1469, 1898, 131, 7213

(104, 126)

In [162]:
%%time
val_inputs = tokenizer(dev_text, max_length=max_length, padding='max_length', truncation=True, return_tensors='tf') 
val_input_ids, val_attention_mask = val_inputs['input_ids'], val_inputs['attention_mask']

Wall time: 2.26 s


In [163]:
class LayerNormalization(tf.keras.layers.Layer):
    """(Conditional) Layer Normalization
    hidden_*系列参数仅为有条件输入时(conditional=True)使用
    """
    def __init__(
        self,
        center=True,
        scale=True,
        epsilon=None,
        conditional=False,
        hidden_units=None,
        hidden_activation='linear',
        hidden_initializer='glorot_uniform',
        **kwargs):
        super(LayerNormalization, self).__init__(**kwargs)
        self.center = center
        self.scale = scale
        self.conditional = conditional
        self.hidden_units = hidden_units
        self.hidden_activation = tf.keras.activations.get(hidden_activation)
        self.hidden_initializer = tf.keras.initializers.get(hidden_initializer)
        self.epsilon = epsilon or 1e-12
        
    def compute_mask(self, inputs, mask=None):
        if self.conditional:
            masks = mask if mask is not None else []
            masks = [m[None] for m in masks if m is not None]
            if len(masks) == 0:
                return None
            else:
                return K.all(K.concatenate(masks, axis=0), axis=0)
        else:
            return mask
        
    def build(self, input_shape):
        super(LayerNormalization, self).build(input_shape)
        if self.conditional:
            shape = (input_shape[0][-1],)
        else:
            shape = (input_shape[-1],)
        if self.center:
            self.beta = self.add_weight(
                shape=shape, initializer='zeros', name='beta')
        if self.scale:
            self.gamma = self.add_weight(
                shape=shape, initializer='ones', name='gamma')
        if self.conditional:
            if self.hidden_units is not None:
                self.hidden_dense = tf.keras.layers.Dense(
                    units=self.hidden_units,
                    activation=self.hidden_activation,
                    use_bias=False,
                    kernel_initializer=self.hidden_initializer)
            if self.center:
                self.beta_dense = tf.keras.layers.Dense(
                    units=shape[0], use_bias=False, kernel_initializer='zeros')
            if self.scale:
                self.gamma_dense = tf.keras.layers.Dense(
                    units=shape[0], use_bias=False, kernel_initializer='zeros')

    def call(self, inputs):
        """如果是条件Layer Norm，则默认以list为输入，第二个是condition
        """
        if self.conditional:
            inputs, cond = inputs
            if self.hidden_units is not None:
                cond = self.hidden_dense(cond)
            for _ in range(K.ndim(inputs) - K.ndim(cond)):
                cond = K.expand_dims(cond, 1)
            if self.center:
                beta = self.beta_dense(cond) + self.beta
            if self.scale:
                gamma = self.gamma_dense(cond) + self.gamma
        else:
            if self.center:
                beta = self.beta
            if self.scale:
                gamma = self.gamma
        outputs = inputs
        if self.center:
            mean = K.mean(outputs, axis=-1, keepdims=True)
            outputs = outputs - mean
        if self.scale:
            variance = K.mean(K.square(outputs), axis=-1, keepdims=True)
            std = K.sqrt(variance + self.epsilon)
            outputs = outputs / std
            outputs = outputs * gamma
        if self.center:
            outputs = outputs + beta
        return outputs
        


In [8]:
def new_loss(true,pred):
    true = tf.cast(true,tf.float32)
    loss = K.sum(K.binary_crossentropy(true, pred))
    return loss

In [9]:
def extract_subject(inputs):
    """根据subject_ids从output中取出subject的向量表征
    """
    output, subject_ids = inputs
    start = tf.gather(output,subject_ids[:,0],axis=1,dims=0)
    end = tf.gather(output,subject_ids[:,1],axis=1,dims=0)
    subject = tf.keras.layers.Concatenate(axis=2)([start, end])
    return subject[:,0]
'''
   output.shape = (None,128,768)
   subjudec_ids.shape = (None,2)
   start.shape = (None,None,768)
   subject.shape = (None,None,1536)
   subject[:,0].shape = (None,1536)
   这一部分给出各个变量的shape应该一目了然
'''
   
def build_model_2(pretrained_path, MAX_LEN, p2id):
    ids = tf.keras.layers.Input((MAX_LEN,), dtype=tf.int32)
    att = tf.keras.layers.Input((MAX_LEN,), dtype=tf.int32)
    s_po_index =  tf.keras.layers.Input((2,), dtype=tf.int32)
    
    bert_model = TFBertModel.from_pretrained(pretrained_path, output_hidden_states=True)
    outputs = bert_model(ids, attention_mask=att)
    x, pooler_output, hidden_states  = outputs[:3]
    layer_1 = hidden_states[-1]
    
    start_logits = tf.keras.layers.Dense(1,activation = 'sigmoid')(layer_1)
    start_logits = tf.keras.layers.Lambda(lambda x: x**2)(start_logits)
    
    end_logits = tf.keras.layers.Dense(1,activation = 'sigmoid')(layer_1)
    end_logits = tf.keras.layers.Lambda(lambda x: x**2)(end_logits)
    
    subject_1 = extract_subject([layer_1,s_po_index])
    Normalization_1 = LayerNormalization(conditional=True)([layer_1, subject_1])
    
    op_out_put_start = tf.keras.layers.Dense(len(p2id),activation = 'sigmoid')(Normalization_1)
    op_out_put_start = tf.keras.layers.Lambda(lambda x: x**4)(op_out_put_start)
    
    op_out_put_end = tf.keras.layers.Dense(len(p2id),activation = 'sigmoid')(Normalization_1)
    op_out_put_end = tf.keras.layers.Lambda(lambda x: x**4)(op_out_put_end)
    
    model = tf.keras.models.Model(inputs=[ids, att, s_po_index], outputs=[start_logits, end_logits, op_out_put_start, op_out_put_end])
    model_2 = tf.keras.models.Model(inputs=[ids, att], outputs=[start_logits,end_logits])
    model_3 = tf.keras.models.Model(inputs=[ids, att, s_po_index], outputs=[op_out_put_start, op_out_put_end])
    return model, model_2, model_3


In [10]:
class Metrics(tf.keras.callbacks.Callback):
    def __init__(self, model_2, model_3, id2tag, val_spo_list, val_input_ids, val_attention_mask, tokenizer):
        super(Metrics, self).__init__()
        self.model_2 = model_2
        self.model_3 = model_3
        self.id2tag = id2tag
        self.val_input_ids = val_input_ids
        self.val_attention_mask = val_attention_mask
        self.val_spo_list = val_spo_list
        self.tokenizer = tokenizer
        
    def on_train_begin(self, logs=None):
        self.val_f1s = []
        self.best_val_f1 = 0
    
    def get_same_element_index(self,ob_list):
        return [i for (i, v) in enumerate(ob_list) if v == 1]
    
    def evaluate_data(self):
        question=[]
        answer=[]
        y1 = self.model_2.predict([self.val_input_ids,self.val_attention_mask])
        for i in range(len(y1[0])):
            for z in self.val_spo_list[i]:
                question.append((z['subject'][0],z['subject'][-1],z['predicate'],z['object'][0],z['object'][-1]))
            x_ = [self.tokenizer.decode([t]) for t in self.val_input_ids[i]]
            x1 = np.array(y1[0][i]>0.5,dtype='int32')
            x2 = np.array(y1[1][i]>0.5,dtype='int32')
            union = x1 + x2
            index_list = self.get_same_element_index(list(union))
            start = 0
            S_list=[]
            while start+1 < len(index_list):
                S_list.append((index_list[start], index_list[start+1]+1))
                start += 2
            for os_s,os_e in S_list:
                S = ''.join(x_[os_s:os_e])
                s_e = tf.constant(np.array([[os_s,os_e]]))
                Y2 = self.model_3.predict([self.val_input_ids[i:i+1], self.val_attention_mask[i:i+1], s_e]) 
                for m in range(len(self.id2tag)):
                    x3 = np.array(Y2[0][0][:,m]>0.4,dtype='int32')
                    x4 = np.array(Y2[1][0][:,m]>0.4,dtype='int32')
                    if sum(x3)>0 and sum(x4)>0:
                        predict = self.id2tag[m]
                        union = x3 + x4
                        index_list = self.get_same_element_index(list(union))
                        start = 0
                        P_list=[]
                        while start+1 < len(index_list):
                            P_list.append((index_list[start],index_list[start+1]+1))
                            start += 2
                        for os_s,os_e in P_list:
                            if os_e>=os_s:
                                P = ''.join(x_[os_s:os_e])
                                answer.append((S[0],S[-1],predict,P[0],P[-1]))
        Q = set(question)
        S = set(answer)
        f1 = 2*len(Q&S)/(len(Q)+len(S))
        return f1
    
    def on_epoch_end(self, epoch, logs=None):
        logs = logs or {}
        _val_f1 = self.evaluate_data()
        _val_f1 = round(_val_f1, 5)
        self.val_f1s.append(_val_f1)
        logs['val_f1'] = _val_f1
        print(f'f1:{_val_f1}')
        if _val_f1 > self.best_val_f1:
            # self.model.save_weights(f'../model_dirs/fine_tune_relation_extraction/{_val_f1}_tf_model.h5')
            self.best_val_f1 = _val_f1
            print("best f1: {}".format(self.best_val_f1))
        else:
            # self.model.save_weights(f'../model_dirs/fine_tune_relation_extraction/{_val_f1}_tf_model.h5')
            print("val f1: {}, but not the best f1".format(_val_f1))
        return 

In [11]:
MAX_LEN = 128
# config = BertConfig.from_json_file('../model_dirs/bert-base-chinese/config.json')
# TFBertModel.from_pretrained(pretrained_path, config=config)
K.clear_session()
model,model_2,model_3 = build_model_2(model_path,  MAX_LEN, p2id)
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5)
model.compile(loss={'lambda': new_loss,
                'lambda_1': new_loss,
                'lambda_2': new_loss,
                'lambda_3': new_loss},optimizer=optimizer)
model.fit([input_ids, attention_mask, send_s_po],\
          [start_tokens,end_tokens,object_start_tokens,object_end_tokens], \
        epochs=3, size=8, callbacks=[Metrics(model_2, model_3 ,id2p, val_spo_list,val_input_ids,val_attention_mask,tokenizer)])
model.save_weights(f'../model_dirs/fine_tune_relation_extraction/_tf_model.h5')


Some layers from the model checkpoint at ../model_dirs/bert-base-chinese were not used when initializing TFBertModel: ['nsp___cls', 'mlm___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at ../model_dirs/bert-base-chinese.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and

KeyboardInterrupt: 

In [13]:
import os 
pid = os.getpid()
!kill -9 $pid