In [2]:
import numpy as np
import pandas as pd
from ast import literal_eval
from transformers import BertTokenizer,  BertConfig, TFBertModel
import tensorflow as tf
from tensorflow.keras import backend as K

## 数据加载

In [3]:
def load_data(path):
    text_list = []
    spo_list = []
    with open(path, encoding='utf-8') as json_file:
        for line in json_file:
            text_list.append(literal_eval(line)['text'])
            spo_list.append(literal_eval(line)['spo_list'])
    return text_list, spo_list
    
path='../data/百度关系抽取数据集/train_data.json'
# path='../data/百度关系抽取数据集/experiment.json'
text_list, spo_list = load_data(path)
print(text_list[0], spo_list[0])
val_path = '../data/百度关系抽取数据集/dev_data.json'
# val_path = '../data/百度关系抽取数据集/experiment.json'
val_text_list, val_spo_list = load_data(path)

如何演好自己的角色，请读《演员自我修养》《喜剧之王》周星驰崛起于穷困潦倒之中的独门秘笈 [{'predicate': '主演', 'object_type': '人物', 'subject_type': '影视作品', 'object': '周星驰', 'subject': '喜剧之王'}]


In [4]:
len(text_list)

173108

In [5]:
def load_predicate(path):
    with open(path,'r', encoding='utf-8')  as f:
        predicate_list = [literal_eval(i)['predicate'] for i in f]
    p2id = {}
    id2p = {}
    data = list(set(predicate_list))
    for i in range(len(data)):
        p2id[data[i]] = i
        id2p[i] = data[i]
    return p2id, id2p
    
path = '../data/百度关系抽取数据集/all_50_schemas'
p2id, id2p = load_predicate(path)
print(p2id)
print(id2p)


{'字': 0, '号': 1, '占地面积': 2, '出生日期': 3, '祖籍': 4, '目': 5, '邮政编码': 6, '所在城市': 7, '气候': 8, '作者': 9, '毕业院校': 10, '身高': 11, '制片人': 12, '民族': 13, '董事长': 14, '嘉宾': 15, '导演': 16, '专业代码': 17, '主持人': 18, '作词': 19, '朝代': 20, '编剧': 21, '连载网站': 22, '人口数量': 23, '修业年限': 24, '所属专辑': 25, '海拔': 26, '面积': 27, '出品公司': 28, '作曲': 29, '创始人': 30, '改编自': 31, '官方语言': 32, '简称': 33, '国籍': 34, '丈夫': 35, '注册资本': 36, '妻子': 37, '父亲': 38, '出版社': 39, '成立日期': 40, '首都': 41, '主演': 42, '上映时间': 43, '总部地点': 44, '出生地': 45, '歌手': 46, '主角': 47, '母亲': 48}
{0: '字', 1: '号', 2: '占地面积', 3: '出生日期', 4: '祖籍', 5: '目', 6: '邮政编码', 7: '所在城市', 8: '气候', 9: '作者', 10: '毕业院校', 11: '身高', 12: '制片人', 13: '民族', 14: '董事长', 15: '嘉宾', 16: '导演', 17: '专业代码', 18: '主持人', 19: '作词', 20: '朝代', 21: '编剧', 22: '连载网站', 23: '人口数量', 24: '修业年限', 25: '所属专辑', 26: '海拔', 27: '面积', 28: '出品公司', 29: '作曲', 30: '创始人', 31: '改编自', 32: '官方语言', 33: '简称', 34: '国籍', 35: '丈夫', 36: '注册资本', 37: '妻子', 38: '父亲', 39: '出版社', 40: '成立日期', 41: '首都', 42: '主演', 43: '上映时间', 44: '总部地点', 45: '出生

## 数据集处理

In [6]:
def proceed_data(text_list,spo_list,p2id,id2p,tokenizer,max_length):
    id_label = {}
    ct = len(text_list)
    input_ids = np.zeros((ct,max_length),dtype='int32')
    attention_mask = np.zeros((ct,max_length),dtype='int32')
    start_tokens = np.zeros((ct,max_length),dtype='int32')
    end_tokens = np.zeros((ct,max_length),dtype='int32')
    send_s_po = np.zeros((ct,2),dtype='int32')
    object_start_tokens = np.zeros((ct,max_length,len(p2id)),dtype='int32')
    object_end_tokens = np.zeros((ct,max_length,len(p2id)),dtype='int32')
    invalid_index = []
    for k in range(ct):
        context_k = text_list[k].lower().replace(' ','')
        enc_context = tokenizer.encode(context_k,max_length=max_length,padding='max_length',truncation=True) 
        if len(spo_list[k])==0:
            invalid_index.append(k)
            continue
        start = []
        end = []
        S_index = []
        for j in range(len(spo_list[k])):
            answers_text_k = spo_list[k][j]['subject'].lower().replace(' ','')
            chars = np.zeros((len(context_k)))
            index = context_k.find(answers_text_k)
            chars[index:index+len(answers_text_k)]=1
            offsets = []
            idx=0
            for t in enc_context[1:]:
                w = tokenizer.decode([t])
                if '#' in w and len(w)>1:
                    w = w.replace('#','')
                if w == '[UNK]':
                    w = '。'
                offsets.append((idx,idx+len(w)))
                
                idx += len(w)
            toks = []
            for i,(a,b) in enumerate(offsets):
                sm = np.sum(chars[a:b])
                if sm>0: 
                    toks.append(i) 
            input_ids[k,:len(enc_context)] = enc_context
            attention_mask[k,:len(enc_context)] = 1
            if len(toks)>0:
                start_tokens[k,toks[0]+1] = 1
                end_tokens[k,toks[-1]+1] = 1
                start.append(toks[0]+1)
                end.append(toks[-1]+1)
                S_index.append(j)
                #随机抽取可以作为负样本提高准确率（不认同）
        if len(start) > 0:
            start_np = np.array(start)
            end_np = np.array(end)
            start_ = np.random.choice(start_np)
            end_ = np.random.choice(end_np[end_np >= start_])
            send_s_po[k,0] = start_
            send_s_po[k,1] = end_
            s_index = start.index(start_)
            #随机选取object的首位，如果选取错误，则作为负样本
            if end_ == end[s_index]:
                for index in range(len(start)):
                    if start[index] == start_ and end[index] == end_:
                        object_text_k = spo_list[k][S_index[index]]['object'].lower().replace(' ','')
                        predicate = spo_list[k][S_index[index]]['predicate']
                        p_id = p2id[predicate]
                        chars = np.zeros((len(context_k)))
                        index = context_k.find(object_text_k)
                        chars[index:index+len(object_text_k)]=1
                        offsets = [] 
                        idx=0
                        for t in enc_context[1:]:
                            w = tokenizer.decode([t])
                            if '#' in w and len(w)>1:
                                w = w.replace('#','')
                            if w == '[UNK]':
                                w = '。'
                            offsets.append((idx,idx+len(w)))
                            idx += len(w)
                        toks = []
                        for i,(a,b) in enumerate(offsets):
                            sm = np.sum(chars[a:b])
                            if sm>0: 
                                toks.append(i) 
                        if len(toks)>0:
                            id_label[p_id] = predicate
                            object_start_tokens[k,toks[0]+1,p_id] = 1
                            object_end_tokens[k,toks[-1]+1,p_id] = 1
        else:
            invalid_index.append(k)
    input_ids = tf.constant(input_ids)
    attention_mask = tf.constant(attention_mask)
    start_tokens = tf.constant(start_tokens)
    end_tokens = tf.constant(end_tokens)
    send_s_po = tf.constant(send_s_po)
    object_start_tokens = tf.constant(object_start_tokens)
    object_end_tokens = tf.constant(object_end_tokens)
    # invalid_index = tf.constant(invalid_index)
    return input_ids, attention_mask, start_tokens, end_tokens, send_s_po, object_start_tokens, object_end_tokens, invalid_index, id_label

max_length = 128  
model_path = '../model_dirs/bert-base-chinese'  
tokenizer = BertTokenizer.from_pretrained(model_path)    
input_ids, attention_mask, start_tokens, end_tokens, send_s_po, object_start_tokens, object_end_tokens, invalid_index, id_label \
= proceed_data(text_list,spo_list,p2id,id2p,tokenizer,max_length)

In [7]:
val_inputs = tokenizer(text_list, max_length=max_length, padding='max_length', truncation=True, return_tensors='tf') 
val_input_ids, val_attention_mask = val_inputs['input_ids'], val_inputs['attention_mask']


In [8]:
class LayerNormalization(tf.keras.layers.Layer):
    """(Conditional) Layer Normalization
    hidden_*系列参数仅为有条件输入时(conditional=True)使用
    """
    def __init__(
        self,
        center=True,
        scale=True,
        epsilon=None,
        conditional=False,
        hidden_units=None,
        hidden_activation='linear',
        hidden_initializer='glorot_uniform',
        **kwargs):
        super(LayerNormalization, self).__init__(**kwargs)
        self.center = center
        self.scale = scale
        self.conditional = conditional
        self.hidden_units = hidden_units
        self.hidden_activation = tf.keras.activations.get(hidden_activation)
        self.hidden_initializer = tf.keras.initializers.get(hidden_initializer)
        self.epsilon = epsilon or 1e-12
        
    def compute_mask(self, inputs, mask=None):
        if self.conditional:
            masks = mask if mask is not None else []
            masks = [m[None] for m in masks if m is not None]
            if len(masks) == 0:
                return None
            else:
                return K.all(K.concatenate(masks, axis=0), axis=0)
        else:
            return mask
        
    def build(self, input_shape):
        super(LayerNormalization, self).build(input_shape)
        if self.conditional:
            shape = (input_shape[0][-1],)
        else:
            shape = (input_shape[-1],)
        if self.center:
            self.beta = self.add_weight(
                shape=shape, initializer='zeros', name='beta')
        if self.scale:
            self.gamma = self.add_weight(
                shape=shape, initializer='ones', name='gamma')
        if self.conditional:
            if self.hidden_units is not None:
                self.hidden_dense = tf.keras.layers.Dense(
                    units=self.hidden_units,
                    activation=self.hidden_activation,
                    use_bias=False,
                    kernel_initializer=self.hidden_initializer)
            if self.center:
                self.beta_dense = tf.keras.layers.Dense(
                    units=shape[0], use_bias=False, kernel_initializer='zeros')
            if self.scale:
                self.gamma_dense = tf.keras.layers.Dense(
                    units=shape[0], use_bias=False, kernel_initializer='zeros')

    def call(self, inputs):
        """如果是条件Layer Norm，则默认以list为输入，第二个是condition
        """
        if self.conditional:
            inputs, cond = inputs
            if self.hidden_units is not None:
                cond = self.hidden_dense(cond)
            for _ in range(K.ndim(inputs) - K.ndim(cond)):
                cond = K.expand_dims(cond, 1)
            if self.center:
                beta = self.beta_dense(cond) + self.beta
            if self.scale:
                gamma = self.gamma_dense(cond) + self.gamma
        else:
            if self.center:
                beta = self.beta
            if self.scale:
                gamma = self.gamma
        outputs = inputs
        if self.center:
            mean = K.mean(outputs, axis=-1, keepdims=True)
            outputs = outputs - mean
        if self.scale:
            variance = K.mean(K.square(outputs), axis=-1, keepdims=True)
            std = K.sqrt(variance + self.epsilon)
            outputs = outputs / std
            outputs = outputs * gamma
        if self.center:
            outputs = outputs + beta
        return outputs
        


In [9]:
def new_loss(true,pred):
    true = tf.cast(true,tf.float32)
    loss = K.sum(K.binary_crossentropy(true, pred))
    return loss

In [10]:
def extract_subject(inputs):
    """根据subject_ids从output中取出subject的向量表征
    """
    output, subject_ids = inputs
    start = tf.gather(output,subject_ids[:,0],axis=1,batch_dims=0)
    end = tf.gather(output,subject_ids[:,1],axis=1,batch_dims=0)
    subject = tf.keras.layers.Concatenate(axis=2)([start, end])
    return subject[:,0]
'''
   output.shape = (None,128,768)
   subjudec_ids.shape = (None,2)
   start.shape = (None,None,768)
   subject.shape = (None,None,1536)
   subject[:,0].shape = (None,1536)
   这一部分给出各个变量的shape应该一目了然
'''
   
def build_model_2(pretrained_path, MAX_LEN, p2id):
    ids = tf.keras.layers.Input((MAX_LEN,), dtype=tf.int32)
    att = tf.keras.layers.Input((MAX_LEN,), dtype=tf.int32)
    s_po_index =  tf.keras.layers.Input((2,), dtype=tf.int32)
    
    bert_model = TFBertModel.from_pretrained(pretrained_path, output_hidden_states=True)
    outputs = bert_model(ids, attention_mask=att)
    x, _, hidden_states  = outputs[:3]
    layer_1 = hidden_states[-1]
    start_logits = tf.keras.layers.Dense(1,activation = 'sigmoid')(layer_1)
    start_logits = tf.keras.layers.Lambda(lambda x: x**2)(start_logits)
    
    end_logits = tf.keras.layers.Dense(1,activation = 'sigmoid')(layer_1)
    end_logits = tf.keras.layers.Lambda(lambda x: x**2)(end_logits)
    
    subject_1 = extract_subject([layer_1,s_po_index])
    Normalization_1 = LayerNormalization(conditional=True)([layer_1, subject_1])
    
    op_out_put_start = tf.keras.layers.Dense(len(p2id),activation = 'sigmoid')(Normalization_1)
    op_out_put_start = tf.keras.layers.Lambda(lambda x: x**4)(op_out_put_start)
    
    op_out_put_end = tf.keras.layers.Dense(len(p2id),activation = 'sigmoid')(Normalization_1)
    op_out_put_end = tf.keras.layers.Lambda(lambda x: x**4)(op_out_put_end)
    
    model = tf.keras.models.Model(inputs=[ids, att, s_po_index], outputs=[start_logits, end_logits, op_out_put_start, op_out_put_end])
    model_2 = tf.keras.models.Model(inputs=[ids, att], outputs=[start_logits,end_logits])
    model_3 = tf.keras.models.Model(inputs=[ids, att, s_po_index], outputs=[op_out_put_start, op_out_put_end])
    return model, model_2, model_3


In [11]:
class Metrics(tf.keras.callbacks.Callback):
    def __init__(self, model_2, model_3, id2tag, val_spo_list, val_input_ids, val_attention_mask, tokenizer):
        super(Metrics, self).__init__()
        self.model_2 = model_2
        self.model_3 = model_3
        self.id2tag = id2tag
        self.val_input_ids = val_input_ids
        self.val_attention_mask = val_attention_mask
        self.val_spo_list = val_spo_list
        self.tokenizer = tokenizer
        
    def on_train_begin(self, logs=None):
        self.val_f1s = []
        self.best_val_f1 = 0
    
    def get_same_element_index(self,ob_list):
        return [i for (i, v) in enumerate(ob_list) if v == 1]
    
    def evaluate_data(self):
        question=[]
        answer=[]
        y1 = self.model_2.predict([self.val_input_ids,self.val_attention_mask])
        for i in range(len(y1[0])):
            for z in self.val_spo_list[i]:
                question.append((z['subject'][0],z['subject'][-1],z['predicate'],z['object'][0],z['object'][-1]))
            x_ = [self.tokenizer.decode([t]) for t in self.val_input_ids[i]]
            x1 = np.array(y1[0][i]>0.5,dtype='int32')
            x2 = np.array(y1[1][i]>0.5,dtype='int32')
            union = x1 + x2
            index_list = self.get_same_element_index(list(union))
            start = 0
            S_list=[]
            while start+1 < len(index_list):
                S_list.append((index_list[start], index_list[start+1]+1))
                start += 2
            for os_s,os_e in S_list:
                s_e = ''.join(x_[os_s:os_e])
                she = tf.constant(np.array([[os_s,os_e]]))
                Y2 = self.model_3.predict([self.val_input_ids[i:i+1], self.val_attention_mask[i:i+1], s_e]) 
                for m in range(len(self.id2tag)):
                    x3 = np.array(Y2[0][0][:,m]>0.4,dtype='int32')
                    x4 = np.array(Y2[1][0][:,m]>0.4,dtype='int32')
                    if sum(x3)>0 and sum(x4)>0:
                        predict = self.id2tag[m]
                        union = x3 + x4
                        index_list = self.get_same_element_index(list(union))
                        start = 0
                        P_list=[]
                        while start+1 < len(index_list):
                            P_list.append((index_list[start],index_list[start+1]+1))
                            start += 2
                        for os_s,os_e in P_list:
                            if os_e>=os_s:
                                P = ''.join(x_[os_s:os_e])
                                answer.append((S[0],S[-1],predict,P[0],P[-1]))
        Q = set(question)
        S = set(answer)
        f1 = 2*len(Q&S)/(len(Q)+len(S))
        return f1
    
    def on_epoch_end(self, epoch, logs=None):
        logs = logs or {}
        _val_f1 = self.evaluate_data()
        self.val_f1s.append(_val_f1)
        logs['val_f1'] = _val_f1
        if _val_f1 > self.best_val_f1:
            self.model.save_weights('..model_dirs/fine_tune_relation_extraction'.format(_val_f1))
            self.best_val_f1 = _val_f1
            print("best f1: {}".format(self.best_val_f1))
        else:
            print("val f1: {}, but not the best f1".format(_val_f1))
        return 

In [12]:
pretrained_path = '../model_dirs/bert-base-chinese'
MAX_LEN = 128
# config = BertConfig.from_json_file('../model_dirs/bert-base-chinese/config.json')
# TFBertModel.from_pretrained(pretrained_path, config=config)
K.clear_session()
model,model_2,model_3 = build_model_2(pretrained_path,  MAX_LEN, p2id)
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-5)
model.compile(loss={'lambda': new_loss,
                'lambda_1': new_loss,
                'lambda_2': new_loss,
                'lambda_3': new_loss},optimizer=optimizer)
model.fit([input_ids, attention_mask, send_s_po],\
          [start_tokens,end_tokens,object_start_tokens,object_end_tokens], \
        epochs=3, batch_size=8, callbacks=[Metrics(model_2, model_3 ,id2p, val_spo_list,val_input_ids,val_attention_mask,tokenizer)])


Some layers from the model checkpoint at ../model_dirs/bert-base-chinese were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at ../model_dirs/bert-base-chinese.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Please report this to the TensorFlow team. When filing the bug, set the verbosity to 10 (on Linux, `export AUTOGRAPH_VERBOSITY=10`) and attach the full output.
Cause: module, class, method, function, traceback, frame, or code object was expected, got cython_function_or_method
Epoch 1/3
 2161/21639 [=>............................] - ETA: 3:20:38 - loss: 650.0965 - lambda_loss: 35.7846 - lambda_1_loss: 43.1129 - lambda_2_loss: 286.4878 - lambda_3_loss: 284.7110

In [None]:
import os 
pid = os.getpid()
!kill -9 $pid

NameError: name 'os' is not defined