# <center>基于Bert的高细粒度命名实体识别模型</center>

## Part I. 模型构建与数据集加载

### 导入相关库

In [1]:
import torch

import torch.utils.data as torchData

import time

import numpy as np

from loguru import logger

import json


### 加载bert模型

In [2]:
import pytorch_pretrained_bert as bert

bert_model_dir = "../bert_model/bert-chinese/"

tokenizer = bert.BertTokenizer.from_pretrained(bert_model_dir)

bert = bert.BertModel.from_pretrained(bert_model_dir )


### 训练数据预处理

In [3]:
root_dir = "D:\\nlpCorpus\\CLUE\\dataset\\NER\\"

for subset in ["train","dev"]:

    logger.info("处理 : ",subset)
    
    seqs = []
    
    labels = []
    
    spans = []
    
    with open(root_dir + subset + ".json","r",encoding = "utf-8") as f :

        data = []

        for line in f.readlines():

            line = json.loads(line)

            text = line["text"]
            
            text_labelseq = ["O" for i in range(len(text))]
            
            text_labels = line["label"]
            
            seq_span = []
            
            for NR_label in text_labels:
                
                seq_spans = list(text_labels[NR_label].values())[0]
                
                for span in seq_spans:
                    
                    seq_span.append([span,NR_label])
                    
                    for j in range(span[0],span[1] + 1):
                   
                        if j == span[0]:
                        
                            text_labelseq[j] = "B-"+ NR_label
                            
                        else:
                            
                            text_labelseq[j] = "I-"+ NR_label

            seqs.append(text)
            
            labels.append(text_labelseq)
            
            spans.append(seq_span)
    print("个数:",len(seqs))
    
    print("序列样例:"+str(seqs[0]))
    
    print("标签样例:"+str(labels[0]))
    
    print("span样例:"+str(spans[0]))
    with open(subset + "_seqs.json",'w',encoding = "utf-8") as f:

        f.write(json.dumps(seqs))

    with open(subset + "_labels.json",'w',encoding = "utf-8") as f:

        f.write(json.dumps(labels))
        
    with open(subset + "_spans.json","w",encoding = "utf-8") as f:
        
        f.write(json.dumps(spans))
        
    logger.info("写入文件成功:")

2020-02-20 14:14:32.124 | INFO     | __main__:<module>:5 - 处理 : 


个数: 10748
序列样例:浙商银行企业信贷部叶老桂博士则从另一个角度对五道门槛进行了解读。叶老桂认为，对目前国内商业银行而言，
标签样例:['B-company', 'I-company', 'I-company', 'I-company', 'O', 'O', 'O', 'O', 'O', 'B-name', 'I-name', 'I-name', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
span样例:[[[9, 11], 'name'], [[0, 3], 'company']]


2020-02-20 14:14:32.574 | INFO     | __main__:<module>:71 - 写入文件成功:
2020-02-20 14:14:32.575 | INFO     | __main__:<module>:5 - 处理 : 


个数: 1343
序列样例:彭小军认为，国内银行现在走的是台湾的发卡模式，先通过跑马圈地再在圈的地里面选择客户，
标签样例:['B-name', 'I-name', 'I-name', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-address', 'I-address', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
span样例:[[[15, 16], 'address'], [[0, 2], 'name']]


2020-02-20 14:14:32.665 | INFO     | __main__:<module>:71 - 写入文件成功:


### 构建命名实体识别模型

In [4]:
class Bert_NER_Model(torch.nn.Module):

    def __init__(self, n_hidden,labelnum):

        super().__init__()

        self.bert = bert

        self.output = torch.nn.Sequential(

            torch.nn.Linear(768 , n_hidden),

            torch.nn.Tanh(),

            torch.nn.Dropout(0.2),

            torch.nn.Linear(n_hidden,labelnum),

        )

        self.prob = torch.nn.Softmax(dim = -1 )
        
    def forward(self, b_seqs):

        layers = self.bert(b_seqs, output_all_encoded_layers= False)[0]
        
        output = self.output(layers)

        prob = self.prob(output)
        
        return prob

### 构建数据集

In [5]:
labels = {
    "address":1,
    "book":2,
    "company":3,
    "game":4,
    "government":5,
    "movie":6,
    "name":7,
    "organization":8,
    "position":9,
    "scene":10
}
ne_labels = {"O":0}
count = 1
for label in labels:
    for head in ['B-',"I-"]:
        ne_labels[head + label] = count 
        count += 1
print(ne_labels)

class NER_DataSet(torchData.Dataset):
    
    def __init__(self,subset = "train",num = - 1):

        self.seqs = []

        self.labels = []

        if subset.lower() in ["train","dev"]:
            
            with open(subset+"_seqs.json","r",encoding = "utf-8") as f :

                self.seqs = list(map(lambda seq:[char for char in seq],json.loads(f.read())))[:num]
                
            with open(subset+"_labels.json","r",encoding = "utf-8") as f :
                
                self.labels = list(map(lambda seq_label: [ne_labels[label] for label in seq_label],json.loads(f.read())))[:num]
                         
            with open(subset+"_spans.json","r",encoding = "utf-8") as f :

                self.spans = json.loads(f.read())[:num]
                
        len_seqs = [len(x) for x in self.seqs]
        
        self.MAX_SEQ_LEN = max(len_seqs)
        
        self.labelNum =sum(len_seqs)
        
        self.IB_Num = sum([sum([label > 0 for label in seq_label]) for seq_label in self.labels] )
        
        self.O_Num = self.labelNum - self.IB_Num
        
        self.IB_weight = round(self.O_Num/self.IB_Num,2)
        
        self.O_weight = round(self.IB_Num/self.O_Num,2)
        
        self.seqs = list(map(lambda seq : ["[CLS]"] + seq + ["[PAD]"] * (self.MAX_SEQ_LEN - len(seq)),self.seqs))

        self.labels = list(map(lambda seq : [-1] + seq + [-1] * (self.MAX_SEQ_LEN - len(seq)),self.labels))
        
        self.NE_num = sum([len(span) for span in self.spans]) 
        
        logger.info("加载数据集"+subset+" token长度: "+str(self.labelNum)+ " 序列个数: "+str(len(self.seqs))+" 标签个数:"+str(len(self.labels)) + " 实体个数:" + str(self.NE_num))
        
        logger.debug("O-label / IB-label: " + str(self.IB_weight) + " O-NUM : " + str(self.O_Num) + " IB-NUM: " + str(self.IB_Num))

    def __len__(self):

        return len(self.seqs)

    def __getitem__(self, index):

        x = seq2ids(self.seqs[index])

        y = torch.tensor(self.labels[index]).T.cuda()
        
        span = self.spans[index]
        
        return (x,y,span)
def seq2ids(seq):
    
    ids = []
    
    for char in seq:
        
        char_replace = {
            8220 :"\"",
            8221:"\"",
            8212:"-",
            8230: "...",
            8216: "＇",
            8217:"＇",
        }
        if char != "[CLS]" and char != "[PAD]":
            
            if ord(char ) >= 65 and ord(char) <= 90:
                char = char.lower()

            if ord(char) in char_replace:

                char = char_replace[ord(char)]

        try:
            
            ids += tokenizer.convert_tokens_to_ids([char])
            
        except:
            
            ids += tokenizer.convert_tokens_to_ids(["[UNK]"])
            

    return ids

{'O': 0, 'B-address': 1, 'I-address': 2, 'B-book': 3, 'I-book': 4, 'B-company': 5, 'I-company': 6, 'B-game': 7, 'I-game': 8, 'B-government': 9, 'I-government': 10, 'B-movie': 11, 'I-movie': 12, 'B-name': 13, 'I-name': 14, 'B-organization': 15, 'I-organization': 16, 'B-position': 17, 'I-position': 18, 'B-scene': 19, 'I-scene': 20}


### 实例化模型与数据集

In [6]:
model = Bert_NER_Model(300,21)

model = torch.nn.DataParallel(model).cuda()

optimizer = torch.optim.Adam([{'params': model.parameters()}, ], lr=1e-2)

trainingSet = NER_DataSet(subset = "train",num = 200)

devSet = NER_DataSet(subset = "dev",num = 200)

weight =  torch.FloatTensor([trainingSet.IB_weight for i in range(21)])

weight[0] = 1

print("权重",weight)

loss = torch.nn.CrossEntropyLoss(weight = weight.cuda(),ignore_index = -1,reduction = "sum")

2020-02-20 14:14:36.408 | INFO     | __main__:__init__:63 - 加载数据集train token长度: 7268 序列个数: 200 标签个数:200 实体个数:342
2020-02-20 14:14:36.410 | DEBUG    | __main__:__init__:65 - O-label / IB-label: 3.89 O-NUM : 5782 IB-NUM: 1486
2020-02-20 14:14:36.441 | INFO     | __main__:__init__:63 - 加载数据集dev token长度: 7550 序列个数: 200 标签个数:200 实体个数:347
2020-02-20 14:14:36.442 | DEBUG    | __main__:__init__:65 - O-label / IB-label: 3.89 O-NUM : 6006 IB-NUM: 1544


权重 tensor([1.0000, 3.8900, 3.8900, 3.8900, 3.8900, 3.8900, 3.8900, 3.8900, 3.8900,
        3.8900, 3.8900, 3.8900, 3.8900, 3.8900, 3.8900, 3.8900, 3.8900, 3.8900,
        3.8900, 3.8900, 3.8900])


## Part II. 训练模型

In [7]:
ne_labels_list = list(ne_labels.keys())
def parse_predict(padded_predict : torch.tensor,label : torch.tensor):
    '''
    根据预测的标签序列解析出预测出的实体
    '''

    parsed = []

    predict = [ ]
    
    for i in range(len(padded_predict)):
        
        if label[i].item() != -1 :
            
            predict.append(padded_predict[i].item())
           
    for i in range(len(predict)):
        
        predict_label = predict[i]
        
        if predict_label % 2 == 1:
            
            entity_span_start = i 
            
            entity_span_end = i
            
            entity_type = ne_labels_list[predict_label][2:]
            
            for j in range(i + 1, len(predict)):
                
                next_predict_label = predict[j]
                
                if next_predict_label == predict_label + 1 :
                
                    entity_span_end  = j
                    
                if next_predict_label == 0 or next_predict_label == predict_label :
                    
                    break
            parsed.append([[entity_span_start,entity_span_end],entity_type])

    return parsed

def measure(pred,label,span):
    
    parsed = parse_predict(pred,label)
    
    accu = 0
    
    for pred_span in parsed:
    
        if pred_span in span :
            
            accu += 1
            
    return {"parsed_num":len(parsed),"accu":accu,"exists":len(span),"parsed":parsed}

In [8]:
import os

os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

def collate_fn(data):
    
        b_seq,b_target,b_span =zip(*data)
        
        b_seq = torch.tensor(b_seq)

        b_target = torch.stack(b_target,dim = 0)
        
        return b_seq,b_target,b_span
    
trainDataGenerator = torchData.DataLoader(dataset = trainingSet,batch_size = 15,shuffle = True,collate_fn = collate_fn)

devDataGenerator = torchData.DataLoader(dataset = devSet,batch_size = 15,shuffle = True,collate_fn = collate_fn)

history = []

for epoch in range(50):

    epoch_loss = [ ]

    start_time = time.time()

    used_seq = 0
    
    accu = 0
    
    parsed_num = 0
    
    label_accu = 0 
    
    for b_seq,b_target,b_span in trainDataGenerator:

        time_usage = time.time() - start_time
         
        used_seq += len(b_seq)

        process = str(round( used_seq / len(trainingSet) * 100 , 6)) + "%"
        
        b_prob = model(b_seq)

        b_predict = b_prob.argmax(dim = 2)

        label_accu += torch.sum(torch.eq(b_target,b_predict)).item()
        
        shape = list(b_prob.size())
        
        b_prob = b_prob.view(shape[0] * shape[1] ,shape[2])

        b_loss = loss(b_prob,b_target.view(shape[0] * shape[1]))

        epoch_loss.append(b_loss.item())

        b_measured = list(measure(b_predict[idx],b_target[idx],b_span[idx]) for idx in range(len(b_predict)))

        for measured in b_measured:
                        
            parsed_num += measured["parsed_num"]
            
            accu += measured["accu"]

        optimizer.zero_grad()

        b_loss.backward()

        optimizer.step()
        
        print("\r epoch : {},process : {} ,timeUsage:{}".format(epoch,process,time_usage),end = "",flush=True)
    
    recall = round( accu / trainingSet.NE_num,2)# 召回率 = 识别出的正确实体数 / 样本的实体数
    
    accuracy = round(accu / parsed_num if parsed_num > 0 else 0 ,2) #正确率 = 识别出的正确实体数 / 识别出的实体数
    
    label_accu = round(label_accu/trainingSet.labelNum,4)
    
    epoch_loss =  sum(epoch_loss)/len(trainingSet)
    
    print("\r epoch {} : Loss :{} ,timeUsage:{},recall:{},parsed_num : {},accu_num : {}".format(epoch,epoch_loss, time_usage,recall,parsed_num,accu))

    print("label accu ",label_accu,"entity accu",accuracy," O-label-ratio:",trainingSet.O_Num/trainingSet.labelNum )
    
    test_loss = []
    
    test_parsed_num = 0
    
    test_accu_num = 0
    
    for b_seq,b_target,b_span in devDataGenerator:
        
        b_prob = model(b_seq)

        b_predict = b_prob.argmax(dim = 2)
                
        shape = list(b_prob.size())
        
        b_prob = b_prob.view(shape[0] * shape[1] ,shape[2])

        b_loss = loss(b_prob,b_target.view(shape[0] * shape[1])).item()

        test_loss.append(b_loss)

        b_measured = list(measure(b_predict[idx],b_target[idx],b_span[idx]) for idx in range(len(b_predict)))

        for measured in b_measured:
                        
            test_parsed_num += measured["parsed_num"]
            
            test_accu_num += measured["accu"]
                
    test_recall = round( test_accu_num / devSet.NE_num,2)
    
    test_accuracy = round(test_accu_num / test_parsed_num if test_parsed_num > 0 else 0 ,2)
    
    test_loss = round(sum(test_loss) / len(devSet),4)
    
    history.append({"train":[accu,parsed_num,epoch_loss],"test":[test_accu_num,test_parsed_num,test_loss]})
    
    print("performance on dev set : loss:",test_loss,"accu rate:{},recall : {},parsed_num : {},accu_num : {}".format(test_accuracy,test_recall,test_parsed_num,test_accu_num))

    if epoch % 5 == 0:
        
        torch.save(model, "NER_MODEL_EPOCH_" + str(epoch))


 epoch 0 : Loss :155.88104095458985 ,timeUsage:7.109553337097168,recall:0.0,parsed_num : 390,accu_num : 0
label accu  0.6877 entity accu 0.0  O-label-ratio: 0.7955421023665382
performance on dev set : loss: 157.5501 accu rate:0,recall : 0.0,parsed_num : 0,accu_num : 0


  "type " + obj.__name__ + ". It won't be checked "


 epoch 1 : Loss :151.64886596679688 ,timeUsage:7.292750835418701,recall:0.0,parsed_num : 0,accu_num : 0
label accu  0.7955 entity accu 0  O-label-ratio: 0.7955421023665382
performance on dev set : loss: 157.5501 accu rate:0,recall : 0.0,parsed_num : 0,accu_num : 0
 epoch 2 : Loss :151.64886047363282 ,timeUsage:8.268913984298706,recall:0.0,parsed_num : 0,accu_num : 0
label accu  0.7955 entity accu 0  O-label-ratio: 0.7955421023665382
performance on dev set : loss: 157.5501 accu rate:0,recall : 0.0,parsed_num : 0,accu_num : 0


KeyboardInterrupt: 

In [None]:
import torch
a = torch.tensor(
[
    [
        [0,0],[1,1],[2,2]
    ],
    [
        [3,3],[4,4],[5,5],

    ],  
    [
        [7,7],[8,8],[5,5],

    ],  
])
b =torch.tensor(
[
    [
        [0,0],[1,1],[2,2]
    ],
    [
        [3,3],[4,4],[5,5],

    ],  
    [
        [7,7],[8,8],[5,5],

    ],  
])
print(b > 1)