In [7]:
import paddle
import paddlenlp as ppnlp
from paddlenlp.datasets import MapDataset
from paddlenlp.data import Stack, Pad, Tuple
from paddlenlp.metrics import ChunkEvaluator
import paddle.nn.functional as F
import numpy as np
from functools import partial #partial()函数可以用来固定某些参数值，并返回一个新的callable对象
import pdb

# 数据集查看

In [8]:
!head -n10 data/train.conll

浙 B-prov
江 E-prov
杭 B-city
州 I-city
市 E-city
江 B-district
干 I-district
区 E-district
九 B-town
堡 I-town


In [9]:
!head -n10  data/dev.conll

杭 B-city
州 E-city
五 B-poi
洲 I-poi
国 I-poi
际 E-poi

浙 B-prov
江 I-prov
省 E-prov


In [10]:
!head data/final_test.txt

1朝阳区小关北里000-0号
2朝阳区惠新东街00号
3朝阳区南磨房路与西大望路交口东南角
4朝阳区潘家园南里00号
5朝阳区向军南里二巷0号附近
6朝阳区多处营业网点
7朝阳区多处营业网点
8朝阳区多处营业网点
9朝阳区北三环中路00号商房大厦0楼
10朝阳区孙河乡康营家园00区北侧底商


# **数据处理已经预先处理好，后面需要转为序列，模型需要的数据。构建batch**

# 1.数据和标签分开，把实体类别转为id


In [11]:
#加载数据文件datafiles
def load_dataset(datafiles):
    #读取数据文件data_path
    def read(data_path):
        with open(data_path, 'r', encoding='utf-8') as fp:
            next(fp)  # Skip header  #Deleted by WGM
            #处理每行数据（文本+‘\t’+标注）
            for line in fp.readlines():
                #提取文本和标注
                words, labels = line.strip('\n').split('\t')
                #文本中单字和标注构成的数组
                words = words.split('\002')
                labels = labels.split('\002')
                #迭代返回文本和标注
                yield words, labels
    
    #根据datafiles的数据类型，选择合适的处理方式
    if isinstance(datafiles, str):#字符串，单个文件名称
        #返回单个文件对应的单个数据集
        return MapDataset(list(read(datafiles)))
    elif isinstance(datafiles, list) or isinstance(datafiles, tuple):#列表或元组，多个文件名称
        #返回多个文件对应的多个数据集
        return [MapDataset(list(read(datafile))) for datafile in datafiles]
        
#加载字典文件，文件由单列构成，需要设置value
def load_dict_single(dict_path):
    #字典初始化为空
    vocab = {}
    #value是自增数值，从0开始
    i = 0
    #逐行读取字典文件
    for line in open(dict_path, 'r', encoding='utf-8'):
        #将每行文字设置为key
        key = line.strip('\n')
        #设置对应的value
        vocab[key] = i
        i+=1
    return vocab


In [12]:
#把数据集转为当个字符
train_ds, dev_ds = load_dataset(datafiles=('./data/train.txt', './data/dev.txt'))

#把类别名转为id
label_vocab = load_dict_single('./data/mytag.dic')

#查看训练集和测试集的大小
print("训练集大小:",len(train_ds))
print("测试集集大小:",len(dev_ds))
print(train_ds[0])
print(dev_ds[0])
print(label_vocab) #57个分类


训练集大小: 8854
测试集集大小: 1969
(['浙', '江', '省', '温', '州', '市', '平', '阳', '县', '海', '西', '镇', '宋', '埠', '公', '园', '南', '路', '0', '0', '0', '0', '号'], ['B-prov', 'I-prov', 'E-prov', 'B-city', 'I-city', 'E-city', 'B-district', 'I-district', 'E-district', 'B-town', 'I-town', 'E-town', 'B-poi', 'I-poi', 'I-poi', 'E-poi', 'B-road', 'E-road', 'B-roadno', 'I-roadno', 'I-roadno', 'I-roadno', 'E-roadno'])
(['浙', '江', '省', '杭', '州', '市', '余', '杭', '乔', '司', '街', '道', '博', '卡', '路', '0', '号', '博', '卡', '制', '衣'], ['B-prov', 'I-prov', 'E-prov', 'B-city', 'I-city', 'E-city', 'B-district', 'E-district', 'B-town', 'I-town', 'I-town', 'E-town', 'B-road', 'I-road', 'E-road', 'B-roadno', 'E-roadno', 'B-poi', 'I-poi', 'I-poi', 'E-poi'])
{'B-prov': 0, 'E-prov': 1, 'B-city': 2, 'I-city': 3, 'E-city': 4, 'B-district': 5, 'I-district': 6, 'E-district': 7, 'B-town': 8, 'I-town': 9, 'E-town': 10, 'B-community': 11, 'I-community': 12, 'E-community': 13, 'B-poi': 14, 'E-poi': 15, 'I-prov': 16, 'I-poi': 17, 'B-road'

# 2.加载bert分词器，对数据进行序列化，数据处理成模型想要的格式。

In [13]:
def convert_example(example,tokenizer,label_vocab,max_seq_len=128,is_test=False):
    #测试集没有标签
    if is_test:
        text = example
    else:
        text, label = example
    tokenizer_input = tokenizer.encode(text=text, max_seq_len=None, pad_to_max_seq_len=False,return_length=True)
    input_ids = tokenizer_input["input_ids"]
    token_type_ids = tokenizer_input["token_type_ids"]
    seq_len = tokenizer_input["seq_len"]
    if not is_test:
        # 加入cls和sep
        label = ['O']+label+['O']
        # 将标签转为序列
        label = [label_vocab[x] for x in label]
        return input_ids, token_type_ids, seq_len, label
    else: # 测试集，不返回标签
        return input_ids, token_type_ids, seq_len

    


In [14]:
#加载Ernie的Tokenizer
tokenizer = ppnlp.transformers.ErnieTokenizer.from_pretrained('ernie-1.0')

[32m[2024-02-18 17:17:32,942] [    INFO][0m - Downloading https://bj.bcebos.com/paddlenlp/models/transformers/ernie/vocab.txt and saved to /home/aistudio/.paddlenlp/models/ernie-1.0[0m
[32m[2024-02-18 17:17:33,031] [    INFO][0m - Downloading vocab.txt from https://bj.bcebos.com/paddlenlp/models/transformers/ernie/vocab.txt[0m
100%|██████████| 89.5k/89.5k [00:00<00:00, 804kB/s]
[32m[2024-02-18 17:17:33,227] [    INFO][0m - tokenizer config file saved in /home/aistudio/.paddlenlp/models/ernie-1.0/tokenizer_config.json[0m
[32m[2024-02-18 17:17:33,227] [    INFO][0m - Special tokens file saved in /home/aistudio/.paddlenlp/models/ernie-1.0/special_tokens_map.json[0m


In [15]:
#偏函数，固定参数
trans_func = partial(convert_example, tokenizer=tokenizer, label_vocab=label_vocab, max_seq_len=128)

In [16]:
#对数据集进行编码（转为TinyBert需要的格式）
train_ds.map(trans_func)
dev_ds.map(trans_func)
print(train_ds[0])

([1, 1382, 409, 244, 565, 404, 99, 157, 507, 308, 233, 213, 484, 945, 3074, 53, 509, 219, 216, 540, 540, 540, 540, 500, 2], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 25, [24, 0, 16, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 14, 17, 17, 15, 18, 19, 20, 21, 21, 21, 22, 24])


In [17]:
#数据组装成一个batch一个batch

#创建Tuple对象，将多个批处理函数的处理结果连接在一起
ignore_label = -1
#因为数据集train_ds、dev_ds的每条数据包含4部分，所以Tuple对象中包含4个批处理函数
batchify_fn = lambda samples, fn=Tuple(
    #将每条数据的input_ids组合为数组，如果input_ids不等长，那么填充为pad_val
    Pad(axis=0, pad_val=tokenizer.pad_token_id),
    #将每条数据的segment_ids组合为数组，如果segment_ids不等长，那么填充为pad_val
    Pad(axis=0, pad_val=tokenizer.pad_token_type_id),
    #将每条数据的seq_len组合为数组
    Stack(),
    #将每条数据的label组合为数组，如果label不等长，那么填充为pad_val
    Pad(axis=0, pad_val=ignore_label)
): fn(samples)

#paddle.io.DataLoader加载给定数据集，返回迭代器，每次迭代访问batch_size条数据
#使用collate_fn定义所读取数据的格式
#训练集
train_loader = paddle.io.DataLoader(
    dataset=train_ds,
    batch_size=32,
    return_list=True,
    collate_fn=batchify_fn)
#验证集
dev_loader = paddle.io.DataLoader(
    dataset=dev_ds,
    batch_size=32,
    return_list=True,
    collate_fn=batchify_fn)

# 组合Ernir+BiGRU+CRF

In [18]:
import paddle.nn as nn
from paddlenlp.transformers import ErnieModel
from paddlenlp.layers.crf import LinearChainCrf, LinearChainCrfLoss, ViterbiDecoder

In [19]:
class ErnieGRUCRF(nn.Layer):
    def __init__(self,Ernie,gru_hidden_size=300,
                num_class=2,
                crf_lr=100):
        super().__init__()
        self.num_classes = num_class
        self.Ernie = Ernie
        self.gru = nn.GRU(self.Ernie.config["hidden_size"],
                          gru_hidden_size,
                          num_layers = 2,
                          direction='bidirect')
        self.fc = nn.Linear(gru_hidden_size*2,num_class+2)
        self.crf = LinearChainCrf(self.num_classes)
        self.crf_loss = LinearChainCrfLoss(self.crf)
        self.viterbi_decoder = ViterbiDecoder(self.crf.transitions)

    def forward(self,input_ids,token_type_ids,lengths=None,labels=None):
        encoder_output,_ = self.Ernie(input_ids, token_type_ids = token_type_ids)
        gru_output, _ = self.gru(encoder_output)
        emission = self.fc(gru_output)
        if labels is not None:
            loss = self.crf_loss(emission, lengths, labels)
            return loss
        else:
            _,prediction = self.viterbi_decoder(emission, lengths)
            return prediction

In [20]:
#加载预训练模型TinyBert
Ernie = ErnieModel.from_pretrained('ernie-1.0')
model = ErnieGRUCRF(Ernie, 300, len(label_vocab), 100)

[32m[2024-02-18 17:17:35,593] [    INFO][0m - Configuration saved in /home/aistudio/.paddlenlp/models/ernie-1.0/config.json[0m
[32m[2024-02-18 17:17:35,652] [    INFO][0m - Downloading https://bj.bcebos.com/paddlenlp/models/transformers/ernie/ernie_v1_chn_base.pdparams[0m
[32m[2024-02-18 17:17:35,662] [    INFO][0m - Downloading ernie_v1_chn_base.pdparams from https://bj.bcebos.com/paddlenlp/models/transformers/ernie/ernie_v1_chn_base.pdparams[0m
100%|██████████| 383M/383M [00:08<00:00, 46.4MB/s] 
[32m[2024-02-18 17:17:44,570] [    INFO][0m - Loading weights file model_state.pdparams from cache at /home/aistudio/.paddlenlp/models/ernie-1.0/model_state.pdparams[0m
[32m[2024-02-18 17:17:45,349] [    INFO][0m - Loaded weights file from disk, setting weights to model.[0m
W0218 17:17:45.355096   190 gpu_resources.cc:119] Please NOTE: device: 0, GPU Compute Capability: 7.0, Driver API Version: 12.0, Runtime API Version: 11.8
W0218 17:17:45.356474   190 gpu_resources.cc:

In [21]:
#设置Fine-Tune优化策略
#1.计算了块检测的精确率、召回率和F1-score。常用于序列标记任务，如命名实体识别
metric = ChunkEvaluator(label_list=label_vocab.keys(), suffix=True)
#2.在Adam的基础上加入了权重衰减的优化器，可以解决L2正则化失效问题
optimizer = paddle.optimizer.AdamW(learning_rate=2e-5, parameters=model.parameters())
#损失函数由模型给出
#3.
#评估函数
def evaluate(model, metric, data_loader):
    model.eval()
    metric.reset()#评估器复位
    #依次处理每批数据
    for input_ids, seg_ids, lens, labels in data_loader:
        #CRF Loss
        preds = model(input_ids, seg_ids, lengths=lens)
        n_infer, n_label, n_correct = metric.compute(lens,preds,labels)
        metric.update(n_infer.numpy(),n_label.numpy(),n_correct.numpy())
        precision, recall, f1_score = metric.accumulate()    
    print("评估准确度: %.6f - 召回率: %.6f - f1得分: %.6f" % (precision, recall, f1_score))
    model.train()
#模型训练
global_step = 0
for epoch in range(20):
    #依次处理每批数据
    for step, (input_ids, segment_ids, seq_lens, labels) in enumerate(train_loader, start=1):
        #直接得到CRF Loss
        loss = model(input_ids, token_type_ids=segment_ids,lengths=seq_lens, labels=labels)
        avg_loss = paddle.mean(loss)
        avg_loss.backward()
        optimizer.step()
        optimizer.clear_grad()
        if global_step % 10 == 0 :
            print("训练集的当前epoch:%d - step:%d" % (epoch, step))
            print("损失函数: %.6f" % (avg_loss))
        global_step += 1
    #评估训练模型
    evaluate(model, metric, dev_loader)
    paddle.save(model.state_dict(),
            './checkpoint/model_%d.pdparams'  % (global_step))

训练集的当前epoch:0 - step:1
损失函数: 80.728073
训练集的当前epoch:0 - step:11
损失函数: 65.700790
训练集的当前epoch:0 - step:21
损失函数: 56.292873
训练集的当前epoch:0 - step:31
损失函数: 47.664459
训练集的当前epoch:0 - step:41
损失函数: 44.414978
训练集的当前epoch:0 - step:51
损失函数: 38.578400
训练集的当前epoch:0 - step:61
损失函数: 33.767685
训练集的当前epoch:0 - step:71
损失函数: 29.349903
训练集的当前epoch:0 - step:81
损失函数: 29.596001
训练集的当前epoch:0 - step:91
损失函数: 28.053680
训练集的当前epoch:0 - step:101
损失函数: 21.799549
训练集的当前epoch:0 - step:111
损失函数: 21.693764
训练集的当前epoch:0 - step:121
损失函数: 22.277697
训练集的当前epoch:0 - step:131
损失函数: 18.083397
训练集的当前epoch:0 - step:141
损失函数: 14.989843
训练集的当前epoch:0 - step:151
损失函数: 14.366377
训练集的当前epoch:0 - step:161
损失函数: 12.595774
训练集的当前epoch:0 - step:171
损失函数: 12.422470
训练集的当前epoch:0 - step:181
损失函数: 10.778782
训练集的当前epoch:0 - step:191
损失函数: 10.650402
训练集的当前epoch:0 - step:201
损失函数: 10.225702
训练集的当前epoch:0 - step:211
损失函数: 9.687275
训练集的当前epoch:0 - step:221
损失函数: 9.811920
训练集的当前epoch:0 - step:231



损失函数: 3.783197
训练集的当前epoch:2 - step:87
损失函数: 2.624102
训练集的当前epoch:2 - step:97
损失函数: 2.453515
训练集的当前epoch:2 - step:107
损失函数: 3.692891
训练集的当前epoch:2 - step:117
损失函数: 2.761435
训练集的当前epoch:2 - step:127
损失函数: 4.389126
训练集的当前epoch:2 - step:137
损失函数: 3.679394
训练集的当前epoch:2 - step:147
损失函数: 2.506398
训练集的当前epoch:2 - step:157
损失函数: 3.978191
训练集的当前epoch:2 - step:167
损失函数: 6.257996
训练集的当前epoch:2 - step:177
损失函数: 2.877659
训练集的当前epoch:2 - step:187
损失函数: 4.026735
训练集的当前epoch:2 - step:197
损失函数: 3.564148
训练集的当前epoch:2 - step:207
损失函数: 3.763389
训练集的当前epoch:2 - step:217
损失函数: 4.845933
训练集的当前epoch:2 - step:227
损失函数: 3.248820
训练集的当前epoch:2 - step:237
损失函数: 4.038624
训练集的当前epoch:2 - step:247
损失函数: 4.929435
训练集的当前epoch:2 - step:257
损失函数: 2.667067
训练集的当前epoch:2 - step:267
损失函数: 4.255585
训练集的当前epoch:2 - step:277
损失函数: 1.442499
评估准确度: 0.972461 - 召回率: 0.969031 - f1得分: 0.970743
训练集的当前epoch:3 - step:10
损失函数: 3.112317
训练集的当前epoch:3 - step:20
损失函数: 2.976952
训练集的当前epoch:3 