# * 环境更新

In [None]:

!pip install --upgrade paddle -i https://pypi.tuna.tsinghua.edu.cn/simple
!pip install --upgrade paddlepaddle -i https://pypi.tuna.tsinghua.edu.cn/simple
!pip install --upgrade paddlenlp -i https://pypi.tuna.tsinghua.edu.cn/simple

In [3]:
import paddle
paddle.is_compiled_with_cuda()

False

In [1]:
import paddle
import paddlenlp as ppnlp
from paddlenlp.datasets import MapDataset
from paddlenlp.data import Stack, Pad, Tuple
from paddlenlp.metrics import ChunkEvaluator
import paddle.nn.functional as F
import numpy as np
from functools import partial #partial()函数可以用来固定某些参数值，并返回一个新的callable对象
import pdb

# 数据集查看

In [10]:
!head -n10 data/train.conll

浙 B-prov
江 E-prov
杭 B-city
州 I-city
市 E-city
江 B-district
干 I-district
区 E-district
九 B-town
堡 I-town


In [12]:
!head -n10  data/dev.conll

杭 B-city
州 E-city
五 B-poi
洲 I-poi
国 I-poi
际 E-poi

浙 B-prov
江 I-prov
省 E-prov


In [14]:
!head data/final_test.txt

1朝阳区小关北里000-0号
2朝阳区惠新东街00号
3朝阳区南磨房路与西大望路交口东南角
4朝阳区潘家园南里00号
5朝阳区向军南里二巷0号附近
6朝阳区多处营业网点
7朝阳区多处营业网点
8朝阳区多处营业网点
9朝阳区北三环中路00号商房大厦0楼
10朝阳区孙河乡康营家园00区北侧底商


# **数据处理已经预先处理好，后面需要转为序列，模型需要的数据。构建batch**

# 1.数据和标签分开，把实体类别转为id


In [13]:
#加载数据文件datafiles
def load_dataset(datafiles):
    #读取数据文件data_path
    def read(data_path):
        with open(data_path, 'r', encoding='utf-8') as fp:
            next(fp)  # Skip header  #Deleted by WGM
            #处理每行数据（文本+‘\t’+标注）
            for line in fp.readlines():
                #提取文本和标注
                words, labels = line.strip('\n').split('\t')
                #文本中单字和标注构成的数组
                words = words.split('\002')
                labels = labels.split('\002')
                #迭代返回文本和标注
                yield words, labels
    
    #根据datafiles的数据类型，选择合适的处理方式
    if isinstance(datafiles, str):#字符串，单个文件名称
        #返回单个文件对应的单个数据集
        return MapDataset(list(read(datafiles)))
    elif isinstance(datafiles, list) or isinstance(datafiles, tuple):#列表或元组，多个文件名称
        #返回多个文件对应的多个数据集
        return [MapDataset(list(read(datafile))) for datafile in datafiles]
        
#加载字典文件，文件由单列构成，需要设置value
def load_dict_single(dict_path):
    #字典初始化为空
    vocab = {}
    #value是自增数值，从0开始
    i = 0
    #逐行读取字典文件
    for line in open(dict_path, 'r', encoding='utf-8'):
        #将每行文字设置为key
        key = line.strip('\n')
        #设置对应的value
        vocab[key] = i
        i+=1
    return vocab


In [15]:
#把数据集转为当个字符
train_ds, dev_ds = load_dataset(datafiles=('./data/train.txt', './data/dev.txt'))

#把类别名转为id
label_vocab = load_dict_single('./data/tag.dic')

#查看训练集和测试集的大小
print("训练集大小:",len(train_ds))
print("测试集集大小:",len(dev_ds))
print(train_ds[0])
print(dev_ds[0])
print(label_vocab) #57个分类


训练集大小: 1600
测试集集大小: 200
(['1', '6', '6', '2', '0', '2', '0', '0', '0', '7', '7', '宣', '荣', '嗣', '甘', '肃', '省', '白', '银', '市', '会', '宁', '县', '河', '畔', '镇', '十', '字', '街', '金', '海', '超', '市', '西', '行', '5', '0', '米'], ['T-B', 'T-I', 'T-I', 'T-I', 'T-I', 'T-I', 'T-I', 'T-I', 'T-I', 'T-I', 'T-I', 'P-B', 'P-I', 'P-I', 'A1-B', 'A1-I', 'A1-I', 'A2-B', 'A2-I', 'A2-I', 'A3-B', 'A3-I', 'A3-I', 'A4-B', 'A4-I', 'A4-I', 'A4-I', 'A4-I', 'A4-I', 'A4-I', 'A4-I', 'A4-I', 'A4-I', 'A4-I', 'A4-I', 'A4-I', 'A4-I', 'A4-I'])
(['喻', '晓', '刚', '云', '南', '省', '楚', '雄', '彝', '族', '自', '治', '州', '南', '华', '县', '东', '街', '古', '城', '路', '3', '7', '号', '1', '8', '5', '1', '3', '3', '8', '6', '1', '6', '3'], ['P-B', 'P-I', 'P-I', 'A1-B', 'A1-I', 'A1-I', 'A2-B', 'A2-I', 'A2-I', 'A2-I', 'A2-I', 'A2-I', 'A2-I', 'A3-B', 'A3-I', 'A3-I', 'A4-B', 'A4-I', 'A4-I', 'A4-I', 'A4-I', 'A4-I', 'A4-I', 'A4-I', 'T-B', 'T-I', 'T-I', 'T-I', 'T-I', 'T-I', 'T-I', 'T-I', 'T-I', 'T-I', 'T-I'])
{'P-B': 0, 'P-I': 1, 'T-B': 2, 'T-I': 3, 'A1-

# 2.加载bert分词器，对数据进行序列化，数据处理成模型想要的格式。

In [4]:
def convert_example(example,tokenizer,label_vocab,max_seq_len=128,is_test=False):
    #测试集没有标签
    if is_test:
        text = example
    else:
        text, label = example
    tokenizer_input = tokenizer.encode(text=text, max_seq_len=None, pad_to_max_seq_len=False,return_length=True)
    input_ids = tokenizer_input["input_ids"]
    token_type_ids = tokenizer_input["token_type_ids"]
    seq_len = tokenizer_input["seq_len"]
    if not is_test:
        # 加入cls和sep
        label = ['O']+label+['O']
        # 将标签转为序列
        label = [label_vocab[x] for x in label]
        return input_ids, token_type_ids, seq_len, label
    else: # 测试集，不返回标签
        return input_ids, token_type_ids, seq_len

    


In [2]:
#加载Ernie的Tokenizer
tokenizer = ppnlp.transformers.ErnieTokenizer.from_pretrained(r"ernie-3.0-medium-zh")

[32m[2024-03-04 11:10:13,338] [    INFO][0m - Already cached C:\Users\huangkai\.paddlenlp\models\ernie-3.0-medium-zh\ernie_3.0_medium_zh_vocab.txt[0m
[32m[2024-03-04 11:10:13,379] [    INFO][0m - tokenizer config file saved in C:\Users\huangkai\.paddlenlp\models\ernie-3.0-medium-zh\tokenizer_config.json[0m
[32m[2024-03-04 11:10:13,382] [    INFO][0m - Special tokens file saved in C:\Users\huangkai\.paddlenlp\models\ernie-3.0-medium-zh\special_tokens_map.json[0m


In [6]:
#偏函数，固定参数
trans_func = partial(convert_example, tokenizer=tokenizer, label_vocab=label_vocab, max_seq_len=128)

In [7]:
#对数据集进行编码（转为Ernie需要的格式）
train_ds.map(trans_func)
dev_ds.map(trans_func)
print(train_ds[0])

([1, 1382, 409, 244, 565, 404, 99, 157, 507, 308, 233, 213, 484, 945, 3074, 53, 509, 219, 216, 540, 540, 540, 540, 500, 2], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 25, [24, 0, 16, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 14, 17, 17, 15, 18, 19, 20, 21, 21, 21, 22, 24])


In [8]:
#数据组装成一个batch一个batch

#创建Tuple对象，将多个批处理函数的处理结果连接在一起
ignore_label = -1
#因为数据集train_ds、dev_ds的每条数据包含4部分，所以Tuple对象中包含4个批处理函数
batchify_fn = lambda samples, fn=Tuple(
    #将每条数据的input_ids组合为数组，如果input_ids不等长，那么填充为pad_val
    Pad(axis=0, pad_val=tokenizer.pad_token_id),
    #将每条数据的segment_ids组合为数组，如果segment_ids不等长，那么填充为pad_val
    Pad(axis=0, pad_val=tokenizer.pad_token_type_id),
    #将每条数据的seq_len组合为数组
    Stack(),
    #将每条数据的label组合为数组，如果label不等长，那么填充为pad_val
    Pad(axis=0, pad_val=ignore_label)
): fn(samples)

#paddle.io.DataLoader加载给定数据集，返回迭代器，每次迭代访问batch_size条数据
#使用collate_fn定义所读取数据的格式
#训练集
train_loader = paddle.io.DataLoader(
    dataset=train_ds,
    batch_size=32,
    return_list=True,
    collate_fn=batchify_fn)
#验证集
dev_loader = paddle.io.DataLoader(
    dataset=dev_ds,
    batch_size=32,
    return_list=True,
    collate_fn=batchify_fn)

# 组合Ernie+BiGRU+CRF

In [5]:
import paddle.nn as nn
from paddlenlp.transformers import ErnieModel
from paddlenlp.layers.crf import LinearChainCrf, LinearChainCrfLoss, ViterbiDecoder

In [17]:
class ErnieGRUCRF(nn.Layer):
    def __init__(self,Ernie,gru_hidden_size=300,
                num_class=2,
                crf_lr=100):
        super().__init__()
        self.num_classes = num_class
        self.Ernie = Ernie
        for param in self.Ernie.parameters():
            # param.trainable = False
            print(param.stop_gradient)
        self.gru = nn.GRU(self.Ernie.config["hidden_size"],
                          gru_hidden_size,
                          num_layers = 2,
                          direction='bidirect')
        self.fc = nn.Linear(gru_hidden_size*2,num_class+2)
        self.crf = LinearChainCrf(self.num_classes)
        self.crf_loss = LinearChainCrfLoss(self.crf)
        self.viterbi_decoder = ViterbiDecoder(self.crf.transitions)

    def forward(self,input_ids,token_type_ids,lengths=None,labels=None):
        encoder_output,_ = self.Ernie(input_ids, token_type_ids = token_type_ids)
        gru_output, _ = self.gru(encoder_output)
        emission = self.fc(gru_output)
        if labels is not None:
            loss = self.crf_loss(emission, lengths, labels)
            return loss
        else:
            _,prediction = self.viterbi_decoder(emission, lengths)
            return prediction

In [9]:
#加载预训练模型Ernie
ernie_model = ErnieModel.from_pretrained(r"ernie-3.0-medium-zh")
for param in ernie_model.parameters():
    param.trainable = False
    print(param)

Parameter containing:
Tensor(shape=[40000, 768], dtype=float32, place=Place(cpu), stop_gradient=True,
       [[-0.02744583, -0.00382745,  0.04380905, ..., -0.03190089,
          0.00601174,  0.03409085],
        [-0.02095849, -0.01926271, -0.02129839, ..., -0.01542175,
          0.00038802, -0.01393166],
        [ 0.04409645, -0.03055532, -0.02618456, ..., -0.00846301,
         -0.03102261, -0.02667180],
        ...,
        [-0.01379568,  0.01972155, -0.01737572, ...,  0.01119119,
         -0.01017644, -0.00422090],
        [-0.00676420,  0.00158248,  0.00291453, ...,  0.01389278,
          0.00160268,  0.00113276],
        [ 0.00575782, -0.01698007,  0.02542021, ..., -0.01337493,
         -0.00425118,  0.00661711]])
Parameter containing:
Tensor(shape=[2048, 768], dtype=float32, place=Place(cpu), stop_gradient=True,
       [[-0.03167305, -0.00013653, -0.02624370, ..., -0.00823384,
         -0.00882055, -0.02770122],
        [-0.00672439,  0.00420699, -0.03575226, ..., -0.00454897,
   

In [18]:
model = ErnieGRUCRF(ernie_model, 300, len(label_vocab), 100)

True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True


In [None]:
#设置Fine-Tune优化策略
#1.计算了块检测的精确率、召回率和F1-score。常用于序列标记任务，如命名实体识别
metric = ChunkEvaluator(label_list=label_vocab.keys(), suffix=True)
#2.在Adam的基础上加入了权重衰减的优化器，可以解决L2正则化失效问题
optimizer = paddle.optimizer.AdamW(learning_rate=2e-5, parameters=model.parameters())
#损失函数由模型给出
#3.
#评估函数
def evaluate(model, metric, data_loader):
    model.eval()
    metric.reset()#评估器复位
    #依次处理每批数据
    for input_ids, seg_ids, lens, labels in data_loader:
        #CRF Loss
        preds = model(input_ids, seg_ids, lengths=lens)
        n_infer, n_label, n_correct = metric.compute(lens,preds,labels)
        metric.update(n_infer.numpy(),n_label.numpy(),n_correct.numpy())
        precision, recall, f1_score = metric.accumulate()    
    print("评估准确度: %.6f - 召回率: %.6f - f1得分: %.6f" % (precision, recall, f1_score))
    model.train()
#模型训练
global_step = 0
for epoch in range(20):
    #依次处理每批数据
    for step, (input_ids, segment_ids, seq_lens, labels) in enumerate(train_loader, start=1):
        #直接得到CRF Loss
        loss = model(input_ids, token_type_ids=segment_ids,lengths=seq_lens, labels=labels)
        avg_loss = paddle.mean(loss)
        avg_loss.backward()
        optimizer.step()
        optimizer.clear_grad()
        if global_step % 10 == 0 :
            print("训练集的当前epoch:%d - step:%d" % (epoch, step))
            print("损失函数: %.6f" % (avg_loss))
        global_step += 1
    #评估训练模型
    evaluate(model, metric, dev_loader)
    paddle.save(model.state_dict(),
            './checkpoint/model_%d.pdparams'  % (global_step))

训练集的当前epoch:0 - step:1
损失函数: 80.013527
训练集的当前epoch:0 - step:11
损失函数: 66.766640
训练集的当前epoch:0 - step:21
损失函数: 56.832478
训练集的当前epoch:0 - step:31
损失函数: 48.255463
训练集的当前epoch:0 - step:41
损失函数: 44.758476
训练集的当前epoch:0 - step:51
损失函数: 38.669243
训练集的当前epoch:0 - step:61
损失函数: 34.507248
训练集的当前epoch:0 - step:71
损失函数: 29.148651
训练集的当前epoch:0 - step:81
损失函数: 29.373930
训练集的当前epoch:0 - step:91
损失函数: 28.151989
训练集的当前epoch:0 - step:101
损失函数: 21.719574
训练集的当前epoch:0 - step:111
损失函数: 21.785767
训练集的当前epoch:0 - step:121
损失函数: 21.802521
训练集的当前epoch:0 - step:131
损失函数: 17.572851
训练集的当前epoch:0 - step:141
损失函数: 14.919171
训练集的当前epoch:0 - step:151
损失函数: 15.116917
训练集的当前epoch:0 - step:161
损失函数: 12.999769
训练集的当前epoch:0 - step:171
损失函数: 13.329678
训练集的当前epoch:0 - step:181
损失函数: 10.552842
训练集的当前epoch:0 - step:191
损失函数: 10.812273
训练集的当前epoch:0 - step:201
损失函数: 9.820799
训练集的当前epoch:0 - step:211
损失函数: 10.503597
训练集的当前epoch:0 - step:221
损失函数: 10.150091
训练集的当前epoch:0 - step:23

有什么问题，欢迎到评论区留言，我们一起讨论。

请点击[此处](https://ai.baidu.com/docs#/AIStudio_Project_Notebook/a38e5576)查看本环境基本用法.  <br>
Please click [here ](https://ai.baidu.com/docs#/AIStudio_Project_Notebook/a38e5576) for more detailed instructions. 