# 导包

In [1]:
import os
import json

from tqdm import tqdm

# 定义参数

In [2]:
import os
import torch
import json

from transformers import BertTokenizer


class CommonConfig:
    bert_dir = "hfl/chinese-macbert-base"
    output_dir = "./checkpoint/"
    data_dir = "./data"

class NerConfig:
    def __init__(self):
        cf = CommonConfig()
        self.bert_dir = cf.bert_dir
        self.output_dir = cf.output_dir
        # self.output_dir = os.path.join(self.output_dir)
        if not os.path.exists(self.output_dir):
            os.mkdir(self.output_dir)
        self.data_dir = cf.data_dir

        labels_list = ['O', 'B-HCCX', 'I-HCCX', 'B-HPPX', 'I-HPPX', 'B-XH', 'I-XH', 'B-MISC', 'I-MISC']
        self.num_labels = len(labels_list)
        self.label2id = {l: i for i, l in enumerate(labels_list)}
        self.id2label = {i: l for i, l in enumerate(labels_list)}

        self.max_seq_len = 512
        self.epochs = 3
        self.train_batch_size = 64
        self.dev_batch_size = 64
        self.bert_learning_rate = 3e-5
        self.crf_learning_rate = 3e-3
        self.adam_epsilon = 1e-8
        self.weight_decay = 0.01
        self.warmup_proportion = 0.01
        self.save_step = 500


In [3]:
# 加载参数
args = NerConfig()

with open(os.path.join(args.output_dir, "ner_args.json"), "w") as fp:
    json.dump(vars(args), fp, ensure_ascii=False, indent=2)

# 定义DataLoader

In [4]:
# 读取数据方法
def read_data(file,label2id):
    lists = []
    with open(file, 'r') as f:
        lines = f.readlines()
        id = 0
        tokens = []
        ner_tags = []
        ner_labels = []
        for line in tqdm(lines):
            line = line.replace("\n", "")
            if len(line) == 0:
                lists.append({
                    "id": str(id),
                    "tokens": tokens,
                    "ner_tags": ner_tags,
                    "label": ner_labels
                })
                tokens = []
                ner_tags = []
                ner_labels = []
                id += 1
                continue
            texts = line.split(" ")
            tokens.append(texts[0])
            ner_tags.append(texts[1])
            ner_labels.append(label2id[texts[1]])
    return lists

In [5]:
from data_loader import NerDataset
from transformers import AutoTokenizer
from torch.utils.data import DataLoader

tokenizer = AutoTokenizer.from_pretrained(args.bert_dir)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

train_dataset = NerDataset(read_data("../origin/train.txt",args.label2id),args, tokenizer=tokenizer)
dev_dataset=NerDataset(read_data("../origin/dev.txt",args.label2id),args,tokenizer=tokenizer)
# test_dataset = NerDataset(read_data("../origin/test.txt"), tokenizer=tokenizer)
train_loader = DataLoader(train_dataset, shuffle=True, batch_size=args.train_batch_size, num_workers=2)
dev_loader = DataLoader(dev_dataset, shuffle=False, batch_size=args.dev_batch_size, num_workers=2)

100%|██████████| 180562/180562 [00:00<00:00, 946122.06it/s]
100%|██████████| 30184/30184 [00:00<00:00, 1012151.10it/s]


# 加载模型

In [6]:
from model import BertNer
model = BertNer(args)
model.to(device)



BertNer(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(21128, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    

# 定义Trainer

In [7]:
from Trainer import Trainer

train = Trainer(
    output_dir=args.output_dir,
    model=model,
    train_loader=train_loader,
    dev_loader=dev_loader,
    test_loader=dev_loader,
    epochs=args.epochs,
    device=device,
    id2label=args.id2label,
    t_total_num=len(train_loader) * args.epochs,
    optimizer_args=args
)



# 训练

In [8]:
train.train()

100%|██████████| 94/94 [02:50<00:00,  1.81s/it]


【train】1/3 95/282 loss:0.32739126682281494


100%|██████████| 94/94 [02:49<00:00,  1.80s/it]


【train】2/3 189/282 loss:0.2356387972831726


100%|██████████| 94/94 [02:48<00:00,  1.79s/it]


【train】3/3 283/282 loss:0.21381179988384247


# 评估

In [9]:
report = train.test()
print(report)

100%|██████████| 16/16 [00:11<00:00,  1.45it/s]


              precision    recall  f1-score   support

        HCCX       0.83      0.84      0.83      3430
        HPPX       0.78      0.79      0.78       484
        MISC       0.76      0.81      0.79       755
          XH       0.71      0.79      0.75       272

   micro avg       0.81      0.83      0.82      4941
   macro avg       0.77      0.81      0.79      4941
weighted avg       0.81      0.83      0.82      4941



# 推理预测

In [10]:
from predict import Predictor

predictor=Predictor()
text="川珍浓香型香菇干货去根肉厚干香菇500g热销品质抢购"
ner_result = predictor.ner_predict(text)
print("文本>>>>>：", text)
print("实体>>>>>：", ner_result)



文本>>>>>： 川珍浓香型香菇干货去根肉厚干香菇500g热销品质抢购
实体>>>>>： {'HPPX': [('川珍', 0, 1)], 'HCCX': [('香菇', 5, 6), ('干货', 7, 8), ('干香菇', 13, 15)], 'MISC': [('500g', 16, 19)]}
