In [5]:
!pip install loguru
!pip install seqeval



In [6]:
import torch
import torch.nn as nn
import json
import tqdm
from loguru import logger
from transformers import BertTokenizerFast, BertModel, AutoModel
from torch.optim import AdamW
import torch.utils.data as Data
from torch.utils.data import DataLoader, Dataset
from seqeval.metrics import f1_score, classification_report, accuracy_score
%config Completer.use_jedi = False

In [7]:
#读取数据并做基本处理
sentences = []
cur_token = []
cur_ner = []
with open('/kaggle/input/conll003-englishversion/train.txt','r') as f:    
    for line in f:
        line = line.strip()
        if not line: #空行 即句子结束
            if cur_token:
                #sentences是一个包含{token ner_tag}字典的列表
                sentences.append({
                    'token':cur_token,
                    'ner_tag':cur_ner})
                cur_token = []
                cur_ner = []
            continue
        line = line.split()
        if len(line) >= 4:
            token = line[0]
            ner_tag = line[3]

            cur_token.append(token)
            cur_ner.append(ner_tag)
            

In [8]:
#创建label index映射
def create_label_mapping(sentences):
    label = set()
    for i in sentences:
        label.update(i['ner_tag'])

    label2idx = {k: v for v, k in enumerate(sorted(label))}
    idx2label = {v: k for v, k in enumerate(sorted(label))}
    
        
    return label2idx, idx2label
    
label2idx, idx2label = create_label_mapping(sentences)

In [9]:
print(label2idx)
print(idx2label)

{'B-LOC': 0, 'B-MISC': 1, 'B-ORG': 2, 'B-PER': 3, 'I-LOC': 4, 'I-MISC': 5, 'I-ORG': 6, 'I-PER': 7, 'O': 8}
{0: 'B-LOC', 1: 'B-MISC', 2: 'B-ORG', 3: 'B-PER', 4: 'I-LOC', 5: 'I-MISC', 6: 'I-ORG', 7: 'I-PER', 8: 'O'}


In [10]:
tokenizer = BertTokenizerFast.from_pretrained('/kaggle/input/bert-base-cased')

In [11]:
# 构建数据集，此处采用动态构建的方式，即在模型训练时对每一条样本进行处理
class NERDataset(Dataset):
    def __init__(self, sentences, tokenizer,label2idx, max_length = 128):
        self.sentences = sentences
        self.tokenizer = tokenizer
        self.label2idx = label2idx
        self.max_length = max_length

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx): # 此处是动态地处理数据，在训练中处理
        # logger.info(f'now processing the {idx}th sentence')
        tokens, labels = self.sentences[idx]['token'], self.sentences[idx]['ner_tag']

        encoding = self.tokenizer( #等价于 tokenizer.encode_plus()
            tokens,
            is_split_into_words = True, #已切分
            padding = 'max_length',
            max_length = self.max_length,
            truncation = True,
            return_tensors = 'pt'
        )

        #标签对齐
        word_ids = encoding.word_ids() #获取token在原始序列中的序号
        aligned_labels = []
        previous_word_idx = None

        for word_idx in word_ids:
            if word_idx is None: 
                aligned_labels.append(-100)
            elif word_idx != previous_word_idx:
                # 新单词的第一个subword，使用原始标签
                aligned_labels.append(label2idx[labels[word_idx]])
            else:
                # 原单词的后续subword，需要被忽略
                #这也是BERT的sub-word level与NER作为word level问题之间的gap
                aligned_labels.append(-100)
            previous_word_idx = word_idx
        return {
            'input_ids':encoding['input_ids'].squeeze(0),#去掉batch维度
            'attention_mask':encoding['attention_mask'].squeeze(0),
            'labels': torch.tensor(aligned_labels, dtype=torch.long),
            'word_ids':word_ids
        }

In [12]:
# 划分训练集和测试集
from sklearn.model_selection import train_test_split
train_sentences, val_sentences = train_test_split(
    sentences, test_size=0.2, random_state=42, shuffle=True
)
train_set = NERDataset(train_sentences, tokenizer,label2idx)
val_set = NERDataset(val_sentences, tokenizer,label2idx)

In [13]:
# 创建DataLoader
# collate_fn的作用就是按照需要的方式堆叠张量
def ner_collate_fn(data):
    input_ids = torch.stack([item['input_ids'] for item in data])
    attn_mask = torch.stack([item['attention_mask'] for item in data])
    labels = torch.stack([item['labels'] for item in data])
    word_ids = [item.get('word_ids', []) for item in data] 
    return {'input_ids':input_ids, 'attention_mask':attn_mask, 'labels':labels, 'word_ids':word_ids}
    
trainloader = DataLoader(
    train_set, 
    batch_size=100,
    shuffle = True,
    collate_fn = ner_collate_fn
)
valloader = DataLoader(
    val_set, 
    batch_size=100,
    shuffle = True,
    collate_fn = ner_collate_fn
)

In [14]:
# 构建模型
class Bert(nn.Module):
    def __init__(self,num_labels):
        super(Bert, self).__init__()
        self.bert = AutoModel.from_pretrained('/kaggle/input/bert-base-cased')
        for param in self.bert.parameters(): #此处是全部参数参与更新
            param.require_grads = True
        #NER是对每个位置的token进行分类
        self.classifier = nn.Linear(768, num_labels)

        #可选: 加上Droupout
        self.dropout = nn.Dropout(0.2)

    def forward(self, input_ids, attention_mask):
        output = self.bert(
            input_ids = input_ids,
            attention_mask = attention_mask,
            return_dict = True
        )
        # 对于NER等序列标注最后一层的隐藏层输出last_hidden_state, size = [batch_size,seq_len,hidden_dim]
        seq_output = output.last_hidden_state
        if self.training == True:
            seq_output = self.dropout(seq_output)
        # size = [batch_size, seq_len, num_labels]
        logits = self.classifier(seq_output)

        return logits

In [15]:
# 定义模型、优化器和损失函数
num_labels = len(label2idx)
device ='cuda' if torch.cuda.is_available() else 'cpu'

model = Bert(num_labels).to(device)

optimizer =AdamW(model.parameters(), lr=2e-5)
loss_func = nn.CrossEntropyLoss(ignore_index=-100)# 对于-100不参与损失计算，即[CLS] [SEP]等

num_epoch = 3



In [16]:
# 训练流程
# 训练阶段是subword级别的
model.train()
for i in range(num_epoch):
    acc_loss = 0
    for idx, batch in enumerate(trainloader):
        input_ids = batch['input_ids'].to(device)
        attention_masks = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        optimizer.zero_grad()
    
        output = model(input_ids, attention_masks)

        loss = loss_func(
            output.view(-1, output.size(-1)),
            labels.view(-1)
        )
        
        loss.backward()
        optimizer.step()
        acc_loss += loss.item()
        
        if idx %100 == 0:
            avg_loss = acc_loss / (idx + 1) if idx > 0 else acc_loss
            print(f'epoch{i+1} batch{idx + 1}的 acc_loss：{avg_loss:.4f}')

    epoch_avg_loss = acc_loss / len(trainloader)
    print(f'Epoch {i+1} finished. Average Loss: {epoch_avg_loss:.4f}')
        

epoch1 batch1的 acc_loss：2.4830
epoch1 batch101的 acc_loss：0.4677
Epoch 1 finished. Average Loss: 0.4193
epoch2 batch1的 acc_loss：0.1168
epoch2 batch101的 acc_loss：0.1059
Epoch 2 finished. Average Loss: 0.1029
epoch3 batch1的 acc_loss：0.0476
epoch3 batch101的 acc_loss：0.0597
Epoch 3 finished. Average Loss: 0.0593


In [17]:
def align_pred2entity(preds, labels, word_ids, idx2label):
    """
    将subword级别的预测转换回word级别
    preds和labels都是subword级别的idx
    需要遍历每个word_idx, 若等于previous_word_idx则说明是同一个word, 可跳过
    """
    true_labels = []
    pred_labels = []

    previous_word_idx = None
    for i, (pred, label, word_idx) in enumerate(zip(preds, labels, word_ids)):
        if word_idx is None: #特殊字符 [CLS]等
            continue
        if word_idx != previous_word_idx:
            if label != -100:
                true_labels.append(idx2label[label])
                pred_labels.append(idx2label[pred])
            else:
                #对于每个词的第一个subword就是-100的情况，实则是数据问题，用O来处理
                true_labels.append('O')
                pred_labels.append(idx2label[pred])
        previous_word_idx = word_idx

    return true_labels, pred_labels

In [18]:
#模型评估
#注意模型评估阶段，是要汇总到实体(span) level的，这一点与训练阶段不同

model.eval()
model.to(device)

true_list = []
pred_list = []
with torch.no_grad():
    for idx, batch in enumerate(valloader):
        input_ids = batch['input_ids'].to(device)
        attention_masks = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        word_ids = batch['word_ids']
    
        output = model(input_ids, attention_masks)
        prediction = torch.argmax(output, dim=2) #沿着num_labels
    
        # 处理这个batch中的每个样本
        batch_size = batch['input_ids'].shape[0]
        for i in range(batch_size):
            seq_len = attention_masks[i].sum().item()
            # 获取当前样本的word_ids，并且截取有效长度
            w_ids = word_ids[i][:seq_len] 
            true_labels, pred_labels = align_pred2entity(
                        prediction[i][:seq_len].cpu().numpy(),
                        labels[i][:seq_len].cpu().numpy(),
                        w_ids,
                        idx2label
                    )
            true_list.append(true_labels)
            pred_list.append(pred_labels)

In [19]:
print(f1_score(true_list, pred_list,average='macro'))
print(classification_report(true_list,  pred_list))

0.8815379226995269
              precision    recall  f1-score   support

         LOC       0.91      0.94      0.92      1413
        MISC       0.82      0.80      0.81       708
         ORG       0.84      0.85      0.84      1220
         PER       0.95      0.95      0.95      1390

   micro avg       0.89      0.90      0.89      4731
   macro avg       0.88      0.88      0.88      4731
weighted avg       0.89      0.90      0.89      4731



In [20]:
#保存模型
torch.save(model.state_dict(),"bert_ner_baseline_sd.pth")

In [69]:
# 模型读取与推理
def align_output(output, word_ids, idx2label):
    output_list =[]
    previous_word_idx = None

    for i,(out, word_id, idx) in enumerate(zip(output, word_ids, idx2label)):
        if word_id is None:
            continue
        if word_id != previous_word_idx:
            pred_label = idx2label[out]
            output_list.append(pred_label)
        previous_word_idx = word_id
    return output_list

num_labels = len(label2idx)
model_infer = Bert(num_labels)
model_infer.load_state_dict(torch.load("/kaggle/working/bert_ner_baseline_sd.pth"))

text = "Jughead is in Guangzhou"
tokenizer_infer =  BertTokenizerFast.from_pretrained('/kaggle/input/bert-base-cased')
encoding_infer = tokenizer_infer(
    text,
    is_split_into_words = False,
    return_tensors = 'pt')

output = model_infer(encoding_infer['input_ids'], encoding_infer['attention_mask'])
output = torch.argmax(output, dim=2)
print(align_output(output.view(-1).numpy(),encoding_infer.word_ids(),idx2label))

['B-PER', 'O', 'O', 'B-LOC']
