In [1]:
import json
import re
import time
import numpy as np
import pandas as pd
import jsonlines
import eval4ner.muc as muc

In [2]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from transformers import BertTokenizer, BertModel 
# torch.manual_seed(1)

## 数据准备

In [3]:
filename=['中国图象图形学报','中文信息学报','模式识别与人工智能','计算机应用','计算机辅助设计与图形学学报']

In [4]:
DATA=[]
for f in filename:
    with open("./DATA/"+f+".jsonl","r+",encoding="utf-8") as ori:
        for item in jsonlines.Reader(ori):
            DATA.append(item)

In [5]:
# with open('./title_all_.jsonl', 'w', encoding='utf-8') as f:
#     for i in DATA:
#         json.dump(i,f,ensure_ascii=False)
#         f.write('\n')

In [2]:
print(len(DATA))
print(DATA[0])
print(DATA[0]['label'])

2501
{'id': 32764, 'text': '3D多尺度深度卷积神经网络肺结节检测', 'label': [[5, 13, 'method'], [13, 18, 'RP'], [13, 18, 'rp']], 'Comments': []}
[[5, 13, 'method'], [13, 18, 'RP'], [13, 18, 'rp']]


In [7]:
method=[]
for i in range(len(DATA)):
    for j in range(len(DATA[i]['label'])):
        if DATA[i]['label'][j][2]=='rp':
            m = DATA[i]['text'][int(DATA[i]['label'][j][0]):int(DATA[i]['label'][j][1])]
            method.append(m)
        

In [8]:
# method_df = pd.DataFrame({'methods':method}).to_csv('./method.csv',encoding='utf_8_sig')

In [9]:
labels = []
sen = [i['text'] for i in DATA]
tags = ['O','B-MED','I-MED','B-rp','I-rp','<START>','<STOP>']

for s in range(len(sen)):
    l = ['O' for i in range(len(sen[s]))]
    B_M=[]
    E_M=[]
    B_rp=[]
    E_rp=[]
    for label in DATA[s]['label']:
        if label[2] == 'method':
            l[label[0]] = 'B-MED'
            l[label[0]+1:label[1]] = ["I-MED" for i in range(label[1]-label[0]-1)]
        elif label[2] == 'rp':
            l[label[0]] = 'B-rp'
            l[label[0]+1:label[1]] = ["I-rp" for i in range(label[1]-label[0]-1)]
            
    L = " ".join(l)
    labels.append(L)

In [11]:
sentences=[]
for i in range(len(sen)):
    s=[]
    for j in sen[i]:
        s.append(j)
    sentences.append(s)

In [12]:
data=[]
for i in range(len(sentences)):
    data.append( (sentences[i], labels[i].split(' ')) )

In [14]:
START_TAG = "<START>"
STOP_TAG = "<STOP>"
tag_to_ix = {}
word_to_ix = {"<UNK>":0} #生字给id=0

for sentence, tags in data:
    for word in sentence:
        if word not in word_to_ix:
            word_to_ix[word] = len(word_to_ix)
            
    for tag in tags:
        if tag not in tag_to_ix:
            tag_to_ix[tag]=len(tag_to_ix)

In [17]:
tag_to_ix[START_TAG]=5
tag_to_ix[STOP_TAG]=6

In [18]:
tag_to_ix

{'O': 0,
 'B-MED': 1,
 'I-MED': 2,
 'B-rp': 3,
 'I-rp': 4,
 '<START>': 5,
 '<STOP>': 6}

In [19]:
from sklearn.model_selection import train_test_split

training_data,test_data=train_test_split(data,test_size=0.2, random_state=0)

## 模型

In [21]:
def argmax(vec):
    # return the argmax as a python int
    _, idx = torch.max(vec, 1)
    return idx.item()


# def prepare_sequence(seq, to_ix):
#     idxs = [to_ix[w] for w in seq]
#     return torch.tensor(idxs, dtype=torch.long)


#这里增加未登录字处理
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] if w in to_ix else to_ix["<UNK>"] for w in seq] #to_ix.has_key(x)
    return torch.tensor(idxs, dtype=torch.long)


# Compute log sum exp in a numerically stable way for the forward algorithm
def log_sum_exp(vec):
    # 计算当前步骤下的最优路径得分
    max_score = vec[0, argmax(vec)]
    max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
    return max_score + torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))

In [22]:
def get_method(output, text):
    output = np.array(output)
    m_b = np.where(output==1)
    output[m_b]=2
    m = np.where(output==2)
    m = m[0].tolist()
    m_text = []
    if len(m)>0:
        method=[[],[],[],[],[],[],[]]
        method[0].append(m[0])
        num=0
        for i in range(1,len(m)):
            if(m[i]-1==m[i-1]):
                method[num].append(m[i])
            else:
                num+=1
                method[num].append(m[i])
        m_text = []
        for i in method:
            if len(i)!=0:
                m_text.append(("method",text[i[0]:i[-1]+1]))
    return m_text

# predictions = get_method([0,0,1,2,2,2,0,1,2,2,2],'一个机器学习的智能算法')
# print(predictions)
# texts = '一个机器学习的智能算法'
# muc.evaluate_one(predictions, predictions, texts)

In [23]:
def get_problem(output, text):
    
    output = np.array(output)
    m_b = np.where(output==3)
    output[m_b]=4
    m = np.where(output==4)
    m = m[0].tolist()
#     print(m)
    m_text = []
    if len(m)>0:
        method=[[],[],[],[],[],[],[]]
        method[0].append(m[0])
        num=0
        for i in range(1,len(m)):
            if(m[i]-1==m[i-1]):
                
                method[num].append(m[i])
            else:
                num+=1
                method[num].append(m[i])
        m_text = []
        for i in method:
            if len(i)!=0:
                m_text.append(("problem",text[i[0]:i[-1]+1]))
    return m_text

# predictions = get_method([0,0,1,2,2,2,0,1,2,2,2],'一个机器学习的智能算法')
# print(predictions)
# texts = '一个机器学习的智能算法'
# muc.evaluate_one(predictions, predictions, texts)

In [24]:
def cal_M(model_,test_data):
    with torch.no_grad():  
        y_pred,y_true,texts=[],[],[]
        for pair in test_data: #抽取部分看看效果
            sentence=pair[0]
            tag=pair[1]

            precheck_sent = prepare_sequence(sentence, word_to_ix)
            precheck_tags = torch.tensor([tag_to_ix[t] for t in tag], dtype=torch.long) 
            score,pred=model_(precheck_sent)
            pred = torch.max(pred, 1)[1]
            
            sentence_ = "".join(sentence)

            METHOD = get_method(pred,sentence_)
            y_pred.append(METHOD)

            method_ = get_method(precheck_tags,sentence_)
            y_true.append(method_)
            
            texts.append(sentence_)
#         print(y_pred)
#         print(y_true)
#         print(texts)
        
        return muc.evaluate_all(y_pred, y_true, texts)['exact']['f1_score']

In [25]:
def cal_P(model_,test_data):
    with torch.no_grad():  
        y_pred,y_true,texts=[],[],[]
        for pair in test_data: #抽取部分看看效果
            sentence=pair[0]
            tag=pair[1]

            precheck_sent = prepare_sequence(sentence, word_to_ix)
            precheck_tags = torch.tensor([tag_to_ix[t] for t in tag], dtype=torch.long) 
            score,pred=model_(precheck_sent)
            pred = torch.max(pred, 1)[1]
            
            sentence_ = "".join(sentence)

            PROBLEM = get_problem(pred,sentence_)
            y_pred.append(PROBLEM)

            problem_ = get_problem(precheck_tags,sentence_)
            y_true.append(problem_)
            
            texts.append(sentence_)
#         print(y_pred)
#         print(y_true)
#         print(texts)
        
        return muc.evaluate_all(y_pred, y_true, texts)['exact']['f1_score']

In [26]:
def cal_f1(model_,test_data):
    with torch.no_grad():  
        y_pred,y_true,texts=[],[],[]
        for pair in test_data: #抽取部分看看效果
            sentence=pair[0]
            tag=pair[1]

            precheck_sent = prepare_sequence(sentence, word_to_ix)
            precheck_tags = torch.tensor([tag_to_ix[t] for t in tag], dtype=torch.long) 
            score,pred=model_(precheck_sent)

            pred = torch.max(pred, 1)[1]
            
            sentence_ = "".join(sentence)

            METHOD = get_method(pred,sentence_)
            PROBLEM = get_problem(pred,sentence_)
            METHOD.extend(PROBLEM)
            y_pred.append(METHOD)

            method_ = get_method(precheck_tags,sentence_)
            problem_ = get_problem(precheck_tags,sentence_)
            method_.extend(problem_)
            y_true.append(method_)
            texts.append(sentence_)
#         print(y_pred)
#         print(y_true)
#         print(texts)
        
        return muc.evaluate_all(y_pred, y_true, texts)['exact']['f1_score']

In [27]:
def bert_cal_M(MODEl,test_data):
    with torch.no_grad():  
        y_pred,y_true,texts=[],[],[]
        for pair in test_data: #抽取部分看看效果
            sentence=pair[0]
            tag=pair[1]

        #         precheck_sent = prepare_sequence(sentence, word_to_ix)
            input_ids = torch.tensor(tokenizer.encode(sentence)).unsqueeze(0)
            outputs = model(input_ids)
            sequence_in = outputs[0][0][1:len(sentence)+1]
            sequence_in = sequence_in.clone().detach().requires_grad_(True)


            precheck_tags = torch.tensor([tag_to_ix[t] for t in tag], dtype=torch.long) 
            score,pred=MODEl(sequence_in)

            sentence_ = "".join(sentence)

            METHOD = get_method(pred,sentence_)
            y_pred.append(METHOD)

            method_ = get_method(precheck_tags,sentence_)
            y_true.append(method_)

            texts.append(sentence_)
    #         print(y_pred)
    #         print(y_true)
    #         print(texts)

    return muc.evaluate_all(y_pred, y_true, texts)['exact']['f1_score']

In [28]:
 def bert_cal_P(MODEl,test_data):
    with torch.no_grad():  
        y_pred,y_true,texts=[],[],[]
        for pair in test_data: #抽取部分看看效果
            sentence=pair[0]
            tag=pair[1]

        #         precheck_sent = prepare_sequence(sentence, word_to_ix)
            input_ids = torch.tensor(tokenizer.encode(sentence)).unsqueeze(0)
            outputs = model(input_ids)
            sequence_in = outputs[0][0][1:len(sentence)+1]
            sequence_in = sequence_in.clone().detach().requires_grad_(True)


            precheck_tags = torch.tensor([tag_to_ix[t] for t in tag], dtype=torch.long) 
            score,pred=MODEl(sequence_in)

            sentence_ = "".join(sentence)
            
            PROBLEM = get_problem(pred,sentence_)
            y_pred.append(PROBLEM)

            problem_ = get_problem(precheck_tags,sentence_)
            y_true.append(problem_)

            texts.append(sentence_)
    #         print(y_pred)
    #         print(y_true)
    #         print(texts)

    return muc.evaluate_all(y_pred, y_true, texts)['exact']['f1_score']

In [29]:
def bert_cal_F1(MODEl,test_data):
    with torch.no_grad():  
        y_pred,y_true,texts=[],[],[]
        for pair in test_data: #抽取部分看看效果
            sentence=pair[0]
            tag=pair[1]

        #         precheck_sent = prepare_sequence(sentence, word_to_ix)
            input_ids = torch.tensor(tokenizer.encode(sentence)).unsqueeze(0)
            outputs = model(input_ids)
            sequence_in = outputs[0][0][1:len(sentence)+1]
            sequence_in = sequence_in.clone().detach().requires_grad_(True)


            precheck_tags = torch.tensor([tag_to_ix[t] for t in tag], dtype=torch.long) 
            score,pred=MODEl(sequence_in)

            sentence_ = "".join(sentence)

            METHOD = get_method(pred,sentence_)
            PROBLEM = get_problem(pred,sentence_)
            METHOD.extend(PROBLEM)
            y_pred.append(METHOD)

            method_ = get_method(precheck_tags,sentence_)
            problem_ = get_problem(precheck_tags,sentence_)
            method_.extend(problem_)
            y_true.append(method_)

            texts.append(sentence_)
    #         print(y_pred)
    #         print(y_true)
    #         print(texts)

    return muc.evaluate_all(y_pred, y_true, texts)['exact']['f1_score']

In [30]:
def compare(y, y_pred):
    error_index = []
    if len(y) == len(y_pred):
        for i in range(0, len(y)):
            if y[i] != y_pred[i]:
                error_index.append(i)

    print("error_index:",error_index)

In [31]:
class BiLSTM(nn.Module):

    def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim):
        super(BiLSTM, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)

        self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,
                            num_layers=1, bidirectional=True)

        # 将LSTM的输出映射到标签空间
        # 相当于公式中的发射矩阵U
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)

        self.hidden = self.init_hidden()

    def init_hidden(self):
        """初始化LSTM"""
        return (torch.randn(2, 1, self.hidden_dim // 2),
                torch.randn(2, 1, self.hidden_dim // 2))
    
    def forward(self, sentence):
        """预测函数，注意这个函数和_forward_alg不一样
        这里给定一个句子，预测最有可能的标签序列
        """
        self.hidden = self.init_hidden()
        embeds = self.word_embeds(sentence).view(len(sentence), 1, -1)
        lstm_out, self.hidden = self.lstm(embeds, self.hidden)
        lstm_out = lstm_out.view(len(sentence), self.hidden_dim)
        lstm_feats = self.hidden2tag(lstm_out)
        
        return _,lstm_feats

In [32]:
class BiLSTM_CRF(nn.Module):

    def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim):
        super(BiLSTM_CRF, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)

        self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,
                            num_layers=1, bidirectional=True)

        # 将LSTM的输出映射到标签空间
        # 相当于公式中的发射矩阵U
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)

        # 转移矩阵，从标签i转移到标签j的分数
        # tagset_size包含了人为加入的START_TAG和STOP_TAG
        # transitions表示前一个字是j后一个是i的概率
        self.transitions = nn.Parameter(
            torch.randn(self.tagset_size, self.tagset_size))

        # 下面这两个约束不能转移到START_TAG，也不能从STOP_TAG开始转移
        self.transitions.data[tag_to_ix[START_TAG], :] = -10000
        self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000

        self.hidden = self.init_hidden()

    def init_hidden(self):
        """初始化LSTM"""
        return (torch.randn(2, 1, self.hidden_dim // 2),
                torch.randn(2, 1, self.hidden_dim // 2))

    def _forward_alg(self, feats):
        """计算配分函数Z(x)"""

        # 对应于伪码第一步
        init_alphas = torch.full((1, self.tagset_size), -10000.)
        # START_TAG has all of the score.
        init_alphas[0][self.tag_to_ix[START_TAG]] = 0.

        # Wrap in a variable so that we will get automatic backprop
        forward_var = init_alphas

        # 对应于伪码第二步的循环，迭代整个句子
        for feat in feats:
            alphas_t = []  # The forward tensors at this timestep
            for next_tag in range(self.tagset_size):
                # broadcast the emission score: it is the same regardless of
                # the previous tag
                emit_score = feat[next_tag].view(
                    1, -1).expand(1, self.tagset_size)
                # the ith entry of trans_score is the score of transitioning to
                # next_tag from i
                trans_score = self.transitions[next_tag].view(1, -1)
                # The ith entry of next_tag_var is the value for the
                # edge (i -> next_tag) before we do log-sum-exp
                # 这里对应了伪码第二步中三者求和
                next_tag_var = forward_var + trans_score + emit_score
                # The forward variable for this tag is log-sum-exp of all the scores.
                alphas_t.append(log_sum_exp(next_tag_var).view(1))
            forward_var = torch.cat(alphas_t).view(1, -1)
        # 对应于伪码第三步，注意损失函数最终是要logZ(x)，所以又是一个logsumexp
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        alpha = log_sum_exp(terminal_var)
        return alpha

    def _get_lstm_features(self, sentence):
        """调用LSTM获得每个token的隐状态，这里可以替换为任意的特征函数，
        LSTM返回的特征就是公式中的x
        """
        self.hidden = self.init_hidden()
        embeds = self.word_embeds(sentence).view(len(sentence), 1, -1)
        lstm_out, self.hidden = self.lstm(embeds, self.hidden)
        lstm_out = lstm_out.view(len(sentence), self.hidden_dim)
        lstm_feats = self.hidden2tag(lstm_out)
        return lstm_feats

    def _score_sentence(self, feats, tags):
        """计算给定输入序列和标签序列的匹配函数，即公式中的s函数"""
        score = torch.zeros(1)
        tags = torch.cat([torch.tensor([self.tag_to_ix[START_TAG]], dtype=torch.long), tags])
        for i, feat in enumerate(feats):
            score = score + \
                self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
        score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]]
        return score

    def _viterbi_decode(self, feats):
        """维特比解码，给定输入x和相关参数(发射矩阵和转移矩阵)，或者概率最大的标签序列
        """
        backpointers = []

        # Initialize the viterbi variables in log space
        init_vvars = torch.full((1, self.tagset_size), -10000.)
        init_vvars[0][self.tag_to_ix[START_TAG]] = 0

        # forward_var at step i holds the viterbi variables for step i-1
        forward_var = init_vvars
        for feat in feats:
            bptrs_t = []  # holds the backpointers for this step
            viterbivars_t = []  # holds the viterbi variables for this step

            for next_tag in range(self.tagset_size):
                # next_tag_var[i] holds the viterbi variable for tag i at the
                # previous step, plus the score of transitioning
                # from tag i to next_tag.
                # We don't include the emission scores here because the max
                # does not depend on them (we add them in below)
                next_tag_var = forward_var + self.transitions[next_tag]
                best_tag_id = argmax(next_tag_var)
                bptrs_t.append(best_tag_id)
                viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
            # Now add in the emission scores, and assign forward_var to the set
            # of viterbi variables we just computed
            forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
            backpointers.append(bptrs_t)

        # Transition to STOP_TAG
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        best_tag_id = argmax(terminal_var)
        path_score = terminal_var[0][best_tag_id]

        # Follow the back pointers to decode the best path.
        best_path = [best_tag_id]
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)
        # Pop off the start tag (we dont want to return that to the caller)
        start = best_path.pop()
        assert start == self.tag_to_ix[START_TAG]  # Sanity check
        best_path.reverse()
        return path_score, best_path

    def neg_log_likelihood(self, sentence, tags):
        """损失函数 = Z(x) - s(x,y)
        """
        feats = self._get_lstm_features(sentence)
        forward_score = self._forward_alg(feats)
        gold_score = self._score_sentence(feats, tags)
        return forward_score - gold_score

    def forward(self, sentence):
        """预测函数，注意这个函数和_forward_alg不一样
        这里给定一个句子，预测最有可能的标签序列
        """
        # Get the emission scores from the BiLSTM
        lstm_feats = self._get_lstm_features(sentence)

        # Find the best path, given the features.
        score, tag_seq = self._viterbi_decode(lstm_feats)
        return score, tag_seq

In [33]:
class BiLSTM_CRF_ATT(nn.Module):

    def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim):
        super(BiLSTM_CRF_ATT, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)

        self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,
                            num_layers=1, bidirectional=True)
        self.Q = nn.Linear(hidden_dim, hidden_dim)
        self.K = nn.Linear(hidden_dim, hidden_dim)
        self.V = nn.Linear(hidden_dim, hidden_dim)


        # 将LSTM的输出映射到标签空间
        # 相当于公式中的发射矩阵U
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)

        # 转移矩阵，从标签i转移到标签j的分数
        # tagset_size包含了人为加入的START_TAG和STOP_TAG
        # transitions表示前一个字是j后一个是i的概率
        self.transitions = nn.Parameter(
            torch.randn(self.tagset_size, self.tagset_size))

        # 下面这两个约束不能转移到START_TAG，也不能从STOP_TAG开始转移
        self.transitions.data[tag_to_ix[START_TAG], :] = -10000
        self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000

        self.hidden = self.init_hidden()

    def init_hidden(self):
        """初始化LSTM"""
        return (torch.randn(2, 1, self.hidden_dim // 2),
                torch.randn(2, 1, self.hidden_dim // 2))

    def _forward_alg(self, feats):
        """计算配分函数Z(x)"""

        # 对应于伪码第一步
        init_alphas = torch.full((1, self.tagset_size), -10000.)
        # START_TAG has all of the score.
        init_alphas[0][self.tag_to_ix[START_TAG]] = 0.

        # Wrap in a variable so that we will get automatic backprop
        forward_var = init_alphas

        # 对应于伪码第二步的循环，迭代整个句子
        for feat in feats:
            alphas_t = []  # The forward tensors at this timestep
            for next_tag in range(self.tagset_size):
                # broadcast the emission score: it is the same regardless of
                # the previous tag
                emit_score = feat[next_tag].view(
                    1, -1).expand(1, self.tagset_size)
                # the ith entry of trans_score is the score of transitioning to
                # next_tag from i
                trans_score = self.transitions[next_tag].view(1, -1)
                # The ith entry of next_tag_var is the value for the
                # edge (i -> next_tag) before we do log-sum-exp
                # 这里对应了伪码第二步中三者求和
                next_tag_var = forward_var + trans_score + emit_score
                # The forward variable for this tag is log-sum-exp of all the scores.
                alphas_t.append(log_sum_exp(next_tag_var).view(1))
            forward_var = torch.cat(alphas_t).view(1, -1)
        # 对应于伪码第三步，注意损失函数最终是要logZ(x)，所以又是一个logsumexp
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        alpha = log_sum_exp(terminal_var)
        return alpha

    def _get_lstm_features(self, sentence):
        """调用LSTM获得每个token的隐状态，这里可以替换为任意的特征函数，
        LSTM返回的特征就是公式中的x
        """
        self.hidden = self.init_hidden()
        embeds = self.word_embeds(sentence).view(len(sentence), 1, -1)
        lstm_out, self.hidden = self.lstm(embeds, self.hidden)
        lstm_out = lstm_out.view(len(sentence), self.hidden_dim)
        Q,K,V = lstm_out,lstm_out,lstm_out
#         Q = self.Q(lstm_out)
#         K = self.K(lstm_out)
#         V = self.V(lstm_out)

        scores = torch.matmul(Q,K.transpose(0,1))
        alpha_n = F.softmax(scores,dim=-1)
        context = torch.matmul(alpha_n,V)
        
        lstm_feats = self.hidden2tag(context)
        return lstm_feats

    def _score_sentence(self, feats, tags):
        """计算给定输入序列和标签序列的匹配函数，即公式中的s函数"""
        score = torch.zeros(1)
        tags = torch.cat([torch.tensor([self.tag_to_ix[START_TAG]], dtype=torch.long), tags])
        for i, feat in enumerate(feats):
            score = score + \
                self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
        score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]]
        return score

    def _viterbi_decode(self, feats):
        """维特比解码，给定输入x和相关参数(发射矩阵和转移矩阵)，或者概率最大的标签序列
        """
        backpointers = []

        # Initialize the viterbi variables in log space
        init_vvars = torch.full((1, self.tagset_size), -10000.)
        init_vvars[0][self.tag_to_ix[START_TAG]] = 0

        # forward_var at step i holds the viterbi variables for step i-1
        forward_var = init_vvars
        for feat in feats:
            bptrs_t = []  # holds the backpointers for this step
            viterbivars_t = []  # holds the viterbi variables for this step

            for next_tag in range(self.tagset_size):
                # next_tag_var[i] holds the viterbi variable for tag i at the
                # previous step, plus the score of transitioning
                # from tag i to next_tag.
                # We don't include the emission scores here because the max
                # does not depend on them (we add them in below)
                next_tag_var = forward_var + self.transitions[next_tag]
                best_tag_id = argmax(next_tag_var)
                bptrs_t.append(best_tag_id)
                viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
            # Now add in the emission scores, and assign forward_var to the set
            # of viterbi variables we just computed
            forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
            backpointers.append(bptrs_t)

        # Transition to STOP_TAG
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        best_tag_id = argmax(terminal_var)
        path_score = terminal_var[0][best_tag_id]

        # Follow the back pointers to decode the best path.
        best_path = [best_tag_id]
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)
        # Pop off the start tag (we dont want to return that to the caller)
        start = best_path.pop()
        assert start == self.tag_to_ix[START_TAG]  # Sanity check
        best_path.reverse()
        return path_score, best_path

    def neg_log_likelihood(self, sentence, tags):
        """损失函数 = Z(x) - s(x,y)
        """
        feats = self._get_lstm_features(sentence)
        forward_score = self._forward_alg(feats)
        gold_score = self._score_sentence(feats, tags)
        return forward_score - gold_score

    def forward(self, sentence):
        """预测函数，注意这个函数和_forward_alg不一样
        这里给定一个句子，预测最有可能的标签序列
        """
        # Get the emission scores from the BiLSTM
        lstm_feats = self._get_lstm_features(sentence)
        # Find the best path, given the features.
        score, tag_seq = self._viterbi_decode(lstm_feats)
        return score, tag_seq

In [34]:
class Bert_CRF(nn.Module):
    def __init__(self,tag_to_ix, embedding_dim):
        super(Bert_CRF, self).__init__()
        self.embedding_dim = embedding_dim
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)
        #定义线性函数      
        self.word_embeds = nn.Linear(768, self.embedding_dim)  #bert默认的隐藏单元数是768， 输出单元是2，表示二分类

        # 将LSTM的输出映射到标签空间
        # 相当于公式中的发射矩阵U
        self.hidden2tag = nn.Linear(self.embedding_dim, self.tagset_size)

        # 转移矩阵，从标签i转移到标签j的分数
        # tagset_size包含了人为加入的START_TAG和STOP_TAG
        # transitions表示前一个字是j后一个是i的概率
        self.transitions = nn.Parameter(
            torch.randn(self.tagset_size, self.tagset_size))

        # 下面这两个约束不能转移到START_TAG，也不能从STOP_TAG开始转移
        self.transitions.data[tag_to_ix[START_TAG], :] = -10000
        self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000

    def _forward_alg(self, feats):
        """计算配分函数Z(x)"""

        # 对应于伪码第一步
        init_alphas = torch.full((1, self.tagset_size), -10000.)
        # START_TAG has all of the score.
        init_alphas[0][self.tag_to_ix[START_TAG]] = 0.

        # Wrap in a variable so that we will get automatic backprop
        forward_var = init_alphas

        # 对应于伪码第二步的循环，迭代整个句子
        for feat in feats:
            alphas_t = []  # The forward tensors at this timestep
            for next_tag in range(self.tagset_size):
                # broadcast the emission score: it is the same regardless of
                # the previous tag
                emit_score = feat[next_tag].view(
                    1, -1).expand(1, self.tagset_size)
                # the ith entry of trans_score is the score of transitioning to
                # next_tag from i
                trans_score = self.transitions[next_tag].view(1, -1)
                # The ith entry of next_tag_var is the value for the
                # edge (i -> next_tag) before we do log-sum-exp
                # 这里对应了伪码第二步中三者求和
                next_tag_var = forward_var + trans_score + emit_score
                # The forward variable for this tag is log-sum-exp of all the scores.
                alphas_t.append(log_sum_exp(next_tag_var).view(1))
            forward_var = torch.cat(alphas_t).view(1, -1)
        # 对应于伪码第三步，注意损失函数最终是要logZ(x)，所以又是一个logsumexp
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        alpha = log_sum_exp(terminal_var)
        return alpha

    def _get_lstm_features(self, sentence):
        """调用LSTM获得每个token的隐状态，这里可以替换为任意的特征函数，
        LSTM返回的特征就是公式中的x
        """

        embeds = self.word_embeds(sentence).view(sentence.shape[0],1,-1).squeeze(1)
        
        lstm_feats = self.hidden2tag(embeds)
        
        return lstm_feats

    def _score_sentence(self, feats, tags):
        """计算给定输入序列和标签序列的匹配函数，即公式中的s函数"""
        score = torch.zeros(1)
        tags = torch.cat([torch.tensor([self.tag_to_ix[START_TAG]], dtype=torch.long), tags])
        for i, feat in enumerate(feats):
            score = score + \
                self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
        score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]]
        return score

    def _viterbi_decode(self, feats):
        """维特比解码，给定输入x和相关参数(发射矩阵和转移矩阵)，或者概率最大的标签序列
        """
        backpointers = []

        # Initialize the viterbi variables in log space
        init_vvars = torch.full((1, self.tagset_size), -10000.)
        init_vvars[0][self.tag_to_ix[START_TAG]] = 0

        # forward_var at step i holds the viterbi variables for step i-1
        forward_var = init_vvars
        for feat in feats:
            bptrs_t = []  # holds the backpointers for this step
            viterbivars_t = []  # holds the viterbi variables for this step

            for next_tag in range(self.tagset_size):
                # next_tag_var[i] holds the viterbi variable for tag i at the
                # previous step, plus the score of transitioning
                # from tag i to next_tag.
                # We don't include the emission scores here because the max
                # does not depend on them (we add them in below)
                next_tag_var = forward_var + self.transitions[next_tag]
                best_tag_id = argmax(next_tag_var)
                bptrs_t.append(best_tag_id)
                viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
            # Now add in the emission scores, and assign forward_var to the set
            # of viterbi variables we just computed
            forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
            backpointers.append(bptrs_t)

        # Transition to STOP_TAG
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        best_tag_id = argmax(terminal_var)
        path_score = terminal_var[0][best_tag_id]

        # Follow the back pointers to decode the best path.
        best_path = [best_tag_id]
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)
        # Pop off the start tag (we dont want to return that to the caller)
        start = best_path.pop()
        assert start == self.tag_to_ix[START_TAG]  # Sanity check
        best_path.reverse()
        return path_score, best_path

    def neg_log_likelihood(self, w2v, tags):
        """损失函数 = Z(x) - s(x,y)
        """
        feats = self._get_lstm_features(w2v)
        forward_score = self._forward_alg(feats)
        gold_score = self._score_sentence(feats, tags)
        return forward_score - gold_score

    def forward(self, sentence):
        """预测函数，注意这个函数和_forward_alg不一样
        这里给定一个句子，预测最有可能的标签序列
        """
        # Get the emission scores from the BiLSTM
        lstm_feats = self._get_lstm_features(sentence)

        # Find the best path, given the features.
        score, tag_seq = self._viterbi_decode(lstm_feats)
        return score, tag_seq

In [35]:
class Bert_ATT_CRF(nn.Module):
    def __init__(self,tag_to_ix, embedding_dim):
        super(Bert_ATT_CRF, self).__init__()
        self.embedding_dim = embedding_dim
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)
        #定义线性函数      
        self.word_embeds = nn.Linear(768, self.embedding_dim)  #bert默认的隐藏单元数是768， 输出单元是2，表示二分类

        # 将LSTM的输出映射到标签空间
        # 相当于公式中的发射矩阵U
        self.hidden2tag = nn.Linear(self.embedding_dim, self.tagset_size)

        # 转移矩阵，从标签i转移到标签j的分数
        # tagset_size包含了人为加入的START_TAG和STOP_TAG
        # transitions表示前一个字是j后一个是i的概率
        self.transitions = nn.Parameter(
            torch.randn(self.tagset_size, self.tagset_size))

        # 下面这两个约束不能转移到START_TAG，也不能从STOP_TAG开始转移
        self.transitions.data[tag_to_ix[START_TAG], :] = -10000
        self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000


    def _forward_alg(self, feats):
        """计算配分函数Z(x)"""

        # 对应于伪码第一步
        init_alphas = torch.full((1, self.tagset_size), -10000.)
        # START_TAG has all of the score.
        init_alphas[0][self.tag_to_ix[START_TAG]] = 0.

        # Wrap in a variable so that we will get automatic backprop
        forward_var = init_alphas

        # 对应于伪码第二步的循环，迭代整个句子
        for feat in feats:
            alphas_t = []  # The forward tensors at this timestep
            for next_tag in range(self.tagset_size):
                # broadcast the emission score: it is the same regardless of
                # the previous tag
                emit_score = feat[next_tag].view(
                    1, -1).expand(1, self.tagset_size)
                # the ith entry of trans_score is the score of transitioning to
                # next_tag from i
                trans_score = self.transitions[next_tag].view(1, -1)
                # The ith entry of next_tag_var is the value for the
                # edge (i -> next_tag) before we do log-sum-exp
                # 这里对应了伪码第二步中三者求和
                next_tag_var = forward_var + trans_score + emit_score
                # The forward variable for this tag is log-sum-exp of all the scores.
                alphas_t.append(log_sum_exp(next_tag_var).view(1))
            forward_var = torch.cat(alphas_t).view(1, -1)
        # 对应于伪码第三步，注意损失函数最终是要logZ(x)，所以又是一个logsumexp
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        alpha = log_sum_exp(terminal_var)
        return alpha

    def _get_lstm_features(self, sentence):
        """调用LSTM获得每个token的隐状态，这里可以替换为任意的特征函数，
        LSTM返回的特征就是公式中的x
        """
        embeds = self.word_embeds(sentence).view(sentence.shape[0],1,-1).squeeze(1)
        
        Q,K,V = embeds,embeds,embeds
#         Q = self.Q(embeds)
#         K = self.K(embeds)
#         V = self.V(embeds)
        scores = torch.matmul(Q,K.transpose(0,1))
        alpha_n = F.softmax(scores,dim=-1)
        context = torch.matmul(alpha_n,V)
        
        lstm_feats = self.hidden2tag(context)
        
        return lstm_feats

    def _score_sentence(self, feats, tags):
        """计算给定输入序列和标签序列的匹配函数，即公式中的s函数"""
        score = torch.zeros(1)
        tags = torch.cat([torch.tensor([self.tag_to_ix[START_TAG]], dtype=torch.long), tags])
        for i, feat in enumerate(feats):
            score = score + \
                self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
        score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]]
        return score

    def _viterbi_decode(self, feats):
        """维特比解码，给定输入x和相关参数(发射矩阵和转移矩阵)，或者概率最大的标签序列
        """
        backpointers = []

        # Initialize the viterbi variables in log space
        init_vvars = torch.full((1, self.tagset_size), -10000.)
        init_vvars[0][self.tag_to_ix[START_TAG]] = 0

        # forward_var at step i holds the viterbi variables for step i-1
        forward_var = init_vvars
        for feat in feats:
            bptrs_t = []  # holds the backpointers for this step
            viterbivars_t = []  # holds the viterbi variables for this step

            for next_tag in range(self.tagset_size):
                # next_tag_var[i] holds the viterbi variable for tag i at the
                # previous step, plus the score of transitioning
                # from tag i to next_tag.
                # We don't include the emission scores here because the max
                # does not depend on them (we add them in below)
                next_tag_var = forward_var + self.transitions[next_tag]
                best_tag_id = argmax(next_tag_var)
                bptrs_t.append(best_tag_id)
                viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
            # Now add in the emission scores, and assign forward_var to the set
            # of viterbi variables we just computed
            forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
            backpointers.append(bptrs_t)

        # Transition to STOP_TAG
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        best_tag_id = argmax(terminal_var)
        path_score = terminal_var[0][best_tag_id]

        # Follow the back pointers to decode the best path.
        best_path = [best_tag_id]
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)
        # Pop off the start tag (we dont want to return that to the caller)
        start = best_path.pop()
        assert start == self.tag_to_ix[START_TAG]  # Sanity check
        best_path.reverse()
        return path_score, best_path

    def neg_log_likelihood(self, w2v, tags):
        """损失函数 = Z(x) - s(x,y)
        """
        feats = self._get_lstm_features(w2v)
        forward_score = self._forward_alg(feats)
        gold_score = self._score_sentence(feats, tags)
        return forward_score - gold_score

    def forward(self, sentence):
        """预测函数，注意这个函数和_forward_alg不一样
        这里给定一个句子，预测最有可能的标签序列
        """
        # Get the emission scores from the BiLSTM
        lstm_feats = self._get_lstm_features(sentence)

        # Find the best path, given the features.
        score, tag_seq = self._viterbi_decode(lstm_feats)
        
        return score, tag_seq

In [36]:
class Bert_BiLSTM_CRF(nn.Module):
    def __init__(self,tag_to_ix, embedding_dim, hidden_dim):
        super(Bert_BiLSTM_CRF, self).__init__()
        self.embedding_dim = embedding_dim
        self.hidden_dim = hidden_dim
#         self.vocab_size = vocab_size
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)
        #定义线性函数      
        self.word_embeds = nn.Linear(768, self.embedding_dim)  #bert默认的隐藏单元数是768， 输出单元是2，表示二分类
        
#         self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,
                            num_layers=1, bidirectional=True)

        # 将LSTM的输出映射到标签空间
        # 相当于公式中的发射矩阵U
        self.hidden2tag = nn.Linear(self.hidden_dim, self.tagset_size)

        # 转移矩阵，从标签i转移到标签j的分数
        # tagset_size包含了人为加入的START_TAG和STOP_TAG
        # transitions表示前一个字是j后一个是i的概率
        self.transitions = nn.Parameter(
            torch.randn(self.tagset_size, self.tagset_size))

        # 下面这两个约束不能转移到START_TAG，也不能从STOP_TAG开始转移
        self.transitions.data[tag_to_ix[START_TAG], :] = -10000
        self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000

        self.hidden = self.init_hidden()

    def init_hidden(self):
        """初始化LSTM"""
        return (torch.randn(2, 1, self.hidden_dim // 2),
                torch.randn(2, 1, self.hidden_dim // 2))

    def _forward_alg(self, feats):
        """计算配分函数Z(x)"""

        # 对应于伪码第一步
        init_alphas = torch.full((1, self.tagset_size), -10000.)
        # START_TAG has all of the score.
        init_alphas[0][self.tag_to_ix[START_TAG]] = 0.

        # Wrap in a variable so that we will get automatic backprop
        forward_var = init_alphas

        # 对应于伪码第二步的循环，迭代整个句子
        for feat in feats:
            alphas_t = []  # The forward tensors at this timestep
            for next_tag in range(self.tagset_size):
                # broadcast the emission score: it is the same regardless of
                # the previous tag
                emit_score = feat[next_tag].view(
                    1, -1).expand(1, self.tagset_size)
                # the ith entry of trans_score is the score of transitioning to
                # next_tag from i
                trans_score = self.transitions[next_tag].view(1, -1)
                # The ith entry of next_tag_var is the value for the
                # edge (i -> next_tag) before we do log-sum-exp
                # 这里对应了伪码第二步中三者求和
                next_tag_var = forward_var + trans_score + emit_score
                # The forward variable for this tag is log-sum-exp of all the scores.
                alphas_t.append(log_sum_exp(next_tag_var).view(1))
            forward_var = torch.cat(alphas_t).view(1, -1)
        # 对应于伪码第三步，注意损失函数最终是要logZ(x)，所以又是一个logsumexp
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        alpha = log_sum_exp(terminal_var)
        return alpha

    def _get_lstm_features(self, sentence):
        """调用LSTM获得每个token的隐状态，这里可以替换为任意的特征函数，
        LSTM返回的特征就是公式中的x
        """
        
#         self.hidden = self.init_hidden()
#         embeds = self.word_embeds(sentence).view(sentence.shape[0],1,-1).squeeze(1)
#         Q,K,V = embeds,embeds,embeds
#         Q = self.Q(lstm_out)
#         K = self.K(lstm_out)
#         V = self.V(lstm_out)
#         scores = torch.matmul(Q,K.transpose(0,1))
#         alpha_n = F.softmax(scores,dim=-1)
#         context = torch.matmul(alpha_n,V)
#         lstm_feats = self.hidden2tag(w2v)
        
        self.hidden = self.init_hidden()
        embeds = self.word_embeds(sentence).view(len(sentence), 1, -1)
        lstm_out, self.hidden = self.lstm(embeds, self.hidden)
        lstm_out = lstm_out.view(len(sentence), self.hidden_dim)
        lstm_feats = self.hidden2tag(lstm_out)
        
        return lstm_feats

    def _score_sentence(self, feats, tags):
        """计算给定输入序列和标签序列的匹配函数，即公式中的s函数"""
        score = torch.zeros(1)
        tags = torch.cat([torch.tensor([self.tag_to_ix[START_TAG]], dtype=torch.long), tags])
        for i, feat in enumerate(feats):
            score = score + \
                self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
        score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]]
        return score

    def _viterbi_decode(self, feats):
        """维特比解码，给定输入x和相关参数(发射矩阵和转移矩阵)，或者概率最大的标签序列
        """
        backpointers = []

        # Initialize the viterbi variables in log space
        init_vvars = torch.full((1, self.tagset_size), -10000.)
        init_vvars[0][self.tag_to_ix[START_TAG]] = 0

        # forward_var at step i holds the viterbi variables for step i-1
        forward_var = init_vvars
        for feat in feats:
            bptrs_t = []  # holds the backpointers for this step
            viterbivars_t = []  # holds the viterbi variables for this step

            for next_tag in range(self.tagset_size):
                # next_tag_var[i] holds the viterbi variable for tag i at the
                # previous step, plus the score of transitioning
                # from tag i to next_tag.
                # We don't include the emission scores here because the max
                # does not depend on them (we add them in below)
                next_tag_var = forward_var + self.transitions[next_tag]
                best_tag_id = argmax(next_tag_var)
                bptrs_t.append(best_tag_id)
                viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
            # Now add in the emission scores, and assign forward_var to the set
            # of viterbi variables we just computed
            forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
            backpointers.append(bptrs_t)

        # Transition to STOP_TAG
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        best_tag_id = argmax(terminal_var)
        path_score = terminal_var[0][best_tag_id]

        # Follow the back pointers to decode the best path.
        best_path = [best_tag_id]
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)
        # Pop off the start tag (we dont want to return that to the caller)
        start = best_path.pop()
        assert start == self.tag_to_ix[START_TAG]  # Sanity check
        best_path.reverse()
        return path_score, best_path

    def neg_log_likelihood(self, w2v, tags):
        """损失函数 = Z(x) - s(x,y)
        """
        feats = self._get_lstm_features(w2v)
        forward_score = self._forward_alg(feats)
        gold_score = self._score_sentence(feats, tags)
        return forward_score - gold_score

    def forward(self, sentence):
        """预测函数，注意这个函数和_forward_alg不一样
        这里给定一个句子，预测最有可能的标签序列
        """
        # Get the emission scores from the BiLSTM
        lstm_feats = self._get_lstm_features(sentence)

        # Find the best path, given the features.
        score, tag_seq = self._viterbi_decode(lstm_feats)
        return score, tag_seq

## 训练

In [37]:
def train_lstm(model,optimizer,EPOCHS):
    criterion = nn.CrossEntropyLoss()
    F1=[]
    for epoch in range(EPOCHS):
        print("epoch %d =============" % epoch)
        time_start = time.time()

        count=1
        for sentence, tags in training_data:
            model.zero_grad()
            sentence_in = prepare_sequence(sentence, word_to_ix) 
            targets = torch.tensor([tag_to_ix[t] for t in tags], dtype=torch.long)

            # Step 3. Run our forward pass.
            _,pred = model(sentence_in)
            loss = criterion(pred,targets)

            # Step 4. Compute the loss, gradients, and update the parameters by
            # calling optimizer.step()
            loss.backward()
            optimizer.step()
            
            if count%500==0:
                print("iter %d: loss %f" %(count,loss))
            count+=1

        time_end=time.time()
        F1.append(cal_f1(model,test_data))
        print("time used: %d s" % (time_end-time_start)  )
        
    return F1,model  

In [38]:
def train(model,optimizer,EPOCHS):

    F1=[]
    for epoch in range(EPOCHS):  # again, normally you would NOT do 300 epochs, it is toy data
        print("epoch %d =============" % epoch)
        time_start = time.time()

        count=1
        for sentence, tags in training_data:
            # Step 1. Remember that Pytorch accumulates gradients.
            # We need to clear them out before each instance
            model.zero_grad()

            # Step 2. Get our inputs ready for the network, that is,
            # turn them into Tensors of word indices.
            sentence_in = prepare_sequence(sentence, word_to_ix) 
            targets = torch.tensor([tag_to_ix[t] for t in tags], dtype=torch.long)

            # Step 3. Run our forward pass.
            loss = model.neg_log_likelihood(sentence_in, targets)

            # Step 4. Compute the loss, gradients, and update the parameters by
            # calling optimizer.step()
            loss.backward()
            optimizer.step()

            if count==500 or count==1000:
                print("iter %d: loss %f" %(count,loss))
            count+=1

        time_end=time.time()
        F1.append(cal_f1(model,test_data))
        print("time used: %d s" % (time_end-time_start)  )
        
    return F1,model

In [39]:
def train_bert(bert_model,optimizer,EPOCHS):
    F1=[]

    for epoch in range(EPOCHS):  # again, normally you would NOT do 300 epochs, it is toy data
        print("epoch %d =============" % epoch)
        time_start = time.time()

        count=1
        for sentence, tags in training_data:
            
            bert_model.zero_grad()
            
          # 使用这种方法对句子编码会自动添加[CLS] 和[SEP]
            input_ids = torch.tensor(tokenizer.encode(sentence)).unsqueeze(0)
            outputs = model(input_ids)
            sequence_in = outputs[0][0][1:len(sentence)+1]
            sequence_in = sequence_in.clone().detach().requires_grad_(True)

            targets = torch.tensor([tag_to_ix[t] for t in tags], dtype=torch.long)

            # Step 3. Run our forward pass.
            loss = bert_model.neg_log_likelihood(sequence_in, targets)

            # Step 4. Compute the loss, gradients, and update the parameters by
            # calling optimizer.step()
            loss.backward()
            optimizer.step()

            if count==500 or count==1000:
#                 score,y_pred=bert_model(sequence_in)
    #             print(sentence)
    #             print(targets)
    #             print(y_pred)
                print("iter %d: loss %f" %(count,loss))
            count+=1

        time_end=time.time()
        F1.append(bert_cal_F1(bert_model,test_data))
        print("time used: %d s" % (time_end-time_start))
        
    return F1,bert_model

In [40]:
EMBEDDING_DIM = 128
HIDDEN_DIM = 64*2
EPOCHS=15

In [68]:
bilstm = BiLSTM(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM)
optimizer0 = optim.Adagrad(bilstm.parameters(), lr=0.01, weight_decay=1e-5)

F1_bilstm,bilstm = train_lstm(bilstm,optimizer0,EPOCHS)
torch.save(bilstm.state_dict(),'./Model/bilstm.pkl')

iter 500: loss 0.757516
iter 1000: loss 0.278210

 NER evaluation scores:
  strict mode, Precision=0.6262, Recall=0.6530, F1:0.6356
   exact mode, Precision=0.6279, Recall=0.6547, F1:0.6373
 partial mode, Precision=0.7794, Recall=0.8221, F1:0.7945
    type mode, Precision=0.9206, Recall=0.9745, F1:0.9400
time used: 10 s
iter 500: loss 0.428748
iter 1000: loss 0.170184

 NER evaluation scores:
  strict mode, Precision=0.6529, Recall=0.6823, F1:0.6629
   exact mode, Precision=0.6538, Recall=0.6834, F1:0.6638
 partial mode, Precision=0.7958, Recall=0.8365, F1:0.8097
    type mode, Precision=0.9260, Recall=0.9729, F1:0.9419
time used: 10 s
iter 500: loss 0.322457
iter 1000: loss 0.136235

 NER evaluation scores:
  strict mode, Precision=0.6719, Recall=0.6987, F1:0.6811
   exact mode, Precision=0.6727, Recall=0.6998, F1:0.6820
 partial mode, Precision=0.8081, Recall=0.8455, F1:0.8211
    type mode, Precision=0.9337, Recall=0.9773, F1:0.9490
time used: 10 s
iter 500: loss 0.260165
iter 1000:

In [57]:
bilstm_crf = BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM)
optimizer1 = optim.Adagrad(bilstm_crf.parameters(), lr=0.01, weight_decay=1e-5)

F1_bilstm_crf,bilstm_crf = train(bilstm_crf,optimizer1,EPOCHS)
torch.save(bilstm_crf.state_dict(),'./Model/bilstm_crf.pkl')

iter 500: loss 17.308954
iter 1000: loss 6.905617

 NER evaluation scores:
  strict mode, Precision=0.6130, Recall=0.6496, F1:0.6263
   exact mode, Precision=0.6130, Recall=0.6496, F1:0.6263
 partial mode, Precision=0.7614, Recall=0.8198, F1:0.7825
    type mode, Precision=0.9021, Recall=0.9784, F1:0.9294
time used: 39 s
iter 500: loss 9.706703
iter 1000: loss 4.953720

 NER evaluation scores:
  strict mode, Precision=0.6709, Recall=0.7009, F1:0.6818
   exact mode, Precision=0.6709, Recall=0.7009, F1:0.6818
 partial mode, Precision=0.8022, Recall=0.8455, F1:0.8177
    type mode, Precision=0.9289, Recall=0.9823, F1:0.9478
time used: 39 s
iter 500: loss 6.836761
iter 1000: loss 3.642677

 NER evaluation scores:
  strict mode, Precision=0.7019, Recall=0.7302, F1:0.7124
   exact mode, Precision=0.7036, Recall=0.7319, F1:0.7140
 partial mode, Precision=0.8221, Recall=0.8615, F1:0.8364
    type mode, Precision=0.9349, Recall=0.9834, F1:0.9523
time used: 38 s
iter 500: loss 7.476097
iter 1000

In [58]:
bilstm_att_crf = BiLSTM_CRF_ATT(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM)
optimizer2 = optim.Adagrad(bilstm_att_crf.parameters(), lr=0.01, weight_decay=1e-5)

F1_bilstm_att_crf,bilstm_att_crf = train(bilstm_att_crf,optimizer2,EPOCHS)
torch.save(bilstm_att_crf.state_dict(),'./Model/bilstm_att_crf.pkl')

iter 500: loss 10.339867
iter 1000: loss 8.545563

 NER evaluation scores:
  strict mode, Precision=0.6342, Recall=0.6519, F1:0.6400
   exact mode, Precision=0.6386, Recall=0.6564, F1:0.6444
 partial mode, Precision=0.7930, Recall=0.8145, F1:0.7994
    type mode, Precision=0.9346, Recall=0.9576, F1:0.9407
time used: 37 s
iter 500: loss 4.933735
iter 1000: loss 3.919632

 NER evaluation scores:
  strict mode, Precision=0.6728, Recall=0.6886, F1:0.6775
   exact mode, Precision=0.6772, Recall=0.6930, F1:0.6819
 partial mode, Precision=0.8140, Recall=0.8323, F1:0.8191
    type mode, Precision=0.9403, Recall=0.9599, F1:0.9453
time used: 37 s
iter 500: loss 2.256699
iter 1000: loss 2.300110

 NER evaluation scores:
  strict mode, Precision=0.6992, Recall=0.7109, F1:0.7019
   exact mode, Precision=0.7053, Recall=0.7169, F1:0.7079
 partial mode, Precision=0.8309, Recall=0.8446, F1:0.8340
    type mode, Precision=0.9488, Recall=0.9640, F1:0.9521
time used: 37 s
iter 500: loss 1.905033
iter 1000

In [41]:
pretrained_weights="../bert-base/bert-base-chinese/"
model = BertModel.from_pretrained(pretrained_weights)
tokenizer = BertTokenizer.from_pretrained(pretrained_weights)

Some weights of the model checkpoint at ../bert-base/bert-base-chinese/ were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [42]:
bert_crf = Bert_CRF(tag_to_ix, EMBEDDING_DIM)
optimizer3 = optim.SGD(bert_crf.parameters(), lr=0.001, weight_decay=1e-5)

F1_bert_crf,bert_crf = train_bert(bert_crf,optimizer3,EPOCHS)
torch.save(bert_crf.state_dict(),'./Model/bert_crf.pkl')

iter 500: loss 12.053375
iter 1000: loss 2.533348

 NER evaluation scores:
  strict mode, Precision=0.6504, Recall=0.6924, F1:0.6649
   exact mode, Precision=0.6581, Recall=0.7002, F1:0.6727
 partial mode, Precision=0.7935, Recall=0.8398, F1:0.8087
    type mode, Precision=0.9045, Recall=0.9529, F1:0.9199
time used: 116 s
iter 500: loss 5.707264
iter 1000: loss 1.945801

 NER evaluation scores:
  strict mode, Precision=0.7317, Recall=0.7539, F1:0.7384
   exact mode, Precision=0.7405, Recall=0.7616, F1:0.7463
 partial mode, Precision=0.8492, Recall=0.8714, F1:0.8547
    type mode, Precision=0.9401, Recall=0.9646, F1:0.9466
time used: 120 s
iter 500: loss 3.551781
iter 1000: loss 1.687943

 NER evaluation scores:
  strict mode, Precision=0.7694, Recall=0.7832, F1:0.7722
   exact mode, Precision=0.7788, Recall=0.7910, F1:0.7805
 partial mode, Precision=0.8713, Recall=0.8833, F1:0.8721
    type mode, Precision=0.9521, Recall=0.9651, F1:0.9530
time used: 118 s
iter 500: loss 2.648972
iter 1

In [44]:
bert_att_crf = Bert_ATT_CRF(tag_to_ix, EMBEDDING_DIM)
optimizer4 = optim.SGD(bert_att_crf.parameters(), lr=0.001, weight_decay=1e-5)

F1_bilstm_att_crf,bert_att_crf = train_bert(bert_att_crf,optimizer4,EPOCHS)
torch.save(bert_att_crf.state_dict(),'./Model/bert_att_crf.pkl')

iter 500: loss 9.204525
iter 1000: loss 1.952400

 NER evaluation scores:
  strict mode, Precision=0.6440, Recall=0.6730, F1:0.6526
   exact mode, Precision=0.6490, Recall=0.6791, F1:0.6581
 partial mode, Precision=0.7888, Recall=0.8220, F1:0.7972
    type mode, Precision=0.9096, Recall=0.9427, F1:0.9165
time used: 124 s
iter 500: loss 4.199558
iter 1000: loss 1.651016

 NER evaluation scores:
  strict mode, Precision=0.7325, Recall=0.7384, F1:0.7304
   exact mode, Precision=0.7403, Recall=0.7445, F1:0.7371
 partial mode, Precision=0.8498, Recall=0.8527, F1:0.8448
    type mode, Precision=0.9405, Recall=0.9444, F1:0.9354
time used: 124 s
iter 500: loss 3.457542
iter 1000: loss 1.357452

 NER evaluation scores:
  strict mode, Precision=0.7544, Recall=0.7605, F1:0.7529
   exact mode, Precision=0.7622, Recall=0.7666, F1:0.7595
 partial mode, Precision=0.8605, Recall=0.8648, F1:0.8567
    type mode, Precision=0.9471, Recall=0.9513, F1:0.9429
time used: 124 s
iter 500: loss 2.388721
iter 10

In [42]:
bert_bilstm_crf = Bert_BiLSTM_CRF(tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM)
optimizer5 = optim.SGD(bert_bilstm_crf.parameters(), lr=0.001, weight_decay=1e-5)

F1_bert_bilstm_crf,bert_bilstm_crf = train_bert(bert_bilstm_crf,optimizer5,EPOCHS)
torch.save(bert_bilstm_crf.state_dict(),'./Model/bert_bilstm_crf_2.pkl')

iter 500: loss 13.297150
iter 1000: loss 3.740913

 NER evaluation scores:
  strict mode, Precision=0.6903, Recall=0.7283, F1:0.7043
   exact mode, Precision=0.6997, Recall=0.7377, F1:0.7137
 partial mode, Precision=0.8175, Recall=0.8630, F1:0.8339
    type mode, Precision=0.9132, Recall=0.9612, F1:0.9302
time used: 131 s
iter 500: loss 4.492828
iter 1000: loss 2.787628

 NER evaluation scores:
  strict mode, Precision=0.7791, Recall=0.7937, F1:0.7818
   exact mode, Precision=0.7879, Recall=0.8026, F1:0.7906
 partial mode, Precision=0.8739, Recall=0.8905, F1:0.8772
    type mode, Precision=0.9455, Recall=0.9640, F1:0.9495
time used: 124 s
iter 500: loss 1.515541
iter 1000: loss 1.545715

 NER evaluation scores:
  strict mode, Precision=0.7943, Recall=0.8070, F1:0.7969
   exact mode, Precision=0.8020, Recall=0.8148, F1:0.8047
 partial mode, Precision=0.8826, Recall=0.8963, F1:0.8853
    type mode, Precision=0.9535, Recall=0.9679, F1:0.9561
time used: 126 s
iter 500: loss 0.822693
iter 1

In [44]:
{'O': 0,
 'B-MED': 1,
 'I-MED': 2,
 'B-rp': 3,
 'I-rp': 4,
 '<START>': 5,
 '<STOP>': 6}
bert_bilstm_crf.state_dict()

OrderedDict([('transitions',
              tensor([[ 3.5112e+00, -8.1621e-01,  1.5422e-02, -8.9862e-01,  1.2429e+00,
                       -1.4511e+00, -1.0000e+04],
                      [ 1.3920e+00, -7.5281e-01,  5.5016e-02, -8.8791e-02, -6.7963e-01,
                        7.7156e-01, -1.0000e+04],
                      [-2.8223e+00,  1.2693e+00,  1.5500e+00, -3.2786e-02,  9.1519e-01,
                        1.2727e-01, -1.0000e+04],
                      [ 1.6845e+00,  1.3555e+00, -3.9388e-01, -1.7229e+00, -2.3801e+00,
                        5.4012e-01, -1.0000e+04],
                      [-1.3457e+00,  3.5078e-01, -1.4227e+00,  6.2656e-01,  2.4739e+00,
                       -4.4986e-01, -1.0000e+04],
                      [-1.0000e+04, -1.0000e+04, -1.0000e+04, -1.0000e+04, -1.0000e+04,
                       -1.0000e+04, -1.0000e+04],
                      [ 2.5859e+00, -1.0751e+00,  1.3245e+00, -7.9399e-01,  1.0516e+00,
                        5.6089e-01, -1.0000e+04]])),
  

## 评价

In [45]:
EMBEDDING_DIM = 128
HIDDEN_DIM = 64*2
EPOCHS=15

In [45]:
# 模型加载
bilstm = BiLSTM(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM)
bilstm.load_state_dict(torch.load('./Model/bilstm.pkl'))

bilstm_crf = BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM)
bilstm_crf.load_state_dict(torch.load('./Model/bilstm_crf.pkl'))

bert_crf = Bert_CRF(tag_to_ix, EMBEDDING_DIM)
bert_crf.load_state_dict(torch.load('./Model/bert_crf.pkl'))

bert_att_crf = Bert_ATT_CRF(tag_to_ix, EMBEDDING_DIM)
bert_att_crf.load_state_dict(torch.load('./Model/bert_att_crf.pkl'))

bert_bilstm_crf = Bert_BiLSTM_CRF(tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM)
bert_bilstm_crf.load_state_dict(torch.load('./Model/bert_bilstm_crf_2.pkl'))

<All keys matched successfully>

In [53]:
print('bilstm')
print(cal_f1(bilstm,test_data))
print('Merhod')
print(cal_M(bilstm,test_data))
print('Problem')
print(cal_P(bilstm,test_data))

bilstm

 NER evaluation scores:
  strict mode, Precision=0.6970, Recall=0.7208, F1:0.7047
   exact mode, Precision=0.6987, Recall=0.7225, F1:0.7063
 partial mode, Precision=0.8213, Recall=0.8546, F1:0.8325
    type mode, Precision=0.9338, Recall=0.9740, F1:0.9477
0.7063439329220063
Merhod

 NER evaluation scores:
  strict mode, Precision=0.6980, Recall=0.7115, F1:0.6988
   exact mode, Precision=0.6980, Recall=0.7115, F1:0.6988
 partial mode, Precision=0.8127, Recall=0.8433, F1:0.8191
    type mode, Precision=0.9275, Recall=0.9751, F1:0.9394
0.698781838316722
Problem

 NER evaluation scores:
  strict mode, Precision=0.7193, Recall=0.7292, F1:0.7226
   exact mode, Precision=0.7193, Recall=0.7292, F1:0.7226
 partial mode, Precision=0.8391, Recall=0.8563, F1:0.8447
    type mode, Precision=0.9590, Recall=0.9834, F1:0.9668
0.7225913621262459


In [48]:
print('bilstm_crf')
print(cal_f1(bilstm_crf))
print('Merhod')
print(cal_M(bilstm_crf))
print('Problem')
print(cal_P(bilstm_crf))

bilstm_crf

 NER evaluation scores:
  strict mode, Precision=0.7491, Recall=0.7529, F1:0.7474
   exact mode, Precision=0.7507, Recall=0.7546, F1:0.7491
 partial mode, Precision=0.8590, Recall=0.8657, F1:0.8579
    type mode, Precision=0.9615, Recall=0.9707, F1:0.9609
0.7490903338079419
Merhod

 NER evaluation scores:
  strict mode, Precision=0.7519, Recall=0.7558, F1:0.7495
   exact mode, Precision=0.7519, Recall=0.7558, F1:0.7495
 partial mode, Precision=0.8499, Recall=0.8571, F1:0.8477
    type mode, Precision=0.9480, Recall=0.9585, F1:0.9460
0.749501661129568
Problem

 NER evaluation scores:
  strict mode, Precision=0.7276, Recall=0.7359, F1:0.7303
   exact mode, Precision=0.7276, Recall=0.7359, F1:0.7303
 partial mode, Precision=0.8475, Recall=0.8596, F1:0.8513
    type mode, Precision=0.9673, Recall=0.9834, F1:0.9723
0.730343300110742


In [50]:
print('bert_crf')
print(bert_cal_F1(bert_crf))
print('Merhod')
print(bert_cal_M(bert_crf))
print('Problem')
print(bert_cal_P(bert_crf))

bert_crf

 NER evaluation scores:
  strict mode, Precision=0.7865, Recall=0.7816, F1:0.7780
   exact mode, Precision=0.7949, Recall=0.7877, F1:0.7848
 partial mode, Precision=0.8805, Recall=0.8717, F1:0.8685
    type mode, Precision=0.9557, Recall=0.9468, F1:0.9430
0.7848204398038288
Merhod

 NER evaluation scores:
  strict mode, Precision=0.7503, Recall=0.7614, F1:0.7484
   exact mode, Precision=0.7503, Recall=0.7614, F1:0.7484
 partial mode, Precision=0.8391, Recall=0.8533, F1:0.8372
    type mode, Precision=0.9280, Recall=0.9452, F1:0.9260
0.7483942414174969
Problem

 NER evaluation scores:
  strict mode, Precision=0.7924, Recall=0.7973, F1:0.7940
   exact mode, Precision=0.7924, Recall=0.7973, F1:0.7940
 partial mode, Precision=0.8679, Recall=0.8754, F1:0.8704
    type mode, Precision=0.9435, Recall=0.9535, F1:0.9468
0.7940199335548173


In [51]:
print('bert_att_crf')
print(bert_cal_F1(bert_att_crf))
print('Merhod')
print(bert_cal_M(bert_att_crf))
print('Problem')
print(bert_cal_P(bert_att_crf))

bert_att_crf

 NER evaluation scores:
  strict mode, Precision=0.7688, Recall=0.7763, F1:0.7672
   exact mode, Precision=0.7782, Recall=0.7841, F1:0.7755
 partial mode, Precision=0.8671, Recall=0.8753, F1:0.8646
    type mode, Precision=0.9421, Recall=0.9527, F1:0.9403
0.7754627432368298
Merhod

 NER evaluation scores:
  strict mode, Precision=0.7381, Recall=0.7553, F1:0.7398
   exact mode, Precision=0.7381, Recall=0.7553, F1:0.7398
 partial mode, Precision=0.8372, Recall=0.8596, F1:0.8398
    type mode, Precision=0.9363, Recall=0.9640, F1:0.9398
0.7397563676633441
Problem

 NER evaluation scores:
  strict mode, Precision=0.7791, Recall=0.7940, F1:0.7841
   exact mode, Precision=0.7791, Recall=0.7940, F1:0.7841
 partial mode, Precision=0.8530, Recall=0.8704, F1:0.8588
    type mode, Precision=0.9269, Recall=0.9468, F1:0.9336
0.7840531561461793


In [52]:
print('bert_bilstm_crf')
print(bert_cal_F1(bert_bilstm_crf))
print('Merhod')
print(bert_cal_M(bert_bilstm_crf))
print('Problem')
print(bert_cal_P(bert_bilstm_crf))

bert_bilstm_crf

 NER evaluation scores:
  strict mode, Precision=0.8209, Recall=0.8148, F1:0.8144
   exact mode, Precision=0.8319, Recall=0.8225, F1:0.8232
 partial mode, Precision=0.9055, Recall=0.8958, F1:0.8962
    type mode, Precision=0.9679, Recall=0.9612, F1:0.9604
0.8232320835310871
Merhod

 NER evaluation scores:
  strict mode, Precision=0.8023, Recall=0.7996, F1:0.7953
   exact mode, Precision=0.8023, Recall=0.7996, F1:0.7953
 partial mode, Precision=0.8832, Recall=0.8832, F1:0.8771
    type mode, Precision=0.9640, Recall=0.9668, F1:0.9589
0.795348837209302
Problem

 NER evaluation scores:
  strict mode, Precision=0.8272, Recall=0.8306, F1:0.8283
   exact mode, Precision=0.8272, Recall=0.8306, F1:0.8283
 partial mode, Precision=0.8962, Recall=0.8995, F1:0.8970
    type mode, Precision=0.9651, Recall=0.9684, F1:0.9657
0.8283499446290145


In [None]:
plt.plot(np.arange(15),F)
plt.legend()
plt.grid()
# plt.title('Loss')
plt.show()

# plt.plot(np.arange(15),train_acc_lst3,label="train_acc")
# plt.legend()
# plt.grid()
# plt.title('Accuracy')
# plt.show()