In [8]:
from collections import Counter
import torch
import argparse
from gensim.models import Word2Vec
import datetime
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

parser = argparse.ArgumentParser()
parser.add_argument('--batch_size', type=int, default=1024)
parser.add_argument('--num_epoch', type=int, default=500)
args = parser.parse_args(args=[])

In [2]:
#读入训练集
with open('./CoNLL2003_NER/train/seq.in', encoding='utf-8') as ftrain_feature:
    train_feature_line = [line.strip() for line in ftrain_feature.readlines()][:10]
with open('./CoNLL2003_NER/train/seq.out', encoding='utf-8') as ftrain_label:
    train_label_line = [line.strip() for line in ftrain_label.readlines()][:10]
#读入验证集
with open('./CoNLL2003_NER/test/seq.in', encoding='utf-8') as ftest_feature:
    test_feature_line = [line.strip() for line in ftest_feature.readlines()][:10]
with open('./CoNLL2003_NER/test/seq.out', encoding='utf-8') as ftest_label:
    test_label_line = [line.strip() for line in ftest_label.readlines()][:10]

#转换大小写并用split分隔开存入列表
train_feature_line = [line.lower().split(" ") for line in train_feature_line]
train_label_line = [line.split(" ") for line in train_label_line]
test_feature_line = [line.lower().split(" ") for line in test_feature_line]
test_label_line = [line.split(" ") for line in test_label_line]

#获得单词字典
word_counter = []
for line in train_feature_line:
    word_counter.extend(line)
word_counter = Counter(word_counter).most_common()                                          #len(counter):21009
vocab = ['[UNK]','[PAD]'] + [word[0] for word in word_counter[:int(len(word_counter)*0.8)]] #UNK:低频词；PAD:填充词
word2id = dict(zip(vocab,range(len(vocab))))                                                # word -> id
id2word = {idx:word for idx,word in enumerate(vocab)}                                       # id -> word
#获得标签字典
label2id = {'O':0, 'B-LOC':1, 'B-PER':2, 'B-ORG':3, 'I-PER':4, 'I-ORG':5, 'B-MISC':6, 'I-LOC':7, 'I-MISC':8, 'START':9, 'STOP':10}

#获得数据和标签序列
train_feature = [[word2id[word] if word in word2id else 0 for word in line] for line in train_feature_line]
train_label = [[label2id[word] for word in line] for line in train_label_line]
test_feature = [[word2id[word] if word in word2id else 0 for word in line] for line in test_feature_line]
test_label = [[label2id[word] for word in line] for line in test_label_line]

#转成Tensor的形式
train_feature = [torch.Tensor(line).long() for line in train_feature]
train_label = [torch.Tensor(line).long() for line in train_label]
test_feature = [torch.Tensor(line).long() for line in test_feature]
test_label = [torch.Tensor(line).long() for line in test_label]

def get_data(sample_features, sample_labels):
    sample_data = []                                                    #为了能够将data放到DataLoader中
    for i in range(len(sample_features)):
        temp = []
        temp.append(sample_features[i])
        temp.append(sample_labels[i])
        sample_data.append(temp)
    return sample_data

def collate_fn(sample_data):
    sample_data.sort(key=lambda data: len(data[0]), reverse=True)                          #倒序排序
    sample_features, sample_labels = [], []
    for data in sample_data:
        sample_features.append(data[0])
        sample_labels.append(data[1])
    data_length = [len(data[0]) for data in sample_data]                                   #取出所有data的长度             
    sample_features = torch.nn.utils.rnn.pad_sequence(sample_features, batch_first=True, padding_value=word2id['[PAD]']) 
    return sample_features, sample_labels, data_length

train_data = get_data(train_feature, train_label)
test_data = get_data(test_feature, test_label)

#处理非定长序列
train_dataloader = torch.utils.data.DataLoader(train_data, args.batch_size, collate_fn=collate_fn, shuffle=True)
test_dataloader = torch.utils.data.DataLoader(test_data, args.batch_size, collate_fn=collate_fn, shuffle=True)

In [3]:
class BiLSTM(torch.nn.Module):
    def __init__(self, input_size, hidden_size, output_size, embedding_vector, label2id):
        super(BiLSTM,self).__init__()
        
        # ============================ BiLSTM的系列参数 ============================ #
        self.hidden_size = hidden_size
        self.embedding = torch.nn.Embedding(input_size, hidden_size)
        self.embedding.weight.data.copy_(torch.from_numpy(embedding_vector))
        self.embedding.weight.requires_grad = False
        self.bilstm = torch.nn.LSTM(input_size=hidden_size, hidden_size=hidden_size, num_layers=2, 
                                    batch_first=True, dropout=0.5, bidirectional=True)
        self.out = torch.nn.Linear(2 * hidden_size,output_size)
        
        # ============================= CRF的系列参数 ============================= #
        self.tagset_size = len(label2id)
        self.tag_to_ix = label2id
        #转移矩阵，transitions[i][j]表示从label j转移到label i的概率,虽然是随机生成的但是后面会迭代更新
        self.transitions = torch.nn.Parameter(torch.randn(self.tagset_size,self.tagset_size))
        #这两个语句强制执行了这样的约束条件：我们永远不会转移到开始标签，也永远不会从停止标签转移
        self.transitions.data[self.tag_to_ix['START'], :] = -10000     #从任何标签转移到START_TAG不可能
        self.transitions.data[:, self.tag_to_ix['STOP']] = -10000      #从STOP_TAG转移到任何标签不可能

# ======================================= 模型前向传播 ======================================= #
    def forward(self, x, batch_seq_len):
        
        lstm_feats = self.BiLSTM(x, batch_seq_len)
        score, tag_seq = self._viterbi_decode(lstm_feats) #BiLSTM处理结果作为CRF的输入,输出为分数和预测的标签序列
        return score, tag_seq

# ======================================= BiLSTM部分 ======================================= #
    def BiLSTM(self, x, batch_seq_len):
        batch_size = x.size(0)
        seq_len = x.size(1)
        x = self.embedding(x)
        
        h = torch.zeros(4, batch_size, self.hidden_size).to(x.device) 
        c = torch.zeros(4, batch_size, self.hidden_size).to(x.device) 
        
        x = torch.nn.utils.rnn.pack_padded_sequence(x,batch_seq_len, batch_first=True)
        output, hidden = self.bilstm(x, (h, c))
        output, _ = torch.nn.utils.rnn.pad_packed_sequence(output,batch_first=True)
        lstm_feats = self.out(output)     
        return lstm_feats
     
# ======================================= CRF的decode部分 ======================================= #
    def _viterbi_decode(self, feats):
        
        # 加入batch的信息
        batch_path_score = []
        batch_best_path = []
        for batch in range(feats.shape[0]):
            backpointers = []

            init_vvars = torch.full((1, self.tagset_size), -10000.).to(device)    #保证了一定是从START到其他标签
            init_vvars[0][self.tag_to_ix['START']] = 0                            #START的位置变成0

            #        我  是  梁  棋  棋
            #        |   |   |   |   |         这部分对应 for next_tag in range(self.tagset_size):
            # START->O ->O ->B ->I ->I ->STOP  这部分对应 self.transitions

            #每个time_step都需要前一个time_step的分数，每个time_step 对应 for feat in feats:
            #                                            分数score 对应 forward_var
            #所以forward_var = (torch.cat(viterbivars_t) + feat)
            
            forward_var = init_vvars                           
            for feat in feats[batch]:                                                    
                bptrs_t = []                                                     
                viterbivars_t = []                                                
                for next_tag in range(self.tagset_size):                          
                    next_tag_var = forward_var + self.transitions[next_tag]       
                    best_tag_id = argmax(next_tag_var)                  
                    bptrs_t.append(best_tag_id)                                   
                    viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
                forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
                backpointers.append(bptrs_t)                                      

            #   棋
            #   |
            #   I ->STOP 这部分对应 terminal_var
            #因为没有字对应‘STOP’标签，所以只需要加上转移到STOP的分数即可，即self.transitions[self.tag_to_ix['STOP']]

            terminal_var = forward_var + self.transitions[self.tag_to_ix['STOP']] # 其他标签到STOP_TAG的转移概率
            best_tag_id = argmax(terminal_var)
            path_score = terminal_var[0][best_tag_id]

            # 根据动态规划，由最后的节点，向前选取最佳的路径
            best_path = [best_tag_id]
            for bptrs_t in reversed(backpointers):
                best_tag_id = bptrs_t[best_tag_id]
                best_path.append(best_tag_id)
            start = best_path.pop()
            assert start == self.tag_to_ix['START']
            best_path.reverse()                                 # 把从后向前的路径正过来
            
            batch_path_score.append(path_score)
            batch_best_path.append(best_path)
        return batch_path_score, batch_best_path
    
# ======================================= 计算模型loss ======================================= #    
    def neg_log_likelihood(self, sentence, tags, batch_seq_len):
        
        feats = self.BiLSTM(sentence, batch_seq_len)
        # loss = log(∑ e^s(X,y)) - s(X,y) 
        forward_score = self._forward_alg(feats)           # loss的log部分的结果
        gold_score = self._score_sentence(feats, tags)     # loss的S(X,y)部分的结果
        return (forward_score - gold_score)

# ======================================= 计算loss的log部分 ======================================= #     
    def _forward_alg(self, feats):
    # 关于log_sum_exp的具体解释： https://blog.csdn.net/Suan2014/article/details/89477037
    
        # 加入batch的信息
        alpha = torch.zeros(1).to(device)
        for batch in range(feats.shape[0]):        
            init_alphas = torch.full((1, self.tagset_size), -10000.).to(device)  #保证了一定是从START到其他标签
            init_alphas[0][self.tag_to_ix['START']] = 0.                         #START的位置变成0

            forward_var = init_alphas                                            #包装到一个变量里面以便自动反向传播

            for feat in feats[batch]:
                alphas_t = []  
                for next_tag in range(self.tagset_size):
                    emit_score = feat[next_tag].view(1, -1).expand(1, self.tagset_size)
                    trans_score = self.transitions[next_tag].view(1, -1)
                    next_tag_var = forward_var + trans_score + emit_score
                    alphas_t.append(log_sum_exp(next_tag_var).view(1))
                forward_var = torch.cat(alphas_t).view(1, -1)
            terminal_var = forward_var + self.transitions[self.tag_to_ix['STOP']]
            alpha = alpha + log_sum_exp(terminal_var)
        return alpha
    
# ======================================= 计算loss的S(X,y)部分 ======================================= #    
    def _score_sentence(self, feats, tags):
        
        #_forward_alg   ：算一个最大可能路径，实际上可能不是真实标签转移的值
        #_score_sentence：用真实标签转移的值计算
        score = torch.zeros(1).to(device)
        for batch in range(feats.shape[0]):
            # 将START_TAG的标签３拼接到tag序列最前面
            tags = torch.cat([torch.tensor([self.tag_to_ix['START']], dtype=torch.long).to(device), tags])
            for i, feat in enumerate(feats[batch]):
                score = score + self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
            score = score + self.transitions[self.tag_to_ix['STOP'], tags[-1]]
        return score

In [4]:
def argmax(vec):
    # return the argmax as a python int
    _, idx = torch.max(vec, 1)
    return idx.item()

# 关于log_sum_exp的具体解释： https://blog.csdn.net/Suan2014/article/details/89477037
def log_sum_exp(vec): 
    max_score = vec[0, argmax(vec)]
    max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1]) 
    
    #等同于torch.log(torch.sum(torch.exp(vec)))，防止e的指数导致计算机上溢
    return max_score + torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))

#通过batch_seq_len留下out和label中不重复的部分进行loss计算和指标计算
def processing_len(out, label, batch_seq_len):
    out_pred = out[:batch_seq_len[0]]
    out_true = label[:batch_seq_len[0]]
    for i in range(1,len(batch_seq_len)):
        out_pred = torch.cat((out_pred,out[i*batch_seq_len[0]:i*batch_seq_len[0]+batch_seq_len[i]]),dim=0)
        out_true = torch.cat((out_true,label[i*batch_seq_len[0]:i*batch_seq_len[0]+batch_seq_len[i]]),dim=0)

    return out_pred, out_true
    
def test_evaluate(model, test_dataloader, batch_size):
    test_l, test_p, test_r, test_f, n = 0.0, 0.0, 0.0, 0.0, 0
    model.eval()
    with torch.no_grad():
        for data_x, data_y, batch_seq_len in test_dataloader:
            _, out = model(data_x.to(device),batch_seq_len)                      #out就是路径序列 [10, 40]

            label = [line.numpy().tolist() for line in data_y]
            for line in label:
                for i in range(data_x.shape[1]-len(line)):
                    line.append(line[len(line)-1])

            label = torch.tensor(label).view(-1,1).squeeze(-1).to(device)        #torch.Size([274])
            out = torch.tensor(out).view(-1,1).squeeze(-1).to(device)          #torch.Size([274])
            out, label = processing_len(out, label, batch_seq_len)
            
            loss = model.neg_log_likelihood(data_x.to(device), label, batch_seq_len)
            out = out.data.cpu().numpy()
            label = label.data.cpu().numpy()
            
            test_l += loss.item()/data_x.shape[0]/data_x.shape[1]
            test_p += precision_score(label, out, average='weighted')
            test_r += recall_score(label, out, average='weighted')
            test_f += f1_score(label, out, average='weighted')
            n += 1
    return test_l/n, test_p/n, test_r/n, test_f/n

In [16]:
device = 'cuda:0'
loss_func = torch.nn.CrossEntropyLoss()

# 让Embedding层使用训练好的Word2Vec权重
# model_word2vec = Word2Vec(train_feature_line, sg=1, min_count=1, size=128, window=5)
# model_word2vec.save('word2vec_model.txt')
w2v_model = Word2Vec.load('word2vec_model.txt')
embedding_matrix = w2v_model.wv.vectors
input_size = embedding_matrix.shape[0]   
hidden_size = embedding_matrix.shape[1]  
model = BiLSTM(input_size, hidden_size, 11, embedding_matrix,label2id).to(device)
model.load_state_dict(torch.load('./model_3.pt'))
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)  #, weight_decay=1e-5

train_loss, train_accuracy, train_precision, train_recall, train_f1 = [], [], [], [], []
test_loss, test_accuracy, test_precision, test_recall, test_f1 = [], [], [], [], []
f1_min = 0.782223
loss_min = 785.145015
for epoch in range(args.num_epoch):
    model.train()
    train_l, train_p, train_r, train_f, n = 0.0, 0.0, 0.0, 0.0, 0
    start = datetime.datetime.now()
    for data_x, data_y, batch_seq_len in train_dataloader:
        _, out = model(data_x.to(device),batch_seq_len)                      #out就是路径序列 [10, 40]
        
        label = [line.numpy().tolist() for line in data_y]
        for line in label:
            for i in range(data_x.shape[1]-len(line)):
                line.append(line[len(line)-1])
        label = torch.tensor(label).view(-1,1).squeeze(-1).to(device)        #torch.Size([274])
        out = torch.tensor(out).view(-1,1).squeeze(-1).to(device)            #torch.Size([274])
        out, label = processing_len(out, label, batch_seq_len)
       
        loss = model.neg_log_likelihood(data_x.to(device), label, batch_seq_len)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        out = out.data.cpu().numpy()
        label = label.data.cpu().numpy()
        train_l += loss.item()/data_x.shape[0]/data_x.shape[1]
        train_p += precision_score(label, out, average='weighted')
        train_r += recall_score(label, out, average='weighted')
        train_f += f1_score(label, out, average='weighted')
        n += 1
    #训练集评价指标
    train_loss.append(train_l/n)
    train_precision.append(train_p/n)
    train_recall.append(train_r/n)
    train_f1.append(train_f/n)
    #测试集评价指标
    test_l, test_p, test_r, test_f = test_evaluate(model, test_dataloader, args.batch_size)
    test_loss.append(test_l)
    test_precision.append(test_p)
    test_recall.append(test_r)
    test_f1.append(test_f)
    end = datetime.datetime.now()
    print('epoch %d, train: loss %f, precision %f, recall %f, f1 %f, time %s'% 
          (epoch+1, train_loss[epoch], train_precision[epoch], train_recall[epoch], train_f1[epoch], end-start))
    print('          test: loss %f,  precision %f,  recall %f,  f1 %f'% 
          (test_loss[epoch], test_precision[epoch], test_recall[epoch], test_f1[epoch]))
    if test_f1[epoch] > f1_min or test_loss[epoch] < loss_min:
        f1_min = test_f1[epoch]
        loss_min = test_loss[epoch]
        torch.save(model.state_dict(), './model_3.pt')
        print("save model......")

  _warn_prf(average, modifier, msg_start, len(result))


epoch 1, train: loss 824.450469, precision 0.698506, recall 0.835766, f1 0.760996, time 0:00:22.479681
          test: loss 785.080060,  precision 0.723872,  recall 0.850806,  f1 0.782223
save model......
epoch 2, train: loss 824.335781, precision 0.698506, recall 0.835766, f1 0.760996, time 0:00:34.551093
          test: loss 784.937202,  precision 0.723872,  recall 0.850806,  f1 0.782223
save model......
epoch 3, train: loss 824.174922, precision 0.698506, recall 0.835766, f1 0.760996, time 0:00:20.850348
          test: loss 784.719345,  precision 0.723872,  recall 0.850806,  f1 0.782223
save model......
epoch 4, train: loss 823.933203, precision 0.698506, recall 0.835766, f1 0.760996, time 0:00:38.825323
          test: loss 784.477902,  precision 0.723872,  recall 0.850806,  f1 0.782223
save model......
epoch 5, train: loss 823.670781, precision 0.698506, recall 0.835766, f1 0.760996, time 0:00:15.544467
          test: loss 784.270238,  precision 0.723872,  recall 0.850806,  f1 0