In [39]:
from collections import Counter
import torch
import argparse
from gensim.models import Word2Vec
import datetime
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

parser = argparse.ArgumentParser()
parser.add_argument('--batch_size', type=int, default=8)
parser.add_argument('--num_epoch', type=int, default=10)
args = parser.parse_args(args=[])

In [2]:
#读入训练集
with open('./CoNLL2003_NER/train/seq.in', encoding='utf-8') as ftrain_feature:
    train_feature_line = [line.strip() for line in ftrain_feature.readlines()][:10]
with open('./CoNLL2003_NER/train/seq.out', encoding='utf-8') as ftrain_label:
    train_label_line = [line.strip() for line in ftrain_label.readlines()][:10]
#读入验证集
with open('./CoNLL2003_NER/test/seq.in', encoding='utf-8') as ftest_feature:
    test_feature_line = [line.strip() for line in ftest_feature.readlines()][:10]
with open('./CoNLL2003_NER/test/seq.out', encoding='utf-8') as ftest_label:
    test_label_line = [line.strip() for line in ftest_label.readlines()][:10]

#转换大小写并用split分隔开存入列表
train_feature_line = [line.lower().split(" ") for line in train_feature_line]
train_label_line = [line.split(" ") for line in train_label_line]
test_feature_line = [line.lower().split(" ") for line in test_feature_line]
test_label_line = [line.split(" ") for line in test_label_line]

#获得单词字典
word_counter = []
for line in train_feature_line:
    word_counter.extend(line)
word_counter = Counter(word_counter).most_common()                                          #len(counter):21009
vocab = ['[UNK]','[PAD]'] + [word[0] for word in word_counter[:int(len(word_counter)*0.8)]] #UNK:低频词；PAD:填充词
word2id = dict(zip(vocab,range(len(vocab))))                                                # word -> id
id2word = {idx:word for idx,word in enumerate(vocab)}                                       # id -> word
#获得标签字典
label2id = {'O':0, 'B-LOC':1, 'B-PER':2, 'B-ORG':3, 'I-PER':4, 'I-ORG':5, 'B-MISC':6, 'I-LOC':7, 'I-MISC':8}

#获得数据和标签序列
train_feature = [[word2id[word] if word in word2id else 0 for word in line] for line in train_feature_line]
train_label = [[label2id[word] for word in line] for line in train_label_line]
test_feature = [[word2id[word] if word in word2id else 0 for word in line] for line in test_feature_line]
test_label = [[label2id[word] for word in line] for line in test_label_line]

#转成Tensor的形式
train_feature = [torch.Tensor(line).long() for line in train_feature]
train_label = [torch.Tensor(line).long() for line in train_label]
test_feature = [torch.Tensor(line).long() for line in test_feature]
test_label = [torch.Tensor(line).long() for line in test_label]

def get_data(sample_features, sample_labels):
    sample_data = []                                                    #为了能够将data放到DataLoader中
    for i in range(len(sample_features)):
        temp = []
        temp.append(sample_features[i])
        temp.append(sample_labels[i])
        sample_data.append(temp)
    return sample_data

def collate_fn(sample_data):
    sample_data.sort(key=lambda data: len(data[0]), reverse=True)                          #倒序排序
    sample_features, sample_labels = [], []
    for data in sample_data:
        sample_features.append(data[0])
        sample_labels.append(data[1])
    data_length = [len(data[0]) for data in sample_data]                                   #取出所有data的长度             
    sample_features = torch.nn.utils.rnn.pad_sequence(sample_features, batch_first=True, padding_value=word2id['[PAD]']) 
    return sample_features, sample_labels, data_length

train_data = get_data(train_feature, train_label)
test_data = get_data(test_feature, test_label)

#处理非定长序列
train_dataloader = torch.utils.data.DataLoader(train_data, args.batch_size, collate_fn=collate_fn, shuffle=True)
test_dataloader = torch.utils.data.DataLoader(test_data, args.batch_size, collate_fn=collate_fn, shuffle=True)

In [3]:
class BiLSTM_CRF(torch.nn.Module):
    def __init__(self, input_size, hidden_size, output_size, embedding_vector):
        super(BiLSTM_CRF,self).__init__()
        
        self.hidden_size = hidden_size
        self.embedding = torch.nn.Embedding(input_size, hidden_size)
        self.embedding.weight.data.copy_(torch.from_numpy(embedding_vector))
        self.embedding.weight.requires_grad = False
        self.bilstm = torch.nn.LSTM(input_size=hidden_size, hidden_size=hidden_size, num_layers=2, 
                                    batch_first=True, dropout=0.5, bidirectional=True)
        self.out = torch.nn.Linear(2 * hidden_size,output_size)
    
    def forward(self, x, batch_seq_len):
        batch_size = x.size(0)
        seq_len = x.size(1)
        x = self.embedding(x)
        
        h = torch.zeros(4, batch_size, self.hidden_size).to(x.device) 
        c = torch.zeros(4, batch_size, self.hidden_size).to(x.device) 
        
        x = torch.nn.utils.rnn.pack_padded_sequence(x,batch_seq_len, batch_first=True)
        output, hidden = self.bilstm(x, (h, c))
        output, _ = torch.nn.utils.rnn.pad_packed_sequence(output,batch_first=True)
        output = self.out(output)
        
        return output

#通过batch_seq_len留下out和label中不重复的部分进行loss计算和指标计算
def processing_len(out, label, batch_seq_len):
    out_pred = out[:batch_seq_len[0],:]
    out_true = label[:batch_seq_len[0]]
    for i in range(1,len(batch_seq_len)):
        out_pred = torch.cat((out_pred,out[i*batch_seq_len[0]:i*batch_seq_len[0]+batch_seq_len[i],:]),dim=0)
        out_true = torch.cat((out_true,label[i*batch_seq_len[0]:i*batch_seq_len[0]+batch_seq_len[i]]),dim=0)

    return out_pred, out_true
    
def test_evaluate(model, test_dataloader, batch_size):
    test_l, test_p, test_r, test_f, n = 0.0, 0.0, 0.0, 0.0, 0
    model.eval()
    with torch.no_grad():
        for data_x, data_y, batch_seq_len in test_dataloader:
            out = model(data_x.to(device),batch_seq_len).view(-1, 9)
            label = [line.numpy().tolist() for line in data_y]
            for line in label:
                for i in range(data_x.shape[1]-len(line)):
                    line.append(line[len(line)-1])
            label = torch.tensor(label).view(-1,1).squeeze(-1).to(device)
            out, label = processing_len(out, label,batch_seq_len)
            prediction = out.argmax(dim=1).data.cpu().numpy()
            label = label.data.cpu().numpy()
            test_l += loss.item()
            test_p += precision_score(label, prediction, average='weighted')
            test_r += recall_score(label, prediction, average='weighted')
            test_f += f1_score(label, prediction, average='weighted')
            n += 1
    return test_l/n, test_p/n, test_r/n, test_f/n

In [40]:
device = 'cuda:0'
loss_func = torch.nn.CrossEntropyLoss()

# 让Embedding层使用训练好的Word2Vec权重
# model_word2vec = Word2Vec(train_feature_line, sg=1, min_count=1, size=128, window=5)
# model_word2vec.save('word2vec_model.txt')
w2v_model = Word2Vec.load('word2vec_model.txt')
embedding_matrix = w2v_model.wv.vectors
input_size = embedding_matrix.shape[0]   
hidden_size = embedding_matrix.shape[1]  
model = BiLSTM_CRF(input_size=input_size, hidden_size=hidden_size, output_size=9, embedding_vector=embedding_matrix).to(device)
model.load_state_dict(torch.load('./model_2.pt'))
optimizer = torch.optim.Adam(model.parameters(), lr=1e-9, weight_decay=1e-5)  #

train_loss, train_accuracy, train_precision, train_recall, train_f1 = [], [], [], [], []
test_loss, test_accuracy, test_precision, test_recall, test_f1 = [], [], [], [], []
f1_min = 0.808494
for epoch in range(args.num_epoch):
    model.train()
    train_l, train_p, train_r, train_f, n = 0.0, 0.0, 0.0, 0.0, 0
    start = datetime.datetime.now()
    for data_x, data_y, batch_seq_len in train_dataloader:
        out = model(data_x.to(device),batch_seq_len).view(-1, 9)
        label = [line.numpy().tolist() for line in data_y]
        for line in label:
            for i in range(data_x.shape[1]-len(line)):
                line.append(line[len(line)-1])
        label = torch.tensor(label).view(-1,1).squeeze(-1).to(device)
        
        out, label = processing_len(out, label,batch_seq_len)
        
        loss = loss_func(out, label)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        prediction = out.argmax(dim=1).data.cpu().numpy()
        label = label.data.cpu().numpy()
        train_l += loss.item()
        train_p += precision_score(label, prediction, average='weighted')
        train_r += recall_score(label, prediction, average='weighted')
        train_f += f1_score(label, prediction, average='weighted')
        n += 1
    #训练集评价指标
    train_loss.append(train_l/n)
    train_precision.append(train_p/n)
    train_recall.append(train_r/n)
    train_f1.append(train_f/n)
    #测试集评价指标
    test_l, test_p, test_r, test_f = test_evaluate(model, test_dataloader, args.batch_size)
    test_loss.append(test_l)
    test_precision.append(test_p)
    test_recall.append(test_r)
    test_f1.append(test_f)
    end = datetime.datetime.now()
    print('epoch %d, train: loss %f, precision %f, recall %f, f1 %f, time %s'% 
          (epoch+1, train_loss[epoch], train_precision[epoch], train_recall[epoch], train_f1[epoch], end-start))
    print('          test: loss %f,  precision %f,  recall %f,  f1 %f'% 
          (test_loss[epoch], test_precision[epoch], test_recall[epoch], test_f1[epoch]))
#     if test_f1[epoch] > f1_min:
#         f1_min = test_f1[epoch]
#         torch.save(model.state_dict(), './model_3.pt')
#         print("save model......")

  _warn_prf(average, modifier, msg_start, len(result))


epoch 1, train: loss 0.621037, precision 0.736780, recall 0.839416, f1 0.775558, time 0:00:00.034181
          test: loss 0.621037,  precision 0.767120,  recall 0.858871,  f1 0.808494
epoch 2, train: loss 0.618712, precision 0.736780, recall 0.839416, f1 0.775558, time 0:00:00.036902
          test: loss 0.618712,  precision 0.767120,  recall 0.858871,  f1 0.808494
epoch 3, train: loss 0.612661, precision 0.728588, recall 0.835766, f1 0.774240, time 0:00:00.035748
          test: loss 0.612661,  precision 0.767120,  recall 0.858871,  f1 0.808494
epoch 4, train: loss 0.612314, precision 0.736780, recall 0.839416, f1 0.775558, time 0:00:00.036307
          test: loss 0.612314,  precision 0.767120,  recall 0.858871,  f1 0.808494
epoch 5, train: loss 0.621237, precision 0.736780, recall 0.839416, f1 0.775558, time 0:00:00.039819
          test: loss 0.621237,  precision 0.767120,  recall 0.858871,  f1 0.808494
epoch 6, train: loss 0.615311, precision 0.736780, recall 0.839416, f1 0.775558,