# BiLSTM + CRF for NER
BiLSTM的输入时文档向量, 并不需要对文档向量padding到一致的长度, 因为对应1个文档, BiLSTM输出1个feats矩阵(len_doc, num_tag), 将feats矩阵输入CRF, CRF最主要是要训练其转移矩阵(tag->tag), 所以即使feats矩阵行数不一, CRF都能用feats训练转移矩阵

官方实现: https://pytorch.apachecn.org/docs/1.0/nlp_advanced_tutorial.html?h=Bidirection

In [1]:
import keras
from keras.models import Sequential
from keras.layers import *
import nltk
import torch.nn as nn
import torch

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## 数据准备

In [2]:
num_sentence = 10000
with open(r'D:/CS/dataset/NLP/命名实体识别/单个字特征数据集/example.train', 'r', 
          encoding='utf-8') as f:
    i = 0;
    a = []
    while(i < num_sentence):     # 读取了10000句预料
        content = f.readline()
        if content == '\n':
            i += 1
        a.append(content)

In [3]:
def get_sentences(temp_list):
    index_list = []
    for i in range(len(temp_list)):
        if temp_list[i] == '\n':
            index_list.append(i)
    
    pointer = 0
    sentence_list = []
    for index in index_list:
        sentence_list.append(temp_list[pointer : index])
        pointer = index + 1
    return sentence_list

In [4]:
sentence_list = get_sentences(a)

In [5]:
sentence_list[-2]   # 最后一句没有句号

['但 O\n',
 '是 O\n',
 '， O\n',
 '金 O\n',
 '融 O\n',
 '危 O\n',
 '机 O\n',
 '也 O\n',
 '可 O\n',
 '能 O\n',
 '来 O\n',
 '自 O\n',
 '内 O\n',
 '部 O\n',
 '。 O\n']

In [6]:
# 去掉'\n'和句号
for sentence in sentence_list:
    try:
        sentence.remove('。 O\n')
    except:
        pass
    
    for i in range(len(sentence)):
        sentence[i] = tuple(sentence[i].strip('\n').split(' '))

In [7]:
sentence_list[-1]

[('首', 'B-ORG'),
 ('届', 'I-ORG'),
 ('立', 'I-ORG'),
 ('法', 'I-ORG'),
 ('会', 'I-ORG'),
 ('选', 'O'),
 ('举', 'O'),
 ('是', 'O'),
 ('“', 'O'),
 ('港', 'B-LOC'),
 ('人', 'O'),
 ('治', 'O'),
 ('港', 'B-LOC'),
 ('”', 'O'),
 ('重', 'O'),
 ('要', 'O'),
 ('一', 'O'),
 ('步', 'O')]

In [8]:
# 去除特殊符号
stop_words = '[a-zA-Z0-9’\n·\s＊!"：#$%&\'()◆●（）＠②*+,-./:;<=>?@，。?★、…【】《》？——“”‘’！[\$$^_`{|}~]+'
sentence_list = [[(word, tag) for word, tag in sentence if word not in stop_words] for sentence in sentence_list]

In [9]:
sentence_list[-1]

[('首', 'B-ORG'),
 ('届', 'I-ORG'),
 ('立', 'I-ORG'),
 ('法', 'I-ORG'),
 ('会', 'I-ORG'),
 ('选', 'O'),
 ('举', 'O'),
 ('是', 'O'),
 ('港', 'B-LOC'),
 ('人', 'O'),
 ('治', 'O'),
 ('港', 'B-LOC'),
 ('重', 'O'),
 ('要', 'O'),
 ('一', 'O'),
 ('步', 'O')]

## 准备数据

In [10]:
from gensim.corpora.dictionary import Dictionary
from keras.preprocessing import sequence



In [11]:
docs = [[word for word, tag in sentence] for sentence in sentence_list]

In [12]:
# 总词数
dic = Dictionary(docs)
vocab_size = max(dic.keys()) + 1
print('总词数: ', vocab_size)     # 词id从0开始, 最大id 3818

# 寻找最大句长
sentences_len = [len(doc) for doc in docs]
max_len_a_doc = max(sentences_len)
print('最大文档长度', max_len_a_doc)

总词数:  3819
最大文档长度 550


查看sentences_len发现绝大多数文档长度在120以内

In [13]:
max_len_a_doc = 120
trainY = [[tag for word, tag in sentence] for sentence in sentence_list]    # padding之前

my_dic = dict([('O',0), ('B-LOC', 1), ('I-LOC', 2),
            ('B-PER', 3), ('I-PER', 4), ('B-ORG', 5), ('I-ORG', 6)])
def tag2idx(tag):
    return my_dic[tag]

# 先将tag映射为idx, 再对idx序列padding, 由于'O'对应idx为0, 所以句子padding部分都视作'O', 对应数字0
# 不能用padding, 会使 B-LOC, I-LOC 都变成 B, I
trainY_vec = [[my_dic[tag] for tag in sentence] for sentence in trainY]
trainY_vec = sequence.pad_sequences(trainY_vec, maxlen=max_len_a_doc, value=0, padding='post')
trainY_vec = torch.LongTensor(trainY_vec)
trainY_vec[0]

tensor([0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [14]:
from sklearn.preprocessing import LabelBinarizer
import numpy as np

In [15]:
# 文本向量化
doc_vec_list = [dic.doc2idx(doc) for doc in docs]
# 不能填充-1 因为embedding矩阵中没有-1索引
padded_doc_vec_list = sequence.pad_sequences(doc_vec_list, maxlen=120, value=3819, padding='post')
trainX = np.array(padded_doc_vec_list)
trainX[0]

array([   7,   12,    6,   10,    4,    8,    3,    2,   13,    0,   11,
         13,    1,   14,    9,    7,    5, 3819, 3819, 3819, 3819, 3819,
       3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819,
       3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819,
       3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819,
       3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819,
       3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819,
       3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819,
       3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819,
       3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819,
       3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819])

## BiLSTM
获取BiLSTM的输出feats

In [16]:
from torch import nn
import torch

embedding_dim = 128
lstm_dim = 128
num_tag = 7

class Net(torch.nn.Module):
    def __init__(self, vocab_size, num_tag, embedding_dim, lstm_dim):  # vocab_size + padding的-1
        super(Net, self).__init__()
        self.vocab_size = vocab_size
        self.num_tag = num_tag
        self.embedding_dim = embedding_dim
        self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, lstm_dim, num_layers=1, bidirectional=True)
        self.lstm2tag = nn.Linear(lstm_dim*2, num_tag)   # BiLSTM 会把正反向lstm每个时间步的输出拼接为1行
#         self.lstm2tag = nn.Softmax(num_tag)
    def forward(self, x):
        x = self.word_embeds(x)
        x, hidden = self.lstm(x)
        x = self.lstm2tag(x)
        return x

In [17]:
trainX = torch.LongTensor(trainX)
trainX[0]

tensor([   7,   12,    6,   10,    4,    8,    3,    2,   13,    0,   11,   13,
           1,   14,    9,    7,    5, 3819, 3819, 3819, 3819, 3819, 3819, 3819,
        3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819,
        3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819,
        3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819,
        3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819,
        3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819,
        3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819,
        3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819,
        3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819])

In [18]:
net = Net(vocab_size+1, num_tag, embedding_dim, lstm_dim)
feats = net(trainX[0:1])
feats

tensor([[[ 0.0332, -0.1004,  0.1632, -0.0808, -0.0067,  0.1000,  0.0260],
         [ 0.0974, -0.0688,  0.1360, -0.0215,  0.1057, -0.0165,  0.0161],
         [ 0.0682, -0.0807,  0.0579, -0.0824, -0.1232,  0.0390, -0.0255],
         [ 0.1617,  0.0086, -0.0476,  0.0115,  0.0057,  0.0495,  0.0105],
         [-0.0037,  0.0084,  0.0238, -0.0170,  0.0337,  0.1049,  0.0563],
         [-0.0327, -0.0249, -0.0067, -0.0044,  0.0743,  0.0092,  0.0237],
         [ 0.1097, -0.0926,  0.1432, -0.0593, -0.0714, -0.0344,  0.0445],
         [ 0.0453, -0.1467, -0.0797, -0.0966, -0.0020, -0.0579,  0.0122],
         [-0.0499, -0.1244,  0.0158, -0.0457, -0.1384, -0.0048,  0.0376],
         [ 0.1179, -0.1381,  0.1598,  0.0099, -0.0188,  0.0093, -0.0037],
         [ 0.1006, -0.0481, -0.0147, -0.0339,  0.1813,  0.0253,  0.0542],
         [-0.0499, -0.1244,  0.0158, -0.0457, -0.1384, -0.0048,  0.0376],
         [ 0.0212, -0.1168,  0.0046, -0.0235, -0.0189, -0.0442,  0.0041],
         [ 0.0193, -0.2451,  0.0807, -

## 尝试单独使用BiLSTM

In [19]:
trainY_vec.shape

torch.Size([10000, 120])

In [20]:
trainX.shape

torch.Size([10000, 120])

## BiLSTM训练

In [21]:
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as func

class myLoss(nn.Module):
    def __init__(self):
        super(myLoss, self).__init__()
        self.crossentropy = nn.CrossEntropyLoss()

    def forward(self, scores, true_y):
        the_loss = 0.0
        for i in range(true_y.shape[0]):
            the_loss += self.crossentropy(scores[i], true_y[i])
        return the_loss

criterion = myLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

loss = 0.0
for i in range(500):   # 训练500个句子
    optimizer.zero_grad()
    x = trainX[i:i+1]
    out = net(x)
    loss = criterion(out, trainY_vec[i:i+1])
    print(loss, '\n')
    loss.backward()
    optimizer.step()
    loss = 0.0

tensor(1.9050, grad_fn=<AddBackward0>) 

tensor(1.8987, grad_fn=<AddBackward0>) 

tensor(1.8977, grad_fn=<AddBackward0>) 

tensor(1.8888, grad_fn=<AddBackward0>) 

tensor(1.8849, grad_fn=<AddBackward0>) 

tensor(1.8722, grad_fn=<AddBackward0>) 

tensor(1.8457, grad_fn=<AddBackward0>) 

tensor(1.8341, grad_fn=<AddBackward0>) 

tensor(1.8145, grad_fn=<AddBackward0>) 

tensor(1.8686, grad_fn=<AddBackward0>) 

tensor(1.7925, grad_fn=<AddBackward0>) 

tensor(1.7831, grad_fn=<AddBackward0>) 

tensor(1.7636, grad_fn=<AddBackward0>) 

tensor(1.8610, grad_fn=<AddBackward0>) 

tensor(1.8762, grad_fn=<AddBackward0>) 

tensor(1.6947, grad_fn=<AddBackward0>) 

tensor(1.6428, grad_fn=<AddBackward0>) 

tensor(1.7525, grad_fn=<AddBackward0>) 

tensor(1.6755, grad_fn=<AddBackward0>) 

tensor(1.6425, grad_fn=<AddBackward0>) 

tensor(1.5839, grad_fn=<AddBackward0>) 

tensor(1.5574, grad_fn=<AddBackward0>) 

tensor(1.5570, grad_fn=<AddBackward0>) 

tensor(1.5338, grad_fn=<AddBackward0>) 

tensor(1.4670, g

tensor(0.3602, grad_fn=<AddBackward0>) 

tensor(0.6167, grad_fn=<AddBackward0>) 

tensor(0.6921, grad_fn=<AddBackward0>) 

tensor(0.4142, grad_fn=<AddBackward0>) 

tensor(0.4450, grad_fn=<AddBackward0>) 

tensor(0.3868, grad_fn=<AddBackward0>) 

tensor(0.3829, grad_fn=<AddBackward0>) 

tensor(0.4982, grad_fn=<AddBackward0>) 

tensor(0.6447, grad_fn=<AddBackward0>) 

tensor(0.3941, grad_fn=<AddBackward0>) 

tensor(0.4123, grad_fn=<AddBackward0>) 

tensor(0.2325, grad_fn=<AddBackward0>) 

tensor(0.6346, grad_fn=<AddBackward0>) 

tensor(0.3918, grad_fn=<AddBackward0>) 

tensor(0.3387, grad_fn=<AddBackward0>) 

tensor(0.3179, grad_fn=<AddBackward0>) 

tensor(0.6101, grad_fn=<AddBackward0>) 

tensor(0.3412, grad_fn=<AddBackward0>) 

tensor(0.4232, grad_fn=<AddBackward0>) 

tensor(0.2651, grad_fn=<AddBackward0>) 

tensor(0.3759, grad_fn=<AddBackward0>) 

tensor(0.2221, grad_fn=<AddBackward0>) 

tensor(0.5521, grad_fn=<AddBackward0>) 

tensor(0.2507, grad_fn=<AddBackward0>) 

tensor(0.5143, g

tensor(0.2759, grad_fn=<AddBackward0>) 

tensor(0.6671, grad_fn=<AddBackward0>) 

tensor(0.3609, grad_fn=<AddBackward0>) 

tensor(0.2663, grad_fn=<AddBackward0>) 

tensor(0.3223, grad_fn=<AddBackward0>) 

tensor(0.5550, grad_fn=<AddBackward0>) 

tensor(0.3572, grad_fn=<AddBackward0>) 

tensor(0.3575, grad_fn=<AddBackward0>) 

tensor(0.1687, grad_fn=<AddBackward0>) 

tensor(0.1787, grad_fn=<AddBackward0>) 

tensor(0.5765, grad_fn=<AddBackward0>) 

tensor(0.2314, grad_fn=<AddBackward0>) 

tensor(0.2913, grad_fn=<AddBackward0>) 

tensor(0.3424, grad_fn=<AddBackward0>) 

tensor(0.4420, grad_fn=<AddBackward0>) 

tensor(0.5227, grad_fn=<AddBackward0>) 

tensor(0.3094, grad_fn=<AddBackward0>) 

tensor(0.2129, grad_fn=<AddBackward0>) 

tensor(0.3231, grad_fn=<AddBackward0>) 

tensor(0.8486, grad_fn=<AddBackward0>) 

tensor(0.3771, grad_fn=<AddBackward0>) 

tensor(0.3835, grad_fn=<AddBackward0>) 

tensor(0.2505, grad_fn=<AddBackward0>) 

tensor(0.1998, grad_fn=<AddBackward0>) 

tensor(0.2733, g

In [26]:
len(doc_vec_list)

10000

## 测试BiLSTM

In [23]:
# [('O',0), ('B-LOC', 1), ('I-LOC', 2), ('B-PER', 3), ('I-PER', 4), ('B-ORG', 5), ('I-ORG', 6)])
trainY = [[my_dic[tag] for tag in sentence] for sentence in trainY]
print(trainY[0], '\n')

trainX = doc_vec_list
print(trainX[0], '\n')

[0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 1, 2, 0, 0, 0, 0, 0] 

[7, 12, 6, 10, 4, 8, 3, 2, 13, 0, 11, 13, 1, 14, 9, 7, 5] 

