# BiLSTM + CRF for NER
BiLSTM的输入时文档向量, 并不需要对文档向量padding到一致的长度, 因为对应1个文档, BiLSTM输出1个feats矩阵(len_doc, num_tag), 将feats矩阵输入CRF, CRF最主要是要训练其转移矩阵(tag->tag), 所以即使feats矩阵行数不一, CRF都能用feats训练转移矩阵

官方实现: https://pytorch.apachecn.org/docs/1.0/nlp_advanced_tutorial.html?h=Bidirection

In [1]:
import keras
from keras.models import Sequential
from keras.layers import *
import nltk
import torch.nn as nn
import torch

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## 数据准备

In [2]:
num_sentence = 10000
with open(r'D:/CS/dataset/NLP/命名实体识别/单个字特征数据集/example.train', 'r', 
          encoding='utf-8') as f:
    i = 0;
    a = []
    while(i < num_sentence):     # 读取了10000句预料
        content = f.readline()
        if content == '\n':
            i += 1
        a.append(content)

In [3]:
def get_sentences(temp_list):
    index_list = []
    for i in range(len(temp_list)):
        if temp_list[i] == '\n':
            index_list.append(i)
    
    pointer = 0
    sentence_list = []
    for index in index_list:
        sentence_list.append(temp_list[pointer : index])
        pointer = index + 1
    return sentence_list

In [4]:
sentence_list = get_sentences(a)

In [5]:
sentence_list[-2]   # 最后一句没有句号

['但 O\n',
 '是 O\n',
 '， O\n',
 '金 O\n',
 '融 O\n',
 '危 O\n',
 '机 O\n',
 '也 O\n',
 '可 O\n',
 '能 O\n',
 '来 O\n',
 '自 O\n',
 '内 O\n',
 '部 O\n',
 '。 O\n']

In [6]:
# 去掉'\n'和句号
for sentence in sentence_list:
    try:
        sentence.remove('。 O\n')
    except:
        pass
    
    for i in range(len(sentence)):
        sentence[i] = tuple(sentence[i].strip('\n').split(' '))

In [7]:
sentence_list[-1]

[('首', 'B-ORG'),
 ('届', 'I-ORG'),
 ('立', 'I-ORG'),
 ('法', 'I-ORG'),
 ('会', 'I-ORG'),
 ('选', 'O'),
 ('举', 'O'),
 ('是', 'O'),
 ('“', 'O'),
 ('港', 'B-LOC'),
 ('人', 'O'),
 ('治', 'O'),
 ('港', 'B-LOC'),
 ('”', 'O'),
 ('重', 'O'),
 ('要', 'O'),
 ('一', 'O'),
 ('步', 'O')]

In [8]:
# 去除特殊符号
stop_words = '[a-zA-Z0-9’\n·\s＊!"：#$%&\'()◆●（）＠②*+,-./:;<=>?@，。?★、…【】《》？——“”‘’！[\$$^_`{|}~]+'
sentence_list = [[(word, tag) for word, tag in sentence if word not in stop_words] for sentence in sentence_list]

In [9]:
sentence_list[-1]

[('首', 'B-ORG'),
 ('届', 'I-ORG'),
 ('立', 'I-ORG'),
 ('法', 'I-ORG'),
 ('会', 'I-ORG'),
 ('选', 'O'),
 ('举', 'O'),
 ('是', 'O'),
 ('港', 'B-LOC'),
 ('人', 'O'),
 ('治', 'O'),
 ('港', 'B-LOC'),
 ('重', 'O'),
 ('要', 'O'),
 ('一', 'O'),
 ('步', 'O')]

## 准备数据

In [10]:
from gensim.corpora.dictionary import Dictionary
from keras.preprocessing import sequence



In [11]:
docs = [[word for word, tag in sentence] for sentence in sentence_list]

In [12]:
# 总词数
dic = Dictionary(docs)
vocab_size = max(dic.keys()) + 1
print('总词数: ', vocab_size)     # 词id从0开始, 最大id 3818

# 寻找最大句长
sentences_len = [len(doc) for doc in docs]
max_len_a_doc = max(sentences_len)
print('最大文档长度', max_len_a_doc)

总词数:  3819
最大文档长度 550


查看sentences_len发现绝大多数文档长度在120以内

In [13]:
max_len_a_doc = 120
trainY = [[tag for word, tag in sentence] for sentence in sentence_list]    # padding之前

my_dic = dict([('O',0), ('B-LOC', 1), ('I-LOC', 2),
            ('B-PER', 3), ('I-PER', 4), ('B-ORG', 5), ('I-ORG', 6)])
def tag2idx(tag):
    return my_dic[tag]

# 先将tag映射为idx, 再对idx序列padding, 由于'O'对应idx为0, 所以句子padding部分都视作'O', 对应数字0
# 不能用padding, 会使 B-LOC, I-LOC 都变成 B, I
trainY_vec = [[my_dic[tag] for tag in sentence] for sentence in trainY]
trainY_vec = sequence.pad_sequences(trainY_vec, maxlen=max_len_a_doc, value=0, padding='post')
trainY_vec = torch.LongTensor(trainY_vec)
trainY_vec[0]

tensor([0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [14]:
from sklearn.preprocessing import LabelBinarizer
import numpy as np

In [15]:
# 文本向量化
doc_vec_list = [dic.doc2idx(doc) for doc in docs]
# 不能填充-1 因为embedding矩阵中没有-1索引
padded_doc_vec_list = sequence.pad_sequences(doc_vec_list, maxlen=120, value=3819, padding='post')
trainX = np.array(padded_doc_vec_list)
trainX[0]

array([   7,   12,    6,   10,    4,    8,    3,    2,   13,    0,   11,
         13,    1,   14,    9,    7,    5, 3819, 3819, 3819, 3819, 3819,
       3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819,
       3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819,
       3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819,
       3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819,
       3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819,
       3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819,
       3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819,
       3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819,
       3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819])

## BiLSTM
获取BiLSTM的输出feats

In [25]:
from torch import nn
import torch

embedding_dim = 128
lstm_dim = 128
num_tag = 7

class Net(torch.nn.Module):
    def __init__(self, vocab_size, num_tag, embedding_dim, lstm_dim):  # vocab_size + padding的-1
        super(Net, self).__init__()
        self.vocab_size = vocab_size
        self.num_tag = num_tag
        self.embedding_dim = embedding_dim
        self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, lstm_dim, num_layers=1, bidirectional=True)
        self.lstm2tag = nn.Linear(lstm_dim*2, num_tag)   # BiLSTM 会把正反向lstm每个时间步的输出拼接为1行
#         self.lstm2tag = nn.Softmax(num_tag)
    def forward(self, x):
        x = self.word_embeds(x)
        x, hidden = self.lstm(x)
        x = self.lstm2tag(x)
        return x

In [17]:
trainX = torch.LongTensor(trainX)
trainX[0]

tensor([   7,   12,    6,   10,    4,    8,    3,    2,   13,    0,   11,   13,
           1,   14,    9,    7,    5, 3819, 3819, 3819, 3819, 3819, 3819, 3819,
        3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819,
        3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819,
        3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819,
        3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819,
        3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819,
        3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819,
        3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819,
        3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819, 3819])

In [26]:
net = Net(vocab_size+1, num_tag, embedding_dim, lstm_dim)
feats = net(trainX[0:1])
feats

tensor([[[-0.0591,  0.1506, -0.0562, -0.0606,  0.0003,  0.1228, -0.0325],
         [-0.0272,  0.0508,  0.0757,  0.1086,  0.0345,  0.0979, -0.0765],
         [ 0.0990, -0.0354,  0.0378, -0.0061,  0.1308, -0.0080,  0.1091],
         [-0.0752,  0.1256, -0.0580,  0.0247, -0.0555,  0.0964, -0.0149],
         [-0.0490,  0.0461, -0.0128, -0.0365,  0.1063, -0.0036, -0.0747],
         [ 0.0525,  0.0404, -0.0667, -0.0469,  0.0178,  0.0238, -0.0066],
         [ 0.0265,  0.1006, -0.0326, -0.1025,  0.0963, -0.0026,  0.0924],
         [-0.0206, -0.0020,  0.0400,  0.0441, -0.0567, -0.0271, -0.0354],
         [-0.0612,  0.0293, -0.0967, -0.0140,  0.0030,  0.0160,  0.1477],
         [ 0.0110,  0.0351, -0.0472,  0.0341,  0.0400,  0.0741, -0.0266],
         [-0.0146,  0.0733, -0.0524,  0.0058,  0.0016,  0.0129, -0.0721],
         [-0.0612,  0.0293, -0.0967, -0.0140,  0.0030,  0.0160,  0.1477],
         [-0.1427,  0.0983,  0.0901,  0.0338,  0.0947,  0.0186, -0.0573],
         [-0.1687, -0.0063, -0.1076, -

## 尝试单独使用BiLSTM

In [27]:
trainY_vec.shape

torch.Size([10000, 120])

In [28]:
trainX.shape

torch.Size([10000, 120])

In [29]:
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as func

class myLoss(nn.Module):
    def __init__(self):
        super(myLoss, self).__init__()
        self.crossentropy = nn.CrossEntropyLoss()

    def forward(self, scores, true_y):
        the_loss = 0.0
        for i in range(true_y.shape[0]):
            the_loss += self.crossentropy(scores[i], true_y[i])
        return the_loss

criterion = myLoss()
optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)

loss = 0.0
for i in range(500):   # 训练500个句子
    optimizer.zero_grad()
    x = trainX[i:i+1]
    out = net(x)
    loss = criterion(out, trainY_vec[i:i+1])
    print(loss, '\n')
    loss.backward()
    optimizer.step()
    loss = 0.0

tensor(1.9825, grad_fn=<AddBackward0>) 

tensor(1.9800, grad_fn=<AddBackward0>) 

tensor(1.9855, grad_fn=<AddBackward0>) 

tensor(1.9586, grad_fn=<AddBackward0>) 

tensor(1.9411, grad_fn=<AddBackward0>) 

tensor(1.9384, grad_fn=<AddBackward0>) 

tensor(1.9062, grad_fn=<AddBackward0>) 

tensor(1.8852, grad_fn=<AddBackward0>) 

tensor(1.8607, grad_fn=<AddBackward0>) 

tensor(1.9253, grad_fn=<AddBackward0>) 

tensor(1.8418, grad_fn=<AddBackward0>) 

tensor(1.8266, grad_fn=<AddBackward0>) 

tensor(1.7961, grad_fn=<AddBackward0>) 

tensor(1.9335, grad_fn=<AddBackward0>) 

tensor(1.9405, grad_fn=<AddBackward0>) 

tensor(1.7170, grad_fn=<AddBackward0>) 

tensor(1.6551, grad_fn=<AddBackward0>) 

tensor(1.7952, grad_fn=<AddBackward0>) 

tensor(1.7041, grad_fn=<AddBackward0>) 

tensor(1.6509, grad_fn=<AddBackward0>) 

tensor(1.5832, grad_fn=<AddBackward0>) 

tensor(1.5476, grad_fn=<AddBackward0>) 

tensor(1.5641, grad_fn=<AddBackward0>) 

tensor(1.5255, grad_fn=<AddBackward0>) 

tensor(1.4335, g

tensor(0.3807, grad_fn=<AddBackward0>) 

tensor(0.6486, grad_fn=<AddBackward0>) 

tensor(0.7679, grad_fn=<AddBackward0>) 

tensor(0.4429, grad_fn=<AddBackward0>) 

tensor(0.4441, grad_fn=<AddBackward0>) 

tensor(0.4016, grad_fn=<AddBackward0>) 

tensor(0.4155, grad_fn=<AddBackward0>) 

tensor(0.5124, grad_fn=<AddBackward0>) 

tensor(0.6908, grad_fn=<AddBackward0>) 

tensor(0.3979, grad_fn=<AddBackward0>) 

tensor(0.4145, grad_fn=<AddBackward0>) 

tensor(0.2447, grad_fn=<AddBackward0>) 

tensor(0.6809, grad_fn=<AddBackward0>) 

tensor(0.3869, grad_fn=<AddBackward0>) 

tensor(0.3440, grad_fn=<AddBackward0>) 

tensor(0.3594, grad_fn=<AddBackward0>) 

tensor(0.6581, grad_fn=<AddBackward0>) 

tensor(0.3635, grad_fn=<AddBackward0>) 

tensor(0.4351, grad_fn=<AddBackward0>) 

tensor(0.2650, grad_fn=<AddBackward0>) 

tensor(0.3864, grad_fn=<AddBackward0>) 

tensor(0.2123, grad_fn=<AddBackward0>) 

tensor(0.5630, grad_fn=<AddBackward0>) 

tensor(0.2631, grad_fn=<AddBackward0>) 

tensor(0.5534, g

tensor(0.4553, grad_fn=<AddBackward0>) 

tensor(0.2663, grad_fn=<AddBackward0>) 

tensor(0.6760, grad_fn=<AddBackward0>) 

tensor(0.3471, grad_fn=<AddBackward0>) 

tensor(0.2873, grad_fn=<AddBackward0>) 

tensor(0.3050, grad_fn=<AddBackward0>) 

tensor(0.5858, grad_fn=<AddBackward0>) 

tensor(0.3543, grad_fn=<AddBackward0>) 

tensor(0.3524, grad_fn=<AddBackward0>) 

tensor(0.1967, grad_fn=<AddBackward0>) 

tensor(0.1747, grad_fn=<AddBackward0>) 

tensor(0.6016, grad_fn=<AddBackward0>) 

tensor(0.2534, grad_fn=<AddBackward0>) 

tensor(0.3347, grad_fn=<AddBackward0>) 

tensor(0.3545, grad_fn=<AddBackward0>) 

tensor(0.4738, grad_fn=<AddBackward0>) 

tensor(0.5240, grad_fn=<AddBackward0>) 

tensor(0.3342, grad_fn=<AddBackward0>) 

tensor(0.2459, grad_fn=<AddBackward0>) 

tensor(0.3102, grad_fn=<AddBackward0>) 

tensor(0.8702, grad_fn=<AddBackward0>) 

tensor(0.4035, grad_fn=<AddBackward0>) 

tensor(0.4035, grad_fn=<AddBackward0>) 

tensor(0.2742, grad_fn=<AddBackward0>) 

tensor(0.2093, g

In [30]:
output = net(trainX[:500])

In [35]:
output[100]

tensor([[ 1.2129, -0.1440, -0.2700, -0.3013, -0.2119, -0.1916, -0.0448],
        [ 1.1364, -0.2074, -0.2025, -0.2082, -0.1840, -0.2551, -0.1368],
        [ 1.4429, -0.1783, -0.2016, -0.2114, -0.2896, -0.2035, -0.1007],
        [ 1.8817, -0.1221, -0.3430, -0.4221, -0.2591, -0.3507, -0.2658],
        [ 1.4655, -0.2979, -0.1774, -0.1898, -0.1961, -0.1219, -0.1612],
        [ 1.6152, -0.2786, -0.1586, -0.3163, -0.2152, -0.3426, -0.1885],
        [ 2.0086, -0.3190, -0.1961, -0.3322,  0.0572, -0.2588, -0.2307],
        [ 1.9222, -0.2518, -0.2900, -0.4293, -0.3825, -0.3854, -0.1496],
        [ 1.7112, -0.2136, -0.3931, -0.3807, -0.3521, -0.3792, -0.2158],
        [ 1.7968, -0.3339, -0.2345, -0.2662, -0.2781, -0.3498, -0.2335],
        [ 1.9462, -0.2802, -0.3822, -0.4253, -0.3978, -0.4022, -0.2506],
        [ 1.1774, -0.2321, -0.1444, -0.2305, -0.1390, -0.0753, -0.1590],
        [ 1.5405, -0.1377, -0.2857, -0.2663, -0.1958, -0.2769, -0.1474],
        [ 1.1932, -0.2359, -0.1249, -0.2710, -0.086

In [32]:
trainY_vec[0]

tensor([0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])