# BiLSTM + CRF for NER
BiLSTM的输入时文档向量, 并不需要对文档向量padding到一致的长度, 因为对应1个文档, BiLSTM输出1个feats矩阵(len_doc, num_tag), 将feats矩阵输入CRF, CRF最主要是要训练其转移矩阵(tag->tag), 所以即使feats矩阵行数不一, CRF都能用feats训练转移矩阵

官方实现: https://pytorch.apachecn.org/docs/1.0/nlp_advanced_tutorial.html?h=Bidirection

In [1]:
import keras
from keras.models import Sequential
from keras.layers import *
import nltk
import torch.nn as nn
import torch

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


## 数据准备

In [2]:
num_sentence = 10000
with open(r'D:/CS/dataset/NLP/命名实体识别/单个字特征数据集/example.train', 'r', 
          encoding='utf-8') as f:
    i = 0;
    a = []
    while(i < num_sentence):     # 读取了10000句预料
        content = f.readline()
        if content == '\n':
            i += 1
        a.append(content)

In [3]:
def get_sentences(temp_list):
    index_list = []
    for i in range(len(temp_list)):
        if temp_list[i] == '\n':
            index_list.append(i)
    
    pointer = 0
    sentence_list = []
    for index in index_list:
        sentence_list.append(temp_list[pointer : index])
        pointer = index + 1
    return sentence_list

In [4]:
sentence_list = get_sentences(a)

In [5]:
sentence_list[-2]   # 最后一句没有句号

['但 O\n',
 '是 O\n',
 '， O\n',
 '金 O\n',
 '融 O\n',
 '危 O\n',
 '机 O\n',
 '也 O\n',
 '可 O\n',
 '能 O\n',
 '来 O\n',
 '自 O\n',
 '内 O\n',
 '部 O\n',
 '。 O\n']

In [6]:
# 去掉'\n'和句号
for sentence in sentence_list:
    try:
        sentence.remove('。 O\n')
    except:
        pass
    
    for i in range(len(sentence)):
        sentence[i] = tuple(sentence[i].strip('\n').split(' '))

In [7]:
sentence_list[-1]

[('首', 'B-ORG'),
 ('届', 'I-ORG'),
 ('立', 'I-ORG'),
 ('法', 'I-ORG'),
 ('会', 'I-ORG'),
 ('选', 'O'),
 ('举', 'O'),
 ('是', 'O'),
 ('“', 'O'),
 ('港', 'B-LOC'),
 ('人', 'O'),
 ('治', 'O'),
 ('港', 'B-LOC'),
 ('”', 'O'),
 ('重', 'O'),
 ('要', 'O'),
 ('一', 'O'),
 ('步', 'O')]

In [8]:
# 去除特殊符号
stop_words = '[a-zA-Z0-9’\n·\s＊!"：#$%&\'()◆●（）＠②*+,-./:;<=>?@，。?★、…【】《》？——“”‘’！[\$$^_`{|}~]+'
sentence_list = [[(word, tag) for word, tag in sentence if word not in stop_words] for sentence in sentence_list]

In [9]:
sentence_list[-1]

[('首', 'B-ORG'),
 ('届', 'I-ORG'),
 ('立', 'I-ORG'),
 ('法', 'I-ORG'),
 ('会', 'I-ORG'),
 ('选', 'O'),
 ('举', 'O'),
 ('是', 'O'),
 ('港', 'B-LOC'),
 ('人', 'O'),
 ('治', 'O'),
 ('港', 'B-LOC'),
 ('重', 'O'),
 ('要', 'O'),
 ('一', 'O'),
 ('步', 'O')]

## 准备数据

In [10]:
from gensim.corpora.dictionary import Dictionary
from keras.preprocessing import sequence



In [11]:
docs = [[word for word, tag in sentence] for sentence in sentence_list]

In [12]:
# 总词数
dic = Dictionary(docs)
vocab_size = max(dic.keys()) + 1
print('总词数: ', vocab_size)     # 词id从0开始, 最大id 3818

# 寻找最大句长
sentences_len = [len(doc) for doc in docs]
max_len_a_doc = max(sentences_len)
print('最大文档长度', max_len_a_doc)

总词数:  3819
最大文档长度 550


查看sentences_len发现绝大多数文档长度在120以内

In [13]:
max_len_a_doc = 120
trainY = [[tag for word, tag in sentence] for sentence in sentence_list]    # padding之前

my_dic = dict([('O',0), ('B-LOC', 1), ('I-LOC', 2),
            ('B-PER', 3), ('I-PER', 4), ('B-ORG', 5), ('I-ORG', 6)])
def tag2idx(tag):
    return my_dic[tag]

# 先将tag映射为idx, 再对idx序列padding, 由于'O'对应idx为0, 所以句子padding部分都视作'O', 对应数字0
# 不能用padding, 会使 B-LOC, I-LOC 都变成 B, I
trainY_vec = [[my_dic[tag] for tag in sentence] for sentence in trainY]
trainY = trainY_vec
# trainY_vec = sequence.pad_sequences(trainY_vec, maxlen=max_len_a_doc, value=0, padding='post')
# trainY_vec = torch.LongTensor(trainY_vec)
# trainY_vec[0]

In [14]:
from sklearn.preprocessing import LabelBinarizer
import numpy as np

In [15]:
# 文本向量化
doc_vec_list = [dic.doc2idx(doc) for doc in docs]
trainX = doc_vec_list
# trainX = torch.LongTensor(doc_vec_list)
# trainY = trainY_vec
# 不能填充-1 因为embedding矩阵中没有-1索引
# padded_doc_vec_list = sequence.pad_sequences(doc_vec_list, maxlen=120, value=3819, padding='post')
# trainX = np.array(padded_doc_vec_list)
# trainX[0]

## BiLSTM
获取BiLSTM的输出feats

In [16]:
from torch import nn
import torch
from sklearn.preprocessing import LabelBinarizer

# 每次丢入1个句子训练
class BiLSTM_CRF(torch.nn.Module):
    def __init__(self, vocab_size, num_tag, embedding_dim, lstm_dim):  # vocab_size + padding的-1
        super(BiLSTM_CRF, self).__init__()
        self.vocab_size = vocab_size
        self.tag2idx_dict = dict([('O',0), ('B-LOC', 1), ('I-LOC', 2), ('B-PER', 3), ('I-PER', 4), ('B-ORG', 5), ('I-ORG', 6)])
        self.num_tag = len(self.tag2idx_dict.keys())
        
        # transitions[i][j] : 从标签j转移到标签i的分数, 为什么不是从i到j?
        # 因为每个时间步都要对某标签取上一时间步所有标签对该标签的转移分数, 转置后方便取一行
        # 不转置的话就要取transitions的1列
        self.transitions = nn.Parameter(torch.randn(self.num_tag, self.num_tag))
        
        # BiLSTM部分
        self.embedding_dim = embedding_dim
        self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, lstm_dim, num_layers=1, bidirectional=True)
        self.lstm2tag = nn.Linear(lstm_dim*2, num_tag)   # BiLSTM 会把正反向lstm每个时间步的输出拼接为1行

#     def forward(self, x):
#         x = self.word_embeds(x)
#         x, hidden = self.lstm(x)
#         x = self.lstm2tag(x)
#         return x

    # 总的发射分数 == feats矩阵 * num_tag, 每个时间步的每个tag会被num_tag条路径经过
    # 总的转移分数
    
    def get_lstm_out(self, sentence_in):     # 发射分数
        embeds = self.word_embeds(sentence_in)
        lstm_out, lstm_hidden = self.lstm(embeds)
        feats = self.lstm2tag(lstm_out)
        # LSTM的输出是3D, feats[i]对应第i个句子的feats, 1次只训练1个句子, 所以直接取0转化为2D
        return feats[0]
    
    def get_loss(self, sentence_in, true_tags):
        feats = self.get_lstm_out(sentence_in)
        loss = self.global_score(feats) - self.given_tags_score(feats, true_tags)
        return loss
    
    def given_tags_score(self, feats, true_tags):      # 计算正确路径所得分数
        # 发射分数的计算采用矩阵
        '''
        true_tags: [0,4,2,1,5,6]   1个句子的标签序列, list
        '''
        lb = LabelBinarizer().fit(list(range(self.num_tag)))   # fit [0,1,2,3,4,5,6]
        tag_mat = torch.Tensor(lb.transform(true_tags))    # 每一行对应标签为1, 其余0
        emit_score = torch.mul(feats, tag_mat)         # 标签矩阵与feats对应点相乘
        emit_score = torch.sum(emit_score)
        
        trans_score = 0.0
        for i in range(1, len(true_tags)):
            a = true_tags[i]
            b = true_tags[i-1]
            trans_score += self.transitions[a][b]
            
        return emit_score + trans_score
    
    def log_exp(self, scores):
        return torch.log(torch.exp(scores))
    
    def global_score(self, feats):
        step_scores = feats[0]   # 初始化为第一个时间步的各标签发射概率
        for i in range(1, feats.shape[0]):
            # 每一时间步的每个标签都会加上发射分数
            step_scores += feats[i]
            for tag_idx in range(self.num_tag):
                # 每一时间步每个标签都加上上一个时间步每个标签对该标签的转移分数
                step_scores[tag_idx] += torch.sum(self.transitions[tag_idx])
        
        return torch.sum(step_scores)   # 会溢出

$$
\operatorname{Loss}=\log \left(\sum_{\hat{y} \in Y_{X}} e^{S(X, \hat{y})}\right)-S(X, y)
$$
global_scores()返回等号右边第一项, 经过了数学变换所以与公式上的看起来不一样

## 训练BiLSTM + CRF

In [20]:
embedding_dim = 64
lstm_dim = 128
num_tag = 7
BiLSTM_CRF_model = BiLSTM_CRF(vocab_size, num_tag, embedding_dim, lstm_dim)

optimizer = torch.optim.RMSprop(BiLSTM_CRF_model.parameters(), lr=0.01, weight_decay=1e-4)
epochs = 1

# 由于是1句1句训练的, 句子不定长, 每次取出句子训练要先化为tnsor
def sentence2tensor(non_tensor_sentence):
    return torch.LongTensor(non_tensor_sentence)

loss = 0.0
for i in range(epochs):
    for j in range(len(trainX[:2000])):
        x = sentence2tensor(trainX[j:j+1])
        loss = BiLSTM_CRF_model.get_loss(x, trainY[j])
        print(loss,'\n')
        loss.backward()
        optimizer.step()

tensor(-70.9691, grad_fn=<SubBackward0>) 

tensor(-521.4917, grad_fn=<SubBackward0>) 

tensor(-3068.4033, grad_fn=<SubBackward0>) 

tensor(-2653.6165, grad_fn=<SubBackward0>) 

tensor(-3972.8896, grad_fn=<SubBackward0>) 

tensor(-5465.9619, grad_fn=<SubBackward0>) 

tensor(-5095.4146, grad_fn=<SubBackward0>) 

tensor(-5512.6152, grad_fn=<SubBackward0>) 

tensor(-8703.5879, grad_fn=<SubBackward0>) 

tensor(-28062.7656, grad_fn=<SubBackward0>) 

tensor(-20681.3496, grad_fn=<SubBackward0>) 

tensor(-27650.3477, grad_fn=<SubBackward0>) 

tensor(-19349.7793, grad_fn=<SubBackward0>) 

tensor(-97163.6016, grad_fn=<SubBackward0>) 

tensor(-73273.3281, grad_fn=<SubBackward0>) 

tensor(-36614.6094, grad_fn=<SubBackward0>) 

tensor(-21915.1445, grad_fn=<SubBackward0>) 

tensor(-55478.2461, grad_fn=<SubBackward0>) 

tensor(-68601.2656, grad_fn=<SubBackward0>) 

tensor(-60168.0547, grad_fn=<SubBackward0>) 

tensor(-42618.2656, grad_fn=<SubBackward0>) 

tensor(-39760.6953, grad_fn=<SubBackward0>) 



tensor(-384155.6875, grad_fn=<SubBackward0>) 

tensor(-182206.7812, grad_fn=<SubBackward0>) 

tensor(-131651.1250, grad_fn=<SubBackward0>) 

tensor(-70834.1328, grad_fn=<SubBackward0>) 

tensor(-238906.7500, grad_fn=<SubBackward0>) 

tensor(-253330.4844, grad_fn=<SubBackward0>) 

tensor(-143605.4688, grad_fn=<SubBackward0>) 

tensor(-146482.8281, grad_fn=<SubBackward0>) 

tensor(-468878.3438, grad_fn=<SubBackward0>) 

tensor(-450307.8125, grad_fn=<SubBackward0>) 

tensor(-230747.2188, grad_fn=<SubBackward0>) 

tensor(-97046.2969, grad_fn=<SubBackward0>) 

tensor(-124639.1797, grad_fn=<SubBackward0>) 

tensor(-238847.6719, grad_fn=<SubBackward0>) 

tensor(-197050.6562, grad_fn=<SubBackward0>) 

tensor(-186992.1250, grad_fn=<SubBackward0>) 

tensor(-329714.8125, grad_fn=<SubBackward0>) 

tensor(-216126.8906, grad_fn=<SubBackward0>) 

tensor(-311599.8125, grad_fn=<SubBackward0>) 

tensor(-427322.1875, grad_fn=<SubBackward0>) 

tensor(-237040.9844, grad_fn=<SubBackward0>) 

tensor(-266540.

tensor(-431703.0312, grad_fn=<SubBackward0>) 

tensor(-401466.1875, grad_fn=<SubBackward0>) 

tensor(-324255.2812, grad_fn=<SubBackward0>) 

tensor(-369853.5312, grad_fn=<SubBackward0>) 

tensor(-361330.0312, grad_fn=<SubBackward0>) 

tensor(-242824.6875, grad_fn=<SubBackward0>) 

tensor(-287638.2500, grad_fn=<SubBackward0>) 

tensor(-281915.0625, grad_fn=<SubBackward0>) 

tensor(-434241.7812, grad_fn=<SubBackward0>) 

tensor(-353812.5000, grad_fn=<SubBackward0>) 

tensor(-229805.7188, grad_fn=<SubBackward0>) 

tensor(-576881.3125, grad_fn=<SubBackward0>) 

tensor(-485315.6875, grad_fn=<SubBackward0>) 

tensor(-325552.4062, grad_fn=<SubBackward0>) 

tensor(-262652.9062, grad_fn=<SubBackward0>) 

tensor(-686923.3750, grad_fn=<SubBackward0>) 

tensor(-295386.7500, grad_fn=<SubBackward0>) 

tensor(-237473.4219, grad_fn=<SubBackward0>) 

tensor(-285767.8438, grad_fn=<SubBackward0>) 

tensor(-190240.3125, grad_fn=<SubBackward0>) 

tensor(-462438.6875, grad_fn=<SubBackward0>) 

tensor(-60835

tensor(-399955.8438, grad_fn=<SubBackward0>) 

tensor(-186718.7188, grad_fn=<SubBackward0>) 

tensor(-309079.1250, grad_fn=<SubBackward0>) 

tensor(-307275.8438, grad_fn=<SubBackward0>) 

tensor(-392381.8125, grad_fn=<SubBackward0>) 

tensor(-144343.3750, grad_fn=<SubBackward0>) 

tensor(-362475.7188, grad_fn=<SubBackward0>) 

tensor(-293735.3750, grad_fn=<SubBackward0>) 

tensor(-473177.1562, grad_fn=<SubBackward0>) 

tensor(-536311.4375, grad_fn=<SubBackward0>) 

tensor(-461449.1250, grad_fn=<SubBackward0>) 

tensor(-419019.8125, grad_fn=<SubBackward0>) 

tensor(-242874.2812, grad_fn=<SubBackward0>) 

tensor(-139791.0938, grad_fn=<SubBackward0>) 

tensor(-979331.6250, grad_fn=<SubBackward0>) 

tensor(-644518.4375, grad_fn=<SubBackward0>) 

tensor(-572185.1875, grad_fn=<SubBackward0>) 

tensor(-344547.0938, grad_fn=<SubBackward0>) 

tensor(-340495.9062, grad_fn=<SubBackward0>) 

tensor(-696043.6250, grad_fn=<SubBackward0>) 

tensor(-895150.6250, grad_fn=<SubBackward0>) 

tensor(-10065

tensor(-810611.1875, grad_fn=<SubBackward0>) 

tensor(-507038.5625, grad_fn=<SubBackward0>) 

tensor(-811012.5000, grad_fn=<SubBackward0>) 

tensor(-783969.5000, grad_fn=<SubBackward0>) 

tensor(-316187.6250, grad_fn=<SubBackward0>) 

tensor(-561022.1875, grad_fn=<SubBackward0>) 

tensor(-491588.9688, grad_fn=<SubBackward0>) 

tensor(-820904.8750, grad_fn=<SubBackward0>) 

tensor(-401981.4062, grad_fn=<SubBackward0>) 

tensor(-419407.6875, grad_fn=<SubBackward0>) 

tensor(-2743053., grad_fn=<SubBackward0>) 

tensor(-426824.7812, grad_fn=<SubBackward0>) 

tensor(-361092.6250, grad_fn=<SubBackward0>) 

tensor(-484389.6250, grad_fn=<SubBackward0>) 

tensor(-328666.9062, grad_fn=<SubBackward0>) 

tensor(-345019.5000, grad_fn=<SubBackward0>) 

tensor(-568103.6250, grad_fn=<SubBackward0>) 

tensor(-612214.5625, grad_fn=<SubBackward0>) 

tensor(-320689., grad_fn=<SubBackward0>) 

tensor(-414406.4375, grad_fn=<SubBackward0>) 

tensor(-647053.0625, grad_fn=<SubBackward0>) 

tensor(-1346886.2500

tensor(-361996.9375, grad_fn=<SubBackward0>) 

tensor(-1070950.1250, grad_fn=<SubBackward0>) 

tensor(-1378694.8750, grad_fn=<SubBackward0>) 

tensor(-688594.6250, grad_fn=<SubBackward0>) 

tensor(-145050.6094, grad_fn=<SubBackward0>) 

tensor(-577514.5000, grad_fn=<SubBackward0>) 

tensor(-694434.0625, grad_fn=<SubBackward0>) 

tensor(-1096010.5000, grad_fn=<SubBackward0>) 

tensor(-410612.4375, grad_fn=<SubBackward0>) 

tensor(-1247628.7500, grad_fn=<SubBackward0>) 

tensor(-883536.2500, grad_fn=<SubBackward0>) 

tensor(-364376.4688, grad_fn=<SubBackward0>) 

tensor(-232061.2500, grad_fn=<SubBackward0>) 

tensor(-965774.7500, grad_fn=<SubBackward0>) 

tensor(-699139.8750, grad_fn=<SubBackward0>) 

tensor(-635118.3750, grad_fn=<SubBackward0>) 

tensor(-948576.3750, grad_fn=<SubBackward0>) 

tensor(-1525157., grad_fn=<SubBackward0>) 

tensor(-472377.6875, grad_fn=<SubBackward0>) 

tensor(-673317.3750, grad_fn=<SubBackward0>) 

tensor(-819558.4375, grad_fn=<SubBackward0>) 

tensor(-6011

tensor(-1225769.3750, grad_fn=<SubBackward0>) 

tensor(-764841.5000, grad_fn=<SubBackward0>) 

tensor(-1226290.1250, grad_fn=<SubBackward0>) 

tensor(-1095404., grad_fn=<SubBackward0>) 

tensor(-498537.1875, grad_fn=<SubBackward0>) 

tensor(-414022.7812, grad_fn=<SubBackward0>) 

tensor(-227201.7500, grad_fn=<SubBackward0>) 

tensor(-966571.1875, grad_fn=<SubBackward0>) 

tensor(-429761.1562, grad_fn=<SubBackward0>) 

tensor(-998861.6250, grad_fn=<SubBackward0>) 

tensor(-939837.5625, grad_fn=<SubBackward0>) 

tensor(-416449.7500, grad_fn=<SubBackward0>) 

tensor(-995704.1250, grad_fn=<SubBackward0>) 

tensor(-500498.5938, grad_fn=<SubBackward0>) 

tensor(-1150174.5000, grad_fn=<SubBackward0>) 

tensor(-439631.7812, grad_fn=<SubBackward0>) 

tensor(-1261213.1250, grad_fn=<SubBackward0>) 

tensor(-1509352.2500, grad_fn=<SubBackward0>) 

tensor(-551171.4375, grad_fn=<SubBackward0>) 

tensor(-1038227., grad_fn=<SubBackward0>) 

tensor(-1153566.8750, grad_fn=<SubBackward0>) 

tensor(-10909

tensor(-253104.0156, grad_fn=<SubBackward0>) 

tensor(-970721.5625, grad_fn=<SubBackward0>) 

tensor(-709278.8750, grad_fn=<SubBackward0>) 

tensor(-687896.2500, grad_fn=<SubBackward0>) 

tensor(-971301.5625, grad_fn=<SubBackward0>) 

tensor(-937080., grad_fn=<SubBackward0>) 

tensor(-911666.5625, grad_fn=<SubBackward0>) 

tensor(-2469486.7500, grad_fn=<SubBackward0>) 

tensor(-658304.2500, grad_fn=<SubBackward0>) 

tensor(-343144.0625, grad_fn=<SubBackward0>) 

tensor(-1286660.6250, grad_fn=<SubBackward0>) 

tensor(-591960.6875, grad_fn=<SubBackward0>) 

tensor(-1436739.1250, grad_fn=<SubBackward0>) 

tensor(-1239123., grad_fn=<SubBackward0>) 

tensor(-692600.1875, grad_fn=<SubBackward0>) 

tensor(-966563.8750, grad_fn=<SubBackward0>) 

tensor(-783284.5625, grad_fn=<SubBackward0>) 

tensor(-1465532.6250, grad_fn=<SubBackward0>) 

tensor(-968535.6250, grad_fn=<SubBackward0>) 

tensor(-282325.5625, grad_fn=<SubBackward0>) 

tensor(-1093769.3750, grad_fn=<SubBackward0>) 

tensor(-1057403

tensor(-938357.9375, grad_fn=<SubBackward0>) 

tensor(-706397.4375, grad_fn=<SubBackward0>) 

tensor(-789103.8125, grad_fn=<SubBackward0>) 

tensor(-1008617.8125, grad_fn=<SubBackward0>) 

tensor(-817379.0625, grad_fn=<SubBackward0>) 

tensor(-653342.5625, grad_fn=<SubBackward0>) 

tensor(-1387160.2500, grad_fn=<SubBackward0>) 

tensor(-1489343.1250, grad_fn=<SubBackward0>) 

tensor(-599507., grad_fn=<SubBackward0>) 

tensor(-1715100.5000, grad_fn=<SubBackward0>) 

tensor(-514502.5625, grad_fn=<SubBackward0>) 

tensor(-1067274.2500, grad_fn=<SubBackward0>) 

tensor(-1137264.6250, grad_fn=<SubBackward0>) 

tensor(-656236.3750, grad_fn=<SubBackward0>) 

tensor(-1902830., grad_fn=<SubBackward0>) 

tensor(-866398.6875, grad_fn=<SubBackward0>) 

tensor(-1051643.6250, grad_fn=<SubBackward0>) 

tensor(-1184871.5000, grad_fn=<SubBackward0>) 

tensor(-1636696.5000, grad_fn=<SubBackward0>) 

tensor(-560105.1250, grad_fn=<SubBackward0>) 

tensor(-717593.1250, grad_fn=<SubBackward0>) 

tensor(-714

tensor(-479541.5000, grad_fn=<SubBackward0>) 

tensor(-942674.4375, grad_fn=<SubBackward0>) 

tensor(-745216.3750, grad_fn=<SubBackward0>) 

tensor(-445330.5938, grad_fn=<SubBackward0>) 

tensor(-1104974.2500, grad_fn=<SubBackward0>) 

tensor(-2591671.5000, grad_fn=<SubBackward0>) 

tensor(-837574.0625, grad_fn=<SubBackward0>) 

tensor(-1843144.7500, grad_fn=<SubBackward0>) 

tensor(-988585.1875, grad_fn=<SubBackward0>) 

tensor(-971929.8125, grad_fn=<SubBackward0>) 

tensor(-416754.7812, grad_fn=<SubBackward0>) 

tensor(-1105476.1250, grad_fn=<SubBackward0>) 

tensor(-1261982.7500, grad_fn=<SubBackward0>) 

tensor(-2074854.1250, grad_fn=<SubBackward0>) 

tensor(-867444.6250, grad_fn=<SubBackward0>) 

tensor(-1544216.1250, grad_fn=<SubBackward0>) 

tensor(-850732.3750, grad_fn=<SubBackward0>) 

tensor(-629853.1250, grad_fn=<SubBackward0>) 

tensor(-1363972.5000, grad_fn=<SubBackward0>) 

tensor(-1128097.5000, grad_fn=<SubBackward0>) 

tensor(-418826.7812, grad_fn=<SubBackward0>) 

tens

tensor(-938973., grad_fn=<SubBackward0>) 

tensor(-685381., grad_fn=<SubBackward0>) 

tensor(-873888.8125, grad_fn=<SubBackward0>) 

tensor(-2088688.2500, grad_fn=<SubBackward0>) 

tensor(-3961685.7500, grad_fn=<SubBackward0>) 

tensor(-1006436., grad_fn=<SubBackward0>) 

tensor(-1271649.7500, grad_fn=<SubBackward0>) 

tensor(-555409.9375, grad_fn=<SubBackward0>) 

tensor(-426879.5000, grad_fn=<SubBackward0>) 

tensor(-843994.5000, grad_fn=<SubBackward0>) 

tensor(-1672224.2500, grad_fn=<SubBackward0>) 

tensor(-1076432.5000, grad_fn=<SubBackward0>) 

tensor(-2319914.5000, grad_fn=<SubBackward0>) 

tensor(-657329.6875, grad_fn=<SubBackward0>) 

tensor(-655484.7500, grad_fn=<SubBackward0>) 

tensor(-458797.3438, grad_fn=<SubBackward0>) 

tensor(-880504.7500, grad_fn=<SubBackward0>) 

tensor(-683920.7500, grad_fn=<SubBackward0>) 

tensor(-1144916.8750, grad_fn=<SubBackward0>) 

tensor(-1173665.3750, grad_fn=<SubBackward0>) 

tensor(-1798610.1250, grad_fn=<SubBackward0>) 

tensor(-886062.

tensor(-879739.3750, grad_fn=<SubBackward0>) 

tensor(-1180061.7500, grad_fn=<SubBackward0>) 

tensor(-3024347.7500, grad_fn=<SubBackward0>) 

tensor(-487591.9688, grad_fn=<SubBackward0>) 

tensor(-1963481.1250, grad_fn=<SubBackward0>) 

tensor(-468307.3750, grad_fn=<SubBackward0>) 

tensor(-594100.8750, grad_fn=<SubBackward0>) 

tensor(-480290.0625, grad_fn=<SubBackward0>) 

tensor(-2241008., grad_fn=<SubBackward0>) 

tensor(-1901159.2500, grad_fn=<SubBackward0>) 

tensor(-1691153.3750, grad_fn=<SubBackward0>) 

tensor(-2802173., grad_fn=<SubBackward0>) 

tensor(-1090745.3750, grad_fn=<SubBackward0>) 

tensor(-1424656.8750, grad_fn=<SubBackward0>) 

tensor(-1673474.7500, grad_fn=<SubBackward0>) 

tensor(-3785390., grad_fn=<SubBackward0>) 

tensor(-809283.4375, grad_fn=<SubBackward0>) 

tensor(-1520550.3750, grad_fn=<SubBackward0>) 

tensor(-845440.0625, grad_fn=<SubBackward0>) 

tensor(-1505564.3750, grad_fn=<SubBackward0>) 

tensor(-2412583.7500, grad_fn=<SubBackward0>) 

tensor(-119

In [21]:
torch.save(BiLSTM_CRF_model, r'D:\BiLSTM_CRF_model')

  "type " + obj.__name__ + ". It won't be checked "


将transtion矩阵初始化为每个时间步各标签概率和为1