# #4命名实体标注

In [1]:
%matplotlib inline

In [114]:
import torch
import torch.autograd as autograd
import torch.nn as nn
import torch.optim as optim
import pickle
import numpy as np
from sklearn.utils import shuffle
torch.manual_seed(1)

<torch._C.Generator at 0x112e3f0d0>

Helper functions to make the code more readable.



In [162]:
# 找到最大值
def argmax(vec):
    # return the argmax as a python int
    _, idx = torch.max(vec, 1)
    return idx.item()

# 将seq数据转换成index用于模型
def prepare_sequence(seq, to_ix):
    idxs = [to_ix[w] for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

# 将seq数据转换成index用于模型，
# 此处支持batch处理以及padding操作
def prepare_sequence_with_batch_and_pad(seqs, to_ix, labels, tag_to_ix):
    max_length = 0
    idxs = []
    label_idxs = []
    for seq,label_seq in zip(seqs, labels):
        if max_length <= len(seq):
            max_length = len(seq)
        idxs.append([to_ix[w] for w in seq])
        label_idxs.append([tag_to_ix[w] for w in label_seq])
    # 补完整
    #print(max_length)
    for i in range(len(idxs)):
        idxs[i] = idxs[i] + [to_ix["<PAD>"]] * (max_length-len(idxs[i]))
        label_idxs[i] = label_idxs[i] + [tag_to_ix["<PAD>"]] * (max_length-len(label_idxs[i]))
        #print([tag_to_ix["<PAD>"]] * (max_length-len(label_idxs[i])))
 
    #print(idxs)
    #print(label_idxs)
    return torch.tensor(idxs, dtype=torch.long), torch.tensor(label_idxs, dtype=torch.long)

# Compute log sum exp in a numerically stable way for the forward algorithm
'''
max_score维度是1，max_score(1,-1)的维度是（1，1），加上expand变成（1，tag_size）
'''
def log_sum_exp(vec):
    max_score = vec[0, argmax(vec)]
    # 
    max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
    return max_score + \
        torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))

Create model



In [188]:
class BiLSTM_CRF(nn.Module):

    def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim):
        super(BiLSTM_CRF, self).__init__()
        
        '''
        定义模型需要用到的参数
        embedding_dim: 嵌入层维度
        hidden_dim:    RNN单元的hidden维度
        voacb_size:    词表大小
        tag_to_ix:     标注->index的映射
        tagset_size:   标注词表大小
        '''
        self.embedding_dim = embedding_dim   
        self.hidden_dim = hidden_dim
        self.vocab_size = vocab_size
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)

        self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
        
        # 使用了双向LSTM，因此hidden_dim要缩小至1/2
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,
                            num_layers=1, bidirectional=True)

        # 将LSTM的输出映射到标注空间
        self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)

        # 转移矩阵的参数.  Entry i,j表示从j->i的转移
        self.transitions = nn.Parameter(
            torch.randn(self.tagset_size, self.tagset_size))

        # 定义start和end标注的限制：
        # 1）不可能从其他标注转移到start
        # 2）不可能从end转移到其他标注
        self.transitions.data[tag_to_ix[START_TAG], :] = -10000
        self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000

        # 初始化RNN中的state
        #self.hidden = self.init_hidden()

    def init_hidden(self, sentence_len):
        return (torch.randn(2, sentence_len, self.hidden_dim // 2),
                torch.randn(2, sentence_len, self.hidden_dim // 2))

    # 预测序列的分数
    def _forward_alg(self, feats, tags):
        scores = []
        batchsize = tags.size(0)
        length = tags.size(1)       
        feats = feats.view(batchsize,length,-1)
        for sen_id in range(batchsize):
            # Do the forward algorithm to compute the partition function
            # 初始化，全部为-10000
            init_alphas = torch.full((1, self.tagset_size), -10000.)
            # START_TAG has all of the score.
            init_alphas[0][self.tag_to_ix[START_TAG]] = 0.

            # Wrap in a variable so that we will get automatic backprop
            forward_var = init_alphas

            # 遍历整句话的每个word
            # feats: [-1, tag_size], -1表示batchsize * max_len
            for feat in feats[sen_id]:
                alphas_t = []  # The forward tensors at this timestep
                for next_tag in range(self.tagset_size):
                    # broadcast the emission score: it is the same regardless of
                    # the previous tag
                    # 用feat（分类层）对下一个标注的输出作为发射分数
                    emit_score = feat[next_tag].view(
                        1, -1).expand(1, self.tagset_size)

                    # 第i个entry的转移分数，是从i->下一个标注的转移分数
                    # trans_score所有其他标注到next_tag的分数
                    trans_score = self.transitions[next_tag].view(1, -1) # (1,tag_size)
                    # 第i个entry的next_tag_var，是在log-sum-exp计算之前，边i->下一个标注的值
                    next_tag_var = forward_var + trans_score + emit_score
                    # The forward variable for this tag is log-sum-exp of all the
                    # scores.
                    alphas_t.append(log_sum_exp(next_tag_var).view(1))
                forward_var = torch.cat(alphas_t).view(1, -1)
            terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
            alpha = log_sum_exp(terminal_var)
        
        scores.append(alpha)
        #print(alpha.size())
        return sum(scores)

    # 输入数据并且计算至分类层的输出，CRF算法中定义这里为发射矩阵
    def _get_lstm_features(self, sentence):
        self.hidden = self.init_hidden(len(sentence[0]))
        embeds = self.word_embeds(sentence).view(len(sentence), sentence.size(1), -1)
        lstm_out, self.hidden = self.lstm(embeds, self.hidden)
        lstm_out = lstm_out.view(-1, self.hidden_dim)
        lstm_feats = self.hidden2tag(lstm_out)
        return lstm_feats

    # 根据真实的标签算出score
    def _score_sentence(self, feats, tags):
        scores = []
        batchsize = tags.size(0)
        length = tags.size(1)
        feats = feats.view(batchsize,length,-1)
        for sen_id in range(batchsize):
            score = torch.zeros(1)
            #temp = torch.tensor([self.tag_to_ix[START_TAG]], dtype=torch.long).expand(tags.size(0),1)
            cur_tags = torch.cat([torch.tensor([self.tag_to_ix[START_TAG]], dtype=torch.long), tags[sen_id].view(-1)])
            #tags = tags.view(-1)
            for i, feat in enumerate(feats[sen_id]):
                # score等于当前score加上从当前tags转移至下一个tags的转移概率再加上下一个tags的分类层概率之和
                score = score + \
                    self.transitions[cur_tags[i + 1], cur_tags[i]] + feat[cur_tags[i + 1]]
            # 再加上从最后一个tags到stop的转移概率
            score = score + self.transitions[self.tag_to_ix[STOP_TAG], cur_tags[-1]]
        scores.append(score)
        return sum(scores)

    # 解码，得到预测的序列，以及预测序列的得分
    def _viterbi_decode(self, feats):
        backpointers = []

        # 和forward_alg部分的初始化一样
        init_vvars = torch.full((1, self.tagset_size), -10000.)
        init_vvars[0][self.tag_to_ix[START_TAG]] = 0

        # forward_var at step i holds the viterbi variables for step i-1
        # fowward_var在第 i step存储了第 i-1 step的viterbi结果
        forward_var = init_vvars
        for feat in feats:
            bptrs_t = []  # holds the backpointers for this step
            viterbivars_t = []  # holds the viterbi variables for this step

            for next_tag in range(self.tagset_size):
                # next_tag_var[i] holds the viterbi variable for tag i at the
                # previous step, plus the score of transitioning
                # from tag i to next_tag.
                # We don't include the emission scores here because the max
                # does not depend on them (we add them in below)
                next_tag_var = forward_var + self.transitions[next_tag]
                best_tag_id = argmax(next_tag_var)
                bptrs_t.append(best_tag_id)
                viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
            # Now add in the emission scores, and assign forward_var to the set
            # of viterbi variables we just computed
            # 这里的feat和之前的虽然有区别，但是直接加上了emit的全部序列的值
            forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
            backpointers.append(bptrs_t)

        # Transition to STOP_TAG
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        best_tag_id = argmax(terminal_var)
        path_score = terminal_var[0][best_tag_id]

        # Follow the back pointers to decode the best path.
        best_path = [best_tag_id]
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)
        # Pop off the start tag (we dont want to return that to the caller)
        start = best_path.pop()
        assert start == self.tag_to_ix[START_TAG]  # Sanity check
        best_path.reverse()
        return path_score, best_path

    
    # 计算负对数似然函数
    def neg_log_likelihood(self, sentence, tags):
        # feats作为经过LSTM特征抽取至分类层的抽象输出
        feats = self._get_lstm_features(sentence)
        # 预测序列的score
        forward_score = self._forward_alg(feats, tags)
        # 计算真实的score
        gold_score = self._score_sentence(feats, tags)
        return forward_score - gold_score
    
    # 用在验证环节
    def forward(self, sentence):  # dont confuse this with _forward_alg above.
        # Get the emission scores from the BiLSTM
        lstm_feats = self._get_lstm_features(sentence)

        # Find the best path, given the features.
        score, tag_seq = self._viterbi_decode(lstm_feats)
        return score, tag_seq

Run training



In [92]:
new_train_data_dict = {'sentence_list':[],'label_list':[]}
for sentence_seq, label_seq in zip(raw_data["sentence_list"], raw_data["label_list"]):
    newsentence_seq = ' '.join(sentence_seq.split(" ")[1:-1])
    newlabel_seq = ' '.join(label_seq.split(" ")[1:-1])
    new_train_data_dict['sentence_list'].append(newsentence_seq)
    new_train_data_dict['label_list'].append(newlabel_seq)

In [109]:
tokens_level_sen = []
tokens_level_label = []
for sen, label in zip(raw_data['sentence_list'],raw_data['label_list']):
    temp_sen = sen.split()
    temp_label = label.split()
    tokens_level_sen.append(temp_sen)
    tokens_level_label.append(temp_label)
new_train_data_dict['tokens_level_sen'] = tokens_level_sen
new_train_data_dict['tokens_level_label'] = tokens_level_label

In [131]:
with open('source/conll-corpora/CoNLL-2003/generated/generated_eng.train','rb') as f:
    raw_data = pickle.load(f)
with open('source/conll-corpora/CoNLL-2003/generated/label_vocab','rb') as f:
    tag_to_ix, ix_to_tag = pickle.load(f)
with open('source/conll-corpora/CoNLL-2003/generated/sentence_vocab','rb') as f:
    word_to_ix, ix_to_word = pickle.load(f)

In [180]:
dataset_sen = raw_data["tokens_level_sen"]
dataset_label = raw_data["tokens_level_label"]
dataset_sen, dataset_label = shuffle(dataset_sen, dataset_label)
length = len(dataset_sen)
train = 0.8
valid = 0.1
test = 0.1
train_data_x, train_data_y = dataset_sen[:int(length*train)], dataset_label[:int(length*train)]
valid_data_x, valid_data_y = dataset_sen[int(length*train):int(length*(train+valid))], dataset_label[int(length*train):int(length*(train+valid))]
train_data_x, train_data_y = dataset_sen[:int(-length*test)], dataset_label[:int(-length*test)]


In [191]:
START_TAG = "<START>"
STOP_TAG = "<END>"
PAD_TAG = "<PAD>"
EMBEDDING_DIM = 100
HIDDEN_DIM = 128
BATCHSIZE = 32
EPOCH = 10
# Make up some training data
# training_data = [[(
#     "the wall street journal reported today that apple corporation made money".split(),
#     "B I I I O O O B I O O".split()
# ),(
#     "the wall street journal reported today that apple corporation made money".split(),
#     "B I I I O O O B I O O".split()
# ),(
#     "the wall street journal reported today that apple corporation made money".split(),
#     "B I I I O O O B I O O".split()
# ) ]]

# tt_train = ["the wall street journal reported today that apple corporation made money".split(),
#            "the wall street journal reported today that apple corporation made money".split(),
#            "the wall street journal reported today that apple corporation made money".split()]
# tt_label = ["B I I I O O O B I O O".split(),"B I I I O O O B I O O".split(),
#            "B I I I O O O B I O O".split()]

#(
#    "georgia tech is a university in georgia".split(),
#    "B I O O O O B".split()
#)
# word_to_ix = {}
# for sentence, tags in training_data[0]:
#     for word in sentence:
#         if word not in word_to_ix:
#             word_to_ix[word] = len(word_to_ix)

#tag_to_ix = {"B": 0, "I": 1, "O": 2, START_TAG: 3, STOP_TAG: 4}

model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM)
optimizer = optim.SGD(model.parameters(), lr=0.01, weight_decay=1e-4)

# Check predictions before training
# with torch.no_grad():
#     precheck_sent = prepare_sequence_with_batch(tt_train, word_to_ix)
#     precheck_tags = torch.tensor([[tag_to_ix[t] for t in tag_seq] for tag_seq in tt_label], dtype=torch.long)
#     print(model(precheck_sent))

total_step = len(train_data_x) // BATCHSIZE + 1
for epo in range(EPOCH):  
    
    for step in range(total_step):
        sentence = train_data_x[step * BATCHSIZE : (step + 1) * BATCHSIZE]
        tags = train_data_y[step * BATCHSIZE : (step + 1) * BATCHSIZE]
        # step 1.  清空梯度
        model.zero_grad()

        # step 2.  准备数据，包括word2index以及padding
        sentence_in, targets = prepare_sequence_with_batch_and_pad(sentence, word_to_ix, tags, tag_to_ix)
        print(sentence_in.size())
        # Step 3.  forward
        loss = model.neg_log_likelihood(sentence_in, targets)

        # Step 4. 计算loss、梯度以及更新参数
        loss.backward()
        optimizer.step()
        
        if(step+1) % 100 == 0:
            print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' 
                   .format(epo+1, EPOCH, step+1, total_step, loss.item())) 
    
    valid_loss = 0.0
    valid_steps = 0
    print("Validation...")
    for step in range(len(valid_data_x) // BATCHSIZE + 1):
        sentence = valid_data_x[step * BATCHSIZE : (step + 1) * BATCHSIZE]
        tags = valid_data_y[step * BATCHSIZE : (step + 1) * BATCHSIZE]  
        
        sentence_in = prepare_sequence_with_batch(sentence, word_to_ix)
        targets = torch.tensor([[tag_to_ix[t] for t in tag_seq] for tag_seq in tags], dtype=torch.long)
        
        loss = model.neg_log_likelihood(sentence_in, targets)
        valid_loss += loss
        valid_steps += 1
    
    avg_valid_loss = float('%.4f' %(valid_loss / valid_steps))
    print ('Epoch [{}/{}], Valid_Loss: {:.4f}' 
           .format(epo+1, EPOCH, step+1, total_step, avg_valid_loss)) 
        
        
# 测试
with torch.no_grad():
    precheck_sent = prepare_sequence(training_data[0][0], word_to_ix)
    print(model(precheck_sent))


torch.Size([32, 38])
torch.Size([32, 47])
torch.Size([32, 39])


KeyboardInterrupt: 

In [None]:
def compute_matrixs(predict, answers, index):
    

Exercise: A new loss function for discriminative tagging
--------------------------------------------------------

It wasn't really necessary for us to create a computation graph when
doing decoding, since we do not backpropagate from the viterbi path
score. Since we have it anyway, try training the tagger where the loss
function is the difference between the Viterbi path score and the score
of the gold-standard path. It should be clear that this function is
non-negative and 0 when the predicted tag sequence is the correct tag
sequence. This is essentially *structured perceptron*.

This modification should be short, since Viterbi and score\_sentence are
already implemented. This is an example of the shape of the computation
graph *depending on the training instance*. Although I haven't tried
implementing this in a static toolkit, I imagine that it is possible but
much less straightforward.

Pick up some real data and do a comparison!


