In [1]:
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence

import torch
import torch.nn as nn
import torch.optim as optim       # 模型優化器模塊
import torch.autograd as autograd # torch中自動計算梯度模塊
import torch.nn.functional as F   # 神經網絡模塊中的常用功能 

import numpy as np
import pickle, math, datetime, time, os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

is_cuda = torch.cuda.is_available()

In [2]:
# from tensorboardX import SummaryWriter
# writer = SummaryWriter('log')

In [3]:
from utils.preprocess import get_sentence_target, group_data, split_dataset

In [15]:
class BiLSTM_CRF(nn.Module):

    def __init__(self, embedding_dim, embedding_weights,
                 hidden_dim, tag_to_ix, dropout, num_layers, bidirectional):

        super(BiLSTM_CRF, self).__init__()
        
        self.direction = 2 if bidirectional else 1
        self.hidden_dim = hidden_dim // self.direction
        self.num_layers = num_layers
        self.embedding_dim = embedding_dim
        self.tag_to_ix = tag_to_ix
        self.tagset_size = len(tag_to_ix)

        weights = torch.cuda.FloatTensor(embedding_weights) if is_cuda else torch.FloatTensor(embedding_weights)
        self.word_embeddings = nn.Embedding.from_pretrained(weights, freeze=True)
        
        self.lstm = nn.LSTM(embedding_dim, self.hidden_dim, 
                            dropout=dropout, num_layers=self.num_layers,
                            bidirectional=bidirectional,
                            batch_first=True)
        
        # Maps the output of the LSTM into tag space.
        self.hidden2tag = nn.Linear(self.hidden_dim * self.direction, self.tagset_size)

        # Matrix of transition parameters.  Entry i,j is the score of transitioning *to* i *from* j.
        init_transitions = torch.randn(self.tagset_size, self.tagset_size)
   
        # These two statements enforce the constraint that we never transfer
        # to the start tag and we never transfer from the stop tag
        init_transitions.data[tag_to_ix[START_TAG], :] = -10000.0
        init_transitions.data[:, tag_to_ix[STOP_TAG]] = -10000.0
        
        if is_cuda: init_transitions = init_transitions.cuda()

        self.transitions = nn.Parameter(init_transitions)

        
    def init_hidden(self, batch_size):
#         h_states = autograd.Variable(torch.zeros(self.num_layers * self.direction, batch_size, self.hidden_dim))
#         c_states = autograd.Variable(torch.zeros(self.num_layers * self.direction, batch_size, self.hidden_dim))

        h_states = torch.randn(self.num_layers * self.direction, batch_size, self.hidden_dim)
        c_states = torch.randn(self.num_layers * self.direction, batch_size, self.hidden_dim)
        
        return (h_states.cuda(), c_states.cuda()) if is_cuda else (h_states, c_states)

    
    def _forward_alg(self, feats):
        # Do the forward algorithm to compute the partition function
        init_alphas = torch.full((1, self.tagset_size), -10000.)
        if is_cuda: init_alphas = init_alphas.cuda()
        
        # START_TAG has all of the score.
        init_alphas[0][self.tag_to_ix[START_TAG]] = 0.

        # Wrap in a variable so that we will get automatic backprop
        forward_var = init_alphas

        # Iterate through the sentence
        for feat in feats:
            alphas_t = []  # The forward tensors at this timestep
            for next_tag in range(self.tagset_size):
                
                # broadcast the emission score: it is the same regardless of the previous tag
                emit_score = feat[next_tag].view(1, -1).expand(1, self.tagset_size)
                
                # the ith entry of trans_score is the score of transitioning to next_tag from i
                trans_score = self.transitions[next_tag].view(1, -1)
                
                # The ith entry of next_tag_var is the value for the
                # edge (i -> next_tag) before we do log-sum-exp
                next_tag_var = forward_var + trans_score + emit_score
                
                # The forward variable for this tag is log-sum-exp of all the scores.
                alphas_t.append(log_sum_exp(next_tag_var).view(1))
                
            forward_var = torch.cat(alphas_t).view(1, -1)
            
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        alpha = log_sum_exp(terminal_var)
        
        return alpha
    

    def _get_lstm_features(self, sentence, lengths):
        batch_size, seq_len = sentence.shape
        self.hidden = self.init_hidden(batch_size)

        embeds = self.word_embeddings(sentence)
        embeds = pack_padded_sequence(embeds, lengths, batch_first=True)

        lstm_out, self.hidden = self.lstm(embeds, self.hidden)
        lstm_out, lengths = pad_packed_sequence(lstm_out, batch_first=True)
        
#         lstm_out = lstm_out.view(len(sentence), self.hidden_dim)
        lstm_out = lstm_out.contiguous().view(batch_size * seq_len, -1)
        lstm_feats = self.hidden2tag(lstm_out) 
        
        return lstm_feats

    
    def _score_sentence(self, feats, tags):
        # Gives the score of a provided tag sequence
        score = torch.zeros(1)
        if is_cuda: 
            score = torch.zeros(1).cuda()
#             [tensor([3], device='cuda:0'), tensor([[0, 1, 1, 1, 2, 2, 2, 0, 1, 2, 2]], device='cuda:0')]
            print(torch.cuda.LongTensor([self.tag_to_ix[START_TAG]]))
            print(tags)
            tags = torch.cat([torch.cuda.LongTensor([self.tag_to_ix[START_TAG]]), tags])
        else:
            score = torch.zeros(1)
            tags = torch.cat([torch.LongTensor([self.tag_to_ix[START_TAG]]), tags])
            
        for i, feat in enumerate(feats):
            score = score + self.transitions[tags[i + 1], tags[i]] + feat[tags[i + 1]]
            
        score = score + self.transitions[self.tag_to_ix[STOP_TAG], tags[-1]]
        
        return score

    
    def _viterbi_decode(self, feats):
        backpointers = []

        # Initialize the viterbi variables in log space
        init_vvars = torch.full((1, self.tagset_size), -10000.)
        init_vvars[0][self.tag_to_ix[START_TAG]] = 0
        if is_cuda: init_vvars = init_vvars.cuda()
            
        # forward_var at step i holds the viterbi variables for step i-1
        forward_var = init_vvars
        for feat in feats:
            bptrs_t = []  # holds the backpointers for this step
            viterbivars_t = []  # holds the viterbi variables for this step

            for next_tag in range(self.tagset_size):
                # next_tag_var[i] holds the viterbi variable for tag i at the
                # previous step, plus the score of transitioning from tag i to next_tag.
                # We don't include the emission scores here because the max
                # does not depend on them (we add them in below)
                next_tag_var = forward_var + self.transitions[next_tag]
                best_tag_id = argmax(next_tag_var)
                bptrs_t.append(best_tag_id)
                viterbivars_t.append(next_tag_var[0][best_tag_id].view(1))
                
            # Now add in the emission scores, and assign forward_var to the set
            # of viterbi variables we just computed
            forward_var = (torch.cat(viterbivars_t) + feat).view(1, -1)
            backpointers.append(bptrs_t)

        # Transition to STOP_TAG
        terminal_var = forward_var + self.transitions[self.tag_to_ix[STOP_TAG]]
        best_tag_id = argmax(terminal_var)
        path_score = terminal_var[0][best_tag_id]

        # Follow the back pointers to decode the best path.
        best_path = [best_tag_id]
        for bptrs_t in reversed(backpointers):
            best_tag_id = bptrs_t[best_tag_id]
            best_path.append(best_tag_id)
            
        # Pop off the start tag (we dont want to return that to the caller)
        start = best_path.pop()
        
        assert start == self.tag_to_ix[START_TAG]  # Sanity check
        
        best_path.reverse()
        
        return path_score, best_path

    
    def neg_log_likelihood(self, sentence, tags, lengths):
        feats = self._get_lstm_features(sentence, lengths)
        forward_score = self._forward_alg(feats)
        gold_score = self._score_sentence(feats, tags)
        
        return forward_score - gold_score

    
    def forward(self, sentence, lengths):  # dont confuse this with _forward_alg above.
        # Get the emission scores from the BiLSTM
        lstm_feats = self._get_lstm_features(sentence, lengths)

        # Find the best path, given the features.
        score, tag_seq = self._viterbi_decode(lstm_feats)
        
        return score, tag_seq

In [5]:
def sequence_to_ixs(seq, to_ix):
    ixs = [to_ix[w] if w in to_ix else to_ix[UNK_TOKEN] for w in seq]
    return torch.cuda.LongTensor(ixs) if is_cuda else torch.LongTensor(ixs)


def ixs_to_sequence(seq, to_word):
    tokens = [to_word[ix] for ix in seq]
    return tokens


def argmax(vec):
    # return the argmax as a python int
    _, idx = torch.max(vec, 1)
    return idx.item()


# Compute log sum exp in a numerically stable way for the forward algorithm
def log_sum_exp(vec):
    max_score = vec[0, argmax(vec)]
    max_score_broadcast = max_score.view(1, -1).expand(1, vec.size()[1])
    return max_score + torch.log(torch.sum(torch.exp(vec - max_score_broadcast)))

In [6]:
def train(training_data):
    total_num = len(training_data)
    batch_num = math.ceil(total_num / batch_size)

    for epoch in range(epochs):
        
        for i in range(batch_num):
            model.zero_grad()

            data = training_data[i * batch_size : (i+1) * batch_size]

            x = list(map(lambda pair: sequence_to_ixs(pair[0], word_to_ix), data))
            y = list(map(lambda pair: sequence_to_ixs(pair[1], tag_to_ix), data))

            assert len(x) == len(y)

            lengths = list(map(lambda x: x.shape[0], x))

            padded_seqs = pad_sequence(x, batch_first=True)
            padded_tags = pad_sequence(y, batch_first=True)

            loss_function = model.neg_log_likelihood(padded_seqs, padded_tags, lengths)
            # predict_tags = model(padded_seqs, lengths)
            
            # true_tags = padded_tags.view(-1)

            # loss = loss_function(predict_tags, true_tags)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        if (epoch + 1) % 5 == 0:
            print("epoch: {}, loss: {}".format(epoch+1, loss))
            
            # writer.add_scalar('Train/Loss'.format(epoch), loss.data[0], epoch)

In [7]:
from utils.evaluate import evaluate

def test(test_data):
    with torch.no_grad():
        data = test_data
        
        x = list(map(lambda pair: sequence_to_ixs(pair[0], word_to_ix), data))
        y = list(map(lambda pair: sequence_to_ixs(pair[1], tag_to_ix), data))

        lengths = list(map(lambda x: x.shape[0], x))

        padded_seqs = pad_sequence(x, batch_first=True)
        y_predicts = model(padded_seqs, lengths)
        y_predicts = torch.max(y_predicts, 1)[1].view([len(lengths), -1])

        y_trues = y
        y_predicts = [y_[:lengths[i]] for i, y_ in enumerate(y_predicts)]

        result = evaluate(y_predicts, y_trues)
        
        return result, (y_predicts, y_trues)

In [8]:
# Constant
UNK_TOKEN = '<UNK>'
START_TAG = "<START>"
STOP_TAG = "<STOP>"

# Data 
file_name = 'dataset/ese.txt'

# Store model
model_path = 'models/' + datetime.datetime.utcfromtimestamp(time.time()).strftime("%Y%m%d_%H%M") + '.model'

# Word embeddings
source = 'glove'

# Model hyper-parameters
embedding_dim = 300
hidden_dim = 100
learning_rate = 0.01
momentum = 0.7
dropout = 0
num_layers = 3
bidirectional = True
batch_size = 1
epochs = 200

In [9]:
### Get Word Embeddings
with open(f'dataset/{source}.pickle', 'rb') as handle:
    word_vectors, embedding_weights, word_to_ix, ix_to_word = pickle.load(handle)

### Manual Tag
tag_to_ix = {"B": 0, "I": 1, "O": 2, START_TAG: 3, STOP_TAG: 4}
ix_to_tag = {0: "B", 1: "I", 2: "O", 3: START_TAG, 4: STOP_TAG}

In [16]:
# model = BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM)
model = BiLSTM_CRF(embedding_dim, embedding_weights,
                   hidden_dim, tag_to_ix, 
                   dropout=dropout,num_layers=num_layers,
                   bidirectional=bidirectional)

model.cuda()

train_data = [(
    "the wall street journal reported today that apple corporation made money".split(),
    "B I I I O O O B I O O".split()
), (
    "georgia tech is a university in georgia".split(),
    "B I O O O O B".split()
)]

train(train_data)


# # Check predictions after training
# with torch.no_grad():
#     precheck_sent = sequence_to_ixs(training_data[0][0], word_to_ix)
#     print(model(precheck_sent))

tensor([3], device='cuda:0')
tensor([[0, 1, 1, 1, 2, 2, 2, 0, 1, 2, 2]], device='cuda:0')


RuntimeError: invalid argument 0: Tensors must have same number of dimensions: got 2 and 1 at /pytorch/aten/src/THC/generic/THCTensorMath.cu:78

In [None]:
best_result = 0
results = []
for num in range(10):
    print("10-fold:", num, "="*50)
    
    # Get Data and split
    documents = group_data(file_name)
    train_data, test_data, dev_data = split_dataset(documents, num)

    # Create Model
    model = LSTMTagger(embedding_dim, embedding_weights,
                       hidden_dim, tag_to_ix, 
                       dropout=dropout,num_layers=num_layers,
                       bidirectional=bidirectional)

    if is_cuda: model.cuda()
        
    loss_function = nn.NLLLoss()

    optimizer = optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=learning_rate, momentum=momentum)

    train(train_data)
    
    result, _ = test(test_data)
    
    if result['proportional']['f1'] >= best_result:
        best_result = result['proportional']['f1']        
        torch.save(model.state_dict(), model_path)
        print("Store Model with score: {}".format(best_result))
        
    results.append(result)

In [None]:
bin_result = { 'precision': .0, 'recall': .0, 'f1': .0 }
prop_result = { 'precision': .0, 'recall': .0, 'f1': .0 }

for i, result in enumerate(results):
    for key in result['binary']: bin_result[key] += (result['binary'][key] / len(results))
    for key in result['proportional']: prop_result[key] += (result['proportional'][key] / len(results))
    
    print("10-fold: {}".format(i))
    print("Binary Overlap\t\tPrecision: {precision:.3f}, Recall: {recall:.3f}, F1: {f1:.3f}".format(**result['binary']))
    print("Proportional Overlap\tPrecision: {precision:.3f}, Recall: {recall:.3f}, F1: {f1:.3f}".format(**result['proportional']))

print("\nAverage", "="*70)
print("Binary Overlap\t\tPrecision: {precision:.3f}, Recall: {recall:.3f}, F1: {f1:.3f}".format(**bin_result))
print("Proportional Overlap\tPrecision: {precision:.3f}, Recall: {recall:.3f}, F1: {f1:.3f}".format(**prop_result))


print("\nParams", "=" * 70)
print(f'''model_path = {model_path}
file_name = {file_name}
source = {source}
embedding_dim = {embedding_dim}
hidden_dim = {hidden_dim}
learning_rate = {learning_rate}
momentum = {momentum}
dropout = {dropout}
num_layers = {num_layers}
bidirectional = {bidirectional}
batch_size = {batch_size}
epochs = {epochs}''')

### Load model and observe the prediction

In [39]:
# model_path = 'models/20181031_0613.model'
# fname = 'ese'

# # Get Data and split
# documents = group_data(file_name)
# train_data, test_data, dev_data = split_dataset(documents, 0)


# # Create Model
# model = LSTMTagger(embedding_dim, embedding_weights,
#                    hidden_dim, 
#                    len(tag_to_ix), 
#                    dropout=dropout,
#                    num_layers=num_layers,
#                    bidirectional=bidirectional)

# model.load_state_dict(torch.load(model_path))

# if is_cuda: model.cuda()

# result, y_pair = test(test_data)

# print(result)

Train size: 7507, Test size: 734, Dev size: 2870
{'binary': {'precision': 0.7154255319148937, 'recall': 0.6078184110970997, 'f1': 0.6572466284289151}, 'proportional': {'precision': 0.6565918973166316, 'recall': 0.4351816027896689, 'f1': 0.523435885236527}}


In [40]:
# ys_, ys = y_pair

# ws = open(f'dataset/failure_{fname}.txt', 'w', encoding='utf8')
# correct = 0
# for (tks, tags), y_, y in zip(test_data, ys_, ys):
#     if sum(torch.eq(y_, y)) == len(tks):
#         correct += 1
#     else:
#         sents, trues, bios = [], [], []
#         for i, tk in enumerate(tks):
#             length = len(tk)
#             sents.append(tk)
#             bios.append('{:>{length}s}'.format(ix_to_tag[int(y_[i])], length=length))
#             trues.append('{:>{length}s}'.format(ix_to_tag[int(y[i])], length=length))
            
#         print(' '.join(sents), file=ws)
#         print(' '.join(bios), file=ws)
#         print(' '.join(trues), file=ws)
#         print("="*20, file=ws)
        
# ws.close()
# print(correct / len(test_data))

0.4032697547683924


### Calculate the number of parameters

In [None]:
# for name, param in model.named_parameters():
#     print( name, param.shape)
    
# total_param = sum(p.numel() for p in model.parameters())
# print(total_param)