### Extract features by Bert

In [None]:
"""Extract pre-computed feature vectors from a PyTorch BERT model."""

import argparse
import collections
import logging
import json
import re
import os

import locale
locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')

os.environ["CUDA_VISIBLE_DEVICES"]="1"

import torch
import torch.nn as nn
import torch.nn.functional as F   # 神經網絡模塊中的常用功能 
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler
from torch.utils.data.distributed import DistributedSampler

from pytorch_pretrained_bert.tokenization import BertTokenizer

from utils.evaluate import *

logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s', 
                    datefmt = '%m/%d/%Y %H:%M:%S',
                    level = logging.WARNING)
logger = logging.getLogger(__name__)

In [None]:
LEFT_PARAENTHESIS = ['-LRB-', '-LSB-', '-LCB-']
RIGHT_PARAENTHESIS = ['-RRB-', '-RSB-', '-RCB-']
START_TAG = "[CLS]"
STOP_TAG = "[SEP]"

class InputExample(object):
    """A single training/test example for simple sequence classification."""

    def __init__(self, unique_id, text, label=None):
        """Constructs a InputExample.

        Args:
          unique_id: Unique id for the example.
          text_a: string. The untokenized text of the first sequence. For single
            sequence tasks, only this sequence must be specified.
          label: (Optional) string. The label of the example. This should be
            specified for train and dev examples, but not for test examples.
        """
        self.unique_id = unique_id
        self.text = text
        self.label = label


class InputFeatures(object):
    """A single set of features of data."""

    def __init__(self, tokens, input_ids, input_mask, input_type_ids, label_ids):
        self.tokens = tokens
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.input_type_ids = input_type_ids
        self.label_ids = label_ids
        #self.label_mask = label_mask
        
        
class DataProcessor(object):
    """Base class for data converters for sequence classification data sets."""

    def get_train_examples(self, data_dir):
        """Gets a collection of `InputExample`s for the train set."""
        raise NotImplementedError()

    def get_dev_examples(self, data_dir):
        """Gets a collection of `InputExample`s for the dev set."""
        raise NotImplementedError()

    def get_label_map(self):
        """Gets the mapping of labels for this data set."""
        raise NotImplementedError()

    @classmethod
    def _read_data(cls, input_file):
        """Reads a BIO data."""
        with open(input_file, 'r', encoding='utf8') as f:
            lines, words, labels = [], [], []

            for line in f:
                contents = line.strip()
                word = contents.split('\t')[0]
                label = contents.split('\t')[-1]


                if len(contents) == 0: # and words[-1] == '.':
                    w = ' '.join([word for word in words if len(word) > 0])
                    l = ' '.join([label for label in labels if len(label) > 0])
                    lines.append([w, l])
                    words = []
                    labels = []
                    continue

                if word in LEFT_PARAENTHESIS: word = '('
                elif word in RIGHT_PARAENTHESIS: word = ')'
                    
                words.append(word)
                labels.append(label)

        return lines


class NerProcessor(DataProcessor):
    def __init__(self, data_dir):
        self.data_dir = data_dir
        
    def get_train_examples(self):
        return self._create_example(
            self._read_data(os.path.join(self.data_dir, "train.txt")), "train")

    def get_dev_examples(self):
        return self._create_example(
            self._read_data(os.path.join(self.data_dir, "dev.txt")), "dev")

    def get_test_examples(self):
        return self._create_example(
            self._read_data(os.path.join(self.data_dir, "test.txt")), "test")

    def get_label_map(self):
        return {"O": 0, "B-DSE": 1, "I-DSE": 2, START_TAG: 3, STOP_TAG: 4}
        # return {"O": 0, "B-DSE": 1, "I-DSE": 2, "[CLS]": 3, "[SEP]": 4}

    def _create_example(self, lines, set_type):
        examples = []
        for (i, line) in enumerate(lines):
            unique_id = "%s-%s" % (set_type, i)
            text, label = line
            examples.append(InputExample(unique_id=unique_id, text=text, label=label))
        return examples

In [None]:
def _truncate_seq_pair(tokens_a, tokens_b, max_length):
    """Truncates a sequence pair in place to the maximum length."""
    
    while True:
        total_length = len(tokens_a) + len(tokens_b)
        if total_length <= max_length:
            break
        if len(tokens_a) > len(tokens_b):
            tokens_a.pop()
        else:
            tokens_b.pop()
            

def convert_examples_to_features(examples, max_seq_length, tokenizer, label_map):
    """Loads a data file into a list of `InputBatch`s."""

    features = []
    for (ex_index, example) in enumerate(examples):
        ### tokenize data
        text_list = example.text.split(' ')
        label_list = example.label.split(' ')
        
        assert len(text_list) == len(label_list)
        
        tokens, labels = [], []
        for i, (w, l) in enumerate(zip(text_list, label_list)):
            tk = tokenizer.tokenize(w)
            tokens.extend(tk)
            for m in range(len(tk)):
                if m == 0:         labels.append(l)
                elif l == "B-DSE": labels.append("I-DSE")
                else:              labels.append("O")

        if len(tokens) > max_seq_length - 2:
            tokens = tokens[0 : (max_seq_length - 2)]
            labels = labels[0 : (max_seq_length - 2)]

        ### insert CLS and SEP
        # label_ids append("O") or append("[CLS]") not sure!
        ntokens, input_type_ids, label_ids = ["[CLS]"], [0], [label_map["O"]]
        for i, (tk, l) in enumerate(zip(tokens, labels)):
            ntokens.append(tk)
            input_type_ids.append(0)
            label_ids.append(label_map[l])
            
        ntokens.append("[SEP]")
        input_type_ids.append(0)
        # append("O") or append("[SEP]") not sure!
        label_ids.append(label_map["O"])
        
        ### convert to ids
        input_ids = tokenizer.convert_tokens_to_ids(ntokens)
        
        ### create mask
        input_mask = [1] * len(input_ids)

        ### padding to max seq len
        while len(input_ids) < max_seq_length:
            input_ids.append(0)
            input_mask.append(0)
            input_type_ids.append(0)
            # we don't concerned about it!
            label_ids.append(0)
        
        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(input_type_ids) == max_seq_length
        assert len(label_ids) == max_seq_length

        
        if ex_index < 2:
            logger.info("*** Example ***")
            logger.info("unique_id: %s" % (example.unique_id))
            logger.info("tokens: %s" % " ".join([str(x) for x in ntokens]))
            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
            logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
            logger.info("input_type_ids: %s" % " ".join([str(x) for x in input_type_ids]))
            logger.info("label_ids: %s" % " ".join([str(x) for x in label_ids]))
            
        features.append(
            InputFeatures(
                tokens=ntokens,
                input_ids=input_ids,
                input_mask=input_mask,
                input_type_ids=input_type_ids,
                label_ids=label_ids))
        
    features = sorted(features, key=lambda x: len(x.tokens), reverse=True)
    return features

In [None]:
def convert_features_to_dataloader(features, local_rank, batch_size):
    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.float)
    all_input_labels = torch.tensor([f.label_ids for f in features], dtype=torch.long)
    all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)

    dataset = TensorDataset(all_input_ids, all_input_mask, all_input_labels, all_example_index)
    
    if local_rank == -1: sampler = SequentialSampler(dataset)
    else:                sampler = DistributedSampler(dataset)
        
    dataloader = DataLoader(dataset, sampler=sampler, batch_size=batch_size)

    return dataloader

In [None]:
# from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
# from pytorch_pretrained_bert.modeling import BertModel

In [3]:
from layers.RNN import LSTM
from layers.layers import CRF

import torch
import torch.nn as nn

BATCH_SIZE = args.batch_size
EMBED_SIZE = 300
HIDDEN_SIZE = 1000
NUM_LAYERS = 2
DROPOUT = 0.5
BIDIRECTIONAL = True
NUM_DIRS = 2 if BIDIRECTIONAL else 1
LEARNING_RATE = 0.01
WEIGHT_DECAY = 1e-4
SAVE_EVERY = 1

PAD = "<PAD>" # padding
SOS = "<SOS>" # start of sequence
EOS = "<EOS>" # end of sequence
UNK = "<UNK>" # unknown token

PAD_IDX = 0
SOS_IDX = 1
EOS_IDX = 2
UNK_IDX = 3

torch.manual_seed(1)
CUDA = torch.cuda.is_available()

class LSTM_CRF(nn.Module):
    
    def __init__(self, embedding_size, num_tags):
        super().__init__()
        
        self.lstm = LSTM(embedding_size, num_tags)
        self.crf = CRF(num_tags)
        
    # for training
    def forward(self, x, mask, y): 
        # mask = x.data.gt(0).float()
        h = self.lstm(x, mask)
        Z = self.crf.forward(h, mask)
        score = self.crf.score(h, y, mask)
        return Z - score # NLL loss

    def decode(self, x): # for prediction
        mask = x.data.gt(0).float()
        h = self.lstm(x, mask)
        return self.crf.decode(h, mask)

NameError: name 'args' is not defined

In [None]:
from pytorch_pretrained_bert.modeling import BertModel

class BertTagger(nn.Module):
 
    def __init__(self, bert_model, label_map,
                 is_frozen=True, mode="last"):
        super(BertTagger, self).__init__()
        
        self.label_map = label_map
        self.loss_function = loss_function
        self.tagset_size = len(self.label_map)
        self.mode = mode
        
        self.bert_model = BertModel.from_pretrained(bert_model)
    
        self.embedding_size = self.bert_model.config.hidden_size
        self.vocab_size = self.bert_model.config.vocab_size

        if is_frozen: self.bert_model.eval()

        
    def _forward_alg(self, input_ids, input_mask):
        batch_size, max_seq_len = input_ids.shape
        
        all_encoder_layers, _ = self.bert_model(input_ids, 
                                                token_type_ids=None, 
                                                attention_mask=input_mask)
        
        if self.mode == "last":
            all_encoder_layers = all_encoder_layers[-1]
#         elif args.mode == "weighted":
#             all_encoder_layers = torch.stack([a * b for a, b in zip(all_encoder_layers, self.bert_weights)])
#             return self.bert_gamma * torch.sum(all_encoder_layers, dim=0)
        
        y_pred = self.lstm_crf(all_encoder_layers)
        

        ### not sure mask
#         y_ = torch.mul(tag_scores, mask.unsqueeze(-1).expand([batch_size, seq_len, self.tagset_size]))
        y_pred = y_pred.view(-1, self.tagset_size)
        
        return y_pred

        
    def forward(self, input_ids, input_mask, input_labels):
        y_pred = self._forward_alg(input_ids, input_mask)
        y_true = input_labels.view(-1)

        loss = self.lstm_crf.neg_log_likelihood(y_pred, y_true)

        return y_pred, loss

In [None]:
def main(args):
    if args.no_cuda:
        device = torch.device("cpu")
        n_gpu = 1
    elif args.local_rank == -1:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        n_gpu = torch.cuda.device_count() # 1
    else:
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
        
    logger.info("device: {} n_gpu: {} distributed training: {}".format(device, n_gpu, bool(args.local_rank != -1)))

    tokenizer = BertTokenizer.from_pretrained(args.bert_model)

    processor = NerProcessor(args.input_dir)
    label_map = processor.get_label_map()
    examples = processor.get_train_examples()

    features = convert_examples_to_features(examples=examples, 
                                            max_seq_length=args.max_seq_length, 
                                            tokenizer=tokenizer, 
                                            label_map=label_map)
    
    dataloader = convert_features_to_dataloader(features=features, 
                                                local_rank=args.local_rank, 
                                                batch_size=args.batch_size)

#     model = BertTagger(args.bert_model, label_map=label_map, mode="last", 
#                        loss_function=loss_function)
    model = BiLSTM_CRF(args.bert_model, device, mode="last", 
                       hidden_dim=100, dropout=0, num_layers=1,
                       bidirectional=True, tag_to_ix=label_map)
    model.to(device)
    
    optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), 
                              lr=args.learning_rate, momentum=args.momentum)
    # optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(model, 
                                                          device_ids=[args.local_rank],
                                                          output_device=args.local_rank)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    ######### TRAIN
    best_count, tmp_loss = 0, 0
    for epoch in range(args.epochs):
        print("Epoch:", epoch)
        total_loss = 0.
        
        for input_ids, input_mask, input_labels, example_indices in dataloader:
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            input_labels = input_labels.to(device)

            optimizer.zero_grad()

            loss = model(input_ids, input_mask, input_labels, is_training=True)
    
            if n_gpu > 1:
                loss = loss.mean() # mean() to average on multi-gpu.
            if args.fp16 and args.loss_scale != 1.0:
                # rescale loss for fp16 training
                # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html
                loss = loss * args.loss_scale
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps
                
            loss = loss.mean()
            total_loss += loss

            loss.backward()
            optimizer.step()

        print("Loss:", total_loss)
        
        if sum(total_loss) > tmp_loss:
            best_count += 1
        else:
            tmp_loss = sum(total_loss)
            best_count = 0
            
        if best_count == args.early_stop:
            torch.save(model.state_dict(), os.path.join(args.output_dir, args.save_model_name))
            print("EARLY STOPPED")
            break
   

    ######### TEST
    model.load_state_dict(torch.load(os.path.join(args.output_dir, "store.model")))
    
    examples = processor.get_test_examples()
    
    features = convert_examples_to_features(examples=examples, 
                                             max_seq_length=args.max_seq_length, 
                                             tokenizer=tokenizer, 
                                             label_map=label_map)

    dataloader = convert_features_to_dataloader(features=features, 
                                                 local_rank=args.local_rank, 
                                                 batch_size=args.batch_size)

    logger.info("***** Running evaluation *****")
    logger.info("  Num examples = %d", len(examples))
    logger.info("  Batch size = %d", args.batch_size)

    y_preds_tk_basis, y_trues_tk_basis = [], []
    y_preds, y_trues = [], []
    total_loss = 0.
   
    for input_ids, input_mask, input_labels, example_indices in dataloader:
        input_ids = input_ids.to(device)
        input_mask = input_mask.to(device)
        input_labels = input_labels.to(device)

        with torch.no_grad():
            y_pred, loss = model(input_ids, input_mask, input_labels, is_training=False)
            total_loss += loss

#             y_pred = torch.max(y_pred, 2)[1].view([len(input_ids), -1])
            y_pred = torch.max(y_pred, 1)[1].view([len(input_ids), -1])

            ### 用 * mask 
            # y_pred = torch.mul(y_pred, input_mask) # [batch, max_seq_len]
            # y_true = torch.mul(input_labels, input_mask)
            y_pred = [y_[ : sum(input_mask[i])] for i, y_ in enumerate(y_pred)]
            y_true = [y[ : sum(input_mask[i])] for i, y in enumerate(input_labels)]
          
            y_preds.extend(y_pred)
            y_trues.extend(y_true)
            y_preds_tk_basis.extend([el for y_ in y_pred for el in y_])
            y_trues_tk_basis.extend([el for y_ in y_true for el in y_])
            

    print("loss:", total_loss)
    
    result = token_basis_evaluate(y_preds_tk_basis, y_trues_tk_basis, label_map, output_dict=False)
    print(result)

    result = overlap_evaluate(y_preds, y_trues, label_map)
    print(result)    

    import pickle
    with open(os.path.join(args.output_dir, "results.txt"), "wb") as file:
        pickle.dump([y_preds, y_trues, y_preds_tk_basis, y_trues_tk_basis], file)

In [None]:
class Args(object):
    """A single set of features of data."""

    def __init__(self):
        self.input_dir = './dse/'
        self.output_dir = '.'
        self.save_model_name = 'bert_bilstm_crf.model'
        self.bert_model = 'bert-base-uncased'
        self.mode = 'last'
        self.max_seq_length = 128
        self.epochs = 5
        self.batch_size = 32
        self.learning_rate = 3e-1
        self.momentum = 0.7
        self.is_frozen = True
        self.local_rank = -1
        self.no_cuda = False
        self.fp16 = False
        self.loss_scale = 128.
        self.gradient_accumulation_steps = 1
        self.early_stop = 5
        
main(Args())

In [None]:
# if __name__ == "__main__":
#     parser = argparse.ArgumentParser()

#     ## Required parameters
#     parser.add_argument("--input_dir", default=None, type=str, required=True)
#     parser.add_argument("--output_dir", default=None, type=str, required=True)
#     parser.add_argument("--bert_model", default=None, type=str, required=True,
#                         help="Bert pre-trained model selected in the list: bert-base-uncased, "
#                              "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.")

#     ## Other parameters
#     parser.add_argument("--mode", default="last", type=str)
#     parser.add_argument("--max_seq_length", default=128, type=int,
#                         help="The maximum total input sequence length after WordPiece tokenization. Sequences longer "
#                             "than this will be truncated, and sequences shorter than this will be padded.")
#     parser.add_argument("--epochs", default=200, type=int, help="Number of epoch.")
#     parser.add_argument("--batch_size", default=32, type=int, help="Batch size for predictions.")
#     parser.add_argument("--learning_rate", default=1e-4, type=float, help="Learning rate for gradient.")
#     parser.add_argument("--momentum", default=0.7, type=float)
#     parser.add_argument("--is_frozen", default=True, type=bool)
#     parser.add_argument("--local_rank",
#                         type=int,
#                         default=-1,
#                         help = "local_rank for distributed training on gpus")
#     parser.add_argument("--no_cuda",
#                         default=False,
#                         action='store_true',
#                         help="Whether not to use CUDA when available")
#     parser.add_argument('--fp16',
#                         default=False,
#                         action='store_true',
#                         help="Whether to use 16-bit float precision instead of 32-bit")
#     parser.add_argument('--loss_scale',
#                         type=float, default=128,
#                         help='Loss scaling, positive power of 2 values can improve fp16 convergence.')
# parser.add_argument('--gradient_accumulation_steps',
#                         type=int,
#                         default=1,
#                         help="Number of updates steps to accumualte before performing a backward/update pass.")          

#     args = parser.parse_args()

#     main(args)