### Extract features by Bert

In [11]:
"""Extract pre-computed feature vectors from a PyTorch BERT model."""

import argparse
import collections
import logging
import json
import re
import os
import locale

locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')

import torch
import torch.nn as nn
import torch.nn.functional as F   # 神經網絡模塊中的常用功能 
from torch.utils.data import TensorDataset, DataLoader, SequentialSampler
from torch.utils.data.distributed import DistributedSampler

from pytorch_pretrained_bert.tokenization import BertTokenizer

from utils.evaluate import *

logging.basicConfig(format = '%(asctime)s - %(levelname)s - %(name)s -   %(message)s', 
                    datefmt = '%m/%d/%Y %H:%M:%S',
                    level = logging.WARNING)
logger = logging.getLogger(__name__)

In [4]:
class InputExample(object):
    """A single training/test example for simple sequence classification."""

    def __init__(self, unique_id, text, label=None):
        """Constructs a InputExample.

        Args:
          unique_id: Unique id for the example.
          text_a: string. The untokenized text of the first sequence. For single
            sequence tasks, only this sequence must be specified.
          label: (Optional) string. The label of the example. This should be
            specified for train and dev examples, but not for test examples.
        """
        self.unique_id = unique_id
        self.text = text
        self.label = label


class InputFeatures(object):
    """A single set of features of data."""

    def __init__(self, tokens, input_ids, input_mask, input_type_ids, label_ids):
        self.tokens = tokens
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.input_type_ids = input_type_ids
        self.label_ids = label_ids
        #self.label_mask = label_mask
        
        
class DataProcessor(object):
    """Base class for data converters for sequence classification data sets."""

    def get_train_examples(self, data_dir):
        """Gets a collection of `InputExample`s for the train set."""
        raise NotImplementedError()

    def get_dev_examples(self, data_dir):
        """Gets a collection of `InputExample`s for the dev set."""
        raise NotImplementedError()

    def get_label_map(self):
        """Gets the mapping of labels for this data set."""
        raise NotImplementedError()

    @classmethod
    def _read_data(cls, input_file):
        """Reads a BIO data."""
        with open(input_file, 'r', encoding='utf8') as f:
            lines, words, labels = [], [], []
            for line in f:
                contents = line.strip()
                word = contents.split('\t')[0]
                label = contents.split('\t')[-1]

                if len(contents) == 0 and words[-1] == '.':
                    w = ' '.join([word for word in words if len(word) > 0])
                    l = ' '.join([label for label in labels if len(label) > 0])
                    lines.append([w, l])
                    words = []
                    labels = []
                    continue
                    
                words.append(word)
                labels.append(label)
            return lines


class NerProcessor(DataProcessor):
    def __init__(self, data_dir):
        self.data_dir = data_dir
        
    def get_train_examples(self):
        return self._create_example(
            self._read_data(os.path.join(self.data_dir, "train.txt")), "train")

    def get_dev_examples(self):
        return self._create_example(
            self._read_data(os.path.join(self.data_dir, "dev.txt")), "dev")

    def get_test_examples(self):
        return self._create_example(
            self._read_data(os.path.join(self.data_dir, "test.txt")), "test")

    def get_label_map(self):
        return {"O": 0, "B-DSE": 1, "I-DSE": 2}
        # return {"O": 0, "B-DSE": 1, "I-DSE": 2, "[CLS]": 3, "[SEP]": 4}

    def _create_example(self, lines, set_type):
        examples = []
        for (i, line) in enumerate(lines):
            unique_id = "%s-%s" % (set_type, i)
            text, label = line
            examples.append(InputExample(unique_id=unique_id, text=text, label=label))
        return examples

In [24]:
def _truncate_seq_pair(tokens_a, tokens_b, max_length):
    """Truncates a sequence pair in place to the maximum length."""
    
    while True:
        total_length = len(tokens_a) + len(tokens_b)
        if total_length <= max_length:
            break
        if len(tokens_a) > len(tokens_b):
            tokens_a.pop()
        else:
            tokens_b.pop()
            

def convert_examples_to_features(examples, max_seq_length, tokenizer, label_map):
    """Loads a data file into a list of `InputBatch`s."""

    features = []
    for (ex_index, example) in enumerate(examples):
        ### tokenize data
        text_list = example.text.split(' ')
        label_list = example.label.split(' ')
        
        tokens, labels = [], []
        for i, (w, l) in enumerate(zip(text_list, label_list)):
            tk = tokenizer.tokenize(w)
            tokens.extend(tk)            
            labels.extend([l if m == 0 else "I-DSE" for m in range(len(tk))])

        if len(tokens) > max_seq_length - 2:
            tokens = tokens[0 : (max_seq_length - 2)]
            labels = labels[0 : (max_seq_length - 2)]

        ### insert CLS and SEP
        # label_ids append("O") or append("[CLS]") not sure!
        ntokens, input_type_ids, label_ids = ["[CLS]"], [0], [label_map["O"]]
        for i, (tk, l) in enumerate(zip(tokens, labels)):
            ntokens.append(tk)
            input_type_ids.append(0)
            label_ids.append(label_map[l])
            
        ntokens.append("[SEP]")
        input_type_ids.append(0)
        # append("O") or append("[SEP]") not sure!
        label_ids.append(label_map["O"])
        
        ### convert to ids
        input_ids = tokenizer.convert_tokens_to_ids(ntokens)
        
        ### create mask
        input_mask = [1] * len(input_ids)

        ### padding to max seq len
        while len(input_ids) < max_seq_length:
            input_ids.append(0)
            input_mask.append(0)
            input_type_ids.append(0)
            # we don't concerned about it!
            label_ids.append(0)
        
        assert len(input_ids) == max_seq_length
        assert len(input_mask) == max_seq_length
        assert len(input_type_ids) == max_seq_length
        assert len(label_ids) == max_seq_length

        
        if ex_index < 2:
            logger.info("*** Example ***")
            logger.info("unique_id: %s" % (example.unique_id))
            logger.info("tokens: %s" % " ".join([str(x) for x in ntokens]))
            logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
            logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
            logger.info("input_type_ids: %s" % " ".join([str(x) for x in input_type_ids]))
            logger.info("label_ids: %s" % " ".join([str(x) for x in label_ids]))
            
        features.append(
            InputFeatures(
                tokens=ntokens,
                input_ids=input_ids,
                input_mask=input_mask,
                input_type_ids=input_type_ids,
                label_ids=label_ids))
    return features

In [25]:
def convert_features_to_dataloader(features, local_rank, batch_size):
    all_input_ids = torch.tensor([f.input_ids for f in features], dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in features], dtype=torch.long)
    all_input_labels = torch.tensor([f.label_ids for f in features], dtype=torch.long)
    all_example_index = torch.arange(all_input_ids.size(0), dtype=torch.long)

    dataset = TensorDataset(all_input_ids, all_input_mask, all_input_labels, all_example_index)
    
    if local_rank == -1: sampler = SequentialSampler(dataset)
    else:                sampler = DistributedSampler(dataset)
        
    dataloader = DataLoader(dataset, sampler=sampler, batch_size=batch_size)

    return dataloader

In [40]:
from pytorch_pretrained_bert.modeling import BertModel

class BertTagger(nn.Module):
 
    def __init__(self, bert_model, hidden_dim, label_map, loss_function,
                 is_frozen=True, mode="last"):
        super(BertTagger, self).__init__()
        
        self.hidden_dim = hidden_dim
        self.label_map = label_map
        self.loss_function = loss_function
        self.tagset_size = len(self.label_map)
        self.mode = mode
        
        self.bert_model = BertModel.from_pretrained(bert_model)
        
        if is_frozen: self.bert_model.eval()
            
        self.linear_model = torch.nn.Linear(self.hidden_dim, self.tagset_size)

        
    def _forward_alg(self, input_ids, input_mask):
        batch_size, max_seq_len = input_ids.shape
        
        all_encoder_layers, _ = self.bert_model(input_ids, 
                                                token_type_ids=None, 
                                                attention_mask=input_mask)
        
        if self.mode == "last":
            all_encoder_layers = all_encoder_layers[-1]
#         elif args.mode == "weighted":
#             all_encoder_layers = torch.stack([a * b for a, b in zip(all_encoder_layers, self.bert_weights)])
#             return self.bert_gamma * torch.sum(all_encoder_layers, dim=0)
        
        y_pred = self.linear_model(all_encoder_layers)
        y_pred = F.log_softmax(y_pred, dim=2)

        ### not sure mask
#         y_ = torch.mul(tag_scores, mask.unsqueeze(-1).expand([batch_size, seq_len, self.tagset_size]))
        y_pred = y_pred.view(-1, self.tagset_size)
        
        return y_pred

        
    def forward(self, input_ids, input_mask, input_labels):
        y_pred = self._forward_alg(input_ids, input_mask)
        y_true = input_labels.view(-1)

        loss = self.loss_function(y_pred, y_true)

        return y_pred, loss

In [65]:
def main(args):
    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
        
    logger.info("device: {} n_gpu: {} distributed training: {}".format(device, n_gpu, bool(args.local_rank != -1)))

    tokenizer = BertTokenizer.from_pretrained(args.bert_model)

    processor = NerProcessor(args.input_dir)
    label_map = processor.get_label_map()
    examples = processor.get_train_examples()

    features = convert_examples_to_features(examples=examples, 
                                            max_seq_length=args.max_seq_length, 
                                            tokenizer=tokenizer, 
                                            label_map=label_map)
    
    dataloader = convert_features_to_dataloader(features=features, 
                                                local_rank=args.local_rank, 
                                                batch_size=args.batch_size)

    
    loss_function = torch.nn.NLLLoss()
    
    model = BertTagger(args.bert_model, hidden_dim=768, 
                       label_map=label_map, mode="last",
                      loss_function=loss_function)
    model.to(device)
    
    optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), 
                              lr=args.learning_rate, momentum=args.momentum)
    # optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(model, 
                                                          device_ids=[args.local_rank],
                                                          output_device=args.local_rank)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    
    ######### TRAIN
    for epoch in range(args.epochs):
        print("Epoch:", epoch)
        
        for input_ids, input_mask, input_labels, example_indices in dataloader:
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            input_labels = input_labels.to(device)

            optimizer.zero_grad()
            
            _, loss = model(input_ids, input_mask, input_labels)
            
            if n_gpu > 1:
                loss = loss.mean() # mean() to average on multi-gpu.
            if args.fp16 and args.loss_scale != 1.0:
                # rescale loss for fp16 training
                # see https://docs.nvidia.com/deeplearning/sdk/mixed-precision-training/index.html
                loss = loss * args.loss_scale
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps
                
            loss.backward()
            optimizer.step()
    

    torch.save(model.state_dict(), os.path.join(args.output_dir, "model"))
    
    
    ######### TEST
    examples = processor.get_dev_examples()
    features = convert_examples_to_features(examples=examples, 
                                             max_seq_length=args.max_seq_length, 
                                             tokenizer=tokenizer, 
                                             label_map=label_map)

    batch_size = len(examples)
    dataloader = convert_features_to_dataloader(features=features, 
                                                 local_rank=args.local_rank, 
                                                 batch_size=batch_size)

    logger.info("***** Running evaluation *****")
    logger.info("  Num examples = %d", len(examples))
    logger.info("  Batch size = %d", batch_size)

   
    # should be only once
    for input_ids, input_mask, input_labels, example_indices in dataloader:
        input_ids = input_ids.to(device)
        input_mask = input_mask.to(device)
        input_labels = input_labels.to(device)

        with torch.no_grad():
            y_pred, loss = model(input_ids, input_mask, input_labels)

#             y_predicts = torch.max(y_pred, 2)[1].view([batch_size, -1])
            y_predicts = torch.max(y_pred, 1)[1].view([batch_size, -1])

            ### 先暫時用 * mask 
            # y_predicts = [y_[:len(y_trues[i])] for i, y_ in enumerate(y_predicts)]

            y_predicts = torch.mul(y_predicts, input_mask)
            y_trues = torch.mul(input_labels, input_mask)

            result = evaluate(y_predicts, y_trues, label_map)


    with open(os.path.join(args.output_dir, "eval_results.txt"), "w") as writer:
        print(result)
        writer.write(json.dumps(result))

In [66]:
class Args(object):
    """A single set of features of data."""

    def __init__(self):
        self.input_dir = './dse/'
        self.output_dir = '.'
        self.bert_model = 'bert-base-uncased'
        self.mode = 'last'
        self.max_seq_length = 128
        self.epochs = 200
        self.batch_size = 32
        self.learning_rate = 1e-4
        self.momentum = 0.7
        self.isfrozen = True
        self.local_rank = -1
        self.no_cuda = False
        self.fp16 = False
        self.loss_scale = 128.
        self.gradient_accumulation_steps = 1
        
main(Args())

11/30/2018 06:31:37 - INFO - __main__ -   device: cuda n_gpu: 2 distributed training: False
11/30/2018 06:31:38 - INFO - pytorch_pretrained_bert.tokenization -   loading vocabulary file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-vocab.txt from cache at /root/.pytorch_pretrained_bert/26bc1ad6c0ac742e9b52263248f6d0f00068293b33709fae12320c0e35ccfbbb.542ce4285a40d23a559526243235df47c5f75c197f04f37d1a0c124c32c9a084
11/30/2018 06:31:38 - INFO - __main__ -   *** Example ***
11/30/2018 06:31:38 - INFO - __main__ -   unique_id: train-0
11/30/2018 06:31:38 - INFO - __main__ -   tokens: [CLS] on monday 28 january , the us national security council convened for an uncomfortable meeting ; the main issue to be debated was known as early as the week before , following an unprecedented split that had emerged in the us administration . [SEP]
11/30/2018 06:31:38 - INFO - __main__ -   input_ids: 101 2006 6928 2654 2254 1010 1996 2149 2120 3036 2473 19596 2005 2019 8796 3116 102

Epoch: 0




Epoch: 1
Epoch: 2
Epoch: 3
Epoch: 4
Epoch: 5
Epoch: 6
Epoch: 7
Epoch: 8
Epoch: 9


11/30/2018 06:55:05 - INFO - __main__ -   *** Example ***
11/30/2018 06:55:05 - INFO - __main__ -   unique_id: dev-0
11/30/2018 06:55:05 - INFO - __main__ -   tokens: [CLS] international condemnation of mug ##abe ' s win mounted yesterday with us president george bush and british foreign secretary jack straw delivering further criticism . [SEP]
11/30/2018 06:55:05 - INFO - __main__ -   input_ids: 101 2248 26248 1997 14757 16336 1005 1055 2663 5614 7483 2007 2149 2343 2577 5747 1998 2329 3097 3187 2990 13137 12771 2582 6256 1012 102 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
11/30/2018 06:55:05 - INFO - __main__ -   input_mask: 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 

{'binary': {'precision': 0.8148959474260679, 'recall': 0.1624119718309859, 'f1': 0.270843722946926}, 'proportional': {'precision': 0.7605330412559325, 'recall': 0.09975792253521129, 'f1': 0.1763803164472085}}


In [None]:
# if __name__ == "__main__":
#     parser = argparse.ArgumentParser()

#     ## Required parameters
#     parser.add_argument("--input_dir", default=None, type=str, required=True)
#     parser.add_argument("--output_dir", default=None, type=str, required=True)
#     parser.add_argument("--bert_model", default=None, type=str, required=True,
#                         help="Bert pre-trained model selected in the list: bert-base-uncased, "
#                              "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.")

#     ## Other parameters
#     parser.add_argument("--mode", default="last", type=str)
#     parser.add_argument("--max_seq_length", default=128, type=int,
#                         help="The maximum total input sequence length after WordPiece tokenization. Sequences longer "
#                             "than this will be truncated, and sequences shorter than this will be padded.")
#     parser.add_argument("--epochs", default=200, type=int, help="Number of epoch.")
#     parser.add_argument("--batch_size", default=32, type=int, help="Batch size for predictions.")
#     parser.add_argument("--learning_rate", default=1e-4, type=float, help="Learning rate for gradient.")
#     parser.add_argument("--momentum", default=0.7, type=float)
#     parser.add_argument("--isfrozen", default=True, type=bool)
#     parser.add_argument("--local_rank",
#                         type=int,
#                         default=-1,
#                         help = "local_rank for distributed training on gpus")
#     parser.add_argument("--no_cuda",
#                         default=False,
#                         action='store_true',
#                         help="Whether not to use CUDA when available")
#     parser.add_argument('--fp16',
#                         default=False,
#                         action='store_true',
#                         help="Whether to use 16-bit float precision instead of 32-bit")
#     parser.add_argument('--loss_scale',
#                         type=float, default=128,
#                         help='Loss scaling, positive power of 2 values can improve fp16 convergence.')
# parser.add_argument('--gradient_accumulation_steps',
#                         type=int,
#                         default=1,
#                         help="Number of updates steps to accumualte before performing a backward/update pass.")          

#     args = parser.parse_args()

#     main(args)