# Demo
- Semantic Meaning Extraction

In [1]:
"""BERT finetuning runner."""

from __future__ import absolute_import, division, print_function, unicode_literals

import argparse
import logging
import os
import random
from io import open

import json
import numpy as np
import torch
import torch.nn as nn
from torch.nn import CrossEntropyLoss
from torch.utils.data import DataLoader, Dataset, RandomSampler
from torch.utils.data.distributed import DistributedSampler
from tqdm import tqdm, trange

from pytorch_transformers.modeling_bert import BertPreTrainedModel, BertConfig, BertModel
from pytorch_transformers.tokenization_bert import BertTokenizer
from pytorch_transformers.optimization import AdamW, WarmupLinearSchedule

logging.basicConfig(format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
                    datefmt='%m/%d/%Y %H:%M:%S',
                    level=logging.INFO)
logger = logging.getLogger(__name__)

In [2]:
class BertForParaphraseDetection(BertPreTrainedModel):
    def __init__(self, config: BertConfig):
        super(BertPreTrainedModel, self).__init__(config)
        self.bert = BertModel(config)
        self.ffn = nn.Linear(config.hidden_size, 2)
        self.apply(self.init_weights)

    def forward(self, input_ids: torch.Tensor, token_type_ids: torch.Tensor, input_mask: torch.Tensor,
                is_paraphrase: torch.Tensor = None):
        outputs = self.bert(input_ids, token_type_ids, input_mask)
        pooled_output = outputs[1]
        paraphrase_score = self.ffn(pooled_output)
        outputs = (paraphrase_score,) + outputs[2:]

        if is_paraphrase is not None:
            loss_fct = CrossEntropyLoss(ignore_index=-1)
            is_paraphrase_loss = loss_fct(paraphrase_score.view(-1, 2), is_paraphrase.view(-1))
            outputs = (is_paraphrase_loss,) + outputs

        return outputs

    def save_pretrained(self, save_directory: str, only_bert: bool = True):
        print('SAVING ONLY BERT...')
        model_to_save = self.bert if only_bert else self

        model_to_save.config.save_pretrained(save_directory)
        output_model_file = os.path.join(save_directory, 'pytorch_model.bin')
        torch.save(model_to_save.state_dict(), output_model_file)


class PPDBDataset(Dataset):
    def __init__(self, corpus_path: str, tokenizer: BertTokenizer, seq_len: int, encoding: str = 'utf-8'):
        self.vocab = tokenizer.vocab
        self.tokenizer = tokenizer
        self.seq_len = seq_len
        self.corpus_path = corpus_path
        self.encoding = encoding

        self.samples = []
        self.sample_counter = 0
        self.count = 0

        with open(corpus_path, mode='r', encoding='utf-8') as fp:
            self.samples = json.load(fp)
            self.count = len(self.samples)

    def __len__(self):
        return self.count

    def __getitem__(self, item):
        curr_id = self.sample_counter
        self.sample_counter += 1

        text1 = self.samples[item]['text1']
        text2 = self.samples[item]['text2']

        if random.random() > 0.5:
            label = 1
        else:
            rand_idx = item
            while rand_idx == item:
                rand_idx = random.randint(0, self.count - 1)

            text2 = self.samples[rand_idx]['text2']
            label = 0

        tokens_a = self.tokenizer.tokenize(text1)
        tokens_b = self.tokenizer.tokenize(text2)
        # print(tokens_a)

        curr_example = PPDBInputExample(curr_id, tokens_a, tokens_b, label)
        curr_features = convert_ppdb_example_to_features(curr_example, self.seq_len, self.tokenizer)
        curr_tensors = (torch.tensor(curr_features.input_ids),
                        torch.tensor(curr_features.input_mask),
                        torch.tensor(curr_features.segment_ids),
                        torch.tensor(curr_features.is_paraphrase))
        return curr_tensors


class PPDBInputExample(object):
    def __init__(self, guid: int, tokens_a: [str], tokens_b: [str], is_paraphrase: int):
        self.guid = guid
        self.tokens_a = tokens_a
        self.tokens_b = tokens_b
        self.is_paraphrase = is_paraphrase


class PPDBInputFeatures(object):
    def __init__(self, input_ids: [int], input_mask: [int], segment_ids: [int], is_paraphrase: int):
        self.input_ids = input_ids
        self.input_mask = input_mask
        self.segment_ids = segment_ids
        self.is_paraphrase = is_paraphrase


def convert_ppdb_example_to_features(example, max_seq_length, tokenizer) -> PPDBInputFeatures:
    tokens_a = example.tokens_a
    tokens_b = example.tokens_b
    # Modifies `tokens_a` and `tokens_b` in place so that the total
    # length is less than the specified length.
    # Account for [CLS], [SEP], [SEP] with "- 3"
    _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)

    tokens = []
    segment_ids = []
    tokens.append("[CLS]")
    segment_ids.append(0)
    for token in tokens_a:
        tokens.append(token)
        segment_ids.append(0)
    tokens.append("[SEP]")
    segment_ids.append(0)

    assert len(tokens_b) > 0
    for token in tokens_b:
        tokens.append(token)
        segment_ids.append(1)
    tokens.append("[SEP]")
    segment_ids.append(1)

    input_ids = tokenizer.convert_tokens_to_ids(tokens)

    # The mask has 1 for real tokens and 0 for padding tokens. Only real
    # tokens are attended to.
    input_mask = [1] * len(input_ids)

    # Zero-pad up to the sequence length.
    while len(input_ids) < max_seq_length:
        input_ids.append(0)
        input_mask.append(0)
        segment_ids.append(0)

    assert len(input_ids) == max_seq_length
    assert len(input_mask) == max_seq_length
    assert len(segment_ids) == max_seq_length

    if example.guid < 5:
        logger.info("*** Example ***")
        logger.info("guid: %s" % example.guid)
        logger.info("tokens: %s" % " ".join(
                [str(x) for x in tokens]))
        logger.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
        logger.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
        logger.info(
                "segment_ids: %s" % " ".join([str(x) for x in segment_ids]))

    features = PPDBInputFeatures(input_ids=input_ids,
                             input_mask=input_mask,
                             segment_ids=segment_ids,
                             is_paraphrase=example.is_paraphrase)
    return features


In [3]:
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument("--train_corpus",
                        default=None,
                        type=str,
                        required=True,
                        help="The input train corpus.")
    parser.add_argument("--bert_model", default=None, type=str, required=True,
                        help="Bert pre-trained model selected in the list: bert-base-uncased, "
                             "bert-large-uncased, bert-base-cased, bert-base-multilingual, bert-base-chinese.")
    parser.add_argument("--output_dir",
                        default=None,
                        type=str,
                        required=True,
                        help="The output directory where the model checkpoints will be written.")

    ## Other parameters
    parser.add_argument("--max_seq_length",
                        default=128,
                        type=int,
                        help="The maximum total input sequence length after WordPiece tokenization. \n"
                             "Sequences longer than this will be truncated, and sequences shorter \n"
                             "than this will be padded.")
    parser.add_argument('--do_debug', action='store_true')
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--learning_rate",
                        default=3e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument("--warmup_steps",
                        default=0,
                        type=int,
                        help="Linear warmup over warmup_steps.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--on_memory",
                        action='store_true',
                        help="Whether to load train samples into memory or use disk")
    parser.add_argument("--do_lower_case",
                        action='store_true',
                        help="Whether to lower case the input text. True for uncased models, False for cased models.")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument('--gradient_accumulation_steps',
                        type=int,
                        default=1,
                        help="Number of updates steps to accumualte before performing a backward/update pass.")
    parser.add_argument('--fp16',
                        action='store_true',
                        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument('--loss_scale',
                        type = float, default = 0,
                        help = "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
                        "0 (default value): dynamic loss scaling.\n"
                        "Positive power of 2: static loss scaling value.\n")

    args = parser.parse_args()

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available() and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info("device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".format(
        device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError("Invalid gradient_accumulation_steps parameter: {}, should be >= 1".format(
                            args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    if n_gpu > 0:
        torch.cuda.manual_seed_all(args.seed)

    if not args.do_train:
        raise ValueError("Training is currently the only implemented execution option. Please set `do_train`.")

    if os.path.exists(args.output_dir) and os.listdir(args.output_dir):
        raise ValueError("Output directory ({}) already exists and is not empty.".format(args.output_dir))
    if not os.path.exists(args.output_dir) and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
        os.makedirs(args.output_dir)

    tokenizer = BertTokenizer.from_pretrained(args.bert_model, do_lower_case=args.do_lower_case)

    #train_examples = None
    num_train_optimization_steps = None
    if args.do_train:
        print("Loading Train Dataset", args.train_corpus)
        train_dataset = PPDBDataset(args.train_corpus, tokenizer, seq_len=args.max_seq_length)
        num_train_optimization_steps = int(
            len(train_dataset) / args.train_batch_size / args.gradient_accumulation_steps) * args.num_train_epochs
        if args.local_rank != -1:
            num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size()

    # Prepare model
    model = BertForParaphraseDetection.from_pretrained(args.bert_model)
    if args.fp16:
        model.half()
    model.to(device)
    if args.local_rank != -1:
        try:
            from apex.parallel import DistributedDataParallel as DDP
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")
        model = DDP(model)
    elif n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Prepare optimizer
    if args.do_train:
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [
            {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 'weight_decay': 0.01},
            {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
            ]

        if args.fp16:
            try:
                from apex.optimizers import FP16_Optimizer
                from apex.optimizers import FusedAdam
            except ImportError:
                raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use distributed and fp16 training.")

            optimizer = FusedAdam(optimizer_grouped_parameters,
                                  lr=args.learning_rate,
                                  bias_correction=False,
                                  max_grad_norm=1.0)
            if args.loss_scale == 0:
                optimizer = FP16_Optimizer(optimizer, dynamic_loss_scale=True)
            else:
                optimizer = FP16_Optimizer(optimizer, static_loss_scale=args.loss_scale)

        else:
            optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
        scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=num_train_optimization_steps)

    global_step = 0
    if args.do_train:
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_dataset))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)

        if args.local_rank == -1:
            train_sampler = RandomSampler(train_dataset)
        else:
            #TODO: check if this works with current data generator from disk that relies on next(file)
            # (it doesn't return item back by index)
            train_sampler = DistributedSampler(train_dataset)
        train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)

        model.train()
        for epoch in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0.
            nb_tr_examples, nb_tr_steps = 0, 0
            for step, batch in enumerate(tqdm(train_dataloader, desc="Iteration")):
                if args.do_debug:
                    if nb_tr_steps == 10:
                        break
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, is_paraphrase = batch
                outputs = model(input_ids, segment_ids, input_mask, is_paraphrase)
                loss = outputs[0]
                if n_gpu > 1:
                    loss = loss.mean()
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps
                if args.fp16:
                    optimizer.backward(loss)
                else:
                    loss.backward()
                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    optimizer.step()
                    scheduler.step()  # Update learning rate schedule
                    optimizer.zero_grad()
                    global_step += 1

            logger.info('Total loss at epoch %d: %.5f' % (epoch, tr_loss))
            logger.info('Avrg  loss at epoch %d: %.5f' % (epoch, tr_loss / nb_tr_examples))

        # Save a trained model
        if args.do_train and (args.local_rank == -1 or torch.distributed.get_rank() == 0):
            logger.info("** ** * Saving fine - tuned model ** ** * ")
            model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
            model_to_save.save_pretrained(args.output_dir)
            tokenizer.save_pretrained(args.output_dir)


def _truncate_seq_pair(tokens_a, tokens_b, max_length):
    """Truncates a sequence pair in place to the maximum length."""

    # This is a simple heuristic which will always truncate the longer sequence
    # one token at a time. This makes more sense than truncating an equal percent
    # of tokens from each, since if one sequence is very short then each token
    # that's truncated likely contains more information than a longer sequence.
    while True:
        total_length = len(tokens_a) + len(tokens_b)
        if total_length <= max_length:
            break
        if len(tokens_a) > len(tokens_b):
            tokens_a.pop()
        else:
            tokens_b.pop()


def accuracy(out, labels):
    outputs = np.argmax(out, axis=1)
    return np.sum(outputs == labels)


if __name__ == "__main__":
    main()

usage: ipykernel_launcher.py [-h] --train_corpus TRAIN_CORPUS --bert_model
                             BERT_MODEL --output_dir OUTPUT_DIR
                             [--max_seq_length MAX_SEQ_LENGTH] [--do_debug]
                             [--do_train]
                             [--train_batch_size TRAIN_BATCH_SIZE]
                             [--learning_rate LEARNING_RATE]
                             [--adam_epsilon ADAM_EPSILON]
                             [--num_train_epochs NUM_TRAIN_EPOCHS]
                             [--warmup_steps WARMUP_STEPS] [--no_cuda]
                             [--on_memory] [--do_lower_case]
                             [--local_rank LOCAL_RANK] [--seed SEED]
                             [--gradient_accumulation_steps GRADIENT_ACCUMULATION_STEPS]
                             [--fp16] [--loss_scale LOSS_SCALE]
ipykernel_launcher.py: error: the following arguments are required: --train_corpus, --bert_model, --output_dir


SystemExit: 2

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [5]:
def func(inputs: int) -> float:
    return 1.2

In [6]:

func(2)

1.2

In [10]:
file = "./data/ppdb-2.0-s-all"

In [11]:
fp = open(file, mode='r', encoding='utf-8')


In [13]:
for line in fp:
    print(line)

[NN] ||| transplant ||| transplantation ||| PPDB2.0Score=5.24981 PPDB1.0Score=3.295900 -logp(LHS|e1)=0.18597 -logp(LHS|e2)=0.14031 -logp(e1|LHS)=11.83583 -logp(e1|e2)=1.80507 -logp(e1|e2,LHS)=1.46728 -logp(e2|LHS)=11.47593 -logp(e2|e1)=1.49083 -logp(e2|e1,LHS)=1.10738 AGigaSim=0.63439 Abstract=0 Adjacent=0 CharCountDiff=5 CharLogCR=0.40547 ContainsX=0 Equivalence=0.371472 Exclusion=0.000344 GlueRule=0 GoogleNgramSim=0.03067 Identity=0 Independent=0.078161 Lex(e1|e2)=9.64663 Lex(e2|e1)=59.48919 Lexical=1 LogCount=4.67283 MVLSASim=NA Monotonic=1 OtherRelated=0.372735 PhrasePenalty=1 RarityPenalty=0 ForwardEntailment=0.177287 SourceTerminalsButNoTarget=0 SourceWords=1 TargetComplexity=0.98821 TargetFormality=0.98464 TargetTerminalsButNoSource=0 TargetWords=1 UnalignedSource=0 UnalignedTarget=0 WordCountDiff=0 WordLenDiff=5.00000 WordLogCR=0 ||| 0-0 ||| OtherRelated

[JJ] ||| <www.un.org/depts/dgacm/docs/crp/aconf212crp1/russian.pdf> ||| <www.un.org/depts/dgacm/docs/crp/aconf212crp1/arabic

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[VBD] ||| describing ||| description ||| PPDB2.0Score=3.88688 PPDB1.0Score=9.356100 -logp(LHS|e1)=6.00379 -logp(LHS|e2)=6.68777 -logp(e1|LHS)=14.06064 -logp(e1|e2)=5.89296 -logp(e1|e2,LHS)=4.03887 -logp(e2|LHS)=12.31479 -logp(e2|e1)=3.46314 -logp(e2|e1,LHS)=2.29302 AGigaSim=0.17096 Abstract=0 Adjacent=0 CharCountDiff=1 CharLogCR=0.09531 ContainsX=0 Equivalence=0.159442 Exclusion=0.000171 GlueRule=0 GoogleNgramSim=0.03681 Identity=0 Independent=0.330518 Lex(e1|e2)=62.90141 Lex(e2|e1)=62.90141 Lexical=1 LogCount=0 MVLSASim=NA Monotonic=1 OtherRelated=0.396846 PhrasePenalty=1 RarityPenalty=0.00674 ForwardEntailment=0.113022 SourceTerminalsButNoTarget=0 SourceWords=1 TargetComplexity=0.99346 TargetFormality=0.98302 TargetTerminalsButNoSource=0 TargetWords=1 UnalignedSource=0 UnalignedTarget=0 WordCountDiff=0 WordLenDiff=1.00000 WordLogCR=0 ||| 0-0 ||| OtherRelated

[NNS] ||| fertilisers ||| fertilizers ||| PPDB2.0Score=3.88687 PPDB1.0Score=4.064620 -logp(LHS|e1)=0.10655 -logp(LHS|e2)=0.090

KeyboardInterrupt: 

In [14]:
import heapq

In [27]:
queue = [4,2,3,1]
heapq.heapify(queue)

In [26]:
heapq.heappop(queue)

1

In [20]:
queue

[2, 3, 4]

In [22]:
queue.pop()

4