In [1]:
from fairseq.models.bart import BARTModel

2021-10-08 02:52:21.336975: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2021-10-08 02:52:21.337034: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
# # MBart Test
# mbart = BARTModel.from_pretrained('mbart.cc25.v2', checkpoint_file='model.pt')
# mbart.eval()

In [5]:
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.

import logging
import os
from argparse import Namespace

import numpy as np
from fairseq import utils
from fairseq.data import (
    ConcatSentencesDataset,
    Dictionary,
    IdDataset,
    NestedDictionaryDataset,
    NumelDataset,
    NumSamplesDataset,
    OffsetTokensDataset,
    PrependTokenDataset,
    RawLabelDataset,
    RightPadDataset,
    RollDataset,
    SortDataset,
    StripTokenDataset,
    data_utils,
)
from fairseq.data.shorten_dataset import maybe_shorten_dataset
from fairseq.tasks import FairseqTask, register_task

logger = logging.getLogger(__name__)


class LegacyFairseqTask(FairseqTask):
    def __init__(self, args: Namespace):
        self.args = args
        self.datasets = {}
        self.dataset_to_epoch_iter = {}

    @classmethod
    def setup_task(cls, args: Namespace, **kwargs):
        """Setup the task (e.g., load dictionaries).
        Args:
            args (argparse.Namespace): parsed command-line arguments
        """
        return cls(args, **kwargs)

    def has_sharded_data(self, split):
        return os.pathsep in getattr(self.args, "data", "")

    def build_model(self, args: Namespace):
        """
        Build the :class:`~fairseq.models.BaseFairseqModel` instance for this
        task.
        Args:
            args (argparse.Namespace): parsed command-line arguments
        Returns:
            a :class:`~fairseq.models.BaseFairseqModel` instance
        """
        from fairseq import models, quantization_utils

        model = models.build_model(args, self)
        if getattr(args, "tpu", False):
            model.prepare_for_tpu_()
        model = quantization_utils.quantize_model_scalar(model, args)
        return model

    def build_criterion(self, args: Namespace):
        """
        Build the :class:`~fairseq.criterions.FairseqCriterion` instance for
        this task.
        Args:
            args (argparse.Namespace): parsed command-line arguments
        Returns:
            a :class:`~fairseq.criterions.FairseqCriterion` instance
        """
        from fairseq import criterions

        return criterions.build_criterion(args, self)


@register_task("plbart_sentence_prediction")
class BARTSentencePredictionTask(LegacyFairseqTask):
    """
    Sentence (or sentence pair) prediction (classification or regression) task.
    Args:
        dictionary (Dictionary): the dictionary for the input of the task
    """

    @staticmethod
    def add_args(parser):
        """Add task-specific arguments to the parser."""
        parser.add_argument("data", metavar="FILE", help="file prefix for data")
        parser.add_argument(
            "--num-classes",
            type=int,
            default=-1,
            help="number of classes or regression targets",
        )
        parser.add_argument(
            "--init-token",
            type=int,
            default=None,
            help="add token at the beginning of each batch item",
        )
        parser.add_argument(
            "--separator-token",
            type=int,
            default=None,
            help="add separator token between inputs",
        )
        parser.add_argument("--regression-target", action="store_true", default=False)
        parser.add_argument("--no-shuffle", action="store_true", default=False)
        parser.add_argument(
            "--shorten-method",
            default="none",
            choices=["none", "truncate", "random_crop"],
            help="if not none, shorten sequences that exceed --tokens-per-sample",
        )
        parser.add_argument(
            "--shorten-data-split-list",
            default="",
            help="comma-separated list of dataset splits to apply shortening to, "
                 'e.g., "train,valid" (default: all dataset splits)',
        )
        parser.add_argument(
            "--add-prev-output-tokens",
            action="store_true",
            default=False,
            help="add prev_output_tokens to sample, used for encoder-decoder arch",
        )

        #####
        parser.add_argument(
            "--max-positions", type=int, help="number of positional embeddings to learn"
        )
        parser.add_argument('--langs', required=True, metavar='LANG',
                            help='comma-separated list of monolingual language, '
                                 'for example, "en,de,fr". These should match the '
                                 'langs from pretraining (and be in the same order). '
                                 'You should always add all pretraining language idx '
                                 'during finetuning.')

    def __init__(self, args, data_dictionary, label_dictionary):
        super().__init__(args)
        self.dictionary = data_dictionary
        self._label_dictionary = label_dictionary
        if not hasattr(args, "max_positions"):
            self._max_positions = (
                args.max_source_positions,
                args.max_target_positions,
            )
        else:
            self._max_positions = args.max_positions
        args.tokens_per_sample = self._max_positions

    @classmethod
    def load_dictionary(cls, args, filename, source=True):
        """Load the dictionary from the filename
        Args:
            filename (str): the filename
        """
        dictionary = Dictionary.load(filename)
        ##
        langs = args.langs.split(",")
        for l in langs:
            dictionary.add_symbol("[{}]".format(l))
        dictionary.add_symbol("<mask>")
        return dictionary

    @classmethod
    def setup_task(cls, args, **kwargs):
        assert args.num_classes > 0, "Must set --num-classes"

        # load data dictionary
        data_dict = cls.load_dictionary(
            args,
            os.path.join(args.data, "input0", "dict.txt"),
            source=True,
        )
        logger.info("[input] dictionary: {} types".format(len(data_dict)))

        label_dict = None
        if not args.regression_target:
            # load label dictionary
            label_dict = cls.load_dictionary(
                args,
                os.path.join(args.data, "label", "dict.txt"),
                source=False,
            )
            logger.info("[label] dictionary: {} types".format(len(label_dict)))
        else:
            label_dict = data_dict
        return cls(args, data_dict, label_dict)

    def load_dataset(self, split, combine=False, **kwargs):
        """Load a given dataset split (e.g., train, valid, test)."""

        def get_path(key, split):
            return os.path.join(self.args.data, key, split)

        def make_dataset(key, dictionary):
            split_path = get_path(key, split)

            dataset = data_utils.load_indexed_dataset(
                split_path,
                dictionary,
                self.args.dataset_impl,
                combine=combine,
            )
            return dataset

        input0 = make_dataset("input0", self.source_dictionary)
        assert input0 is not None, "could not find dataset: {}".format(
            get_path("input0", split)
        )
        input1 = make_dataset("input1", self.source_dictionary)

        if self.args.init_token is not None:
            input0 = PrependTokenDataset(input0, self.args.init_token)

        if input1 is None:
            src_tokens = input0
        else:
            if self.args.separator_token is not None:
                input1 = PrependTokenDataset(input1, self.args.separator_token)

            src_tokens = ConcatSentencesDataset(input0, input1)

        with data_utils.numpy_seed(self.args.seed):
            shuffle = np.random.permutation(len(src_tokens))

        src_tokens = maybe_shorten_dataset(
            src_tokens,
            split,
            self.args.shorten_data_split_list,
            self.args.shorten_method,
            self.max_positions(),
            self.args.seed,
        )

        dataset = {
            "id": IdDataset(),
            "net_input": {
                "src_tokens": RightPadDataset(
                    src_tokens,
                    pad_idx=self.source_dictionary.pad(),
                ),
                "src_lengths": NumelDataset(src_tokens, reduce=False),
            },
            "nsentences": NumSamplesDataset(),
            "ntokens": NumelDataset(src_tokens, reduce=True),
        }

        if self.args.add_prev_output_tokens:
            prev_tokens_dataset = RightPadDataset(
                RollDataset(src_tokens, 1),
                pad_idx=self.dictionary.pad(),
            )
            dataset["net_input"].update(
                prev_output_tokens=prev_tokens_dataset,
            )

        if not self.args.regression_target:
            label_dataset = make_dataset("label", self.label_dictionary)
            if label_dataset is not None:
                dataset.update(
                    target=OffsetTokensDataset(
                        StripTokenDataset(
                            label_dataset,
                            id_to_strip=self.label_dictionary.eos(),
                        ),
                        offset=-self.label_dictionary.nspecial,
                    )
                )
        else:
            label_path = "{0}.label".format(get_path("label", split))
            if os.path.exists(label_path):
                def parse_regression_target(i, line):
                    values = line.split()
                    assert (
                            len(values) == self.args.num_classes
                    ), f'expected num_classes={self.args.num_classes} regression target values on line {i}, found: "{line}"'
                    return [float(x) for x in values]

                with open(label_path) as h:
                    dataset.update(
                        target=RawLabelDataset(
                            [
                                parse_regression_target(i, line.strip())
                                for i, line in enumerate(h.readlines())
                            ]
                        )
                    )

        nested_dataset = NestedDictionaryDataset(
            dataset,
            sizes=[src_tokens.sizes],
        )

        if self.args.no_shuffle:
            dataset = nested_dataset
        else:
            dataset = SortDataset(
                nested_dataset,
                # shuffle
                sort_order=[shuffle],
            )

        logger.info("Loaded {0} with #samples: {1}".format(split, len(dataset)))

        self.datasets[split] = dataset
        return self.datasets[split]

    def build_model(self, args):
        from fairseq import models

        model = models.build_model(args, self)

        model.register_classification_head(
            getattr(args, "classification_head_name", "sentence_classification_head"),
            num_classes=self.args.num_classes,
        )

        return model

    def max_positions(self):
        return self._max_positions

    @property
    def source_dictionary(self):
        return self.dictionary

    @property
    def target_dictionary(self):
        return self.dictionary

    @property
    def label_dictionary(self):
        return self._label_dictionary


In [6]:
plbart = BARTModel.from_pretrained('PLBART/plbart-c-cpp-defect-detection', checkpoint_file='model.pt')
plbart.eval()

BARTHubInterface(
  (model): BARTModel(
    (encoder): TransformerEncoder(
      (dropout_module): FairseqDropout()
      (embed_tokens): Embedding(50005, 768, padding_idx=1)
      (embed_positions): LearnedPositionalEmbedding(1026, 768, padding_idx=1)
      (layernorm_embedding): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (layers): ModuleList(
        (0): TransformerEncoderLayer(
          (self_attn): MultiheadAttention(
            (dropout_module): FairseqDropout()
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout_module): FairseqDropout()
          (activation_dropout_module): FairseqDropout()
     

In [7]:
len(plbart.task.source_dictionary)

50005

In [8]:
for i in [0,1,2,3,50001,50002, 50003, 50004]:
    print(i, plbart.task.source_dictionary[i])

0 <s>
1 <pad>
2 </s>
3 <unk>
50001 [java]
50002 [python]
50003 [en_XX]
50004 <mask>


In [9]:
fs_model = plbart.model

In [10]:
plbart.args

Namespace(activation_dropout=0.0, activation_fn='gelu', adam_betas='(0.9, 0.98)', adam_eps=1e-08, adaptive_input=False, adaptive_softmax_cutoff=None, adaptive_softmax_dropout=0, add_prev_output_tokens=True, all_gather_list_size=16384, arch='mbart_base', attention_dropout=0.1, best_checkpoint_metric='accuracy', bf16=False, bpe='gpt2', broadcast_buffers=False, bucket_cap_mb=25, checkpoint_suffix='', classification_head_name='sentence_classification_head', clip_norm=1.0, cpu=False, criterion='sentence_prediction', cross_self_attention=False, curriculum=0, data='/home/crocoder/Desktop/transformers/PLBART/plbart-c-cpp-defect-detection', data_buffer_size=10, dataset_impl=None, ddp_backend='no_c10d', decoder_attention_heads=12, decoder_embed_dim=768, decoder_embed_path=None, decoder_ffn_embed_dim=3072, decoder_input_dim=768, decoder_layerdrop=0, decoder_layers=6, decoder_layers_to_keep=None, decoder_learned_pos=True, decoder_normalize_before=False, decoder_output_dim=768, device_id=0, disable

In [11]:
from transformers import PLBartConfig, PLBartForSequenceClassification

In [12]:
hf_model = PLBartForSequenceClassification.from_pretrained('plbart-c-cpp-defect-detection')

## Inputs

In [13]:
import sentencepiece as spm

In [14]:
vocab_filepath = "./PLBART/plbart_orig_pretrained_ckpt/sentencepiece.bpe.model"
tokenizer = spm.SentencePieceProcessor()
tokenizer.Load(vocab_filepath)
tokenizer.SetEncodeExtraOptions("")

True

In [15]:
texts = ["This is a sample text", "Another example here"]
text_0_tokens = tokenizer.EncodeAsIds(texts[0].strip())
text_1_tokens = tokenizer.EncodeAsIds(texts[1].strip())
# Need to check how is the original tokenizer

In [16]:
text_1_tokens += [1]*2

In [17]:
import numpy as np
import torch
import torch.nn as nn

In [18]:
input_ids = torch.from_numpy(np.array([text_0_tokens, text_1_tokens]))
attention_mask = torch.ones_like(input_ids)
attention_mask[1, -2:] = 0
token_type_ids = torch.zeros_like(input_ids)

# Encoder Embeddings

## Fairseq

In [19]:
fs_embeds, embed = fs_model.encoder.forward_embedding(input_ids)

In [20]:
# fs_model.encoder.quant_noise # None
# fs_model.encoder.layernorm_embedding #  not None
# fs_model.encoder.embed_positions # not None

## HuggingFace

In [21]:
hf_model.model.encoder.training

False

In [22]:
input_shape = input_ids.size()
inputs_embeds = hf_model.model.encoder.embed_tokens(input_ids) * hf_model.model.encoder.embed_scale
embed_pos = hf_model.model.encoder.embed_positions(input_shape)
hf_embeds = inputs_embeds + embed_pos
hf_embeds = hf_model.model.encoder.layernorm_embedding(hf_embeds)
hf_embeds = nn.functional.dropout(hf_embeds, p= hf_model.model.encoder.dropout, training= hf_model.model.encoder.training)


In [23]:
torch.allclose(hf_embeds[0], fs_embeds[0], atol=1e-5)

True

In [24]:
torch.allclose(hf_embeds[1,:-2], fs_embeds[1, :-2], atol=1e-5)

True

In [25]:
hf_embeds[1, -2:]

tensor([[-0.8661,  0.3116,  1.0590,  ..., -0.0160,  0.5052, -1.4175],
        [-0.9077,  0.2613,  0.8916,  ...,  0.3555,  0.6211, -1.4142]],
       grad_fn=<SliceBackward>)

In [26]:
fs_embeds[1, -2:] # Fairseq handles padding tokens differently, so they won't match.

tensor([[-0.6240,  0.1483,  1.1706,  ...,  0.1917,  0.5521, -1.0526],
        [-0.6240,  0.1483,  1.1706,  ...,  0.1917,  0.5521, -1.0526]],
       grad_fn=<SliceBackward>)

In [27]:
torch.allclose(hf_embeds[1,-2:], fs_embeds[1, -2:], atol=1e-5)

False

# Encoder Model

## Fairseq

In [28]:
fs_encoder = fs_model.encoder

In [29]:
src_lengths = torch.tensor([len(text_0_tokens), len(text_1_tokens)])

In [30]:
fs_encoder_out = fs_encoder(input_ids, src_lengths)

In [31]:
fs_encoder_out_encoder_out = fs_encoder_out.encoder_out.permute(1, 0, 2)

In [32]:
fs_encoder_out_encoder_out.shape

torch.Size([2, 5, 768])

## HuggingFace

In [33]:
hf_encoder = hf_model.model.encoder

In [34]:
hf_encoder_out = hf_encoder(input_ids, attention_mask)

In [35]:
hf_encoder_out = hf_encoder_out.last_hidden_state

In [36]:
torch.allclose(hf_encoder_out[0], fs_encoder_out_encoder_out[0], atol=1e-5)

True

In [37]:
torch.allclose(hf_encoder_out[1,:-2], fs_encoder_out_encoder_out[1,:-2], atol=1e-5)

True

# Decoder

## Fairseq

In [38]:
fs_decoder = fs_model.decoder

In [39]:
fs_decoder_out = fs_decoder(input_ids, fs_encoder_out)

In [40]:
len(fs_decoder_out)
fs_decoder_out_decoder_out = fs_decoder_out[0]

In [41]:
fs_decoder_out[1].keys()

dict_keys(['attn', 'inner_states'])

In [42]:
#  fs_decoder_out[1]['inner_states'][-1].permute(1, 0, 2).shape

In [43]:
fs_decoder_out_inner_state = fs_decoder_out[1]['inner_states'][-1].permute(1, 0, 2)

In [44]:
fs_decoder_out_decoder_out.shape

torch.Size([2, 5, 50005])

## HuggingFace

In [45]:
hf_decoder = hf_model.model.decoder

In [46]:
hf_decoder_out = hf_decoder(input_ids, attention_mask=attention_mask, encoder_hidden_states=hf_encoder_out, encoder_attention_mask=attention_mask)

In [47]:
hf_decoder_out.last_hidden_state.shape

torch.Size([2, 5, 768])

In [48]:
torch.allclose(hf_decoder_out.last_hidden_state[0], fs_decoder_out_inner_state[0], atol=1e-5)

True

In [49]:
torch.allclose(hf_decoder_out.last_hidden_state[1, :-2], fs_decoder_out_inner_state[1, :-2], atol=1e-5)

True

In [50]:
torch.allclose(hf_decoder_out.last_hidden_state[1], fs_decoder_out_inner_state[1], atol=1e-5)

True

# Classification Head

## Fairseq

In [51]:
logits = fs_model.classification_heads.sentence_classification_head(fs_decoder_out_inner_state[:, -1])

In [52]:
logits.shape

torch.Size([2, 2])

## HuggingFace

In [53]:
clf_logits = hf_model.classification_head(hf_decoder_out.last_hidden_state[:, -1])

In [54]:
hf_decoder_out.last_hidden_state[:, -1]

tensor([[-0.0129, -0.0352, -0.0964,  ..., -0.0557,  0.2081,  0.3102],
        [ 0.1923,  0.1242, -0.0606,  ...,  0.0995,  0.1881,  0.5741]],
       grad_fn=<SelectBackward>)

In [55]:
fs_decoder_out_inner_state[:, -1]

tensor([[-0.0129, -0.0352, -0.0964,  ..., -0.0557,  0.2081,  0.3102],
        [ 0.1923,  0.1242, -0.0606,  ...,  0.0995,  0.1881,  0.5741]],
       grad_fn=<SelectBackward>)

In [56]:
clf_logits.shape

torch.Size([2, 2])

In [57]:
torch.allclose(clf_logits[0], logits[0], atol=1e-5)

True

In [58]:
torch.allclose(clf_logits[1], logits[1], atol=1e-5)

True