In [1]:
from onnxruntime import InferenceSession
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from pathlib import Path
from transformers.convert_graph_to_onnx import convert

In Transformers v4.0.0, the default path to cache downloaded models changed from '~/.cache/torch/transformers' to '~/.cache/huggingface/transformers'. Since you don't seem to have overridden and '~/.cache/torch/transformers' is a directory that exists, we're moving it to '~/.cache/huggingface/transformers' to avoid redownloading models you have already in the cache. You should only see this message once.


In [2]:
onnx_model_name="./onnx_model/roberta-base-squad2.onnx"
model_name="deepset/roberta-base-squad2"
tokenizer = AutoTokenizer.from_pretrained(model_name)

Downloading: 100%|██████████| 571/571 [00:00<00:00, 289kB/s]
Downloading: 100%|██████████| 899k/899k [00:00<00:00, 1.44MB/s]
Downloading: 100%|██████████| 456k/456k [00:00<00:00, 873kB/s]
Downloading: 100%|██████████| 772/772 [00:00<00:00, 327kB/s]
Downloading: 100%|██████████| 79.0/79.0 [00:00<00:00, 17.9kB/s]


In [3]:
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

Downloading: 100%|██████████| 496M/496M [01:37<00:00, 5.10MB/s]


In [4]:
def convert_for_onnx(model_path, onnx_model_name, tokenizer):
    if Path(onnx_model_name).exists():
        print("ONNX input exists")
        return
    convert(
        framework="pt",
        model=model_path,
        tokenizer=tokenizer,
        output=Path(onnx_model_name),
        pipeline_name="question-answering",
        opset=12
    ) 

In [5]:
convert_for_onnx(model_path=model, onnx_model_name=onnx_model_name, tokenizer=tokenizer)

ONNX opset version set to: 12
Loading pipeline (model: RobertaForQuestionAnswering(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, 

In [77]:
# need example.context_text and example.question_text
example_dict={"context": "In its early years, the new convention center failed to meet attendance and revenue expectations.[12] By 2002, many Silicon Valley businesses were choosing the much larger Moscone Center in San Francisco over the San Jose Convention Center due to the latter's limited space. A ballot measure to finance an expansion via a hotel tax failed to reach the required two-thirds majority to pass. In June 2005, Team San Jose built the South Hall, a $6.77 million, blue and white tent, adding 80,000 square feet (7,400 m2) of exhibit space", "question": "where is the businesses choosing to go?"}

In [86]:
padding="longest"
max_seq_len=384
doc_stride=128
max_question_len=64

In [68]:
from transformers.data.processors import squad_convert_examples_to_features

In [70]:
from transformers.pipelines import QuestionAnsweringArgumentHandler

In [71]:
_args_parser = QuestionAnsweringArgumentHandler()

In [78]:
examples=_args_parser(example_dict)

In [79]:
from transformers.tokenization_utils_base import PaddingStrategy

In [87]:
features_list = [
                squad_convert_examples_to_features(
                    examples=[example],
                    tokenizer=tokenizer,
                    max_seq_length=max_seq_len,
                    doc_stride=doc_stride,
                    max_query_length=max_question_len,
                    padding_strategy=PaddingStrategy.MAX_LENGTH.value,
                    is_training=False,
                    tqdm_enabled=False,
                )
                for example in examples
            ]

TypeError: _batch_encode_plus() got an unexpected keyword argument 'add_prefix_space'

In [88]:
from transformers.data.processors import SquadFeatures

In [91]:
features_list = []
for example in examples:
    # Define the side we want to truncate / pad and the text/pair sorting
    question_first = bool(tokenizer.padding_side == "right")
    encoded_inputs = tokenizer(
        text=example.question_text if question_first else example.context_text,
        text_pair=example.context_text if question_first else example.question_text,
        padding=padding,
        truncation="only_second" if question_first else "only_first",
        max_length=max_seq_len,
        stride=doc_stride,
        return_tensors="np",
        return_token_type_ids=True,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        return_special_tokens_mask=True,
    )
    
    num_spans = len(encoded_inputs["input_ids"])
    p_mask = np.asarray(
        [
            [tok != 1 if question_first else 0 for tok in encoded_inputs.sequence_ids(span_id)]
            for span_id in range(num_spans)
        ]
    )

    # keep the cls_token unmasked (some models use it to indicate unanswerable questions)
    if tokenizer.cls_token_id:
        cls_index = np.nonzero(encoded_inputs["input_ids"] == tokenizer.cls_token_id)
        p_mask[cls_index] = 0

    features = []
    for span_idx in range(num_spans):
        features.append(
            SquadFeatures(
                input_ids=encoded_inputs["input_ids"][span_idx],
                attention_mask=encoded_inputs["attention_mask"][span_idx],
                token_type_ids=encoded_inputs["token_type_ids"][span_idx],
                p_mask=p_mask[span_idx].tolist(),
                encoding=encoded_inputs[span_idx],
                # We don't use the rest of the values - and actually
                # for Fast tokenizer we could totally avoid using SquadFeatures and SquadExample
                cls_index=None,
                token_to_orig_map={},
                example_index=0,
                unique_id=0,
                paragraph_len=0,
                token_is_max_context=0,
                tokens=[],
                start_position=0,
                end_position=0,
                is_impossible=False,
                qas_id=None,
                )
            )
        features_list.append(features)

In [92]:
features_list

[[<transformers.data.processors.squad.SquadFeatures at 0x7f775802cc10>]]

In [112]:
for features, example in zip(features_list, examples):
    model_input_names = tokenizer.model_input_names + ["input_ids"]
    fw_args = {k: [feature.__dict__[k] for feature in features] for k in model_input_names}

In [113]:
fw_args

{'attention_mask': [array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])],
 'input_ids': [array([    0,  8569,    16,     5,  1252,  8348,     7,   213,   116,
             2,     2,  1121,    63,   419,   107,     6,     5,    92,
          8825,  1312,  1447,     7,   972,  6856,     8,   903,  2113,
         31274,  1092,   742,   870,  5241,     6,   171, 10087,  1739,
          1252,    58,  8348,     5,   203,  2514,  8033, 33666,   824,
            11,   764,  2659,    81,     5,   764,  3071,  9127,   824,
           528,     7,     5,  5442,    18,  1804,   980,     4,    83,
          5250,  2450,     7, 

In [114]:
type(fw_args), fw_args.keys()

(dict, dict_keys(['attention_mask', 'input_ids']))

In [115]:
session = InferenceSession(onnx_model_name)

In [137]:
for inp in session.get_inputs():
    print(inp)

NodeArg(name='input_ids', type='tensor(int64)', shape=['batch', 'sequence'])
NodeArg(name='attention_mask', type='tensor(int64)', shape=['batch', 'sequence'])


In [138]:
output = session.run(None, fw_args)

In [139]:
len(output), type(output), type(output[0]), len(output[0])

(2, list, numpy.ndarray, 1)

In [155]:
start=output[0][0]
end=output[1][0]

In [156]:
type(start)

numpy.ndarray

In [157]:
def decode(self, start: np.ndarray, end: np.ndarray, topk: int, max_answer_len: int) -> Tuple:
    """
    Take the output of any :obj:`ModelForQuestionAnswering` and will generate probabilities for each span to be the
    actual answer.
    In addition, it filters out some unwanted/impossible cases like answer len being greater than max_answer_len or
    answer end position being before the starting position. The method supports output the k-best answer through
    the topk argument.
    Args:
        start (:obj:`np.ndarray`): Individual start probabilities for each token.
        end (:obj:`np.ndarray`): Individual end probabilities for each token.
        topk (:obj:`int`): Indicates how many possible answer span(s) to extract from the model output.
        max_answer_len (:obj:`int`): Maximum size of the answer to extract from the model's output.
    """
    # Ensure we have batch axis
    if start.ndim == 1:
        start = start[None]

    if end.ndim == 1:
        end = end[None]

    # Compute the score of each tuple(start, end) to be the real answer
    outer = np.matmul(np.expand_dims(start, -1), np.expand_dims(end, 1))

    # Remove candidate with end < start and end - start > max_answer_len
    candidates = np.tril(np.triu(outer), max_answer_len - 1)

    #  Inspired by Chen & al. (https://github.com/facebookresearch/DrQA)
    scores_flat = candidates.flatten()
    if topk == 1:
        idx_sort = [np.argmax(scores_flat)]
    elif len(scores_flat) < topk:
        idx_sort = np.argsort(-scores_flat)
    else:
        idx = np.argpartition(-scores_flat, topk)[0:topk]
        idx_sort = idx[np.argsort(-scores_flat[idx])]

    start, end = np.unravel_index(idx_sort, candidates.shape)[1:]
    return start, end, candidates[0, start, end]


In [None]:
all_answers = []
for features, example in zip(features_list, examples):
    model_input_names = tokenizer.model_input_names + ["input_ids"]
    fw_args = {k: [feature.__dict__[k] for feature in features] for k in model_input_names}
    session = InferenceSession(onnx_model_name)
    output = session.run(None, fw_args)
    start=output[0][0]
    end=output[1][0]

    min_null_score = 1000000  # large and positive
    answers = []
    for (feature, start_, end_) in zip(features, start, end):
    # Ensure padded tokens & question tokens cannot belong to the set of candidate answers.
    undesired_tokens = np.abs(np.array(feature.p_mask) - 1) & feature.attention_mask
    
    # Generate mask
    undesired_tokens_mask = undesired_tokens == 0.0

    # Make sure non-context indexes in the tensor cannot contribute to the softmax
    start_ = np.where(undesired_tokens_mask, -10000.0, start_)
    end_ = np.where(undesired_tokens_mask, -10000.0, end_)

    # Normalize logits and spans to retrieve the answer
    start_ = np.exp(start_ - np.log(np.sum(np.exp(start_), axis=-1, keepdims=True)))
    end_ = np.exp(end_ - np.log(np.sum(np.exp(end_), axis=-1, keepdims=True)))

    # Mask CLS
    start_[0] = end_[0] = 0.0
    starts, ends, scores = decode(start_, end_,1, max_answer_len)