In [1]:
from onnxruntime import InferenceSession
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from pathlib import Path
from transformers.convert_graph_to_onnx import convert

In [2]:
onnx_model_name="./onnx_model/roberta-base-squad2.onnx"
model_name="deepset/roberta-base-squad2"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [5]:
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

In [3]:
def convert_for_onnx(model_path, onnx_model_name, tokenizer):
    if Path(onnx_model_name).exists():
        print("ONNX input exists")
        return
    convert(
        framework="pt",
        model=model_path,
        tokenizer=tokenizer,
        output=Path(onnx_model_name),
        pipeline_name="question-answering",
        opset=12
    ) 

In [6]:
convert_for_onnx(model_path=model, onnx_model_name=onnx_model_name, tokenizer=tokenizer)

ONNX input exists


In [7]:
# need example.context_text and example.question_text
example_dict={"context": "In its early years, the new convention center failed to meet attendance and revenue expectations.[12] By 2002, many Silicon Valley businesses were choosing the much larger Moscone Center in San Francisco over the San Jose Convention Center due to the latter's limited space. A ballot measure to finance an expansion via a hotel tax failed to reach the required two-thirds majority to pass. In June 2005, Team San Jose built the South Hall, a $6.77 million, blue and white tent, adding 80,000 square feet (7,400 m2) of exhibit space", "question": "where is the businesses choosing to go?"}

In [35]:
padding="longest"
max_seq_len=384
doc_stride=128
max_question_len=64
max_answer_len=64

In [9]:
from transformers.data.processors import squad_convert_examples_to_features
from transformers.pipelines import QuestionAnsweringArgumentHandler

In [11]:
_arg_parser=QuestionAnsweringArgumentHandler()

In [12]:
examples= _arg_parser(example_dict)

In [13]:
from transformers.data.processors import SquadFeatures

In [14]:
import numpy as np

In [61]:
import torch 
device=torch.device("cpu")

In [54]:
features_list = []
for example in examples:
    # Define the side we want to truncate / pad and the text/pair sorting
    question_first = bool(tokenizer.padding_side == "right")
    encoded_inputs = tokenizer(
        text=example.question_text if question_first else example.context_text,
        text_pair=example.context_text if question_first else example.question_text,
        padding=padding,
        truncation="only_second" if question_first else "only_first",
        max_length=max_seq_len,
        stride=doc_stride,
        return_tensors="np",
        return_token_type_ids=True,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        return_special_tokens_mask=True,
    )
    
    num_spans = len(encoded_inputs["input_ids"])
    p_mask = np.asarray(
        [
            [tok != 1 if question_first else 0 for tok in encoded_inputs.sequence_ids(span_id)]
            for span_id in range(num_spans)
        ]
    )

    # keep the cls_token unmasked (some models use it to indicate unanswerable questions)
    if tokenizer.cls_token_id:
        cls_index = np.nonzero(encoded_inputs["input_ids"] == tokenizer.cls_token_id)
        p_mask[cls_index] = 0

    features = []
    for span_idx in range(num_spans):
        features.append(
            SquadFeatures(
                input_ids=encoded_inputs["input_ids"][span_idx],
                attention_mask=encoded_inputs["attention_mask"][span_idx],
                token_type_ids=encoded_inputs["token_type_ids"][span_idx],
                p_mask=p_mask[span_idx].tolist(),
                encoding=encoded_inputs[span_idx],
                # We don't use the rest of the values - and actually
                # for Fast tokenizer we could totally avoid using SquadFeatures and SquadExample
                cls_index=None,
                token_to_orig_map={},
                example_index=0,
                unique_id=0,
                paragraph_len=0,
                token_is_max_context=0,
                tokens=[],
                start_position=0,
                end_position=0,
                is_impossible=False,
                qas_id=None,
                )
            )
        features_list.append(features)

In [55]:
for features, example in zip(features_list, examples):
    model_input_names = tokenizer.model_input_names + ["input_ids"]
    fw_args = {k: [feature.__dict__[k] for feature in features] for k in model_input_names}

In [18]:
fw_args

{'attention_mask': [array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])],
 'input_ids': [array([    0,  8569,    16,     5,  1252,  8348,     7,   213,   116,
             2,     2,  1121,    63,   419,   107,     6,     5,    92,
          8825,  1312,  1447,     7,   972,  6856,     8,   903,  2113,
         31274,  1092,   742,   870,  5241,     6,   171, 10087,  1739,
          1252,    58,  8348,     5,   203,  2514,  8033, 33666,   824,
            11,   764,  2659,    81,     5,   764,  3071,  9127,   824,
           528,     7,     5,  5442,    18,  1804,   980,     4,    83,
          5250,  2450,     7, 

In [19]:
session = InferenceSession(onnx_model_name)

In [20]:
for inp in session.get_inputs():
    print(inp)

NodeArg(name='input_ids', type='tensor(int64)', shape=['batch', 'sequence'])
NodeArg(name='attention_mask', type='tensor(int64)', shape=['batch', 'sequence'])


In [21]:
from typing import Tuple

In [22]:
def decode(start: np.ndarray, end: np.ndarray, topk: int, max_answer_len: int) -> Tuple:
    """
    Take the output of any :obj:`ModelForQuestionAnswering` and will generate probabilities for each span to be the
    actual answer.
    In addition, it filters out some unwanted/impossible cases like answer len being greater than max_answer_len or
    answer end position being before the starting position. The method supports output the k-best answer through
    the topk argument.
    Args:
        start (:obj:`np.ndarray`): Individual start probabilities for each token.
        end (:obj:`np.ndarray`): Individual end probabilities for each token.
        topk (:obj:`int`): Indicates how many possible answer span(s) to extract from the model output.
        max_answer_len (:obj:`int`): Maximum size of the answer to extract from the model's output.
    """
    # Ensure we have batch axis
    if start.ndim == 1:
        start = start[None]

    if end.ndim == 1:
        end = end[None]

    # Compute the score of each tuple(start, end) to be the real answer
    outer = np.matmul(np.expand_dims(start, -1), np.expand_dims(end, 1))

    # Remove candidate with end < start and end - start > max_answer_len
    candidates = np.tril(np.triu(outer), max_answer_len - 1)

    #  Inspired by Chen & al. (https://github.com/facebookresearch/DrQA)
    scores_flat = candidates.flatten()
    if topk == 1:
        idx_sort = [np.argmax(scores_flat)]
    elif len(scores_flat) < topk:
        idx_sort = np.argsort(-scores_flat)
    else:
        idx = np.argpartition(-scores_flat, topk)[0:topk]
        idx_sort = idx[np.argsort(-scores_flat[idx])]

    start, end = np.unravel_index(idx_sort, candidates.shape)[1:]
    return start, end, candidates[0, start, end]


In [74]:
def predict_qa(model_type, features_list, examples):
    all_answers = []
    for features, example in zip(features_list, examples):
        model_input_names = tokenizer.model_input_names + ["input_ids"]
        fw_args = {k: [feature.__dict__[k] for feature in features] for k in model_input_names}
        if model_type=="onnx":
            session = InferenceSession(onnx_model_name)
            output = session.run(None, fw_args)
            start=output[0]
            end=output[1]
        else:
            with torch.no_grad():
                fw_args = {k: torch.tensor(v, device=device) for (k, v) in fw_args.items()}
                # On Windows, the default int type in numpy is np.int32 so we get some non-long tensors.
                fw_args = {k: v.long() if v.dtype == torch.int32 else v for (k, v) in fw_args.items()}
                start, end = model_type(**fw_args)[:2]
                start, end = start.cpu().numpy(), end.cpu().numpy()


        min_null_score = 1000000  # large and positive
        answers = []
        for (feature, start_, end_) in zip(features, start, end):
            # Ensure padded tokens & question tokens cannot belong to the set of candidate answers.
            undesired_tokens = np.abs(np.array(feature.p_mask) - 1) & feature.attention_mask
    
            # Generate mask
            undesired_tokens_mask = undesired_tokens == 0.0

            # Make sure non-context indexes in the tensor cannot contribute to the softmax
            start_ = np.where(undesired_tokens_mask, -10000.0, start_)
            end_ = np.where(undesired_tokens_mask, -10000.0, end_)

            # Normalize logits and spans to retrieve the answer
            start_ = np.exp(start_ - np.log(np.sum(np.exp(start_), axis=-1, keepdims=True)))
            end_ = np.exp(end_ - np.log(np.sum(np.exp(end_), axis=-1, keepdims=True)))

            # Mask CLS
            start_[0] = end_[0] = 0.0
            starts, ends, scores = decode(start=start, end=end,topk=1, max_answer_len=max_answer_len)
            if not tokenizer.is_fast:
                char_to_word = np.array(example.char_to_word_offset)
                answers += [
                    {
                        "score": score.item(),
                        "start": np.where(char_to_word == feature.token_to_orig_map[s])[0][0].item(),
                        "end": np.where(char_to_word == feature.token_to_orig_map[e])[0][-1].item(),
                        "answer": " ".join(
                            example.doc_tokens[feature.token_to_orig_map[s] : feature.token_to_orig_map[e] + 1]
                            ),
                    }
                    for s, e, score in zip(starts, ends, scores)
                ]
            else:
                question_first = bool(tokenizer.padding_side == "right")
                enc = feature.encoding
                # Sometimes the max probability token is in the middle of a word so:
                # - we start by finding the right word containing the token with `token_to_word`
                # - then we convert this word in a character span with `word_to_chars`
                answers += [
                    {
                        "score": score.item(),
                        "start": enc.word_to_chars(
                        enc.token_to_word(s), sequence_index=1 if question_first else 0 )[0],
                        "end": enc.word_to_chars(enc.token_to_word(e), sequence_index=1 if question_first else 0)[1],
                        "answer": example.context_text[
                            enc.word_to_chars(enc.token_to_word(s), sequence_index=1 if question_first else 0)[0] : enc.word_to_chars(enc.token_to_word(e), sequence_index=1 if question_first else 0)[1]],
                    }
                    for s, e, score in zip(starts, ends, scores)
                ]
            answers = sorted(answers, key=lambda x: x["score"], reverse=True)[: 1]
            all_answers += answers
        if len(all_answers) == 1:
            return all_answers[0]
        return all_answers


In [75]:
pred_onnx=predict_qa(model_type="onnx", features_list=features_list, examples=examples)

In [58]:
pred_model=predict_qa(model_type=model, features_list=features_list, examples=examples)

In [76]:
pred_onnx

{'score': 96.2432861328125, 'start': 438, 'end': 443, 'answer': ', a $'}

In [60]:
pred_model

{'score': 96.2432861328125, 'start': 438, 'end': 443, 'answer': ', a $'}

In [62]:
for features, example in zip(features_list, examples):
    model_input_names = tokenizer.model_input_names + ["input_ids"]
    fw_args = {k: [feature.__dict__[k] for feature in features] for k in model_input_names}
    with torch.no_grad():
        fw_args = {k: torch.tensor(v, device=device) for (k, v) in fw_args.items()}
        # On Windows, the default int type in numpy is np.int32 so we get some non-long tensors.
        fw_args = {k: v.long() if v.dtype == torch.int32 else v for (k, v) in fw_args.items()}
        start, end = model(**fw_args)[:2]
        start, end = start.cpu().numpy(), end.cpu().numpy()



In [69]:
for features, example in zip(features_list, examples):
    model_input_names = tokenizer.model_input_names + ["input_ids"]
    fw_args = {k: [feature.__dict__[k] for feature in features] for k in model_input_names}
    session = InferenceSession(onnx_model_name)
    output = session.run(None, fw_args)
    start_onnx=output[0]
    end_onnx=output[1]

In [72]:
start, start_onnx

(array([[ 2.5174756 , -9.336974  , -9.503601  , -9.283567  , -9.2332    ,
         -9.0281725 , -9.042689  , -8.827452  , -9.803331  , -9.7623415 ,
         -9.869641  , -7.7731953 , -8.48362   , -8.002133  , -9.462732  ,
         -9.973475  , -7.5483236 , -6.3274603 , -4.1583757 , -7.7243757 ,
         -8.653463  , -9.206422  , -9.023609  , -8.492439  , -9.467815  ,
         -8.710339  , -9.111895  , -8.046997  , -6.6119637 , -6.567126  ,
         -1.41188   , -4.6289377 , -7.875642  ,  0.84093297,  2.0125396 ,
         -4.012975  , -2.4589577 , -4.748082  , -0.12348899,  3.4789903 ,
          1.2759614 , -0.641837  ,  5.559165  , -2.6643484 , -2.3954556 ,
         -1.4975166 ,  3.0133183 , -1.9406399 , -3.4758637 ,  0.09910806,
          1.7524412 , -3.6651056 , -3.6433187 , -5.0818367 , -6.8084307 ,
         -8.037044  , -4.3071117 , -6.0028043 , -8.769691  , -4.5208206 ,
         -5.7817235 , -6.567108  , -7.3898325 , -7.8200207 , -9.358879  ,
         -9.039545  , -8.446899  , -8.

In [77]:
end, end_onnx

(array([[ 2.8266854 , -8.526317  , -8.608975  , -8.922647  , -8.779562  ,
         -8.767787  , -8.910081  , -8.093397  , -8.119008  , -8.366886  ,
         -8.068167  , -9.368933  , -9.19462   , -9.218822  , -7.867637  ,
         -7.989144  , -9.538399  , -9.151335  , -7.176478  , -3.3620288 ,
         -9.053052  , -8.856573  , -8.985335  , -8.094151  , -8.784493  ,
         -8.71572   , -7.2514143 , -5.2117157 , -5.2346363 , -0.91592056,
         -7.121268  , -3.8923995 , -3.0051386 , -5.377213  , -4.2859216 ,
          1.814283  , -1.9127815 , -6.796349  , -3.922427  , -3.5612164 ,
         -5.677236  , -1.9109424 , -2.496266  , -0.76523185,  5.523105  ,
         -2.8118653 , -2.6718936 ,  5.498702  , -1.9994283 , -5.6354527 ,
         -5.228826  , -0.06761378, -3.9976761 ,  3.1309412 , -3.9512823 ,
         -6.956884  , -7.705128  , -5.520048  , -7.4978147 , -7.50962   ,
         -2.060118  , -0.91590536, -9.25655   , -8.93781   , -7.879832  ,
         -9.095184  , -9.046648  , -9.

In [36]:
fw_args

{'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1]]),
 'input_ids': tensor([[    0,  8569,    16,     5,  1252,  8348,     7,   213,   116,     2,
              2,  1121,    63,   419,   107,     6,     5,    92,  8825,  1312,
           1447,     7,   972,  6856,     8,   903,  2113, 31274,  1092,   742,
            870,  5241,     6,   171, 10087,  1739,  1252,    58,  8348,     5,
            203,  2514,  8033, 33666,   824,    11,   764,  2659,    81,     5,
            764,  3071,  9127,   824,   528,     7,     5,  5442,    18,  1804,
            980,     4,    83,  5250,  2450,    

In [37]:
model(**fw_args)[:2]

(tensor([[ 2.5175, -9.3370, -9.5036, -9.2836, -9.2332, -9.0282, -9.0427, -8.8275,
          -9.8033, -9.7623, -9.8696, -7.7732, -8.4836, -8.0021, -9.4627, -9.9735,
          -7.5483, -6.3275, -4.1584, -7.7244, -8.6535, -9.2064, -9.0236, -8.4924,
          -9.4678, -8.7103, -9.1119, -8.0470, -6.6120, -6.5671, -1.4119, -4.6289,
          -7.8756,  0.8409,  2.0125, -4.0130, -2.4590, -4.7481, -0.1235,  3.4790,
           1.2760, -0.6418,  5.5592, -2.6643, -2.3955, -1.4975,  3.0133, -1.9406,
          -3.4759,  0.0991,  1.7524, -3.6651, -3.6433, -5.0818, -6.8084, -8.0370,
          -4.3071, -6.0028, -8.7697, -4.5208, -5.7817, -6.5671, -7.3898, -7.8200,
          -9.3589, -9.0395, -8.4469, -8.5160, -7.7395, -9.0783, -8.1258, -4.2110,
          -8.8292, -8.5163, -9.1294, -8.8214, -8.7204, -8.5580, -7.5532, -8.9369,
          -8.8408, -8.8931, -9.1339, -8.8616, -6.5671, -8.4583, -7.9381, -8.5961,
          -9.6361, -4.6034, -3.3610, -6.8178, -8.8042, -6.1084, -3.2126, -7.5068,
          -9.879

In [None]:
def predict(*args, **kwargs):
    kwargs.setdefault("padding", "longest")
    kwargs.setdefault("topk", 1)
    kwargs.setdefault("doc_stride", 128)
    kwargs.setdefault("max_answer_len", 15)
    kwargs.setdefault("max_seq_len", 384)
    kwargs.setdefault("max_question_len", 64)
    kwargs.setdefault("handle_impossible_answer", False)
    examples = _args_parser(*args, **kwargs)

    features_list = [squad_convert_examples_to_features(
        examples=[example],
        tokenizer=tokenizer,
        max_seq_length=kwargs["max_seq_len"],
        doc_stride=kwargs["doc_stride"],
        max_query_length=kwargs["max_question_len"],
        padding_strategy=PaddingStrategy.MAX_LENGTH.value,
        is_training=False,
        tqdm_enabled=False)
        for example in examples]