In [1]:
from onnxruntime import InferenceSession
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from pathlib import Path
from transformers.convert_graph_to_onnx import convert

In [2]:
onnx_model_name="./onnx_model/roberta-base-squad2.onnx"
model_name="deepset/roberta-base-squad2"
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [3]:
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

In [4]:
def convert_for_onnx(model_path, onnx_model_name, tokenizer):
    if Path(onnx_model_name).exists():
        print("ONNX input exists")
        return
    convert(
        framework="pt",
        model=model_path,
        tokenizer=tokenizer,
        output=Path(onnx_model_name),
        pipeline_name="question-answering",
        opset=12
    ) 

In [5]:
convert_for_onnx(model_path=model, onnx_model_name=onnx_model_name, tokenizer=tokenizer)

ONNX input exists


In [7]:
# need example.context_text and example.question_text
examples={"context_text": "In its early years, the new convention center failed to meet attendance and revenue expectations.[12] By 2002, many Silicon Valley businesses were choosing the much larger Moscone Center in San Francisco over the San Jose Convention Center due to the latter's limited space. A ballot measure to finance an expansion via a hotel tax failed to reach the required two-thirds majority to pass. In June 2005, Team San Jose built the South Hall, a $6.77 million, blue and white tent, adding 80,000 square feet (7,400 m2) of exhibit space", "question_text": "where is the businesses choosing to go?"}

In [6]:
padding="longest"
max_seq_len=384
doc_stride=128


In [9]:
examples["question_text"]

'where is the businesses choosing to go?'

In [None]:
features_list = []
for example in examples:
    # Define the side we want to truncate / pad and the text/pair sorting
    question_first = bool(tokenizer.padding_side == "right")
    encoded_inputs = tokenizer(
        text=example.question_text if question_first else example.context_text,
        text_pair=example["context_text"] if question_first else example["question_text"],
        padding=padding,
        truncation="only_second" if question_first else "only_first",
        max_length=max_seq_len,
        stride=doc_stride,
        return_tensors="np",
        return_token_type_ids=True,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        return_special_tokens_mask=True,
        )
    
    # When the input is too long, it's converted in a batch of inputs with overflowing tokens
    # and a stride of overlap between the inputs. If a batch of inputs is given, a special output
    # "overflow_to_sample_mapping" indicate which member of the encoded batch belong to which original batch sample.
    # Here we tokenize examples one-by-one so we don't need to use "overflow_to_sample_mapping".
    # "num_span" is the number of output samples generated from the overflowing tokens.
    num_spans = len(encoded_inputs["input_ids"])

    # p_mask: mask with 1 for token than cannot be in the answer (0 for token which can be in an answer)
    # We put 0 on the tokens from the context and 1 everywhere else (question and special tokens)
    p_mask = np.asarray(
                    [
                        [tok != 1 if question_first else 0 for tok in encoded_inputs.sequence_ids(span_id)]
                        for span_id in range(num_spans)
                    ]
                )

                # keep the cls_token unmasked (some models use it to indicate unanswerable questions)
                if self.tokenizer.cls_token_id:
                    cls_index = np.nonzero(encoded_inputs["input_ids"] == self.tokenizer.cls_token_id)
                    p_mask[cls_index] = 0

                features = []
                for span_idx in range(num_spans):
                    features.append(
                        SquadFeatures(
                            input_ids=encoded_inputs["input_ids"][span_idx],
                            attention_mask=encoded_inputs["attention_mask"][span_idx],
                            token_type_ids=encoded_inputs["token_type_ids"][span_idx],
                            p_mask=p_mask[span_idx].tolist(),
                            encoding=encoded_inputs[span_idx],
                            # We don't use the rest of the values - and actually
                            # for Fast tokenizer we could totally avoid using SquadFeatures and SquadExample
                            cls_index=None,
                            token_to_orig_map={},
                            example_index=0,
                            unique_id=0,
                            paragraph_len=0,
                            token_is_max_context=0,
                            tokens=[],
                            start_position=0,
                            end_position=0,
                            is_impossible=False,
                            qas_id=None,
                        )
                    )
                features_list.append(features)

In [15]:
from transformers.data.processors.squad import squad_convert_example_to_features

examples = [{'context_text': "In its early years, the new convention center failed to meet attendance and revenue expectations.[12] By 2002, many Silicon Valley businesses were choosing the much larger Moscone Center in San Francisco over the San Jose Convention Center due to the latter's limited space. A ballot measure to finance an expansion via a hotel tax failed to reach the required two-thirds majority to pass. In June 2005, Team San Jose built the South Hall, a $6.77 million, blue and white tent, adding 80,000 square feet (7,400 m2) of exhibit space",
'question_text': "where is the businesses choosing to go?"}]

In [20]:
from transformers.tokenization_utils_base import PaddingStrategy

In [22]:
squad_convert_example_to_features(examples=[examples], tokenizer=tokenizer, max_seq_length=384, doc_stride=128, max_query_length=64, is_training=False, return_dataset=False)

TypeError: squad_convert_example_to_features() got an unexpected keyword argument 'examples'

In [21]:
features_list=squad_convert_example_to_features(examples=[examples],
        tokenizer=tokenizer,
        max_seq_length=384,
        doc_stride=128,
        max_query_length=64,
        padding_strategy=PaddingStrategy.MAX_LENGTH.value,
        is_training=False,
        tqdm_enabled=False,
    )

TypeError: squad_convert_example_to_features() got an unexpected keyword argument 'examples'

In [24]:
model_name="deepset/roberta-base-squad2"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForQuestionAnswering.from_pretrained(model_name)

from transformers import pipeline
nlp=pipeline(task="question-answering", model=model, tokenizer=tokenizer)

In [26]:
from transformers import pipelines

In [27]:
ArgumentHandler(ABS)

NameError: name 'ArgumentHandler' is not defined

In [25]:
transformers_query = {"context": examples["context_text"], "question": examples["question_text"]}
predictions = nlp(transformers_query, topk=1, max_seq_len=384, doc_stride=128)

TypeError: list indices must be integers or slices, not str

In [7]:
session = InferenceSession(onnx_model_name)

In [8]:
for input_meta in session.get_inputs():
    print(input_meta)

NodeArg(name='input_ids', type='tensor(int64)', shape=['batch', 'sequence'])
NodeArg(name='attention_mask', type='tensor(int64)', shape=['batch', 'sequence'])


In [9]:
context="In its early years, the new convention center failed to meet attendance and revenue expectations.[12] By 2002, many Silicon Valley businesses were choosing the much larger Moscone Center in San Francisco over the San Jose Convention Center due to the latter's limited space. A ballot measure to finance an expansion via a hotel tax failed to reach the required two-thirds majority to pass. In June 2005, Team San Jose built the South Hall, a $6.77 million, blue and white tent, adding 80,000 square feet (7,400 m2) of exhibit space"
tokens_context = tokenizer(context, return_attention_mask=True, return_tensors="np", truncation=True)

In [10]:
question="where is the businesses choosing to go?"
tokens_question = tokenizer(question, return_attention_mask=True, return_tensors="np", truncation=True)

In [12]:
tokens_question.__dict__["data"]

{'input_ids': array([[   0, 8569,   16,    5, 1252, 8348,    7,  213,  116,    2]]),
 'attention_mask': array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [13]:
tokens_context.__dict__["data"]

{'input_ids': array([[    0,  1121,    63,   419,   107,     6,     5,    92,  8825,
          1312,  1447,     7,   972,  6856,     8,   903,  2113, 31274,
          1092,   742,   870,  5241,     6,   171, 10087,  1739,  1252,
            58,  8348,     5,   203,  2514,  8033, 33666,   824,    11,
           764,  2659,    81,     5,   764,  3071,  9127,   824,   528,
             7,     5,  5442,    18,  1804,   980,     4,    83,  5250,
          2450,     7,  2879,    41,  2919,  1241,    10,  2303,   629,
          1447,     7,  1338,     5,  1552,    80,    12, 10224,  1647,
             7,  1323,     4,    96,   502,  4013,     6,  2711,   764,
          3071,  1490,     5,   391,  1631,     6,    10,    68,   401,
             4,  4718,   153,     6,  2440,     8,  1104, 10178,     6,
          1271,  1812,     6,   151,  3925,  1730,    36,   406,     6,
          4017,   475,   176,    43,     9,  8483,   980,     2]]),
 'attention_mask': array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1