#### Cloning the onnxruntime git repo

In [11]:
!git clone https://github.com/microsoft/onnxruntime.git

In [14]:
cd ..

C:\Users\Nandan\OneDrive\Documents\kilmb


In [1]:
import sys
if sys.platform in ["darwin", "win32"]:  # Mac or Windows
    !{sys.executable} -m pip install torch -q
else:
    !{sys.executable} -m pip install install torch --index-url https://download.pytorch.org/whl/cpu -q

!{sys.executable} -m pip install onnxruntime transformers==4.18 onnx psutil pandas py-cpuinfo py3nvml netron coloredlogs --no-warn-script-location -q

In [1]:
import os
import sys

if sys.platform in ["win32"]:
    os.environ["HF_HUB_DISABLE_SYMLINKS_WARNING"] = "1"

# Create a cache directory to store pretrained model.
cache_dir = os.path.join(".", "cache_models")
if not os.path.exists(cache_dir):
    os.makedirs(cache_dir)

### Loading the GPT2 model from Huggingface

In [2]:
from onnxruntime.transformers.models.gpt2.gpt2_helper import Gpt2Helper, MyGPT2LMHeadModel
from transformers import AutoConfig
import torch

model_name_or_path = "gpt2"
config = AutoConfig.from_pretrained(model_name_or_path, cache_dir=cache_dir)
model = MyGPT2LMHeadModel.from_pretrained(model_name_or_path, config=config, cache_dir=cache_dir)
device = torch.device("cpu")
model.eval().to(device)

print(model.config)

num_attention_heads = model.config.n_head
hidden_size = model.config.n_embd
num_layer = model.config.n_layer

GPT2Config {
  "_name_or_path": "gpt2",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.18.0",
  "use_cache": true,
  "vocab_size": 50257
}



### Converting the model to onnx format

In [None]:
!python -m onnxruntime.transformers.models.gpt2.convert_to_onnx -m gpt2 --output gpt_2.onnx -o -p fp32

### Comparing the torch and onnx runtime outputs

In [105]:
from transformers import AutoTokenizer

EXAMPLE_Text = ["best hotel in bay area", "here is an example of gpt2 model"]

def get_tokenizer(model_name_or_path, cache_dir):
    tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, cache_dir=cache_dir)
    tokenizer.padding_side = "left"
    tokenizer.pad_token = tokenizer.eos_token
    return tokenizer

def get_example_inputs(prompt_text=EXAMPLE_Text):
    tokenizer = get_tokenizer(model_name_or_path, cache_dir)
    encodings_dict = tokenizer.batch_encode_plus(prompt_text, padding=True)

    input_ids = torch.tensor(encodings_dict["input_ids"], dtype=torch.int32)
    attention_mask = torch.tensor(encodings_dict["attention_mask"], dtype=torch.int32)
    position_ids = attention_mask.long().cumsum(-1) - 1
    position_ids.masked_fill_(position_ids < 0, 0)
    position_ids = position_ids.to(torch.int32)

    # Empty Past State for generating first word
    empty_past = []
    batch_size = input_ids.size(0)
    sequence_length = input_ids.size(1)
    past_shape = [2, batch_size, num_attention_heads, 0, hidden_size // num_attention_heads]
    for i in range(num_layer):
        empty_past.append(torch.empty(past_shape).type(torch.float32).to(device))

    return input_ids, attention_mask, position_ids, empty_past

In [106]:
onnx_model_path = "gpt_2.onnx"

In [31]:
!{sys.executable} -m onnxruntime.transformers.models.gpt2.convert_to_onnx -m $model_name_or_path --output $onnx_model_path -o -p fp32 -t 10 >export_output.txt 2>&1

In [107]:
file = open("export_output.txt", "r")
for line in file.readlines():
    if "Optimized operators" in line:
        print(line)

Optimized operators:{'EmbedLayerNormalization': 1, 'Attention': 12, 'MultiHeadAttention': 0, 'Gelu': 0, 'FastGelu': 12, 'BiasGelu': 0, 'GemmFastGelu': 0, 'LayerNormalization': 0, 'SkipLayerNormalization': 24, 'QOrderedAttention': 0, 'QOrderedGelu': 0, 'QOrderedLayerNormalization': 0, 'QOrderedMatMul': 0}



###  Torch output

In [108]:
from transformers import GPT2LMHeadModel

torch_model = GPT2LMHeadModel.from_pretrained(model_name_or_path, config=config, cache_dir=cache_dir)
device = torch.device("cpu")
torch_model.eval().to(device)

input_ids, attention_mask, position_ids, empty_past = get_example_inputs()
print("input_ids", input_ids)
print("attention_mask", attention_mask)
print("position_ids", position_ids)

input_ids tensor([[50256, 50256, 50256, 50256, 13466,  7541,   287, 15489,  1989],
        [ 1456,   318,   281,  1672,   286,   308,   457,    17,  2746]],
       dtype=torch.int32)
attention_mask tensor([[0, 0, 0, 0, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1]], dtype=torch.int32)
position_ids tensor([[0, 0, 0, 0, 0, 1, 2, 3, 4],
        [0, 1, 2, 3, 4, 5, 6, 7, 8]], dtype=torch.int32)
[tensor([], size=(2, 2, 12, 0, 64)), tensor([], size=(2, 2, 12, 0, 64)), tensor([], size=(2, 2, 12, 0, 64)), tensor([], size=(2, 2, 12, 0, 64)), tensor([], size=(2, 2, 12, 0, 64)), tensor([], size=(2, 2, 12, 0, 64)), tensor([], size=(2, 2, 12, 0, 64)), tensor([], size=(2, 2, 12, 0, 64)), tensor([], size=(2, 2, 12, 0, 64)), tensor([], size=(2, 2, 12, 0, 64)), tensor([], size=(2, 2, 12, 0, 64)), tensor([], size=(2, 2, 12, 0, 64))]


In [109]:
with torch.no_grad():
    torch_output = torch_model(
        input_ids, past_key_values=empty_past, attention_mask=attention_mask, position_ids=position_ids
    )

### Onnx output

In [110]:
import onnxruntime
import numpy

input_ids, attention_mask, position_ids, empty_past = get_example_inputs()

session = onnxruntime.InferenceSession(onnx_model_path, providers=["CPUExecutionProvider"])
ort_inputs = {
    "input_ids": numpy.ascontiguousarray(input_ids.cpu().numpy()),
    "attention_mask": numpy.ascontiguousarray(attention_mask.cpu().numpy()),
    "position_ids": numpy.ascontiguousarray(position_ids.cpu().numpy()),
}
ort_inputs

{'input_ids': array([[50256, 50256, 50256, 50256, 13466,  7541,   287, 15489,  1989],
        [ 1456,   318,   281,  1672,   286,   308,   457,    17,  2746]]),
 'attention_mask': array([[0, 0, 0, 0, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1]]),
 'position_ids': array([[0, 0, 0, 0, 0, 1, 2, 3, 4],
        [0, 1, 2, 3, 4, 5, 6, 7, 8]])}

In [111]:
for i, past_i in enumerate(empty_past):
    ort_inputs[f"past_{i}"] = numpy.ascontiguousarray(past_i.cpu().numpy())
ort_outputs = session.run(None, ort_inputs)

#### Output Comparison

In [112]:
logits_masked_diff = (torch_output[0] - ort_outputs[0]) * attention_mask.unsqueeze(2)
max_logits_diff = logits_masked_diff.abs().max()
print("max logits diff (ignored padding)", max_logits_diff)

max logits diff (ignored padding) tensor(0.0002)


### Inference with torch and onnx runtime (text generation)

In [113]:
from typing import List, Dict
from onnxruntime import InferenceSession

from onnxruntime.transformers.io_binding_helper import TypeHelper
from onnxruntime.transformers.io_binding_helper import IOBindingHelper


def inference_with_io_binding(session, config, input_ids, position_ids, attention_mask, past):
    output_shapes = Gpt2Helper.get_output_shapes(
        batch_size=input_ids.size(0),
        past_sequence_length=past[0].size(3),
        sequence_length=input_ids.size(1),
        config=config,
    )
    output_buffers = Gpt2Helper.get_output_buffers(output_shapes, device)

    io_binding = IOBindingHelper.prepare_io_binding(
        session, input_ids, position_ids, attention_mask, past, output_buffers, output_shapes
    )
    session.run_with_iobinding(io_binding)

    outputs = Gpt2Helper.get_outputs_from_io_binding_buffer(session, output_buffers, output_shapes, return_numpy=False)
    return outputs

In [114]:
def test_generation(tokenizer, input_text, ort_session=None, num_tokens_to_produce=30):
    assert len(input_text) == 1  # This function requires batch_size==1
    use_onnxruntime = ort_session is not None
    print("Text generation using", "OnnxRuntime" if use_onnxruntime else "PyTorch", "...")
    eos_token_id = tokenizer.eos_token_id

    input_ids, attention_mask, position_ids, past = get_example_inputs(input_text)
    batch_size = input_ids.size(0)

    has_eos = torch.zeros(batch_size, dtype=torch.bool)

    all_token_ids = input_ids.clone()

    for step in range(num_tokens_to_produce):
        if ort_session is not None:
            outputs = inference_with_io_binding(ort_session, config, input_ids, position_ids, attention_mask, past)
        else:
            outputs = torch_model(
                input_ids, attention_mask=attention_mask, position_ids=position_ids, past_key_values=past
            )

        next_token_logits = outputs[0][:, -1, :]
        # Greedy approach is used here. You can easily extend it to use beam search and sampling to pick next tokens.
        next_tokens = torch.argmax(next_token_logits)

        has_eos = has_eos | (next_tokens == eos_token_id)
        tokens_to_add = next_tokens.masked_fill(has_eos, eos_token_id)
        all_token_ids = torch.cat([all_token_ids, tokens_to_add.unsqueeze(-1)], dim=-1)

        # Update input_ids, attention_mask, position_ids and past
        input_ids = tokens_to_add.clone().detach().reshape([batch_size, 1]).to(device)
        position_ids = (position_ids[:, -1] + 1).reshape(batch_size, 1)
        attention_mask = torch.cat([attention_mask, torch.ones([batch_size, 1]).type_as(attention_mask)], 1).to(device)

        past = []
        if not use_onnxruntime:
            past = list(outputs[1])  # past in torch output is tuple
        else:
            for i in range(num_layer):
                past_i = (
                    torch.from_numpy(outputs[i + 1])
                    if isinstance(outputs[i + 1], numpy.ndarray)
                    else outputs[i + 1].clone().detach()
                )
                past.append(past_i.to(device))

        if torch.all(has_eos):
            break

    for i, output in enumerate(all_token_ids):
        print("------------")
        print(tokenizer.decode(output, skip_special_tokens=True))

In [115]:
EXAMPLE_Text = ["'I enjoy walking with my cute dog'"]

In [170]:
tokenizer = get_tokenizer(model_name_or_path, cache_dir)
input_text = EXAMPLE_Text

In [178]:
start_time = time.time()

test_generation(tokenizer, input_text)

end_time = time.time()

elapsed_time_torch = end_time - start_time

Text generation using PyTorch ...
------------
'I enjoy walking with my cute dog'

The dog, named "Bunny", was born with a rare genetic condition called Down syndrome.

The condition causes the dog to have


In [177]:
import time

start_time = time.time()

test_generation(tokenizer, input_text, ort_session=session)

end_time = time.time()

elapsed_time_onnx = end_time - start_time

Text generation using OnnxRuntime ...
------------
'I enjoy walking with my cute dog'

The dog, named "Bunny", was born with a rare genetic condition called Down syndrome.

The condition causes the dog to have


In [179]:
print(f"The time taken by onnx_runtime is : {elapsed_time_onnx}")
print(f"The time taken by torch runtime is : {elapsed_time_torch}")

The time taken by onnx_runtime is : 5.105276107788086
The time taken by torch runtime is : 7.234872341156006
