In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch, os

import onnx
import onnxruntime as ort
from onnxruntime.quantization import quantize_dynamic, QuantType

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model_name = "meta-llama/Llama-3.1-8B-Instruct"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Loading checkpoint shards: 100%|██████████| 4/4 [00:03<00:00,  1.11it/s]


In [3]:
save_dir = "/mnt/2T/Codes/models/quantized_model"
os.makedirs(save_dir, exist_ok=True)
model_path = os.path.join(save_dir, "llama3.1-8B-instruct.onnx")
quantized_model_path = os.path.join(save_dir, "llama3.1-8B-instruct-quantized.onnx")

In [4]:
model.eval()

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(128256, 4096)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
          (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (up_proj): Linear(in_features=4096, out_features=14336, bias=False)
          (down_proj): Linear(in_features=14336, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
        (post_attention_layernorm): LlamaRMSNorm((4096,), eps=1e-05)
      )
    )
    (n

In [5]:
input_names = ["input_ids"]
output_names = ["output"]

In [6]:
input_text = "This is a sample input."
inputs = tokenizer(input_text, return_tensors="pt")

In [16]:
print(inputs)

{'input_ids': tensor([[128000,   2028,    374,    264,   6205,   1988,     13]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1]])}


In [8]:
outputs = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])

We detected that you are passing `past_key_values` as a tuple and this is deprecated and will be removed in v4.43. Please use an appropriate `Cache` class (https://huggingface.co/docs/transformers/v4.41.3/en/internal/generation_utils#transformers.Cache)


In [9]:
print(outputs.keys())

odict_keys(['logits', 'past_key_values'])


In [21]:
print(outputs.logits.shape)
print(outputs.past_key_values)

torch.Size([1, 7, 128256])
((tensor([[[[ 5.5868e-01,  9.8483e-01,  1.0634e+00,  ...,  1.3285e+00,
           -9.2118e-02,  4.1570e-01],
          [ 9.2978e-01,  1.2283e-01, -6.6437e-02,  ..., -1.4198e+00,
           -1.7475e+00, -1.5809e+00],
          [-3.5403e+00, -2.1394e+00, -1.9625e+00,  ...,  3.9188e-01,
           -2.0951e+00, -1.8113e+00],
          ...,
          [-2.7057e+00, -2.3835e+00, -2.6107e+00,  ...,  3.7589e-01,
           -1.6461e+00, -1.5153e+00],
          [ 4.0626e+00,  2.2509e-01, -1.2498e+00,  ...,  1.1277e+00,
           -1.6242e+00, -1.7077e+00],
          [ 2.5035e+00,  9.8973e-01, -5.5071e-01,  ...,  6.0556e-01,
           -1.5994e+00, -9.7771e-01]],

         [[ 1.7906e-02, -7.5300e-02,  1.9733e-02,  ..., -1.1907e+00,
            8.1819e-01,  6.0337e-02],
          [ 9.3476e-01, -2.2744e+00,  1.5036e+00,  ...,  2.2894e+00,
           -2.3290e+00, -2.1614e-01],
          [ 8.2517e-01, -1.3896e+00,  9.7922e-01,  ...,  1.8071e+00,
           -9.9578e-01, -1.58

In [None]:
output = model.generate(**inputs)


In [11]:
print(output)

tensor([[128000,   2028,    374,    264,   6205,   1988,     13,   1472,    649,
           2349,    433,    311,   7937,    701,   3966,    382,  14196,   4077,
            517,    220]])


In [13]:
output_text = tokenizer.decode(output[0], skip_special_tokens=True)

In [14]:
print(output_text)

This is a sample input. You can change it to suit your needs.

```
{
 
