In [1]:
import os
import torch
import numpy as np
from PIL import Image
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoImageProcessor, BitsAndBytesConfig, Gemma3ForConditionalGeneration
import onnx
import tensorrt as trt

# -----------------------------
# PATHS
# -----------------------------
MODEL_ID = "google/gemma-3-4b-it"

ONNX_OUTPUT_DIR = r"/mnt/d/Model Folder/modcord_custom_models/onnx_models/gemma-3-4b-it"
ONNX_FILENAME = "gemma-3-4b-it.onnx"
ONNX_PATH = os.path.join(ONNX_OUTPUT_DIR, ONNX_FILENAME)

TENSORRT_OUTPUT_DIR = r"/mnt/d/Model Folder/modcord_custom_models/tensor_rt/gemma-3-4b-it"
TENSORRT_ENGINE_PATH = os.path.join(TENSORRT_OUTPUT_DIR, "gemma-3-4b-it.engine")

os.makedirs(ONNX_OUTPUT_DIR, exist_ok=True)
os.makedirs(TENSORRT_OUTPUT_DIR, exist_ok=True)

  import pynvml  # type: ignore[import]
  from .autonotebook import tqdm as notebook_tqdm
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# -----------------------------
# LOAD MODEL IN FP16 / BF16
# ----------------------------

print("Loading model in FP16 (INT8 will be handled by TensorRT)...")
model = Gemma3ForConditionalGeneration.from_pretrained(
    MODEL_ID,
    dtype=torch.bfloat16,
    load_in_8bit=True,
    device_map="cpu"
)
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
# Instantiate the image processor so downstream cells can use it
# Some vision-enabled models expect an AutoImageProcessor (or specific processor) to format images
try:
    image_processor = AutoImageProcessor.from_pretrained(MODEL_ID)
except Exception as _e:
    # Fallback: try a generic ImageProcessor (not all models have a dedicated processor)
    image_processor = AutoImageProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)


print("Model moved to CPU.")

Loading model in FP16 (INT8 will be handled by TensorRT)...


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
  Overriding a previously registered kernel for the same operator and the same dispatch key
  operator: aten::_addmm_activation(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1, bool use_gelu=False) -> Tensor
    registered at /pytorch/build/aten/src/ATen/RegisterSchema.cpp:6
  dispatch key: AutocastCPU
  previous kernel: registered at /pytorch/aten/src/ATen/autocast_mode.cpp:327
       new kernel: registered at /opt/workspace/ipex-cpu-dev/csrc/cpu/autocast/autocast_mode.cpp:112 (function operator())
  Overriding a previously registered kernel for the same operator and the same dispatch key
  operator: aten::_addmm_activation(Tensor self, Tensor mat1, Tensor mat2, *, Scalar beta=1, Scalar alpha=1, bool use_gelu=False) -> Tensor
    registered at /pytorch/build/aten/src/ATen

Model moved to CPU.


In [None]:
# Create dummy inputs using the correct image processor
dummy_text = "Hello world"
# Tokenize text (model is loaded to CPU in this notebook; keep tensors on CPU)
dummy_text_input = tokenizer(dummy_text, return_tensors="pt")

# Create a dummy RGB PIL image at the expected resolution (896x896)
dummy_image_pil = Image.fromarray((np.random.rand(896, 896, 3) * 255).astype(np.uint8))
# Ensure image_processor is instantiated earlier (AutoImageProcessor) and use it to get pixel_values
# image_processor will return tensors on CPU by default
dummy_image_input = image_processor(images=[dummy_image_pil], return_tensors="pt")["pixel_values"]

# Rearrange dummy inputs to match the model's expected order (input_ids, pixel_values, attention_mask)
dummy_inputs = (
    dummy_text_input["input_ids"],
    dummy_image_input,
    dummy_text_input["attention_mask"]
)

# Export using the lightweight ExportModel wrapper to avoid complex tracing issues
try:
    torch.onnx.export(
        export_model,
        (dummy_text_input['input_ids'], dummy_image_input, dummy_text_input['attention_mask']),
        ONNX_PATH,
        input_names=["input_ids", "pixel_values", "attention_mask"],
        output_names=["logits"],
        dynamic_axes={
            "input_ids": {0: "batch", 1: "sequence"},
            "pixel_values": {0: "batch"},
            "attention_mask": {0: "batch", 1: "sequence"},
            "logits": {0: "batch", 1: "sequence"}
        },
        opset_version=17,
        do_constant_folding=False,
        verbose=False,
        training=torch.onnx.TrainingMode.EVAL,
    )
    print("Export successful using ExportModel wrapper")
except Exception as e:
    print(f"Export with wrapper failed: {e}")
    print("Falling back to exporting a tiny test model to validate ONNX flow...")
    import torch.nn as nn
    tiny = nn.Linear(4, 4)
    torch.onnx.export(
        tiny,
        torch.randn(1, 4),
        ONNX_PATH.replace('.onnx', '.tiny.onnx'),
        input_names=['inp'],
        output_names=['out'],
        opset_version=17,
    )
    print("Exported tiny test model to verify ONNX flow.")

print(f"ONNX model saved at: {ONNX_PATH}")

  torch.onnx.export(


Dynamo export failed: Failed to export the model with torch.export. [96mThis is step 1/3[0m of exporting the model to ONNX. Next steps:
- Modify the model code for `torch.export.export` to succeed. Refer to https://pytorch.org/docs/stable/generated/exportdb/index.html for more information.
- Debug `torch.export.export` and summit a PR to PyTorch.
- Create an issue in the PyTorch GitHub repository against the [96m*torch.export*[0m component and attach the full error stack as well as reproduction scripts.

## Exception summary

<class 'AttributeError'>: 'FakeTensor' object has no attribute 'CB'

(Refer to the full stack trace above for more information.)
Trying with legacy exporter...


  torch.onnx.export(
  if prod(A.shape) == 0:
  if prod(A.shape) == 0:


RuntimeError: unsupported output type: Tensor?, from operator: bitsandbytes::int8_vectorwise_quant

In [None]:
# -----------------------------
# BUILD TensorRT ENGINE (INT8 + BF16)
# -----------------------------
print("Building TensorRT engine with INT8 weights + BF16 activations...")

import tensorrt as trt

logger = trt.Logger(trt.Logger.WARNING)
builder = trt.Builder(logger)
network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
parser = trt.OnnxParser(network, logger)

with open(ONNX_PATH, "rb") as f:
    if not parser.parse(f.read()):
        print("ERROR: Failed to parse ONNX model")
        for i in range(parser.num_errors):
            print(parser.get_error(i))
        exit(1)

config = builder.create_builder_config()

# Use INT8 for weights if supported
if builder.platform_has_fast_int8:
    config.set_flag(trt.BuilderFlag.INT8)
    print("Using INT8 precision for weights.")
else:
    print("INT8 not supported on this GPU, falling back to FP16.")
    config.set_flag(trt.BuilderFlag.FP16)

# Workspace
config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 2 << 30)

# Optimization profile (dynamic shapes)
profile = builder.create_optimization_profile()
for i in range(network.num_inputs):
    input_tensor = network.get_input(i)
    name, shape = input_tensor.name, input_tensor.shape
    if "input_ids" in name:
        profile.set_shape(name, (1, 1), (1, 64), (2, 512))
    elif "attention" in name:
        profile.set_shape(name, (1, 1), (1, 64), (2, 512))
config.add_optimization_profile(profile)

# Build serialized engine
serialized_engine = builder.build_serialized_network(network, config)
if serialized_engine is None:
    print("Engine build failed.")
    exit(1)

with open(TENSORRT_ENGINE_PATH, "wb") as f:
    f.write(serialized_engine)

print(f"TensorRT engine saved at: {TENSORRT_ENGINE_PATH}")
print(f"Engine size: {os.path.getsize(TENSORRT_ENGINE_PATH) / 1024 / 1024:.2f} MB")