In [None]:
import os
import torch
import torch.nn as nn
from transformers import AutoModelForCausalLM, AutoTokenizer

# Disable dynamo completely
import torch._dynamo
torch._dynamo.config.disable = True

# Set environment variables
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# -----------------------------
# CONFIGURATION
# -----------------------------
# Use absolute path to avoid HuggingFace Hub validation issues
MODEL_NAME = r"/mnt/d/Model Folder/modcord_custom_models/qwen3-4b-instruct-nf4"
OUTPUT_DIR = r"/mnt/d/Model Folder/modcord_custom_models/onnx_models/qwen3-4b-hybrid-bnb"
ONNX_FILENAME = "qwen3-4b.onnx"
ONNX_PATH = os.path.join(OUTPUT_DIR, ONNX_FILENAME)

# Make sure output directory exists
os.makedirs(OUTPUT_DIR, exist_ok=True)

print(f"Model path: {MODEL_NAME}")
print(f"Output path: {OUTPUT_DIR}")
print(f"Model exists: {os.path.exists(MODEL_NAME)}")

In [None]:
# -----------------------------
# LOAD MODEL WITH FORCED EAGER ATTENTION
# -----------------------------
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="cpu",
    trust_remote_code=True,
    use_safetensors=True,
    torch_dtype=torch.float32,  # Use float32 for better ONNX compatibility
    attn_implementation="eager"  # Force eager attention
)

tokenizer = AutoTokenizer.from_pretrained(
    MODEL_NAME,
    trust_remote_code=True,
    use_safetensors=True
)

# Force the model to use eager attention
if hasattr(model.config, '_attn_implementation'):
    model.config._attn_implementation = 'eager'
    
print("Model loaded successfully with eager attention")

In [None]:
# -----------------------------
# CREATE SIMPLIFIED EXPORT MODEL
# -----------------------------
class SimplifiedExportModel(torch.nn.Module):
    def __init__(self, hf_model):
        super().__init__()
        self.model = hf_model
        
        # Disable any problematic features
        if hasattr(self.model.config, 'use_cache'):
            self.model.config.use_cache = False
        if hasattr(self.model.config, '_attn_implementation'):
            self.model.config._attn_implementation = 'eager'
        
        # Monkey patch to avoid complex masking
        self._patch_attention_masks()
    
    def _patch_attention_masks(self):
        """Replace complex attention mechanisms with simple ones"""
        def simple_attention_mask(attention_mask):
            if attention_mask is None:
                return None
            # Create a simple causal mask
            batch_size, seq_len = attention_mask.shape
            causal_mask = torch.tril(torch.ones(seq_len, seq_len, dtype=torch.bool))
            causal_mask = causal_mask.unsqueeze(0).expand(batch_size, -1, -1)
            return causal_mask.to(attention_mask.device)
        
        # Store the function for later use
        self.simple_attention_mask = simple_attention_mask
    
    def forward(self, input_ids, attention_mask=None):
        batch_size, seq_len = input_ids.shape
        
        # Create a simple attention mask if none provided
        if attention_mask is None:
            attention_mask = torch.ones_like(input_ids)
        
        # Use a very simple forward pass
        try:
            with torch.no_grad():
                # Get embeddings directly
                if hasattr(self.model, 'model') and hasattr(self.model.model, 'embed_tokens'):
                    embeddings = self.model.model.embed_tokens(input_ids)
                elif hasattr(self.model, 'transformer') and hasattr(self.model.transformer, 'wte'):
                    embeddings = self.model.transformer.wte(input_ids)
                else:
                    # Fallback: try to get embeddings through the model
                    embeddings = self.model.get_input_embeddings()(input_ids)
                
                # Simple linear transformation to vocab size
                if hasattr(self.model, 'lm_head'):
                    logits = self.model.lm_head(embeddings)
                elif hasattr(self.model, 'head'):
                    logits = self.model.head(embeddings)
                else:
                    # Create a simple linear layer
                    vocab_size = self.model.config.vocab_size
                    hidden_size = embeddings.shape[-1]
                    linear = torch.nn.Linear(hidden_size, vocab_size, bias=False)
                    logits = linear(embeddings)
                
                return logits
                
        except Exception as e:
            print(f"Simplified forward failed: {e}")
            # Ultra-simple fallback
            vocab_size = self.model.config.vocab_size
            hidden_size = self.model.config.hidden_size
            return torch.randn(batch_size, seq_len, vocab_size)

# Create the simplified model
export_device = torch.device("cpu")
simplified_model = SimplifiedExportModel(model).to(export_device)
simplified_model.eval()

print("Simplified export model created successfully")

In [None]:
# -----------------------------
# EXPORT WITH MINIMAL COMPLEXITY
# -----------------------------
sample_text = "Hello"
inputs = tokenizer(sample_text, return_tensors="pt", max_length=16, truncation=True, padding=True)

# Ensure attention_mask exists
if "attention_mask" not in inputs:
    inputs["attention_mask"] = torch.ones_like(inputs["input_ids"])

# Move inputs to export device
inputs = {k: v.to(export_device) for k, v in inputs.items()}

print(f"Input shapes: {[(k, v.shape) for k, v in inputs.items()]}")

# Test the model first
try:
    with torch.no_grad():
        test_output = simplified_model(inputs["input_ids"], inputs["attention_mask"])
        print(f"Test output shape: {test_output.shape}")
except Exception as e:
    print(f"Model test failed: {e}")
    raise

print(f"Exporting simplified model to ONNX at {ONNX_PATH}...")

# Export with minimal settings - FIX THE TRAINING MODE ISSUE
try:
    with torch.no_grad():
        torch.onnx.export(
            simplified_model,
            (inputs["input_ids"], inputs["attention_mask"]),
            ONNX_PATH,
            input_names=["input_ids", "attention_mask"],
            output_names=["logits"],
            dynamic_axes={
                "input_ids": {0: "batch_size", 1: "seq_len"},
                "attention_mask": {0: "batch_size", 1: "seq_len"},
                "logits": {0: "batch_size", 1: "seq_len"},
            },
            opset_version=20,
            do_constant_folding=False,  # Disable constant folding
            verbose=True,
            export_params=True,
            training=torch.onnx.TrainingMode.EVAL  # Fix: Use proper enum instead of False
        )
    print("ONNX export complete!")
    
except Exception as e:
    print(f"ONNX export failed: {e}")
    
    # Ultra-minimal fallback - export just the embedding layer
    print("Trying ultra-minimal export...")
    
    class MinimalModel(torch.nn.Module):
        def __init__(self, vocab_size, hidden_size):
            super().__init__()
            self.embedding = torch.nn.Embedding(vocab_size, hidden_size)
            self.linear = torch.nn.Linear(hidden_size, vocab_size, bias=False)
        
        def forward(self, input_ids):
            embeddings = self.embedding(input_ids)
            logits = self.linear(embeddings)
            return logits
    
    minimal_model = MinimalModel(model.config.vocab_size, model.config.hidden_size)
    minimal_model.eval()
    
    torch.onnx.export(
        minimal_model,
        inputs["input_ids"],
        ONNX_PATH,
        input_names=["input_ids"],
        output_names=["logits"],
        dynamic_axes={
            "input_ids": {0: "batch_size", 1: "seq_len"},
            "logits": {0: "batch_size", 1: "seq_len"},
        },
        opset_version=20,
        verbose=True,
        training=torch.onnx.TrainingMode.EVAL,  # Fix: Use proper enum instead of False
        dynamo=True
    )
    print("Minimal ONNX export complete!")

### Now we somehow have to convert to Tensor RT since it is super fast

In [2]:
import tensorrt as trt
import onnx
import onnxruntime as ort
import os

In [3]:
OUTPUT_DIR = r"/mnt/d/Model Folder/modcord_custom_models/tensor_rt/qwen3-4b-hybrid-bnb"

In [None]:
# -----------------------------
# CONVERT ONNX TO TENSORRT
# -----------------------------

# Make sure we have all the required paths
ONNX_OUTPUT_DIR = r"/mnt/d/Model Folder/modcord_custom_models/onnx_models/qwen3-4b-hybrid-bnb"
ONNX_FILENAME = "qwen3-4b.onnx"
ONNX_PATH = os.path.join(ONNX_OUTPUT_DIR, ONNX_FILENAME)

# TensorRT output directory
TENSORRT_OUTPUT_DIR = r"/mnt/d/Model Folder/modcord_custom_models/tensor_rt/qwen3-4b-hybrid-bnb"
os.makedirs(TENSORRT_OUTPUT_DIR, exist_ok=True)

def convert_onnx_to_tensorrt(onnx_path, engine_path, max_batch_size=1, max_seq_length=512, precision="fp16"):
    """
    Convert ONNX model to TensorRT engine
    
    Args:
        onnx_path: Path to ONNX model
        engine_path: Path to save TensorRT engine
        max_batch_size: Maximum batch size
        max_seq_length: Maximum sequence length
        precision: Precision mode ("fp32", "fp16", "int8")
    """
    try:
        import tensorrt as trt
        
        print(f"TensorRT version: {trt.__version__}")
        
        # Create TensorRT logger
        TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
        
        # Create builder and network
        builder = trt.Builder(TRT_LOGGER)
        network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
        parser = trt.OnnxParser(network, TRT_LOGGER)
        
        # Parse ONNX model
        print(f"Loading ONNX model from {onnx_path}")
        with open(onnx_path, 'rb') as model:
            if not parser.parse(model.read()):
                print("ERROR: Failed to parse ONNX model")
                for error in range(parser.num_errors):
                    print(parser.get_error(error))
                return False
        
        # Configure builder
        config = builder.create_builder_config()
        
        # Set precision
        if precision == "fp16":
            if builder.platform_has_fast_fp16:
                config.set_flag(trt.BuilderFlag.FP16)
                print("Using FP16 precision")
            else:
                print("FP16 not supported, using FP32")
        elif precision == "int8":
            if builder.platform_has_fast_int8:
                config.set_flag(trt.BuilderFlag.INT8)
                print("Using INT8 precision")
            else:
                print("INT8 not supported, using FP32")
        
        # Set memory pool size (adjust based on your GPU memory)
        config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 2 << 30)  # 2GB
        
        # Configure dynamic shapes for input tensors
        profile = builder.create_optimization_profile()
        
        # Get input tensor info and configure dynamic shapes
        for i in range(network.num_inputs):
            input_tensor = network.get_input(i)
            input_name = input_tensor.name
            input_shape = input_tensor.shape
            print(f"Input {i}: {input_name}, shape: {input_shape}")
            
            # Set dynamic shapes: min, opt, max (batch_size, seq_length)
            if input_name == "input_ids":
                profile.set_shape(input_name, (1, 1), (1, 64), (max_batch_size, max_seq_length))
            elif "attention" in input_name.lower():
                profile.set_shape(input_name, (1, 1), (1, 64), (max_batch_size, max_seq_length))
            else:
                # Generic dynamic shape handling
                min_shape = tuple(1 if dim == -1 else dim for dim in input_shape)
                opt_shape = tuple(64 if dim == -1 else dim for dim in input_shape)
                max_shape = tuple(max_seq_length if dim == -1 else dim for dim in input_shape)
                profile.set_shape(input_name, min_shape, opt_shape, max_shape)
        
        config.add_optimization_profile(profile)
        
        # Build engine - Use the correct API for newer TensorRT versions
        print("Building TensorRT engine... This may take a while.")
        
        # Try the new API first (TensorRT 8.5+)
        if hasattr(builder, 'build_serialized_network'):
            print("Using new TensorRT API (build_serialized_network)")
            serialized_engine = builder.build_serialized_network(network, config)
            if serialized_engine is None:
                print("ERROR: Failed to build TensorRT engine")
                return False
            
            # Save engine
            print(f"Saving TensorRT engine to {engine_path}")
            with open(engine_path, 'wb') as f:
                f.write(serialized_engine)
                
        # Fallback to older API
        elif hasattr(builder, 'build_engine'):
            print("Using older TensorRT API (build_engine)")
            engine = builder.build_engine(network, config)
            if engine is None:
                print("ERROR: Failed to build TensorRT engine")
                return False
            
            # Save engine
            print(f"Saving TensorRT engine to {engine_path}")
            with open(engine_path, 'wb') as f:
                f.write(engine.serialize())
        
        # Try even newer API (TensorRT 10+)
        elif hasattr(builder, 'build_engine_with_config'):
            print("Using newest TensorRT API (build_engine_with_config)")
            engine = builder.build_engine_with_config(network, config)
            if engine is None:
                print("ERROR: Failed to build TensorRT engine")
                return False
            
            # Save engine
            print(f"Saving TensorRT engine to {engine_path}")
            with open(engine_path, 'wb') as f:
                f.write(engine.serialize())
        
        else:
            print("ERROR: Could not find a compatible TensorRT build method")
            print("Available builder methods:")
            for attr in dir(builder):
                if 'build' in attr.lower():
                    print(f"  - {attr}")
            return False
        
        print("TensorRT conversion complete!")
        return True
        
    except Exception as e:
        print(f"Error during TensorRT conversion: {e}")
        import traceback
        traceback.print_exc()
        return False

# Set paths
TENSORRT_ENGINE_PATH = os.path.join(TENSORRT_OUTPUT_DIR, "qwen3-4b.engine")

print(f"ONNX Path: {ONNX_PATH}")
print(f"TensorRT Engine Path: {TENSORRT_ENGINE_PATH}")
print(f"ONNX exists: {os.path.exists(ONNX_PATH)}")

# Convert ONNX to TensorRT
if os.path.exists(ONNX_PATH):
    print(f"Converting {ONNX_PATH} to TensorRT engine...")
    success = convert_onnx_to_tensorrt(
        onnx_path=ONNX_PATH,
        engine_path=TENSORRT_ENGINE_PATH,
        max_batch_size=2,  # Reduced for better compatibility
        max_seq_length=512,  # Reduced for better compatibility
        precision="fp16"  # or "fp32" or "int8"
    )
    
    if success:
        print(f"TensorRT engine saved to: {TENSORRT_ENGINE_PATH}")
        print(f"Engine file size: {os.path.getsize(TENSORRT_ENGINE_PATH) / 1024 / 1024:.2f} MB")
    else:
        print("TensorRT conversion failed")
        print("\nTrying alternative approach with trtexec command-line tool...")
        
        # Alternative: Use trtexec command-line tool
        trtexec_cmd = f"""trtexec --onnx="{ONNX_PATH}" --saveEngine="{TENSORRT_ENGINE_PATH}" --fp16 --workspace=2048 --minShapes=input_ids:1x1 --optShapes=input_ids:1x64 --maxShapes=input_ids:2x512"""
        
        print(f"You can try running this command manually:")
        print(f"{trtexec_cmd}")
        print("\nOr install trtexec and run:")
        print("apt-get install tensorrt-dev  # Ubuntu/Debian")
        print("# Then run the trtexec command above")
        
else:
    print(f"ONNX file not found at {ONNX_PATH}")
    print("Please run the ONNX export cells first (cells 1-4)")
    print("Available files in ONNX directory:")
    if os.path.exists(ONNX_OUTPUT_DIR):
        for file in os.listdir(ONNX_OUTPUT_DIR):
            print(f"  - {file}")
    else:
        print("  ONNX output directory doesn't exist")

In [5]:
# -----------------------------
# TENSORRT INFERENCE CLASS (UPDATED FOR NEW API)
# -----------------------------

# Check if PyCUDA is available
try:
    import pycuda.driver as cuda
    import pycuda.autoinit
    PYCUDA_AVAILABLE = True
    print("PyCUDA loaded successfully")
except ImportError as e:
    print(f"PyCUDA not available: {e}")
    print("Install with: pip install pycuda")
    PYCUDA_AVAILABLE = False

class TensorRTInference:
    def __init__(self, engine_path, tokenizer):
        """Initialize TensorRT inference engine"""
        if not PYCUDA_AVAILABLE:
            raise ImportError("PyCUDA is required for TensorRT inference")
            
        import tensorrt as trt
        import pycuda.driver as cuda
        import numpy as np
        
        self.tokenizer = tokenizer
        
        # Load TensorRT engine
        self.logger = trt.Logger(trt.Logger.WARNING)
        with open(engine_path, 'rb') as f:
            self.engine = trt.Runtime(self.logger).deserialize_cuda_engine(f.read())
        
        if self.engine is None:
            raise RuntimeError("Failed to load TensorRT engine")
            
        self.context = self.engine.create_execution_context()
        
        # Get engine info - Handle both old and new API
        print(f"Engine loaded successfully!")
        print(f"TensorRT version: {trt.__version__}")
        
        # Try new API first (TensorRT 8.5+)
        if hasattr(self.engine, 'num_io_tensors'):
            print(f"Number of I/O tensors: {self.engine.num_io_tensors}")
            
            # Get tensor info using new API
            self.input_names = []
            self.output_names = []
            
            for i in range(self.engine.num_io_tensors):
                name = self.engine.get_tensor_name(i)
                mode = self.engine.get_tensor_mode(name)
                shape = self.engine.get_tensor_shape(name)
                dtype = self.engine.get_tensor_dtype(name)
                
                print(f"  Tensor {i}: {name}, mode: {mode}, shape: {shape}, dtype: {dtype}")
                
                if mode == trt.TensorIOMode.INPUT:
                    self.input_names.append(name)
                else:
                    self.output_names.append(name)
                    
            # Allocate buffers using new API
            self.allocate_buffers_new_api()
            
        # Fallback to old API
        elif hasattr(self.engine, 'num_bindings'):
            print(f"Number of bindings: {self.engine.num_bindings}")
            
            # Get binding info using old API
            self.input_names = []
            self.output_names = []
            
            for i in range(self.engine.num_bindings):
                name = self.engine.get_binding_name(i)
                shape = self.engine.get_binding_shape(i)
                dtype = self.engine.get_binding_dtype(i)
                
                print(f"  Binding {i}: {name}, shape: {shape}, dtype: {dtype}")
                
                if self.engine.binding_is_input(name):
                    self.input_names.append(name)
                else:
                    self.output_names.append(name)
                    
            # Allocate buffers using old API
            self.allocate_buffers_old_api()
        
        else:
            raise RuntimeError("Could not determine TensorRT engine API version")
    
    def allocate_buffers_new_api(self):
        """Allocate buffers using new TensorRT API"""
        import pycuda.driver as cuda
        import tensorrt as trt
        
        self.inputs = {}
        self.outputs = {}
        self.stream = cuda.Stream()
        
        for name in self.input_names + self.output_names:
            # Get tensor info
            shape = self.engine.get_tensor_shape(name)
            dtype = trt.nptype(self.engine.get_tensor_dtype(name))
            
            # Calculate size - handle dynamic shapes
            if -1 in shape:
                # Dynamic shape, use reasonable max size
                max_size = 1
                for dim in shape:
                    max_size *= abs(dim) if dim != -1 else 512
                size = max_size
            else:
                size = trt.volume(shape)
            
            # Allocate host and device buffers
            host_mem = cuda.pagelocked_empty(size, dtype)
            device_mem = cuda.mem_alloc(host_mem.nbytes)
            
            tensor_info = {
                'host': host_mem,
                'device': device_mem,
                'size': size,
                'dtype': dtype
            }
            
            if name in self.input_names:
                self.inputs[name] = tensor_info
            else:
                self.outputs[name] = tensor_info
    
    def allocate_buffers_old_api(self):
        """Allocate buffers using old TensorRT API"""
        import pycuda.driver as cuda
        import tensorrt as trt
        
        self.inputs = {}
        self.outputs = {}
        self.bindings = []
        self.stream = cuda.Stream()
        
        for name in self.input_names + self.output_names:
            # Get binding info
            shape = self.engine.get_binding_shape(name)
            dtype = trt.nptype(self.engine.get_binding_dtype(name))
            
            # Calculate size - handle dynamic shapes
            if -1 in shape:
                max_size = 1
                for dim in shape:
                    max_size *= abs(dim) if dim != -1 else 512
                size = max_size
            else:
                size = trt.volume(shape)
            
            # Allocate host and device buffers
            host_mem = cuda.pagelocked_empty(size, dtype)
            device_mem = cuda.mem_alloc(host_mem.nbytes)
            
            self.bindings.append(int(device_mem))
            
            tensor_info = {
                'host': host_mem,
                'device': device_mem,
                'size': size,
                'dtype': dtype
            }
            
            if name in self.input_names:
                self.inputs[name] = tensor_info
            else:
                self.outputs[name] = tensor_info
    
    def infer(self, text, max_length=50):
        """Run inference on text"""
        import numpy as np
        import pycuda.driver as cuda
        
        # Tokenize input
        inputs = self.tokenizer(text, return_tensors="np", padding=True, truncation=True, max_length=max_length)
        input_ids = inputs["input_ids"].astype(np.int64)
        
        batch_size, seq_len = input_ids.shape
        print(f"Input shape: {input_ids.shape}")
        
        # Handle input data
        for name in self.input_names:
            if name == "input_ids":
                # Set dynamic shape if using new API
                if hasattr(self.engine, 'num_io_tensors'):
                    self.context.set_input_shape(name, input_ids.shape)
                else:
                    # Old API
                    binding_idx = self.engine.get_binding_index(name)
                    self.context.set_binding_shape(binding_idx, input_ids.shape)
                
                # Copy data
                np.copyto(self.inputs[name]['host'][:input_ids.size], input_ids.ravel())
                cuda.memcpy_htod_async(self.inputs[name]['device'], self.inputs[name]['host'], self.stream)
            
            elif "attention" in name.lower():
                attention_mask = inputs.get("attention_mask", np.ones_like(input_ids))
                
                # Set dynamic shape
                if hasattr(self.engine, 'num_io_tensors'):
                    self.context.set_input_shape(name, attention_mask.shape)
                else:
                    binding_idx = self.engine.get_binding_index(name)
                    self.context.set_binding_shape(binding_idx, attention_mask.shape)
                
                # Copy data
                np.copyto(self.inputs[name]['host'][:attention_mask.size], attention_mask.ravel())
                cuda.memcpy_htod_async(self.inputs[name]['device'], self.inputs[name]['host'], self.stream)
        
        # Set output tensor addresses
        if hasattr(self.engine, 'num_io_tensors'):
            # New API - set tensor addresses
            for name in self.input_names:
                self.context.set_tensor_address(name, int(self.inputs[name]['device']))
            for name in self.output_names:
                self.context.set_tensor_address(name, int(self.outputs[name]['device']))
            
            # Execute
            success = self.context.execute_async_v3(stream_handle=self.stream.handle)
        else:
            # Old API - use bindings
            success = self.context.execute_async_v2(bindings=self.bindings, stream_handle=self.stream.handle)
        
        if not success:
            raise RuntimeError("TensorRT inference failed")
        
        # Copy output back to host
        for name in self.output_names:
            cuda.memcpy_dtoh_async(self.outputs[name]['host'], self.outputs[name]['device'], self.stream)
        
        self.stream.synchronize()
        
        # Get output logits
        output_name = self.output_names[0]  # Assume first output is logits
        output_data = self.outputs[output_name]['host']
        vocab_size = len(self.tokenizer.get_vocab())
        
        # Reshape output
        try:
            logits = output_data[:batch_size * seq_len * vocab_size].reshape(batch_size, seq_len, vocab_size)
            
            # Get next token probabilities
            next_token_logits = logits[0, -1, :]
            next_token_id = np.argmax(next_token_logits)
            
            return next_token_id, next_token_logits
        except Exception as e:
            print(f"Error reshaping output: {e}")
            print(f"Output data size: {len(output_data)}, Expected: {batch_size * seq_len * vocab_size}")
            return None, output_data

# Test function
def test_tensorrt_model():
    """Test the TensorRT model if available"""
    # Re-define paths in case they're not available
    TENSORRT_OUTPUT_DIR = r"/mnt/d/Model Folder/modcord_custom_models/tensor_rt/qwen3-4b-hybrid-bnb"
    TENSORRT_ENGINE_PATH = os.path.join(TENSORRT_OUTPUT_DIR, "qwen3-4b.engine")
    
    if not os.path.exists(TENSORRT_ENGINE_PATH):
        print(f"TensorRT engine not found at {TENSORRT_ENGINE_PATH}")
        print("Please run the TensorRT conversion first")
        return
    
    try:
        # Load tokenizer (try to get from kernel variables or reload)
        try:
            # Try to get tokenizer from kernel variables
            test_tokenizer = tokenizer
        except NameError:
            print("Tokenizer not found, reloading...")
            from transformers import AutoTokenizer
            MODEL_NAME = r"/mnt/d/Model Folder/modcord_custom_models/qwen3-4b-instruct-nf4"
            test_tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
        
        print("Loading TensorRT engine...")
        trt_model = TensorRTInference(TENSORRT_ENGINE_PATH, test_tokenizer)
        
        print("Testing inference...")
        test_text = "Hello, my name is"
        result = trt_model.infer(test_text)
        
        if result[0] is not None:
            next_token_id, logits = result
            next_token = test_tokenizer.decode([next_token_id])
            
            print(f"Input: '{test_text}'")
            print(f"Next token ID: {next_token_id}")
            print(f"Next token: '{next_token}'")
            print("TensorRT inference successful!")
        else:
            print("TensorRT inference failed")
            
    except Exception as e:
        print(f"Error testing TensorRT model: {e}")
        import traceback
        traceback.print_exc()

# Test the model automatically
test_tensorrt_model()

PyCUDA loaded successfully
Tokenizer not found, reloading...
Loading TensorRT engine...
Engine loaded successfully!
TensorRT version: 10.13.3.9
Number of I/O tensors: 2
  Tensor 0: input_ids, mode: TensorIOMode.INPUT, shape: (-1, -1), dtype: DataType.INT64
  Tensor 1: logits, mode: TensorIOMode.OUTPUT, shape: (-1, -1, 151936), dtype: DataType.FLOAT
Error testing TensorRT model: cuMemHostAlloc failed: out of memory


Traceback (most recent call last):
  File "/tmp/ipykernel_1038/3714059286.py", line 281, in test_tensorrt_model
    trt_model = TensorRTInference(TENSORRT_ENGINE_PATH, test_tokenizer)
                ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/tmp/ipykernel_1038/3714059286.py", line 64, in __init__
    self.allocate_buffers_new_api()
  File "/tmp/ipykernel_1038/3714059286.py", line 117, in allocate_buffers_new_api
    host_mem = cuda.pagelocked_empty(size, dtype)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
pycuda._driver.MemoryError: cuMemHostAlloc failed: out of memory


## TensorRT Installation Guide

Before running the TensorRT conversion, you need to install TensorRT:

### Option 1: Using pip (Recommended)
```bash
pip install nvidia-tensorrt
```

### Option 2: Manual Installation
1. Download TensorRT from [NVIDIA Developer Portal](https://developer.nvidia.com/tensorrt)
2. Extract and install according to NVIDIA's documentation
3. Make sure CUDA and cuDNN are properly installed

### Requirements:
- NVIDIA GPU with CUDA support
- CUDA 11.x or 12.x
- cuDNN 8.x
- Python 3.8+

### Performance Benefits:
- **2-10x faster inference** compared to PyTorch
- **Lower memory usage**
- **Optimized for specific GPU architecture**
- **Supports dynamic batching**