In [1]:
import requests

file_url = "https://huggingface.co/datasets/nhotin/segment-text/resolve/main/model.onnx"
file_path = "model.onnx"  
response = requests.get(file_url, stream=True)
if response.status_code == 200:
    with open(file_path, "wb") as f:
        for chunk in response.iter_content(chunk_size=1024):
            f.write(chunk)
    print(f"Đã tải file ONNX thành công: {file_path}")
else:
    print(f"Lỗi khi tải file: {response.status_code}")


Đã tải file ONNX thành công: model.onnx


In [2]:
!pip install numpy onnx
!pip install tensorrt==10.7.0
!pip install pycuda



In [8]:
import onnx
import os
import tensorrt as trt
import pycuda.driver as cuda
import pycuda.autoinit
import time
import numpy as np
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"

In [4]:
def check_tensorrt():
    try:
        print(f"TensorRT version: {trt.__version__}")
        print(f"CUDA version: {os.popen('nvcc --version').read().strip()}")
    except ModuleNotFoundError:
        print("TensorRT chưa được cài")

check_tensorrt()

TensorRT version: 10.7.0
CUDA version: nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0


In [5]:
def check_onnx_model(onnx_path):
    model = onnx.load(onnx_path)
    print(f"Model name: {model.graph.name}")

    for input_tensor in model.graph.input:
        dtype = onnx.TensorProto.DataType.Name(input_tensor.type.tensor_type.elem_type)
        print(f"Input: {input_tensor.name}, Type: {dtype}")
        
    for output_tensor in model.graph.output:
        dtype = onnx.TensorProto.DataType.Name(output_tensor.type.tensor_type.elem_type)
        print(f"Output: {output_tensor.name}, Type: {dtype}")

onnx_path = "/kaggle/working/model.onnx"
check_onnx_model(onnx_path)

Model name: main_graph
Input: input_ids, Type: INT32
Input: attention_mask, Type: FLOAT16
Output: logits, Type: FLOAT16


In [9]:
# import tensorrt as trt

# def build_engine(onnx_file_path, engine_file_path, dynamic_shape=True, fp16=True):
#     TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
#     builder = trt.Builder(TRT_LOGGER)
#     network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
#     parser = trt.OnnxParser(network, TRT_LOGGER)

#     # Đọc mô hình ONNX
#     with open(onnx_file_path, 'rb') as model:
#         if not parser.parse(model.read()):
#             print('Lỗi: Không thể parse file ONNX')
#             for error in range(parser.num_errors):
#                 print(parser.get_error(error))
#             return None

#     config = builder.create_builder_config()
#     config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 30)  # 1GB workspace

#     if fp16 and builder.platform_has_fast_fp16:
#         config.set_flag(trt.BuilderFlag.FP16)
#         print("Đang sử dụng FP16 để tăng tốc")

#     if dynamic_shape:
#         profile = builder.create_optimization_profile()
#         # Thêm dynamic shape cho cả input_ids và attention_mask
#         profile.set_shape("input_ids", (1, 16), (1, 128), (1, 512))  # min, opt, max
#         profile.set_shape("attention_mask", (1, 16), (1, 128), (1, 512))  # min, opt, max
#         config.add_optimization_profile(profile)
#         print("Hỗ trợ Dynamic Shape cho `input_ids` và `attention_mask`!")

#     engine = builder.build_serialized_network(network, config)

#     if engine is None:
#         print("Lỗi: Không thể build engine TensorRT.")
#         return None

#     with open(engine_file_path, 'wb') as f:
#         f.write(engine)

#     print(f"Model converted successfully! Saved as {engine_file_path}")
#     return engine

# engine_model = "model.engine"
# build_engine(onnx_path, engine_model, dynamic_shape=True, fp16=True)

def build_engine(onnx_file_path, engine_file_path, dynamic_shape=True, fp16=True):
    TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
    builder = trt.Builder(TRT_LOGGER)
    network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH))
    parser = trt.OnnxParser(network, TRT_LOGGER)

    with open(onnx_file_path, 'rb') as model:
        if not parser.parse(model.read()):
            print('Lỗi: Không thể parse file ONNX')
            for error in range(parser.num_errors):
                print(parser.get_error(error))
            return None

    config = builder.create_builder_config()
    config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 1 << 30)  # 1GB workspace

    config.set_flag(trt.BuilderFlag.GPU_FALLBACK)

    if fp16 and builder.platform_has_fast_fp16:
        config.set_flag(trt.BuilderFlag.FP16)
        print("FP16")

    if dynamic_shape:
        profile = builder.create_optimization_profile()
        profile.set_shape("input_ids", (1, 16), (1, 128), (1, 512))  # min, opt, max
        profile.set_shape("attention_mask", (1, 16), (1, 128), (1, 512))  # min, opt, max
        config.add_optimization_profile(profile)
        print("Hỗ trợ Dynamic Shape!")

    engine = builder.build_serialized_network(network, config)
    if engine is None:
        print("Lỗi: Không thể build engine TensorRT.")
        return None

    with open(engine_file_path, 'wb') as f:
        f.write(engine)

    print(f"Model converted successfully! Saved as {engine_file_path}")
    return engine

engine_model = "model.engine"
build_engine(onnx_path, engine_model, dynamic_shape=True, fp16=True)


FP16
Hỗ trợ Dynamic Shape!


KeyboardInterrupt: 

In [None]:
def load_engine(engine_path):
    TRT_LOGGER = trt.Logger(trt.Logger.WARNING)
    with open(engine_path, "rb") as f, trt.Runtime(TRT_LOGGER) as runtime:
        engine = runtime.deserialize_cuda_engine(f.read())
    return engine

engine = load_engine("model.engine")
print("TensorRT Engine đã load thành công!")

In [None]:
def benchmark_inference(engine, input_shape=(1, 128)):  
    context = engine.create_execution_context()

    input_data = np.random.randint(0, 100, size=input_shape).astype(np.int32)
    attention_mask = np.ones(input_shape, dtype=np.int32)  # Giữ nguyên attention mask = 1

    d_input = cuda.mem_alloc(input_data.nbytes)
    d_mask = cuda.mem_alloc(attention_mask.nbytes)
    d_output = cuda.mem_alloc(input_data.nbytes)  # Output có cùng shape với input

    bindings = [int(d_input), int(d_mask), int(d_output)]

    cuda.memcpy_htod(d_input, input_data)
    cuda.memcpy_htod(d_mask, attention_mask)

    start_time = time.time()
    context.execute_v2(bindings)
    end_time = time.time()

    print(f"Inference time: {end_time - start_time:.6f} s")

benchmark_inference(engine)

In [None]:
from huggingface_hub import HfApi
from kaggle_secrets import UserSecretsClient

# Lấy Hugging Face Token từ Kaggle Secrets
user_secrets = UserSecretsClient()
HF_TOKEN = user_secrets.get_secret("HF_UPLOAD_TOKEN")  # Cần token có quyền `Write`

# Repo Dataset của bạn
repo_id = "nhotin/segment-text"  # Thay thế bằng repo của bạn
file_path = "/kaggle/working/model.engine"  # File engine đã tạo
upload_path = "model.engine"  # Tên file sau khi upload lên HF

# Khởi tạo API và upload file lên Hugging Face Dataset
api = HfApi()
api.upload_file(
    path_or_fileobj=file_path,
    path_in_repo=upload_path,
    repo_id=repo_id,
    repo_type="dataset",  # Bắt buộc với dataset
    token=HF_TOKEN
)

print(f"Đã upload {file_path} lên Hugging Face Dataset: {repo_id}/{upload_path}")