Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -283,7 +283,8 @@ def extract_from_precompiled(precompiled_location: str, package_data: List[str],
'trtllm-refit=tensorrt_llm.commands.refit:main',
'trtllm-bench=tensorrt_llm.commands.bench:main',
'trtllm-serve=tensorrt_llm.commands.serve:main',
'trtllm-eval=tensorrt_llm.commands.eval:main'
'trtllm-eval=tensorrt_llm.commands.eval:main',
'trtllm-configure=tensorrt_llm.commands.configure:main'
],
},
scripts=['tensorrt_llm/llmapi/trtllm-llmapi-launch'],
Expand Down
9 changes: 9 additions & 0 deletions tensorrt_llm/commands/configure.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from tensorrt_llm.configure.cli import TRTLLMConfigure


def main():
TRTLLMConfigure().run()


if __name__ == "__main__":
main()
Empty file.
125 changes: 125 additions & 0 deletions tensorrt_llm/configure/cli.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
from enum import StrEnum
from pathlib import Path
from typing import Optional

import yaml
from pydantic import AliasChoices, Field, model_validator
from pydantic_settings import BaseSettings, CliSubCommand, SettingsConfigDict, get_subcommand

from tensorrt_llm.configure.constraints import BaseConstraints, BenchmarkConstraints
from tensorrt_llm.configure.profile import PROFILE_REGISTRY


def generate_subcommand_description(constraints_cls: type[BaseConstraints]) -> str:
"""Generate a description of the subcommand for the given constraints class."""
profiles = PROFILE_REGISTRY[constraints_cls]
description = constraints_cls._get_cli_description() + "\n\n"

description += (
"The --profile flag can be used to specify which profile to use. A profile defines the strategy used to "
"generate the optimized config. The available profiles are:\n\n"
)

for profile in profiles:
metadata = profile._get_metadata()
description += f"- {metadata.cli_name}: {metadata.description}\n"

return description


def create_subcommand(constraints_cls: type[BaseConstraints]) -> CliSubCommand:
"""Create a Pydantic CLI subcommand for the given constraints class."""
profiles = PROFILE_REGISTRY[constraints_cls]
default_profile = next(profile for profile in profiles if profile._get_metadata().is_default)
ProfileEnum = StrEnum(
"ProfileEnum",
[profile._get_metadata().cli_name for profile in profiles],
)

class SubCommand(constraints_cls):
# The docstring is shown as the help message for the subcommand
__doc__ = generate_subcommand_description(constraints_cls)

# Common options for all subcommands
output: Optional[Path] = Field(
default=None,
description="YAML file path where the optimized config will be written.",
validation_alias=AliasChoices("output", "o"),
)

profile: ProfileEnum = Field(
default=default_profile._get_metadata().cli_name,
description="Name of the profile to use, which defines the strategy used to generate the optimized config. "
"See above for a description of the available profiles.",
)

@model_validator(mode="after")
def validate_output(self) -> "SubCommand":
"""Verify that output file is a valid YAML file path and does not already exist."""
if self.output is not None:
if self.output.suffix != ".yaml":
raise ValueError(f"Output file must be a YAML file. Got '{self.output}'.")
if self.output.exists():
print(f"Output file '{self.output}' already exists, will overwrite it.")
return self

def run(self) -> None:
# Dispatch to the appropriate profile
profiles = PROFILE_REGISTRY[constraints_cls]
profile_cls = next(
profile for profile in profiles if profile._get_metadata().cli_name == self.profile
)
config = profile_cls().get_config(self)
print(f"Found optimized config: \n\n{yaml.safe_dump(config)}")

if self.output is None:
print(
"No output file specified. To write the optimized config to a file, use the --output / -o flag."
)
else:
with open(self.output, "w") as f:
f.write(yaml.safe_dump(config))
print(f"Optimized config written to {self.output}")
print("To serve the model with optimized settings, run the following command:\n")
print(f"trtllm-serve {self.model} --config {self.output}")

return CliSubCommand[SubCommand]


BenchmarkSubCommand = create_subcommand(BenchmarkConstraints)
# TODO: add support for throughput/latency subcommand
# ThroughputLatencySubCommand = create_subcommand(ThroughputLatencyConstraints)


class TRTLLMConfigure(BaseSettings):
# The docstring below is used to generate the CLI help message
"""The trtllm-configure CLI tool allows you to optimize the configuration of TensorRT LLM for your specific
inference scenario.
""" # noqa: D205

model_config = SettingsConfigDict(
cli_parse_args=True,
cli_prog_name="trtllm-configure",
cli_enforce_required=True, # Make required fields enforced at CLI level
cli_implicit_flags=True, # Boolean fields will be exposed as e.g. --flag and --no-flag
cli_avoid_json=True, # Do not expose JSON string options for nested models
)

benchmark: BenchmarkSubCommand = Field(description=BenchmarkConstraints._get_cli_description())
# TODO: add support for throughput/latency SLA subcommand
# throughput_latency: CliSubCommand[ThroughputLatencySubCommand] = Field(
# description=ThroughputLatencySubCommand.__doc__
# )

def run(self) -> None:
"""Main entrypoint for the trtllm-configure CLI tool."""
subcommand = get_subcommand(self)
subcommand.run()


def main():
TRTLLMConfigure().run()


if __name__ == "__main__":
main()
66 changes: 66 additions & 0 deletions tensorrt_llm/configure/constraints.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
from abc import ABC, abstractmethod
from enum import StrEnum
from typing import Optional

from pydantic import BaseModel, Field, NonNegativeInt, PositiveFloat, PositiveInt, model_validator


class GPU(StrEnum):
GB200 = "GB200"
H200_SXM = "H200_SXM"


class BaseConstraints(BaseModel, ABC):
"""Base class for all constraints containing common fields.

A set of constraints fully defines the requirements for a specific inference workload
and the goals for optimization (e.g. SLA targets).
"""

model: str = Field(description="HuggingFace ID of the model being deployed")
gpu: GPU = Field(description="GPU SKU used in the deployment")
num_gpus: PositiveInt = Field(description="Number of GPUs available in the deployment")

@classmethod
@abstractmethod
def _get_cli_description(cls) -> str:
"""Get a description of the constraints which will be shown in CLI help messages."""


class BenchmarkConstraints(BaseConstraints):
isl: NonNegativeInt = Field(description="Target input sequence length")
osl: NonNegativeInt = Field(description="Target output sequence length")
concurrency: PositiveInt = Field(description="Target number of concurrent requests")
# TODO: make this optional and add logic to choose best parallelization mapping automatically
tp_size: PositiveInt = Field(description="Specific tensor parallel size that should be used")

@classmethod
def _get_cli_description(cls) -> str:
return "Optimize TensorRT LLM for a benchmark workload with a specific number of concurrent requests."


class ThroughputLatencyConstraints(BaseConstraints):
tps_per_gpu: Optional[PositiveFloat] = Field(
default=None,
description="Target minimum throughput per GPU in tokens per second",
)
tps_per_user: Optional[PositiveFloat] = Field(
default=None, description="Target minimum throughput per user in tokens per second."
)
ttft: Optional[PositiveFloat] = Field(
default=None,
description="Target maximum time to first token in seconds.",
)

@classmethod
def _get_cli_description(cls) -> str:
return "Optimize TensorRT LLM to meet a throughput and/or latency SLA."

@model_validator(mode="after")
def validate_has_at_least_one_constraint(self) -> "ThroughputLatencyConstraints":
if not any([self.tps_per_gpu, self.tps_per_user, self.ttft]):
raise ValueError(
"At least one of target throughput per GPU, target throughput per user, or target time to first token "
"must be specified."
)
return self
82 changes: 82 additions & 0 deletions tensorrt_llm/configure/profile.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
from abc import ABC, abstractmethod
from collections import defaultdict
from typing import Any

from pydantic import BaseModel, Field

from tensorrt_llm.configure.constraints import BaseConstraints, BenchmarkConstraints


class ProfileMetadata(BaseModel):
"""Metadata about a profile."""

constraints_cls: type[BaseConstraints] = Field(
description="The constraints class that this profile is compatible with."
)
cli_name: str = Field(description="The name of the profile as it will be exposed in the CLI.")
description: str = Field(
description="A description of the profile which will be shown in the CLI help message."
)
is_default: bool = Field(
default=False,
description="Whether this profile is the default profile for the constraints class. There can only be one "
"default profile per constraints class.",
)


class BaseProfile(ABC):
"""Base class for all profiles.

A profile defines a particular strategy used to find an optimized config for a given set of constraints
(e.g. database lookup, heuristics, etc.)

Each profile is compatible with a specific type of constraints.
"""

@classmethod
@abstractmethod
def _get_metadata(cls) -> ProfileMetadata:
"""Get the metadata associated with this profile."""

@abstractmethod
def get_config(self, constraints: BaseConstraints) -> dict[str, Any]:
"""Retrieve or generate the optimal config for the given constraints."""

def __init_subclass__(cls, **kwargs):
super().__init_subclass__(**kwargs)

# Validate that there is only one default profile per constraints class
metadata = cls._get_metadata()
if metadata.is_default:
for other_profile in PROFILE_REGISTRY[metadata.constraints_cls]:
other_metadata = other_profile._get_metadata()
if other_metadata.is_default:
raise ValueError(
f"Multiple default profiles found for constraints class {metadata.constraints_cls.__name__}: "
f"{other_profile.__name__} and {cls.__name__}"
)

# Add this class to the profile registry
PROFILE_REGISTRY[metadata.constraints_cls].append(cls)


# Maps constraints classes to the list of profiles compatible with those constraints
PROFILE_REGISTRY: defaultdict[type[BaseConstraints], list[type[BaseProfile]]] = defaultdict(list)


class InferenceMaxProfile(BaseProfile):
@classmethod
def _get_metadata(cls) -> ProfileMetadata:
return ProfileMetadata(
constraints_cls=BenchmarkConstraints,
cli_name="inferencemax",
description=(
"Retrieve optimized settings from a database of configs used for SemiAnalysis InferenceMax "
"benchmarks."
),
is_default=True,
)

def get_config(self, constraints: BenchmarkConstraints) -> dict[str, Any]:
# TODO: add logic to retrieve optimal config from database
return {}
43 changes: 43 additions & 0 deletions tests/unittest/configure/test_configure.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
from pathlib import Path
from unittest.mock import patch

import yaml

from tensorrt_llm.configure.cli import BenchmarkSubCommand, TRTLLMConfigure
from tensorrt_llm.configure.profile import InferenceMaxProfile


def test_trtllm_configure_subcommand_basic(tmp_path: Path):
output_path = tmp_path / "test_config.yaml"

mock_config = {
"max_batch_size": 1,
"kv_cache_free_gpu_memory_fraction": 0.9,
}

cmd = BenchmarkSubCommand(
model="meta-llama/Llama-3.1-8B",
gpu="H200_SXM",
num_gpus=1,
isl=1000,
osl=2000,
concurrency=64,
tp_size=1,
output=output_path,
profile="inferencemax",
)

trtllm_configure = TRTLLMConfigure(benchmark=cmd)

# Mock get_config to return our mock config
with patch.object(InferenceMaxProfile, "get_config", return_value=mock_config):
trtllm_configure.run()

assert output_path.exists()
with open(output_path, "r") as f:
loaded_config = yaml.safe_load(f)

assert "max_batch_size" in loaded_config
assert loaded_config["max_batch_size"] == 1
assert "kv_cache_free_gpu_memory_fraction" in loaded_config
assert loaded_config["kv_cache_free_gpu_memory_fraction"] == 0.9