NVIDIA · asfiyab-nvidia · Aug 12, 2025 · Aug 11, 2025 · Aug 11, 2025
@@ -7,7 +7,7 @@ This demo application ("demoDiffusion") showcases the acceleration of Stable Dif
 ### Clone the TensorRT OSS repository
 
 ```bash
-git clone git@github.com:NVIDIA/TensorRT.git -b release/10.13 --single-branch
+git clone git@github.com:NVIDIA/TensorRT.git -b release/sd35 --single-branch
 cd TensorRT
 ```
 
@@ -210,7 +210,7 @@ Run the command below to generate an image using Stable Diffusion 3 and Stable D
 python3 demo_txt2img_sd3.py "A vibrant street wall covered in colorful graffiti, the centerpiece spells \"SD3 MEDIUM\", in a storm of colors" --version sd3 --hf-token=$HF_TOKEN
 
 # Stable Diffusion 3.5-medium
-python3 demo_txt2img_sd35.py "a beautiful photograph of Mt. Fuji during cherry blossom" --version=3.5-medium --denoising-steps=30 --guidance-scale 3.5 --hf-token=$HF_TOKEN --bf16
+python3 demo_txt2img_sd35.py "a beautiful photograph of Mt. Fuji during cherry blossom" --version=3.5-medium --denoising-steps=30 --guidance-scale 3.5 --hf-token=$HF_TOKEN --bf16 --download-onnx-models
 
 # Stable Diffusion 3.5-large
 python3 demo_txt2img_sd35.py "a beautiful photograph of Mt. Fuji during cherry blossom" --version=3.5-large --denoising-steps=30 --guidance-scale 3.5 --hf-token=$HF_TOKEN --bf16 --download-onnx-models
@@ -234,13 +234,13 @@ Note that a denosing-percentage is applied to the number of denoising-steps when
 
 ```bash
 # Depth
-python3 demo_controlnet_sd35.py "a photo of a man" --controlnet-type depth --hf-token=$HF_TOKEN --denoising-steps 40 --guidance-scale 4.5 --bf16
+python3 demo_controlnet_sd35.py "a photo of a man" --controlnet-type depth --hf-token=$HF_TOKEN --denoising-steps 40 --guidance-scale 4.5 --bf16 --download-onnx-models
 
 # Canny
-python3 demo_controlnet_sd35.py "A Night time photo taken by Leica M11, portrait of a Japanese woman in a kimono, looking at the camera, Cherry blossoms" --controlnet-type canny --hf-token=$HF_TOKEN --denoising-steps 60 --guidance-scale 3.5 --bf16
+python3 demo_controlnet_sd35.py "A Night time photo taken by Leica M11, portrait of a Japanese woman in a kimono, looking at the camera, Cherry blossoms" --controlnet-type canny --hf-token=$HF_TOKEN --denoising-steps 60 --guidance-scale 3.5 --bf16 --download-onnx-models
 
 # Blur
-python3 demo_controlnet_sd35.py "generated ai art, a tiny, lost rubber ducky in an action shot close-up, surfing the humongous waves, inside the tube, in the style of Kelly Slater" --controlnet-type blur --hf-token=$HF_TOKEN --denoising-steps 60 --guidance-scale 3.5 --bf16
+python3 demo_controlnet_sd35.py "generated ai art, a tiny, lost rubber ducky in an action shot close-up, surfing the humongous waves, inside the tube, in the style of Kelly Slater" --controlnet-type blur --hf-token=$HF_TOKEN --denoising-steps 60 --guidance-scale 3.5 --bf16 --download-onnx-models
 ```
 
 ### Generate a video guided by an initial image using Stable Video Diffusion

@@ -43,7 +43,7 @@ def parseArgs():
     parser.add_argument(
         "--max-sequence-length",
         type=int,
-        default=77,
+        default=256,
         help="Maximum sequence length to use with the prompt.",
     )
     parser.add_argument(
@@ -55,17 +55,15 @@ def parseArgs():
     )
     parser.add_argument(
         "--controlnet-type",
-        nargs="+",
         type=str,
-        default=["canny"],
-        help="Controlnet type, can be `None`, `str` or `str` list from ['canny', 'depth', 'blur']",
+        default="canny",
+        help="Controlnet type (single type only), can be 'canny', 'depth', 'blur', etc.",
     )
     parser.add_argument(
         "--controlnet-scale",
-        nargs="+",
         type=float,
-        default=[1.0],
-        help="The outputs of the controlnet are multiplied by `controlnet_scale` before they are added to the residual in the original unet, can be `None`, `float` or `float` list",
+        default=1.0,
+        help="The outputs of the controlnet are multiplied by `controlnet_scale` before they are added to the residual in the original Transformer",
     )
     return parser.parse_args()
 
@@ -99,48 +97,39 @@ def process_demo_args(args):
         )
 
     # Controlnet configuration
-    if not isinstance(args.controlnet_type, list):
-        raise ValueError(
-            f"`--controlnet-type` must be of type `str` or `str` list, but is {type(args.controlnet_type)}"
-        )
+    if not isinstance(args.controlnet_type, str):
+        raise ValueError(f"`--controlnet-type` must be of type `str`, but is {type(args.controlnet_type)}")
 
     # Controlnet configuration
-    if not isinstance(args.controlnet_scale, list):
-        raise ValueError(
-            f"`--controlnet-scale`` must be of type `float` or `float` list, but is {type(args.controlnet_scale)}"
-        )
-
-    # Check number of ControlNets to ControlNet scales
-    if len(args.controlnet_type) != len(args.controlnet_scale):
-        raise ValueError(
-            f"Numbers of ControlNets {len(args.controlnet_type)} should be equal to number of ControlNet scales {len(args.controlnet_scale)}."
-        )
+    if not isinstance(args.controlnet_scale, float):
+        raise ValueError(f"`--controlnet-scale` must be of type `float`, but is {type(args.controlnet_scale)}")
 
     # Convert controlnet scales to tensor
-    controlnet_scale = torch.FloatTensor(args.controlnet_scale)
+    controlnet_scale = torch.FloatTensor([args.controlnet_scale])
 
     # Check images
     input_images = []
     if len(args.control_image) > 0:
         for image in args.control_image:
             input_images.append(Image.open(image))
     else:
-        for controlnet in args.controlnet_type:
-            if controlnet == "canny":
-                canny_image = image_module.download_image("https://huggingface.co/datasets/diffusers/diffusers-images-docs/resolve/main/canny.png")
-                input_images.append(canny_image.resize((args.height, args.width)))
-            elif controlnet == "depth":
-                depth_image = image_module.download_image(
-                    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/marigold/marigold_einstein_lcm_depth.png"
-                )
-                input_images.append(depth_image.resize((args.height, args.width)))
-            elif controlnet == "blur":
-                blur_image = image_module.download_image(
-                    "https://huggingface.co/datasets/diffusers/diffusers-images-docs/resolve/main/blur.png"
-                )
-                input_images.append(blur_image.resize((args.height, args.width)))
-            else:
-                raise ValueError(f"You should implement the conditonal image of this controlnet: {controlnet}")
+        if args.controlnet_type == "canny":
+            canny_image = image_module.download_image(
+                "https://huggingface.co/datasets/diffusers/diffusers-images-docs/resolve/main/canny.png"
+            )
+            input_images.append(canny_image.resize((args.height, args.width)))
+        elif args.controlnet_type == "depth":
+            depth_image = image_module.download_image(
+                "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/marigold/marigold_einstein_lcm_depth.png"
+            )
+            input_images.append(depth_image.resize((args.height, args.width)))
+        elif args.controlnet_type == "blur":
+            blur_image = image_module.download_image(
+                "https://huggingface.co/datasets/diffusers/diffusers-images-docs/resolve/main/blur.png"
+            )
+            input_images.append(blur_image.resize((args.height, args.width)))
+        else:
+            raise ValueError(f"You should implement the conditonal image of this controlnet: {args.controlnet_type}")
     assert len(input_images) > 0
 
     kwargs_run_demo = {
@@ -149,7 +138,7 @@ def process_demo_args(args):
         "height": args.height,
         "width": args.width,
         "control_image": input_images,
-        "controlnet_scales": controlnet_scale,
+        "controlnet_scale": controlnet_scale,
         "batch_count": args.batch_count,
         "num_warmup_runs": args.num_warmup_runs,
         "use_cuda_graph": args.use_cuda_graph,

@@ -310,7 +310,7 @@ def process_pipeline_args(args: argparse.Namespace) -> Tuple[Dict[str, Any], Dic
     # int8 support
     if args.int8 and not any(args.version.startswith(prefix) for prefix in ("xl", "1.4", "1.5", "2.1")):
         raise ValueError("int8 quantization is only supported for SDXL, SD1.4, SD1.5 and SD2.1 pipelines.")
-    
+
     # fp8 support validation
     if args.fp8:
         # Check version compatibility
@@ -339,7 +339,7 @@ def process_pipeline_args(args: argparse.Namespace) -> Tuple[Dict[str, Any], Dic
             raise ValueError(
                 "Native FP8 quantization is not supported for SD3.5-large. Please pass --download-onnx-models."
             )
-        
+
     # TensorRT ModelOpt quantization level
     if args.quantization_level == 0.0:
         def override_quant_level(level: float, dtype_str: str):

@@ -30,8 +30,8 @@
     FluxTransformerModel,
     SD3_MMDiTModel,
     SD3TransformerModel,
-    SD3TransformerModelControlNet,
 )
+from demo_diffusion.model.controlnet import SD3ControlNet
 from demo_diffusion.model.gan import VQGANModel
 from demo_diffusion.model.load import unload_torch_model
 from demo_diffusion.model.lora import FLUXLoraLoader, SDLoraLoader, merge_loras
@@ -71,7 +71,7 @@
     "SD3_MMDiTModel",
     "FluxTransformerModel",
     "SD3TransformerModel",
-    "SD3TransformerModelControlNet",
+    "SD3ControlNet",
     # gan
     "VQGANModel",
     # lora

@@ -42,6 +42,7 @@ def __init__(
         bf16=False,
         int8=False,
         fp8=False,
+        fp4=False,
         max_batch_size=16,
         text_maxlen=77,
         embedding_dim=768,
@@ -63,6 +64,7 @@ def __init__(
         self.bf16 = bf16
         self.int8 = int8
         self.fp8 = fp8
+        self.fp4 = fp4
 
         self.compression_factor = compression_factor
         self.min_batch = 1