# Setup Dependencies

In [None]:
%env CUDA_VISIBLE_DEVICES=1

In [None]:
# Install diffusers for demo
!pip install diffusers
!pip install torch
!pip install pytorch-lightning

In [None]:
# Import modules
from IPython.display import display, Image
import PIL

import pytorch_lightning as L
from diffusers import (
    StableDiffusion3Pipeline,
    StableDiffusion3ControlNetPipeline,
)
from diffusers.models import SD3ControlNetModel
from diffusers.utils import load_image
import numpy as np
import torch
from tqdm import tqdm

# Fix seed
L.seed_everything(2025)

# Load Stable Diffusion 3 and Generate Images from Text

In [None]:
# Prepare prompts
prompts = ["a photo of an astronaut riding a horse on mars"]

In [None]:
# Load model
model_id = "stabilityai/stable-diffusion-3-medium-diffusers"
pipe = StableDiffusion3Pipeline.from_pretrained(model_id, torch_dtype=torch.float16)
pipe.enable_model_cpu_offload()  # NOTE: This option is necessary for VRAM-efficient inference

In [None]:
# Generate Images (Takes less than a minute on RTX 3090)
L.seed_everything(2025)  # for reproducibility

images = pipe(
    prompts,
    negative_prompt="",
    num_inference_steps=28,
    guidance_scale=7.0,
).images

In [None]:
# Show Images
for image_idx, image in enumerate(images):
    display(image)

# Playing with Classifier-Free Guidance Scale

In [None]:
cfg_scales = [0.0, 7.0, 14.0, 21.0]
cfg_results = []
for cfg_scale in tqdm(cfg_scales):
    images = pipe(
        prompts,
        negative_prompt="",
        num_inference_steps=28,
        guidance_scale=cfg_scale,
    ).images
    cfg_results.extend(images)

In [None]:
# Show Images
cfg_results_ = [np.array(image) for image in cfg_results]
cfg_results_ = np.concatenate(cfg_results_, axis=1)
display(PIL.Image.fromarray(cfg_results_))

# Depth-Guided Image Generation with ControlNet

In [None]:
# NOTE: https://huggingface.co/InstantX/SD3-Controlnet-Depth
model_id = "stabilityai/stable-diffusion-3-medium-diffusers"

# load pipeline
controlnet = SD3ControlNetModel.from_pretrained("InstantX/SD3-Controlnet-Depth")
pipe_controlnet = StableDiffusion3ControlNetPipeline.from_pretrained(
    model_id,
    controlnet=controlnet
)
pipe_controlnet.enable_model_cpu_offload()

# config
control_image = load_image("https://huggingface.co/InstantX/SD3-Controlnet-Depth/resolve/main/images/depth.jpeg")
prompt = "a panda cub, captured in a close-up, in forest, is perched on a tree trunk. good composition, Photography, the cub's ears, a fluffy black, are tucked behind its head, adding a touch of whimsy to its appearance. a lush tapestry of green leaves in the background. depth of field, National Geographic"
n_prompt = "bad hands, blurry, NSFW, nude, naked, porn, ugly, bad quality, worst quality"
control_image = control_image.resize((512, 512))  # resize to reduce VRAM usage

# to reproduce result in our example
generator = torch.Generator(device="cpu").manual_seed(2025)
image = pipe_controlnet(
    prompt, 
    negative_prompt=n_prompt, 
    control_image=control_image, 
    controlnet_conditioning_scale=0.5,
    guidance_scale=7.0,
    generator=generator
).images[0]
image = image.resize((512, 512))
image.save('image.jpg')

# show the results
summary = np.concatenate(
    [np.array(control_image), np.array(image)],
    axis=1
)
summary = PIL.Image.fromarray(summary)
display(summary)

In [None]:
# NOTE: https://huggingface.co/InstantX/SD3-Controlnet-Depth
model_id = "stabilityai/stable-diffusion-3-medium-diffusers"

# load pipeline
controlnet = SD3ControlNetModel.from_pretrained("InstantX/SD3-Controlnet-Depth")
pipe_controlnet = StableDiffusion3ControlNetPipeline.from_pretrained(
    model_id,
    controlnet=controlnet
)
pipe_controlnet.enable_model_cpu_offload()

# config
control_image = load_image("https://huggingface.co/InstantX/SD3-Controlnet-Depth/resolve/main/images/depth.jpeg")
prompt = "a panda cub, captured in a close-up, in forest, is perched on a tree trunk. good composition, Photography, the cub's ears, a fluffy black, are tucked behind its head, adding a touch of whimsy to its appearance. a lush tapestry of green leaves in the background. depth of field, National Geographic"
n_prompt = "bad hands, blurry, NSFW, nude, naked, porn, ugly, bad quality, worst quality"
control_image = control_image.resize((512, 512))  # resize to reduce VRAM usage

# to reproduce result in our example
generator = torch.Generator(device="cpu").manual_seed(2025)
image = pipe_controlnet(
    prompt, 
    negative_prompt=n_prompt, 
    control_image=control_image, 
    controlnet_conditioning_scale=0.0,  # NOTE: Set to 0 to disable controlnet
    guidance_scale=7.0,
    generator=generator
).images[0]
image = image.resize((512, 512))
image.save('image_no_control.jpg')

# show the results
summary = np.concatenate(
    [np.array(control_image), np.array(image)],
    axis=1
)
summary = PIL.Image.fromarray(summary)
display(summary)