In [None]:
import torch
from diffusers import PixArtAlphaPipeline, DiffusionPipeline, StableVideoDiffusionPipeline, StableDiffusion3Pipeline
from diffusers.utils import load_image, export_to_video, make_image_grid
from cache_diffusion import cachify
from cache_diffusion.utils import SVD_DEFAULT_CONFIG, SDXL_DEFAULT_CONFIG, PIXART_DEFAULT_CONFIG, SD3_DEFAULT_CONFIG

# SDXL

Let's load the Model

In [None]:
pipe = DiffusionPipeline.from_pretrained(
    "stabilityai/stable-diffusion-xl-base-1.0",
    torch_dtype=torch.float16,
    variant="fp16",
    use_safetensors=True,
)
pipe = pipe.to("cuda")

In [None]:
num_inference_steps = 20
prompt = "beautiful lady, (freckles), big smile, blue eyes, short hair, dark makeup, hyperdetailed photography, soft light, head and shoulders portrait, cover"

Our pipeline requires just a single API call to perform caching.

Let's disable the caching and run the baseline model

In [None]:
cachify.prepare(pipe, SDXL_DEFAULT_CONFIG)
cachify.disable(pipe)

In [None]:
generator = torch.Generator(device="cuda").manual_seed(2946901)
baseline_img_20_steps = pipe(prompt=prompt, num_inference_steps=num_inference_steps, generator=generator).images[0]

We can also reduce the number of steps to achieve similar latency as using cache diffusion. However, you will notice that the image quality is not as good.

In [None]:
generator = torch.Generator(device="cuda").manual_seed(2946901)
baseline_img_11_steps = pipe(prompt=prompt, num_inference_steps=11, generator=generator).images[0]

Let's enable the caching

In [None]:
cachify.enable(pipe)

In [None]:
generator = torch.Generator(device="cuda").manual_seed(2946901)

with cachify.infer(pipe) as cached_pipe:
    cache_img = cached_pipe(prompt=prompt, num_inference_steps=num_inference_steps, generator=generator).images[0]

In [None]:
make_image_grid([baseline_img_20_steps, cache_img, baseline_img_11_steps], 1, 3)

# PixArt-Alpha

In [None]:
pipe = PixArtAlphaPipeline.from_pretrained(
    "PixArt-alpha/PixArt-XL-2-1024-MS", torch_dtype=torch.float16
)
pipe = pipe.to("cuda")
num_inference_steps = 30
prompt = "a small cactus with a happy face in the Sahara desert"

In [None]:
cachify.prepare(pipe, PIXART_DEFAULT_CONFIG)

In [None]:
generator = torch.Generator(device="cuda").manual_seed(2946901)

with cachify.infer(pipe) as cached_pipe:
    img = cached_pipe(prompt=prompt, generator=generator, num_inference_steps=num_inference_steps).images[0]

In [None]:
img

# SVD

In [None]:
pipe = StableVideoDiffusionPipeline.from_pretrained(
    "stabilityai/stable-video-diffusion-img2vid-xt", torch_dtype=torch.float16, variant="fp16"
)
pipe.enable_model_cpu_offload()

In [None]:
# Load the conditioning image
image = load_image(
    "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/svd/rocket.png"
)
image = image.resize((1024, 576))

In [None]:
generator = torch.manual_seed(42)
num_inference_steps = 25

In [None]:
cachify.prepare(pipe, SVD_DEFAULT_CONFIG)

In [None]:
with cachify.infer(pipe) as cached_pipe:
    frames = cached_pipe(image, decode_chunk_size=8, generator=generator).frames[0]

export_to_video(frames, "generated.mp4", fps=7)

# SD3-Medium

In [None]:
pipe = StableDiffusion3Pipeline.from_pretrained("stabilityai/stable-diffusion-3-medium-diffusers", torch_dtype=torch.float16)
pipe = pipe.to("cuda")
num_inference_steps=28

In [None]:
cachify.prepare(pipe, SD3_DEFAULT_CONFIG)
cachify.enable(pipe)

In [None]:
generator = torch.Generator(device="cuda").manual_seed(2946901)
with cachify.infer(pipe) as cached_pipe:
    cached_img = pipe(
        "A cat holding a sign that says hello world",
        negative_prompt="",
        num_inference_steps=28,
        guidance_scale=7.0,
        generator=generator
    ).images[0]
cached_img