In [None]:
sudo mkdir -p /data/models/stable-diffusion
sudo mkdir -p /data/images/stable-diffusion

In [None]:
# Download checkpoint (First time needed)
sudo wget \
  https://huggingface.co/runwayml/stable-diffusion-v1-5/resolve/main/v1-5-pruned-emaonly.ckpt \
  -O /data/models/stable-diffusion/sd-v1-5.ckpt

In [None]:
sudo docker pull dustynv/stable-diffusion:r35.4.1
sudo docker run --runtime nvidia -it --rm --network=host \
  -v /data/models/stable-diffusion:/data/models/stable-diffusion \
  -v /data/images/stable-diffusion:/data/images/stable-diffusion \
  dustynv/stable-diffusion:r35.4.1

In [None]:
cd /opt/stable-diffusion

In [None]:
python3 scripts/img2img.py --ckpt /data/models/stable-diffusion/sd-v1-5.ckpt --init-img "/data/images/stable-diffusion/samples/input.png" --outdir "/data/images/stable-diffusion/test" --prompt "high quality, sharp focus, crisp street details, natural lighting, balanced contrast, accurate colors, clean edges, realistic textures, photorealistic" --strength 0.3 --n_samples 1 --n_iter 1 --ddim_steps 50 --seed $(python3 -c 'import secrets; print(1 + secrets.randbelow(2147483646))')

In [None]:
# If you want to record the latency of each inference step, use the following code
nano scripts/img2img.py

In [None]:
#!/usr/bin/env python3
"""img2img â€” 4-stage latency (model_load / encode / denoise / decode)"""

import argparse, os, time, csv
import PIL
import torch
import numpy as np
from omegaconf import OmegaConf
from PIL import Image
from tqdm import tqdm, trange
from itertools import islice
from einops import rearrange, repeat
from torchvision.utils import make_grid
from torch import autocast
from contextlib import nullcontext
from pytorch_lightning import seed_everything

from ldm.util import instantiate_from_config
from ldm.models.diffusion.ddim import DDIMSampler
from ldm.models.diffusion.plms import PLMSSampler


def chunk(it, size):
    it = iter(it)
    return iter(lambda: tuple(islice(it, size)), ())

def load_model_from_config(config, ckpt, verbose=False):
    print(f"Loading model from {ckpt}")
    pl_sd = torch.load(ckpt, map_location="cpu")
    if "global_step" in pl_sd:
        print(f"Global Step: {pl_sd['global_step']}")
    sd = pl_sd["state_dict"]
    model = instantiate_from_config(config.model)
    m, u = model.load_state_dict(sd, strict=False)
    if len(m) > 0 and verbose:
        print("missing keys:", m)
    if len(u) > 0 and verbose:
        print("unexpected keys:", u)
    model.cuda()
    model.eval()
    return model

def load_img(path):
    image = Image.open(path).convert("RGB")
    w, h = image.size
    print(f"loaded input image of size ({w}, {h}) from {path}")
    w, h = map(lambda x: x - x % 32, (w, h))
    image = image.resize((w, h), resample=PIL.Image.LANCZOS)
    arr = np.array(image).astype(np.float32) / 255.0
    arr = arr[None].transpose(0, 3, 1, 2)
    tensor = torch.from_numpy(arr)
    return 2. * tensor - 1.

class StageTimer:
    def __init__(self):
        self.metrics = {k:0.0 for k in ("model_load","encode","denoise","decode")}
        self.cur = None
        self.t0 = None

    def _now(self):
        return time.perf_counter()

    def start(self, stage):
        self.cur = stage
        self.t0  = self._now()

    def switch(self, next_stage):
        now = self._now()
        if self.cur is not None:
            self.metrics[self.cur] += now - self.t0
        self.cur = next_stage
        self.t0  = now

    def stop(self):
        now = self._now()
        if self.cur is not None:
            self.metrics[self.cur] += now - self.t0


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--prompt",       type=str, nargs="?", default="a painting of a virus monster playing guitar")
    parser.add_argument("--init-img",     type=str, nargs="?", help="path to the input image")
    parser.add_argument("--outdir",       type=str, nargs="?", default="outputs/img2img-samples")
    parser.add_argument("--skip_grid",    action='store_true')
    parser.add_argument("--skip_save",    action='store_true')
    parser.add_argument("--ddim_steps",   type=int, default=50)
    parser.add_argument("--plms",         action='store_true')
    parser.add_argument("--fixed_code",   action='store_true')
    parser.add_argument("--ddim_eta",     type=float, default=0.0)
    parser.add_argument("--n_iter",       type=int, default=1)
    parser.add_argument("--C",            type=int, default=4)
    parser.add_argument("--f",            type=int, default=8)
    parser.add_argument("--n_samples",    type=int, default=2)
    parser.add_argument("--n_rows",       type=int, default=0)
    parser.add_argument("--scale",        type=float, default=5.0)
    parser.add_argument("--strength",     type=float, default=0.75)
    parser.add_argument("--from-file",    type=str)
    parser.add_argument("--config",       type=str, default="configs/stable-diffusion/v1-inference.yaml")
    parser.add_argument("--ckpt",         type=str, default="models/ldm/stable-diffusion-v1/model.ckpt")
    parser.add_argument("--seed",         type=int, default=42)
    parser.add_argument("--precision",    type=str, choices=["full", "autocast"], default="autocast")
    opt = parser.parse_args()

    seed_everything(opt.seed)
    os.makedirs(opt.outdir, exist_ok=True)
    sample_path = os.path.join(opt.outdir, "samples")
    os.makedirs(sample_path, exist_ok=True)

    precision_scope = autocast if opt.precision == "autocast" else nullcontext

    timer = StageTimer()

    timer.start("model_load")
    config = OmegaConf.load(f"{opt.config}")
    model  = load_model_from_config(config, opt.ckpt)
    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    model  = model.to(device)
    if torch.cuda.is_available():
        torch.cuda.synchronize()
    timer.switch("encode")

    assert os.path.isfile(opt.init_img), f"{opt.init_img} not found"
    init_image = load_img(opt.init_img).to(device)
    init_image = repeat(init_image, '1 ... -> b ...', b=opt.n_samples)

    with precision_scope("cuda"):
        init_latent = model.get_first_stage_encoding(model.encode_first_stage(init_image))

    if opt.plms:
        raise NotImplementedError("PLMS sampler not (yet) supported")
        sampler = PLMSSampler(model)
    else:
        sampler = DDIMSampler(model)
    sampler.make_schedule(ddim_num_steps=opt.ddim_steps, ddim_eta=opt.ddim_eta, verbose=False)

    if not opt.from_file:
        data = [opt.n_samples * [opt.prompt]]
    else:
        with open(opt.from_file, "r") as f:
            lines = f.read().splitlines()
        data = list(chunk(lines, opt.n_samples))

    if torch.cuda.is_available():
        torch.cuda.synchronize()
    timer.switch("denoise")

    all_batches = []
    with torch.no_grad(), precision_scope("cuda"), model.ema_scope():
        for _ in trange(opt.n_iter, desc="Sampling"):
            for prompts in tqdm(data, desc="data"):
                uc = None if opt.scale == 1.0 else model.get_learned_conditioning(opt.n_samples * [""])
                if isinstance(prompts, tuple):
                    prompts = list(prompts)
                c = model.get_learned_conditioning(prompts)

                z_enc = sampler.stochastic_encode(
                    init_latent,
                    torch.tensor([int(opt.strength * opt.ddim_steps)] * opt.n_samples).to(device)
                )
                samples = sampler.decode(
                    z_enc, c, int(opt.strength * opt.ddim_steps),
                    unconditional_guidance_scale=opt.scale, unconditional_conditioning=uc
                )
                if torch.cuda.is_available():
                    torch.cuda.synchronize()
                timer.switch("decode")

                x_samples = model.decode_first_stage(samples)
                x_samples_vis = torch.clamp((x_samples + 1.0) / 2.0, min=0.0, max=1.0)

                if not opt.skip_save:
                    base_count = len(os.listdir(sample_path))
                    for x in x_samples_vis:
                        img = (255. * rearrange(x.cpu().numpy(), 'c h w -> h w c')).astype(np.uint8)
                        Image.fromarray(img).save(os.path.join(sample_path, f"{base_count:05}.png"))
                        base_count += 1

                all_batches.append(x_samples_vis)

                if torch.cuda.is_available():
                    torch.cuda.synchronize()
                timer.switch("denoise")

    timer.switch("decode")
    if not opt.skip_grid and all_batches:
        grid = torch.stack([x for batch in all_batches for x in batch], 0)
        grid = make_grid(grid, nrow=(opt.n_rows or opt.n_samples))
        grid = (255. * rearrange(grid, 'c h w -> h w c').cpu().numpy()).astype(np.uint8)
        Image.fromarray(grid).save(os.path.join(opt.outdir, 'grid.png'))

    if torch.cuda.is_available():
        torch.cuda.synchronize()
    timer.stop()

    existing = [f for f in os.listdir(opt.outdir) if f.endswith('.csv') and f[:-4].isdigit()]
    indices  = sorted(int(f[:-4]) for f in existing)
    next_idx = indices[-1] + 1 if indices else 0
    csv_path = os.path.join(opt.outdir, f"{next_idx}.csv")

    with open(csv_path, "w", newline="") as f:
        w = csv.writer(f)
        w.writerow(["stage","delay"])
        w.writerow(["model_load", f"{timer.metrics['model_load']:.6f}"])
        w.writerow(["encode",     f"{timer.metrics['encode']:.6f}"])
        w.writerow(["denoise",    f"{timer.metrics['denoise']:.6f}"])
        w.writerow(["decode",     f"{timer.metrics['decode']:.6f}"])

    print(f"[+] Latencies written to {csv_path}")
    print(f"Your samples are ready here: {opt.outdir}")

if __name__ == "__main__":
    main()
