# nb-sd-quickstart.ipynb - SD1.5/SDXL 基礎推論入門

## Goals（學習目標）
1. 建立 SD1.5/SDXL 基礎推論能力，掌握兩大主流 SD 模型的載入與生圖流程
2. 理解記憶體管理策略，學會 FP16、attention slicing、CPU offload 等 Low-VRAM 技術  
3. 掌握核心參數調優，理解 steps、CFG scale、seed 對圖像品質的影響
4. 建立可重現的實驗流程，配置外化、結果記錄、A/B 對照
5. 為後續進階學習奠基，熟悉 diffusers 生態與 HuggingFace Hub 整合

## Prerequisites（前置條件）
- **VRAM**: 最小 6GB (SD1.5: 4GB, SDXL: 6GB)，推薦 8GB+
- **Storage**: ~10GB for models cache
- **Packages**: diffusers[torch], transformers, accelerate, xformers
- **Optional**: wandb (實驗記錄), compel (prompt weighting)


In [None]:
# %% [1] Shared Cache Bootstrap
import os, pathlib, torch
import sys
from datetime import datetime

# Shared cache configuration (複製到每本 notebook)
AI_CACHE_ROOT = os.getenv("AI_CACHE_ROOT", "../ai_warehouse/cache")

for k, v in {
    "HF_HOME": f"{AI_CACHE_ROOT}/hf",
    "TRANSFORMERS_CACHE": f"{AI_CACHE_ROOT}/hf/transformers",
    "HF_DATASETS_CACHE": f"{AI_CACHE_ROOT}/hf/datasets",
    "HUGGINGFACE_HUB_CACHE": f"{AI_CACHE_ROOT}/hf/hub",
    "TORCH_HOME": f"{AI_CACHE_ROOT}/torch",
}.items():
    os.environ[k] = v
    pathlib.Path(v).mkdir(parents=True, exist_ok=True)
print("[Cache]", AI_CACHE_ROOT, "| GPU:", torch.cuda.is_available())

Cell 2: Package Installation & Imports

In [None]:
# Install required packages (run once)
import subprocess, sys


def install_if_missing(package):
    try:
        __import__(package.split("[")[0])  # Handle extras like diffusers[torch]
    except ImportError:
        print(f"Installing {package}...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])


packages = [
    "diffusers[torch]>=0.30.0",
    "transformers>=4.42",
    "accelerate>=0.33",
    "xformers",  # For memory efficiency
    "compel",  # For prompt weighting
]

for pkg in packages:
    install_if_missing(pkg)

print("✅ All packages ready!")

In [None]:
# Core imports
import torch
from diffusers import (
    StableDiffusionPipeline,
    StableDiffusionXLPipeline,
    DPMSolverMultistepScheduler,
    EulerAncestralDiscreteScheduler,
)
from compel import Compel
import time
from datetime import datetime
import json
from pathlib import Path
import matplotlib.pyplot as plt
from PIL import Image
import numpy as np

# Configuration
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
DTYPE = torch.float16 if DEVICE == "cuda" else torch.float32
SMOKE_MODE = os.getenv("SMOKE_MODE", "false").lower() == "true"

print(f"Device: {DEVICE}, Dtype: {DTYPE}, Smoke Mode: {SMOKE_MODE}")

Cell 3: SD1.5 Pipeline Setup (Low-VRAM Optimized)

In [None]:
# SD1.5 Pipeline with memory optimizations
MODEL_ID_SD15 = "runwayml/stable-diffusion-v1-5"


def create_sd15_pipeline(enable_optimizations=True):
    """Create SD1.5 pipeline with optional memory optimizations"""

    # Load pipeline with FP16 for memory efficiency
    pipe = StableDiffusionPipeline.from_pretrained(
        MODEL_ID_SD15,
        torch_dtype=DTYPE,
        use_safetensors=True,
        variant="fp16" if DTYPE == torch.float16 else None,
    )

    if enable_optimizations and DEVICE == "cuda":
        # Memory optimizations for low-VRAM GPUs
        pipe.enable_attention_slicing()  # Reduce memory usage
        pipe.enable_vae_slicing()  # VAE memory optimization

        try:
            pipe.enable_xformers_memory_efficient_attention()  # Faster + less memory
            print("✅ xFormers enabled")
        except ImportError:
            print("⚠️  xFormers not available, using default attention")

    pipe = pipe.to(DEVICE)

    # Use more efficient scheduler
    pipe.scheduler = DPMSolverMultistepScheduler.from_config(
        pipe.scheduler.config, use_karras_sigmas=True
    )

    return pipe


# Create SD1.5 pipeline
print("Loading SD1.5 pipeline...")
start_time = time.time()
sd15_pipe = create_sd15_pipeline()
load_time = time.time() - start_time
print(f"✅ SD1.5 loaded in {load_time:.2f}s")

# Check VRAM usage
if DEVICE == "cuda":
    memory_used = torch.cuda.memory_allocated() / 1e9
    print(f"📊 VRAM used: {memory_used:.2f}GB")

Cell 4: SD1.5 Basic Inference

In [None]:
# SD1.5 basic inference with parameter exploration
def generate_sd15(
    prompt,
    negative_prompt="",
    width=512,
    height=512,
    num_inference_steps=20,
    guidance_scale=7.5,
    seed=42,
    num_images=1,
):
    """Generate images with SD1.5"""

    # Set generator for reproducibility
    generator = torch.Generator(device=DEVICE).manual_seed(seed)

    # Generate
    start_time = time.time()
    with torch.no_grad():
        result = sd15_pipe(
            prompt=prompt,
            negative_prompt=negative_prompt,
            width=width,
            height=height,
            num_inference_steps=num_inference_steps,
            guidance_scale=guidance_scale,
            generator=generator,
            num_images_per_prompt=num_images,
            return_dict=True,
        )

    inference_time = time.time() - start_time

    return {
        "images": result.images,
        "metadata": {
            "model": "SD1.5",
            "prompt": prompt,
            "negative_prompt": negative_prompt,
            "width": width,
            "height": height,
            "steps": num_inference_steps,
            "cfg": guidance_scale,
            "seed": seed,
            "inference_time": inference_time,
        },
    }


# Test with a simple prompt
test_prompt = (
    "a serene landscape with mountains and lake, golden hour lighting, photorealistic"
)
test_negative = "blurry, low quality, distorted"

print("🎨 Generating SD1.5 test image...")
steps = 5 if SMOKE_MODE else 20  # Reduce steps for CI testing

result_sd15 = generate_sd15(
    prompt=test_prompt,
    negative_prompt=test_negative,
    num_inference_steps=steps,
    guidance_scale=7.5,
    seed=42,
)

print(f"✅ Generated in {result_sd15['metadata']['inference_time']:.2f}s")

# Display result
fig, ax = plt.subplots(1, 1, figsize=(8, 8))
ax.imshow(result_sd15["images"][0])
ax.set_title(f"SD1.5 - Steps: {steps}, CFG: 7.5")
ax.axis("off")
plt.tight_layout()
plt.show()

# Print metadata
print("📋 Generation metadata:")
print(json.dumps(result_sd15["metadata"], indent=2))

Cell 5: SDXL Pipeline Setup (Advanced Memory Management)

In [None]:
# SDXL Pipeline with aggressive memory optimizations
MODEL_ID_SDXL = "stabilityai/stable-diffusion-xl-base-1.0"


def create_sdxl_pipeline(enable_cpu_offload=False):
    """Create SDXL pipeline with optional CPU offloading for low-VRAM"""

    # Load with memory-efficient settings
    pipe = StableDiffusionXLPipeline.from_pretrained(
        MODEL_ID_SDXL,
        torch_dtype=DTYPE,
        use_safetensors=True,
        variant="fp16" if DTYPE == torch.float16 else None,
    )

    if DEVICE == "cuda":
        # Enable all memory optimizations
        pipe.enable_attention_slicing()
        pipe.enable_vae_slicing()

        try:
            pipe.enable_xformers_memory_efficient_attention()
            print("✅ xFormers enabled for SDXL")
        except ImportError:
            print("⚠️  xFormers not available")

        if enable_cpu_offload:
            # For GPUs with < 8GB VRAM
            pipe.enable_sequential_cpu_offload()
            print("✅ CPU offload enabled")
        else:
            pipe = pipe.to(DEVICE)

    # Use efficient scheduler
    pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(pipe.scheduler.config)

    return pipe


# Check available VRAM and decide on CPU offload
if DEVICE == "cuda":
    total_vram = torch.cuda.get_device_properties(0).total_memory / 1e9
    use_cpu_offload = total_vram < 10  # Use offload for <10GB VRAM
    print(f"📊 Total VRAM: {total_vram:.1f}GB, CPU offload: {use_cpu_offload}")
else:
    use_cpu_offload = False

print("Loading SDXL pipeline...")
start_time = time.time()
sdxl_pipe = create_sdxl_pipeline(enable_cpu_offload=use_cpu_offload)
load_time = time.time() - start_time
print(f"✅ SDXL loaded in {load_time:.2f}s")

Cell 6: SDXL Basic Inference

In [None]:
# SDXL inference function
def generate_sdxl(
    prompt,
    negative_prompt="",
    width=1024,
    height=1024,
    num_inference_steps=20,
    guidance_scale=5.0,  # SDXL typically uses lower CFG
    seed=42,
    num_images=1,
):
    """Generate images with SDXL"""

    generator = torch.Generator(device=DEVICE).manual_seed(seed)

    start_time = time.time()
    with torch.no_grad():
        result = sdxl_pipe(
            prompt=prompt,
            negative_prompt=negative_prompt,
            width=width,
            height=height,
            num_inference_steps=num_inference_steps,
            guidance_scale=guidance_scale,
            generator=generator,
            num_images_per_prompt=num_images,
            return_dict=True,
        )

    inference_time = time.time() - start_time

    return {
        "images": result.images,
        "metadata": {
            "model": "SDXL",
            "prompt": prompt,
            "negative_prompt": negative_prompt,
            "width": width,
            "height": height,
            "steps": num_inference_steps,
            "cfg": guidance_scale,
            "seed": seed,
            "inference_time": inference_time,
        },
    }


# Test SDXL with same prompt for comparison
print("🎨 Generating SDXL test image...")

# Use smaller resolution and fewer steps in smoke mode
width = 512 if SMOKE_MODE else 1024
height = 512 if SMOKE_MODE else 1024
steps = 5 if SMOKE_MODE else 20

result_sdxl = generate_sdxl(
    prompt=test_prompt,
    negative_prompt=test_negative,
    width=width,
    height=height,
    num_inference_steps=steps,
    guidance_scale=5.0,  # Lower CFG for SDXL
    seed=42,
)

print(f"✅ Generated in {result_sdxl['metadata']['inference_time']:.2f}s")

# Display result
fig, ax = plt.subplots(1, 1, figsize=(8, 8))
ax.imshow(result_sdxl["images"][0])
ax.set_title(f"SDXL - {width}x{height}, Steps: {steps}, CFG: 5.0")
ax.axis("off")
plt.tight_layout()
plt.show()

print("📋 SDXL metadata:")
print(json.dumps(result_sdxl["metadata"], indent=2))

Cell 7: Side-by-Side Comparison

In [None]:
# Compare SD1.5 vs SDXL results
fig, axes = plt.subplots(1, 2, figsize=(16, 8))

# SD1.5 result
axes[0].imshow(result_sd15["images"][0])
axes[0].set_title(
    f"SD1.5 (512x512)\nSteps: {result_sd15['metadata']['steps']}, CFG: {result_sd15['metadata']['cfg']}"
)
axes[0].axis("off")

# SDXL result
axes[1].imshow(result_sdxl["images"][0])
axes[1].set_title(
    f"SDXL ({result_sdxl['metadata']['width']}x{result_sdxl['metadata']['height']})\nSteps: {result_sdxl['metadata']['steps']}, CFG: {result_sdxl['metadata']['cfg']}"
)
axes[1].axis("off")

plt.tight_layout()
plt.show()

# Performance comparison
print("\n📊 Performance Comparison:")
print(f"SD1.5:  {result_sd15['metadata']['inference_time']:.2f}s")
print(f"SDXL:   {result_sdxl['metadata']['inference_time']:.2f}s")
print(
    f"Ratio:  {result_sdxl['metadata']['inference_time'] / result_sd15['metadata']['inference_time']:.1f}x slower"
)

Cell 8: Parameter Exploration (Steps & CFG Scale)

In [None]:
# Parameter exploration: Steps vs Quality vs Speed
def parameter_exploration(pipe_func, model_name, base_prompt):
    """Explore different steps and CFG values"""

    if SMOKE_MODE:
        step_values = [5, 10]
        cfg_values = [3.5, 7.5]
    else:
        step_values = [10, 15, 20, 28]
        cfg_values = [3.5, 5.0, 7.5, 10.0]

    results = {}

    for steps in step_values:
        for cfg in cfg_values:
            print(f"🔄 Testing {model_name}: steps={steps}, cfg={cfg}")

            result = pipe_func(
                prompt=base_prompt,
                num_inference_steps=steps,
                guidance_scale=cfg,
                width=512,
                height=512,  # Keep resolution consistent
                seed=12345,  # Fixed seed for comparison
            )

            key = f"steps_{steps}_cfg_{cfg}"
            results[key] = {
                "image": result["images"][0],
                "time": result["metadata"]["inference_time"],
                "steps": steps,
                "cfg": cfg,
            }

    return results


# Explore SD1.5 parameters
exploration_prompt = (
    "a cyberpunk cityscape at night, neon lights, detailed architecture"
)

print("🔬 SD1.5 Parameter Exploration...")
sd15_exploration = parameter_exploration(generate_sd15, "SD1.5", exploration_prompt)

# Display results in grid
n_steps = 2 if SMOKE_MODE else 4
n_cfg = 2 if SMOKE_MODE else 4

fig, axes = plt.subplots(n_steps, n_cfg, figsize=(16, 12))
if n_steps == 1:
    axes = [axes]
if n_cfg == 1:
    axes = [[ax] for ax in axes]

step_values = [5, 10] if SMOKE_MODE else [10, 15, 20, 28]
cfg_values = [3.5, 7.5] if SMOKE_MODE else [3.5, 5.0, 7.5, 10.0]

for i, steps in enumerate(step_values):
    for j, cfg in enumerate(cfg_values):
        key = f"steps_{steps}_cfg_{cfg}"
        if key in sd15_exploration:
            result = sd15_exploration[key]
            axes[i][j].imshow(result["image"])
            axes[i][j].set_title(f"Steps: {steps}, CFG: {cfg}\n{result['time']:.1f}s")
            axes[i][j].axis("off")

plt.suptitle("SD1.5 Parameter Exploration", fontsize=16)
plt.tight_layout()
plt.show()

# Print timing analysis
print("\n⏱️  Timing Analysis:")
for key, result in sd15_exploration.items():
    print(f"{key}: {result['time']:.2f}s")

Cell 9: Save Results & Configuration

In [None]:
# Save results and configuration for reproducibility
from datetime import datetime
import json

# Create output directory
output_dir = (
    Path("outputs") / "sd_quickstart" / datetime.now().strftime("%Y%m%d_%H%M%S")
)
output_dir.mkdir(parents=True, exist_ok=True)


# Save images with metadata
def save_result(result, filename_prefix):
    """Save image and metadata"""
    image = result["images"][0]
    metadata = result["metadata"]

    # Save image
    image_path = output_dir / f"{filename_prefix}.png"
    image.save(image_path)

    # Save metadata
    meta_path = output_dir / f"{filename_prefix}_metadata.json"
    with open(meta_path, "w") as f:
        json.dump(metadata, f, indent=2)

    print(f"💾 Saved: {image_path}")
    return image_path


# Save main results
sd15_path = save_result(result_sd15, "sd15_basic")
sdxl_path = save_result(result_sdxl, "sdxl_basic")

# Save exploration results (sample)
if not SMOKE_MODE:
    for i, (key, result) in enumerate(list(sd15_exploration.items())[:4]):
        save_path = output_dir / f"exploration_{key}.png"
        result["image"].save(save_path)

# Save experiment configuration
config = {
    "experiment": "SD Quickstart Comparison",
    "timestamp": datetime.now().isoformat(),
    "models": {"sd15": MODEL_ID_SD15, "sdxl": MODEL_ID_SDXL},
    "device": DEVICE,
    "dtype": str(DTYPE),
    "optimizations": {
        "attention_slicing": True,
        "vae_slicing": True,
        "xformers": True,
        "cpu_offload": use_cpu_offload,
    },
    "test_prompt": test_prompt,
    "smoke_mode": SMOKE_MODE,
}

config_path = output_dir / "experiment_config.json"
with open(config_path, "w") as f:
    json.dump(config, f, indent=2)

print(f"\n📁 Results saved to: {output_dir}")
print(f"📄 Config saved to: {config_path}")

Cell 10: Memory Cleanup

In [None]:
import gc


def cleanup_memory():
    """Clean up GPU memory"""
    # Clear variables
    del sd15_pipe, sdxl_pipe

    # Force garbage collection
    gc.collect()

    if DEVICE == "cuda":
        torch.cuda.empty_cache()
        memory_after = torch.cuda.memory_allocated() / 1e9
        print(f"🧹 Memory after cleanup: {memory_after:.2f}GB")


cleanup_memory()
print("✅ Memory cleanup completed")


## Summary（完成摘要）

✅ **完成項目**:
- SD1.5/SDXL 基礎推論流程建立
- 記憶體優化策略實作（FP16、attention slicing、CPU offload）
- 核心參數探索（steps、CFG scale）
- 可重現實驗框架（seed、配置記錄）
- 效能對比分析

## Key Concepts（核心概念）

1. **記憶體管理**: attention_slicing、vae_slicing、cpu_offload 的適用情境
2. **參數調優**: Steps (推論品質 vs 速度)、CFG Scale (創意性 vs 一致性)
3. **模型差異**: SD1.5 (512px, 快速) vs SDXL (1024px, 高品質)
4. **可重現性**: Generator seeds、配置外化的重要性

## Common Pitfalls（常見問題）

1. **VRAM 不足**: 未啟用記憶體優化選項
2. **速度緩慢**: 過高的解析度或步數設定
3. **結果不穩定**: 未設定 seed 或清除快取問題
4. **版本衝突**: diffusers/transformers 版本不匹配

## Next Steps（下一步）

1. **Scheduler 探索**: 嘗試不同 scheduler 對生圖品質的影響
2. **Prompt Engineering**: 學習有效的 prompt 結構與技巧
3. **ControlNet 整合**: 為 Stage 2 的條件控制做準備
4. **批次推論**: 開發 CSV/JSON 驅動的批次生圖流程

Smoke Test 測試

In [None]:
# Smoke test for CI/CD pipeline
def smoke_test():
    """Quick test to verify basic functionality"""
    print("🧪 Running smoke test...")

    # Test with minimal parameters
    quick_result = generate_sd15(
        prompt="a simple cat",
        num_inference_steps=2,  # Minimal steps
        guidance_scale=5.0,
        width=256,
        height=256,  # Small resolution
        seed=999,
    )

    # Verify image was generated
    assert len(quick_result["images"]) == 1
    assert quick_result["images"][0].size == (256, 256)

    print("✅ Smoke test passed!")
    return quick_result


if SMOKE_MODE:
    smoke_result = smoke_test()