<a href="https://colab.research.google.com/github/PacktPublishing/Modern-Computer-Vision-with-PyTorch-2E/blob/main/Chapter16/Unet_Components_from_scratch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
try:
  from torch_snippets import *
except:
  %pip install -U diffusers torch_snippets lovely_tensors torchinfo

Summary of unet model used in "Implementing Diffusion from Scratch" section

In [4]:
from diffusers import UNet2DModel
from torchinfo import summary

net = UNet2DModel(
  sample_size=28, # the target image resolution
  in_channels=1, # the number of input channels, 3 for RGB images
  out_channels=1, # the number of output channels
  layers_per_block=1, # how many ResNet layers to use per UNet block block_out_channels=(32, 64, 128, 256), # Roughly matching our basic
  down_block_types=(
    "DownBlock2D", # a regular ResNet downsampling block "AttnDownBlock2D", # a ResNet downsampling block with spatial self-attention
    "AttnDownBlock2D",
    "AttnDownBlock2D",
    "AttnDownBlock2D",
  ),
  up_block_types=(
    "AttnUpBlock2D",
    "AttnUpBlock2D",
    "AttnUpBlock2D", # a ResNet upsampling block with spatial self- attention
    "UpBlock2D", # a regular ResNet upsampling block
  )
)

summary(net, depth=2)

Layer (type:depth-idx)                        Param #
UNet2DModel                                   --
├─Conv2d: 1-1                                 2,240
├─Timesteps: 1-2                              --
├─TimestepEmbedding: 1-3                      --
│    └─Linear: 2-1                            201,600
│    └─SiLU: 2-2                              --
│    └─Linear: 2-3                            803,712
├─ModuleList: 1-4                             --
│    └─DownBlock2D: 2-4                       1,557,248
│    └─AttnDownBlock2D: 2-5                   5,826,688
│    └─AttnDownBlock2D: 2-6                   13,557,152
│    └─AttnDownBlock2D: 2-7                   17,272,640
├─ModuleList: 1-5                             --
│    └─AttnUpBlock2D: 2-8                     59,838,912
│    └─AttnUpBlock2D: 2-9                     35,095,200
│    └─AttnUpBlock2D: 2-10                    15,870,400
│    └─UpBlock2D: 2-11                        3,818,304
├─UNetMidBlock2D: 1-6                  

Summary of unet model used in "Understanding Stable Diffusion" section, i.e., summary of pretrained model

In [5]:
from diffusers import StableDiffusionPipeline
model_id = "stabilityai/stable-diffusion-2"
pipe = StableDiffusionPipeline.from_pretrained(model_id)
pipe = pipe.to('cuda')

Cannot initialize model with low cpu memory usage because `accelerate` was not found in the environment. Defaulting to `low_cpu_mem_usage=False`. It is strongly recommended to install `accelerate` for faster and less memory-intense model loading. You can do so with: 
```
pip install accelerate
```
.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


model_index.json:   0%|          | 0.00/537 [00:00<?, ?B/s]

Fetching 13 files:   0%|          | 0/13 [00:00<?, ?it/s]

text_encoder/config.json:   0%|          | 0.00/633 [00:00<?, ?B/s]

tokenizer/vocab.json:   0%|          | 0.00/1.06M [00:00<?, ?B/s]

tokenizer/tokenizer_config.json:   0%|          | 0.00/824 [00:00<?, ?B/s]

(…)ature_extractor/preprocessor_config.json:   0%|          | 0.00/342 [00:00<?, ?B/s]

scheduler/scheduler_config.json:   0%|          | 0.00/345 [00:00<?, ?B/s]

tokenizer/special_tokens_map.json:   0%|          | 0.00/460 [00:00<?, ?B/s]

unet/config.json:   0%|          | 0.00/909 [00:00<?, ?B/s]

tokenizer/merges.txt:   0%|          | 0.00/525k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.36G [00:00<?, ?B/s]

vae/config.json:   0%|          | 0.00/611 [00:00<?, ?B/s]

diffusion_pytorch_model.safetensors:   0%|          | 0.00/3.46G [00:00<?, ?B/s]

diffusion_pytorch_model.safetensors:   0%|          | 0.00/335M [00:00<?, ?B/s]

Loading pipeline components...:   0%|          | 0/6 [00:00<?, ?it/s]

In [8]:
from torch_snippets import *
from torch_snippets.trainer.hooks import print_module_ios_for

keep = {'CrossAttnDownBlock2D', 'CrossAttnUpBlock2D', 'UNetMidBlock2DCrossAttn', 'UpBlock2D', 'DownBlock2D'}

with print_module_ios_for(pipe.unet, print_only=keep, required_children='conv_in'):
  image = pipe("a dog sitting on the grass", num_inference_steps=1).images[0]


  0%|          | 0/1 [00:00<?, ?it/s]

══════════════════════════════════════════════════════════════════
Module Name: Conv2d. Child Name: conv_in
Input Kwargs: 

Input Args:
  1 - 🔦tensor[2, 4, 96, 96] n=73728 (0.3Mb) x∈[-3.869, 4.422] μ=0.004 σ=0.999 cuda:0 - ID:#f5c78a

Outputs: 
  1 - 🔦tensor[2, 320, 96, 96] n=5898240 (22Mb) x∈[-3.419, 3.250] μ=0.004 σ=0.350 cuda:0 - ID:#55229c

══════════════════════════════════════════════════════════════════
══════════════════════════════════════════════════════════════════
Module Name: CrossAttnDownBlock2D. Child Name: down_blocks.0
Input Kwargs: 
  hidden_states - 🔦tensor[2, 320, 96, 96] n=5898240 (22Mb) x∈[-3.419, 3.250] μ=0.004 σ=0.350 cuda:0 - ID:#55229c
  temb - 🔦tensor[2, 1280] n=2560 (10Kb) x∈[-2.014, 5.339] μ=-0.003 σ=0.256 cuda:0 - ID:#f0d6c5
  encoder_hidden_states - 🔦tensor[2, 77, 1024] n=157696 (0.6Mb) x∈[-6.571, 13.023] μ=-0.169 σ=1.032 cuda:0 - ID:#96f1c8
  attention_mask - NoneType
  cross_attention_kwargs - NoneType
  encoder_attention_mask - NoneType

Input Args:

O

In [9]:
summary(pipe.unet, depth=2)

Layer (type:depth-idx)                                            Param #
UNet2DConditionModel                                              --
├─Conv2d: 1-1                                                     11,840
├─Timesteps: 1-2                                                  --
├─TimestepEmbedding: 1-3                                          --
│    └─Linear: 2-1                                                410,880
│    └─SiLU: 2-2                                                  --
│    └─Linear: 2-3                                                1,639,680
├─ModuleList: 1-4                                                 --
│    └─CrossAttnDownBlock2D: 2-4                                  10,852,160
│    └─CrossAttnDownBlock2D: 2-5                                  37,473,920
│    └─CrossAttnDownBlock2D: 2-6                                  141,303,040
│    └─DownBlock2D: 2-7                                           62,277,120
├─ModuleList: 1-5                                