In [None]:
%load_ext autoreload
%autoreload 2

import torch
import clip
from torch.utils.data import DataLoader, random_split
import os

# On a multi-GPU system, this hides all GPUs except the first 
os.environ["CUDA_VISIBLE_DEVICES"] = "0" 

# Custom modules
from handsoncv.datasets import generate_clip_metadata, TFflowersCLIPDataset
from handsoncv.models import UNet 
from handsoncv.utils import DDPM, set_seed, seed_worker
from handsoncv.training import train_diffusion

# Hardware & Paths
NOTEBOOK_DIR = os.getcwd()
PROJECT_ROOT = os.path.abspath(os.path.join(NOTEBOOK_DIR, "..", ".."))
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(DEVICE)

# Folders we frequently use across the experiments' notebooks
ROOT_PATH = os.path.join(PROJECT_ROOT, "Assignment-3")
ROOT_DATA = os.path.join(ROOT_PATH, "data")
DATA_DIR = f"{ROOT_DATA}/cropped_flowers"
SAMPLE_DIR = f"{ROOT_DATA}/05_images"
CSV_PATH = f"{ROOT_DATA}/clip_embeddings_metadata.csv"

CHECKPOINTS_DIR = os.path.join(ROOT_PATH, "checkpoints")
os.makedirs(CHECKPOINTS_DIR, exist_ok=True)

# Numpy and Torch Reproducibility
SEED=42
set_seed(42)

cuda
Seeds set to 42 for reproducibility.


In [2]:
# Prepare Metadata (Originate clip.csv)
clip_model, clip_preprocess = clip.load("ViT-B/32", device=DEVICE)

if not os.path.exists(CSV_PATH):
    print("Generating CLIP metadata...")
    generate_clip_metadata(DATA_DIR, CSV_PATH, clip_model, clip_preprocess, DEVICE)

In [3]:
# Cropped TF Flowers Data Loading
full_dataset = TFflowersCLIPDataset(CSV_PATH, img_size=32)
train_size = int(0.95 * len(full_dataset))
val_size = len(full_dataset) - train_size
train_ds, val_ds = random_split(full_dataset, [train_size, val_size])

# Create a Generator object to pass to the dataLoaders
g = torch.Generator()
g.manual_seed(SEED)

train_loader = DataLoader(train_ds, batch_size=128, shuffle=True, num_workers=2, worker_init_fn=seed_worker, generator=g)
val_loader = DataLoader(val_ds, batch_size=128, shuffle=False, num_workers=2, worker_init_fn=seed_worker, generator=g)

In [4]:
# Configuration 
EPOCHS = 100
LEARNING_RATE = 1e-4
SUBSET_SIZE = len(train_ds) + len(val_ds) 

# Initialize Model & DDPM
T = 400
IMG_CH = 3
IMG_SIZE = train_loader.dataset[0][0].shape[-1]
BETAS = torch.linspace(0.0001, 0.02, T).to(DEVICE)
# For OpenAI's CLIP, c_embed_dim is stored in model.visual.output_dim
CLIP_EMBED_DIM = clip_model.visual.output_dim 

# Set Seed again for Ensuring Same Model Initialization at Every Run
set_seed(SEED)

ddpm = DDPM(BETAS, DEVICE)
model = UNet(
    T, 
    IMG_CH, 
    IMG_SIZE, 
    down_chs=(256, 256, 512), 
    t_embed_dim=8, 
    c_embed_dim=CLIP_EMBED_DIM
).to(DEVICE)
print("Num params: ", sum(p.numel() for p in model.parameters()))

optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)

# Define list of text prompts to geenrate images for 
text_list = [
    "A round white daisy with a yellow center",
    "An orange sunflower with a big brown center",
    "A deep red rose flower"
]

# Execute Training
train_diffusion(
    model=model,
    ddpm=ddpm,
    train_loader=train_loader,
    val_loader=val_loader,
    optimizer=optimizer,
    epochs=EPOCHS,
    device=DEVICE,
    drop_prob=0.1,
    save_dir=CHECKPOINTS_DIR,
    sample_save_dir=SAMPLE_DIR,
    clip_model=clip_model,   # Pass the clip model for evaluation
    text_list=text_list   # Pass the text prompts list for evaluation
)

Seeds set to 42 for reproducibility.
Num params:  44900355
Epoch 0: Train Loss: 0.8968 | Val Loss: 0.5058
Saved samples to /home/vanessa/Documents/repos/Applied-Hands-On-Computer-Vision/Assignment-3/data/05_images/sample_ep00.png
--- Saved new best model to /home/vanessa/Documents/repos/Applied-Hands-On-Computer-Vision/Assignment-3/checkpoints ---
Epoch 1: Train Loss: 0.3230 | Val Loss: 0.2326
--- Saved new best model to /home/vanessa/Documents/repos/Applied-Hands-On-Computer-Vision/Assignment-3/checkpoints ---
Epoch 2: Train Loss: 0.2180 | Val Loss: 0.1923
--- Saved new best model to /home/vanessa/Documents/repos/Applied-Hands-On-Computer-Vision/Assignment-3/checkpoints ---
Epoch 3: Train Loss: 0.1891 | Val Loss: 0.1900
--- Saved new best model to /home/vanessa/Documents/repos/Applied-Hands-On-Computer-Vision/Assignment-3/checkpoints ---
Epoch 4: Train Loss: 0.1787 | Val Loss: 0.1885
--- Saved new best model to /home/vanessa/Documents/repos/Applied-Hands-On-Computer-Vision/Assignment-