In [1]:
%load_ext autoreload
%autoreload 2

import torch
import clip
import wandb
import torchvision.transforms as transforms
import os
import random

from torch.utils.data import DataLoader, Subset

# On a multi-GPU system, this hides all GPUs except the first 
os.environ["CUDA_VISIBLE_DEVICES"] = "0" 

# Custom modules
from handsoncv.datasets import generate_clip_metadata, TFflowersCLIPDataset
from handsoncv.models import UNet 
from handsoncv.utils import DDPM, set_seed, seed_worker
from handsoncv.training import train_diffusion

# Hardware & Paths
NOTEBOOK_DIR = os.getcwd()
PROJECT_ROOT = os.path.abspath(os.path.join(NOTEBOOK_DIR, "..", ".."))
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(DEVICE)

# Folders we frequently use across the experiments' notebooks
ROOT_PATH = os.path.join(PROJECT_ROOT, "Assignment-3")
ROOT_DATA = os.path.join(ROOT_PATH, "data")
DATA_DIR = f"{ROOT_DATA}/cropped_flowers"
SAMPLE_DIR = f"{ROOT_DATA}/05_images"
CSV_PATH = f"{ROOT_DATA}/clip_embeddings_metadata.csv"

CHECKPOINTS_DIR = os.path.join(ROOT_PATH, "checkpoints")
os.makedirs(CHECKPOINTS_DIR, exist_ok=True)

# Numpy and Torch Reproducibility
SEED=42
set_seed(42)

# Base Configuration Parameters
BATCH_SIZE = 128

cuda
Seeds set to 42 for reproducibility.


In [2]:
# Prepare Metadata (Originate clip.csv)
clip_model, clip_preprocess = clip.load("ViT-B/32", device=DEVICE)

if not os.path.exists(CSV_PATH):
    print("Generating CLIP metadata...")
    generate_clip_metadata(DATA_DIR, CSV_PATH, clip_model, clip_preprocess, DEVICE)

In [None]:
# Base transforms used by both training and validation data
base_t = [
    transforms.Resize((32, 32)),
    transforms.ToTensor(),
    transforms.Lambda(lambda t: (t * 2) - 1)
]

# Training: Base + Augmentation
train_transform = transforms.Compose([
    transforms.RandomHorizontalFlip(p=0.5),
    *base_t 
])

# Validation: Base only
val_transform = transforms.Compose(base_t)

# Cropped TF Flowers Data Loading
# We use a dummy dataset just to get the total count
temp_ds = TFflowersCLIPDataset(CSV_PATH)
dataset_size = len(temp_ds)
indices = list(range(dataset_size))
split = int(0.95 * dataset_size)

# Shuffle indices once
random.shuffle(indices)
train_indices, val_indices = indices[:split], indices[split:]

# Create a Generator object to pass to the dataLoaders
g = torch.Generator()
g.manual_seed(SEED)

# Create two separate Dataset Instances
train_ds = Subset(TFflowersCLIPDataset(CSV_PATH, transform=train_transform), train_indices)
val_ds = Subset(TFflowersCLIPDataset(CSV_PATH, transform=val_transform), val_indices)

# Create dataloaders
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=2, worker_init_fn=seed_worker, generator=g, drop_last=True)
val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=2, generator=g)

In [4]:
# Training Configuration 
EPOCHS = 130
LEARNING_RATE = 1e-4
SUBSET_SIZE = len(train_ds) + len(val_ds) 

# Initialize Model & DDPM
T = 400
IMG_CH = 3
IMG_SIZE = train_loader.dataset[0][0].shape[-1]
BETAS = torch.linspace(0.0001, 0.02, T).to(DEVICE)
# For OpenAI's CLIP, c_embed_dim is stored in model.visual.output_dim
CLIP_EMBED_DIM = clip_model.visual.output_dim 

# Set Seed again for Ensuring Same Model Initialization at Every Run
set_seed(SEED)

ddpm = DDPM(BETAS, DEVICE)
model = UNet(
    T, 
    IMG_CH, 
    IMG_SIZE, 
    down_chs=(256, 256, 512), 
    t_embed_dim=8, 
    c_embed_dim=CLIP_EMBED_DIM
).to(DEVICE)
print("Num params: ", sum(p.numel() for p in model.parameters()))

optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, 
    'min', 
    patience=15, # We wait 15 epochs before cutting LR
    factor=0.5,   # We don'tcut it so aggressively
    min_lr=5e-5 # We stop the LR from dropping below 5e-5
)
BOTTLE_EMB_CHANNELS = model.down2.model[-2].model[0].out_channels

# Define list of text prompts to generate images for 
text_list = [
    "A round white daisy with a yellow center",
    "An orange sunflower with a big brown center",
    "A deep red rose flower"
]

run = wandb.init(
    project="diffusion-model-assessment-v2", 
    name="ddpm_unet_training",
    config={
        "architecture": "ddpm_unet",
        "strategy": "generative_modeling",
        "downsample_mode": "maxpool",
        "embedding_size": BOTTLE_EMB_CHANNELS,
        "learning_rate": LEARNING_RATE,
        "batch_size": BATCH_SIZE,
        "epochs": EPOCHS,
        "optimizer_type": "Adam",
        "subset_size": SUBSET_SIZE,
        "seed": SEED,
    }
)

# Execute Training
train_diffusion(
    model=model,
    ddpm=ddpm,
    train_loader=train_loader,
    val_loader=val_loader,
    optimizer=optimizer,
    epochs=EPOCHS,
    device=DEVICE,
    drop_prob=0.1,
    save_dir=CHECKPOINTS_DIR,
    sample_save_dir=SAMPLE_DIR,
    clip_model=clip_model,   # Pass the clip model for evaluation
    clip_preprocess=clip_preprocess,  # Pass the clip preprocess for evaluation
    text_list=text_list,   # Pass the text prompts list for evaluation
    scheduler=scheduler
)

wandb.finish()

Seeds set to 42 for reproducibility.
Num params:  44900355


[34m[1mwandb[0m: Currently logged in as: [33mguarino-vanessa-emanuela[0m ([33mhandsoncv-research[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch 0: Train Loss: 0.9444 | Val Loss: 0.4709
Saved samples to /home/vanessa/Documents/repos/Applied-Hands-On-Computer-Vision/Assignment-3/data/05_images/sample_ep00.png
Epoch 0: Val Loss: 0.4709 | CLIP Score: 0.1871
Saved and logged samples for epoch 0
--- Saved new best Val model to /home/vanessa/Documents/repos/Applied-Hands-On-Computer-Vision/Assignment-3/checkpoints ---
--- Saved new best CLIP model to /home/vanessa/Documents/repos/Applied-Hands-On-Computer-Vision/Assignment-3/checkpoints ---
Epoch 1: Train Loss: 0.3162 | Val Loss: 0.2457
--- Saved new best Val model to /home/vanessa/Documents/repos/Applied-Hands-On-Computer-Vision/Assignment-3/checkpoints ---
Epoch 2: Train Loss: 0.2413 | Val Loss: 0.2179
--- Saved new best Val model to /home/vanessa/Documents/repos/Applied-Hands-On-Computer-Vision/Assignment-3/checkpoints ---
Epoch 3: Train Loss: 0.1974 | Val Loss: 0.1980
--- Saved new best Val model to /home/vanessa/Documents/repos/Applied-Hands-On-Computer-Vision/Assignment-3

0,1
clip_score,▁▂▃▂▃▃▄▄▅▆▆▆▆█▇▇▇▇▇▇█▇▇▇▇▇▇
epoch,▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇█
epoch_time_sec,█▁▁▇▁▁▁▁█▁▁▁▁▁█▁▁▁▁▁▁▁▁▁▁▁▇▁▇▁▇▁▇▁▁▁▁▇▁▇
learning_rate,████████████████████▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
peak_gpu_mem_mb,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss,█▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_loss,█▃▃▃▃▂▂▂▂▁▂▂▂▁▁▁▁▂▁▂▂▂▂▁▁▁▁▁▂▁▁▂▁▁▁▂▁▁▁▁

0,1
clip_score,0.2846
epoch,129.0
epoch_time_sec,14.93703
learning_rate,5e-05
peak_gpu_mem_mb,6112.83984
train_loss,0.07176
val_loss,0.06453
