In [2]:
%cd /home/jrottmay/ml-dev
import torch; torch.manual_seed(0)
import torch.nn as nn
import torch.nn.functional as F
import torch.utils
import torch.distributions
import torchvision

/home/jrottmay/ml-dev


In [3]:
from modules import *

In [4]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Additional Info when using cuda
if device == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')

Tesla V100-SXM2-16GB
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB


# Data

In [6]:
# parameters
batch_size = 128

In [7]:
train_data = torch.utils.data.DataLoader(
            torchvision.datasets.MNIST('~/data', 
                transform=torchvision.transforms.ToTensor(), 
                download=True),
            batch_size=batch_size,
            shuffle=True)

test_data = torch.utils.data.DataLoader(
        torchvision.datasets.MNIST('~/data', 
                train=False,
            transform=torchvision.transforms.ToTensor(), 
            download=True),
        batch_size=batch_size,
        shuffle=True)

# Model Definition

In [15]:
# VAE PARAMETERS
kld_weight = batch_size / (len(train_data) * batch_size)

# DDPM PARAMETERS
activation = F.relu
use_labels = True
schedule = "cosine"
schedule_low = 1e-4
schedule_high = 2e-2
num_timesteps = 100
num_res_blocks = 2
loss_type = "l2"

ema_decay = 0.9999
ema_update_rate = 1

learning_rate = 2e-4
iterations = 100000
checkpoint_rate = 10000
log_rate = 1000
num_samples = 10
classes = torch.arange(10)
project_name = "Image Space Diffusion"
log_dir = "./logs"
img_dir = "./img"
wandb_dir = "./tmp"
entity="jan-rottmayer"
run_name = "testing_2"
log_to_wandb = False
model_checkpoint = None #f"{log_dir}/Image Space Diffusion-testing-iteration-100000-model.pth"
optim_checkpoint = None #f"{log_dir}/Image Space Diffusion-testing-iteration-100000-optim.pth"

In [9]:
if schedule == "cosine":
    betas = np.linspace(schedule_low, schedule_high, num_timesteps)
else:
    betas = np.linspace(
        schedule_low * 1000 / num_timesteps,
        schedule_high * 1000 / num_timesteps,
        num_timesteps
    )

# VAE

In [17]:
vae = VAE(
    1,
    16,
    100,
    channel_multipliers=(1, 2, 4),
    attention_resultions=(1),
    dropout=0.2,
    norm="bn",
)
vae_optim = torch.optim.Adam(vae.parameters(), lr=learning_rate)

In [36]:
train(
    vae,
    vae_optim,
    train_data,
    lambda m, x, y: m.loss(x, y),
    iterations=100000,
    checkpoint_rate=10000,
    log_rate=1000,
    run_name="test_01",
    project_name="MNIST_VAE",
    chkpt_callback=visualize_mnist_sample,
)

  7%|▋         | 7332/100000 [03:14<39:09, 39.44it/s, Train Loss=0.0122, Test Loss=nan]

# DDPM

In [14]:
model = UNet(
        img_channels=1,
        base_channels=32,
        channel_mults=(1, 2, 4),
        time_emb_dim=128 * 2,
        norm="gn",
        dropout=0.1,
        activation=F.relu,
        attention_resolutions=(1,),
        num_classes=None if not use_labels else 10,
        initial_pad=0,
    )

diffusion = GaussianDiffusion(
        model, (28, 28), 1, 
        betas,
        ema_decay=ema_decay,
        ema_update_rate=ema_update_rate,
        ema_start=2000,
    )