In [1]:
!pip install wandb

Collecting wandb
  Downloading wandb-0.15.5-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting pathtools
  Downloading pathtools-0.1.2.tar.gz (11 kB)
  Preparing metadata (setup.py) ... [?25ldone
Collecting Click!=8.0.0,>=7.1
  Downloading click-8.1.4-py3-none-any.whl (98 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting appdirs>=1.4.3
  Downloading appdirs-1.4.4-py2.py3-none-any.whl (9.6 kB)
Collecting docker-pycreds>=0.4.0
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting GitPython!=3.1.29,>=1.0.0
  Downloading GitPython-3.1.32-py3-none-any.whl (188 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m188.5/188.5 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sentry-sdk>=1.0.0
  Downloading sentry_sdk-1.28.0-py2

# Defining the Model Training

## Dataset & Autoencoder

In [1]:
import torch
from configs.deepsvg.hierarchical_ordered import Config
from deepsvg import utils

pretrained_path = "./hierarchical_ordered.pth.tar"

device = torch.device("cuda:0"if torch.cuda.is_available() else "cpu")

cfg = Config()
vae_model = cfg.make_model().to(device)
utils.load_model(pretrained_path, vae_model)
vae_model.eval()

SVGTransformer(
  (encoder): Encoder(
    (embedding): SVGEmbedding(
      (command_embed): Embedding(7, 256)
      (arg_embed): Embedding(257, 64)
      (embed_fcn): Linear(in_features=704, out_features=256, bias=True)
      (pos_encoding): PositionalEncodingLUT(
        (dropout): Dropout(p=0.1, inplace=False)
        (pos_embed): Embedding(32, 256)
      )
    )
    (encoder): TransformerEncoder(
      (layers): ModuleList(
        (0): TransformerEncoderLayerImproved(
          (self_attn): MultiheadAttention(
            (out_proj): Linear(in_features=256, out_features=256, bias=True)
          )
          (linear1): Linear(in_features=256, out_features=512, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=512, out_features=256, bias=True)
          (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
          (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
          (dropout1): Dropout(p=0.1, inplace=F

In [2]:
import torch
from deepsvg.utils.utils import batchify
from deepsvg.difflib.tensor import SVGTensor
from deepsvg.svglib.svg import SVG

def encode(data, model):
    model_args = batchify((data[key] for key in cfg.model_args), device)
    with torch.no_grad():
        z = model(*model_args, encode_mode=True)
        return z.squeeze(dim=0).squeeze(dim=0)

def decode(z, model, do_display=True, return_svg=False, return_png=False):
    commands_y, args_y = model.greedy_sample(z=z)
    tensor_pred = SVGTensor.from_cmd_args(commands_y[0].cpu(), args_y[0].cpu())
    svg_path_sample = SVG.from_tensor(tensor_pred.data, viewbox=Bbox(256), allow_empty=True).normalize().split_paths().set_color("random")

    if return_svg:
        return svg_path_sample

    return svg_path_sample.draw(do_display=do_display, return_png=return_png)

In [3]:
from deepsvg.svgtensor_dataset import load_dataset
import numpy as np
from torch.utils.data import DataLoader
from torch.utils.data.sampler import SubsetRandomSampler

dataset = load_dataset(cfg) # the DeepSVG dataset as {'commands': [...], 'args': [...]}


def dataloader_with_transformed_dataset(batch_n: int, length: int = None):
    encoded_dataset_with_labels = []
    data_len = length if length else len(dataset)

    for i in range(data_len):
        xy = dataset.get(i, model_args=['commands', 'args', 'label'])
        label = xy.pop('label')
        encoded_dataset_with_labels.append([encode(xy, vae_model), label])

    #   encoded_dataset.append(encoded_svg[0][0])

    dataset_size = len(encoded_dataset_with_labels)
    batch_size = batch_n
    validation_split = .2
    shuffle_dataset = True
    random_seed= 42

    # Creating data indices for training and validation splits:

    indices = list(range(dataset_size))
    split = int(np.floor(validation_split * dataset_size))
    if shuffle_dataset :
        np.random.seed(random_seed)
        np.random.shuffle(indices)
    train_indices, val_indices = indices[split:], indices[:split]

    # Creating PT data samplers and loaders:
    train_sampler = SubsetRandomSampler(train_indices)
    valid_sampler = SubsetRandomSampler(val_indices)

    train_loader = DataLoader(encoded_dataset_with_labels, batch_size=batch_size, sampler=train_sampler, drop_last=True,)
    validation_loader = DataLoader(encoded_dataset_with_labels, batch_size=batch_size, sampler=valid_sampler, drop_last=True,)

    return train_loader, validation_loader

In [4]:
def num_classes(dataloader):
    all_classes = set()

    for x, y in dataloader:
          all_classes.update(set(y.numpy()))

    return len(all_classes)

In [18]:
train_dataloader, valid_dataloader = dataloader_with_transformed_dataset(batch_n=100, length=1000)

## Model

In [5]:
from dit.diffusion import create_diffusion
from svgfusion import DiT

def create_model(predict_xstart=True, dropout=0.1, n_classes=56, depth=28, learn_sigma=True, num_heads=16):

    model = DiT(class_dropout_prob=dropout, num_classes=n_classes, depth=depth, learn_sigma=learn_sigma, num_heads=num_heads)
    device = "cuda" if torch.cuda.is_available() else "cpu"

    model.to(device)
    diffusion = create_diffusion(timestep_respacing="", predict_xstart=predict_xstart)  # default: 1000 steps, linear noise schedule

    model.train()  # important! This enables embedding dropout for classifier-free guidance
    
    return model, diffusion

In [30]:
from dit.diffusion.gaussian_diffusion import LossType, ModelVarType, ModelMeanType

def mean_flat(tensor):
    """
    Take the mean over all non-batch dimensions.
    """
    return tensor.mean(dim=list(range(1, len(tensor.shape))))

def training_losses(diffusion, model, x_start, t, model_kwargs=None, noise=None):
        """
        Compute training losses for a single timestep.
        :param model: the model to evaluate loss on.
        :param x_start: the [N x C x ...] tensor of inputs.
        :param t: a batch of timestep indices.
        :param model_kwargs: if not None, a dict of extra keyword arguments to
            pass to the model. This can be used for conditioning.
        :param noise: if specified, the specific Gaussian noise to try to remove.
        :return: a dict with the key "loss" containing a tensor of shape [N].
                 Some mean or variance settings may also have other keys.
        """
        if model_kwargs is None:
            model_kwargs = {}
        if noise is None:
            noise = torch.randn_like(x_start)
        x_t = diffusion.q_sample(x_start, t, noise=noise)

        terms = {}

        if diffusion.loss_type == LossType.KL or diffusion.loss_type == LossType.RESCALED_KL:
            terms["loss"] = diffusion._vb_terms_bpd(
                model=model,
                x_start=x_start,
                x_t=x_t,
                t=t,
                clip_denoised=False,
                model_kwargs=model_kwargs,
            )["output"]
            if diffusion.loss_type == LossType.RESCALED_KL:
                terms["loss"] *= diffusion.num_timesteps
        elif diffusion.loss_type == LossType.MSE or diffusion.loss_type == LossType.RESCALED_MSE:
            model_output = model(x_t, t, **model_kwargs)

            if diffusion.model_var_type in [
                ModelVarType.LEARNED,
                ModelVarType.LEARNED_RANGE,
            ]:
                B, C = x_t.shape[:2]
                assert model_output.shape == (B, C * 2, *x_t.shape[2:])
                model_output, model_var_values = torch.split(model_output, C, dim=1)
                # Learn the variance using the variational bound, but don't let
                # it affect our mean prediction.
                frozen_out = torch.cat([model_output.detach(), model_var_values], dim=1)
                terms["vb"] = diffusion._vb_terms_bpd(
                    model=lambda *args, r=frozen_out: r,
                    x_start=x_start,
                    x_t=x_t,
                    t=t,
                    clip_denoised=False,
                )["output"]
                if diffusion.loss_type == LossType.RESCALED_MSE:
                    # Divide by 1000 for equivalence with initial implementation.
                    # Without a factor of 1/1000, the VB term hurts the MSE term.
                    terms["vb"] *= diffusion.num_timesteps / 1000.0

            target = {
                ModelMeanType.PREVIOUS_X: diffusion.q_posterior_mean_variance(
                    x_start=x_start, x_t=x_t, t=t
                )[0],
                ModelMeanType.START_X: x_start,
                ModelMeanType.EPSILON: noise,
            }[diffusion.model_mean_type]
            assert model_output.shape == target.shape == x_start.shape
            terms["mse"] = mean_flat((target - model_output) ** 2)
            if "vb" in terms:
                terms["loss"] = terms["mse"] + terms["vb"]
            else:
                terms["loss"] = terms["mse"]
        else:
            raise NotImplementedError(diffusion.loss_type)

        return terms

In [42]:
config = {
        'optimizer': 'adam',
        'predict_xstart': True,
        'learn_sigma': True,
        'use_schduler': True,
        'num_heads': 16,
        'depth': 28,
        'dropout': 0.1,
        'epochs': 1,
        'learning_rate': 0.001,
        'batch_size': 100,
}

model, diffusion = create_model(dropout=config['dropout'], predict_xstart=config['predict_xstart'],
                                    n_classes=num_classes(train_dataloader), depth=config['depth'], 
                                    learn_sigma=config['learn_sigma'], num_heads=config['num_heads'])
tmp = next(iter(train_dataloader))
t = torch.randint(0, diffusion.num_timesteps, (tmp[0].shape[0],), device=device)
model(tmp[0].to(device), t, tmp[1].to(device)).shape

torch.Size([100, 2, 256])

In [43]:
training_losses(diffusion, model, tmp[0].to(device), t, {'y': tmp[1].to(device)})["loss"].mean()

tensor(0.7577, device='cuda:0', grad_fn=<MeanBackward0>)

In [54]:
print(tmp[0].shape)
print(tmp[0].squeeze().unsqueeze(dim=0).shape)

torch.Size([100, 1, 256])
torch.Size([1, 100, 256])


## Training

In [6]:
import wandb
import torch
from torch.optim import AdamW
from torch.optim.lr_scheduler import ReduceLROnPlateau

def train():
    config_defaults = {
            'optimizer': 'adam',
            'predict_xstart': True,
            'learn_sigma': True,
            'use_schduler': True,
            'num_heads': 16,
            'depth': 28,
            'dropout': 0.1,
            'epochs': 100,
            'learning_rate': 0.001,
            'batch_size': 100,
    }
    wandb.init(config=config_defaults)
    config = wandb.config
    
    train_dataloader, valid_dataloader = dataloader_with_transformed_dataset(batch_n=config.batch_size, length=1000)

    model, diffusion = create_model(dropout=config.dropout, predict_xstart=config.predict_xstart,
                                    n_classes=num_classes(train_dataloader), depth=config.depth, 
                                    learn_sigma=config.learn_sigma, num_heads=config.num_heads)
    
    magical_number = 0.7128
    device = "cuda" if torch.cuda.is_available() else "cpu"
    
    optimizer = AdamW(model.parameters(), lr=config.learning_rate, weight_decay=0)
    scheduler = ReduceLROnPlateau(optimizer, 'min', patience=5)
    
    for epoch in range(config.epochs):
        avg_loss = 0
        for x, y in train_dataloader:
            x = x.to(device)
            y = y.to(device)
    
            x = x.squeeze().unsqueeze(dim=1)
            x = x / magical_number # mean of std's of latents
    
            model_kwargs = dict(y=y)
    
            t = torch.randint(0, diffusion.num_timesteps, (x.shape[0],), device=device)
    
            loss_dict = diffusion.training_losses(model, x, t, model_kwargs)
            loss = loss_dict["loss"].mean()
    
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            avg_loss += loss.item()
            
            wandb.log({"batch_loss": loss.item()})     
        
        if config.use_schduler: scheduler.step(avg_loss / len(train_dataloader))
        wandb.log({"loss": avg_loss / len(train_dataloader), "epoch": epoch, 'learning_rate': optimizer.param_groups[0]['lr']}) 

In [60]:
train()
wandb.finish()

VBox(children=(Label(value='0.011 MB of 0.011 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016668309217008452, max=1.0…

24
torch.Size([24, 1, 256])
torch.Size([24, 1, 256])
torch.Size([24, 1, 256])
(24, 2, 256)


AssertionError: 

# Defining the Sweep

## Config

In [7]:
import wandb

wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mhasanjbara[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [8]:
sweep_config = {
    'method': 'random',
    'metric': {
        'name': 'loss',
        'goal': 'minimize'   
    },
    'parameters': {
        'optimizer': {
            'values': ['adam']
        },
        'predict_xstart': {
            'value': True,
        },
        'learn_sigma':{
            'value': True, # [True, False]
        },
        'use_scheduler':{
            'values': [True, False],
        },
        'num_heads': {
            'values': [16, 32, 64, 128, 256]
        },
        'depth': {
            'distribution': 'int_uniform',
            'min': 28,
            'max': 100, 
        },
        'dropout': {
              'values': [0.3, 0.4, 0.5]
        },
        'epochs': {
            'value': 100
        },
        'learning_rate': {
            # a flat distribution between 0.01 and 0.0001
            'distribution': 'uniform',
            'min': 0.0001,
            'max': 0.01
        },
        'batch_size': {
            # integers between 32 and 256
            # with evenly-distributed logarithms 
            'distribution': 'q_log_uniform_values',
            'q': 8,
            'min': 16,
            'max': 128,
        }
    },
}

## Sweep

In [9]:
import wandb

sweep_id = wandb.sweep(sweep_config, project="svgfusion-sweep")

Create sweep with ID: 64ivnjiq
Sweep URL: https://wandb.ai/hasanjbara/svgfusion-sweep/sweeps/64ivnjiq


In [None]:
import wandb

wandb.agent(sweep_id, train, count=100)

[34m[1mwandb[0m: Agent Starting Run: f9i5202s with config:
[34m[1mwandb[0m: 	batch_size: 72
[34m[1mwandb[0m: 	depth: 29
[34m[1mwandb[0m: 	dropout: 0.5
[34m[1mwandb[0m: 	epochs: 100
[34m[1mwandb[0m: 	learn_sigma: True
[34m[1mwandb[0m: 	learning_rate: 0.00105478693528865
[34m[1mwandb[0m: 	num_heads: 128
[34m[1mwandb[0m: 	optimizer: adam
[34m[1mwandb[0m: 	predict_xstart: True
[34m[1mwandb[0m: 	use_scheduler: True


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.016668392849775653, max=1.0…